PyCafe - Vizro - 24 25 netflix

24H2_Netflix Report_Movies.csv
24H2_Netflix Report_TV.csv
25_H1 Netflix Report_Movies.csv
25_H1 Netflix Report_Shows.csv
app.py
requirements.txt
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pandas as pd
import vizro.plotly.express as px
from vizro import Vizro
import vizro.models as vm
import vizro.tables as vt # Import vizro.tables

# --- 1. Data Loading and Preprocessing ---

# The names of the files uploaded from the Excel sheets have been updated
# New files are:
SHOWS_24H2_FILE = "24H2_Netflix Report_TV.csv"
MOVIES_24H2_FILE = "24H2_Netflix Report_Movies.csv"
SHOWS_25H1_FILE = "25_H1 Netflix Report_Shows.csv"
MOVIES_25H1_FILE = "25_H1 Netflix Report_Movies.csv"


# Load dataframes
try:
    # Adding thousands separator and decimal handling for robust loading of data like '313,000,000'
    
    # Load individual dataframes and tag them with the time period (NEW LOGIC)
    df_shows_24h2 = pd.read_csv(SHOWS_24H2_FILE, thousands=',')
    df_shows_24h2['Time_Period'] = '2024 H2'
    
    df_shows_25h1 = pd.read_csv(SHOWS_25H1_FILE, thousands=',')
    df_shows_25h1['Time_Period'] = '2025 H1'
    
    df_movies_24h2 = pd.read_csv(MOVIES_24H2_FILE, thousands=',')
    df_movies_24h2['Time_Period'] = '2024 H2'
    
    df_movies_25h1 = pd.read_csv(MOVIES_25H1_FILE, thousands=',')
    df_movies_25h1['Time_Period'] = '2025 H1'

    # CONCATENATE dataframes to ensure all data is loaded
    df_shows = pd.concat([df_shows_24h2, df_shows_25h1], ignore_index=True)
    df_movies = pd.concat([df_movies_24h2, df_movies_25h1], ignore_index=True)
    
except FileNotFoundError:
    print("Error: One or more data files were not found. Please ensure the file paths are correct.")
    # Create empty dataframes to prevent crash in the vizro part
    df_shows = pd.DataFrame(columns=['Title', 'Available Globally?', 'Release Date', 'Hours Viewed', 'Runtime', 'Views', 'Time_Period'])
    df_movies = pd.DataFrame(columns=['Title', 'Available Globally?', 'Release Date', 'Hours Viewed', 'Runtime', 'Views', 'Time_Period'])

def preprocess_data(df):
    """Cleans and transforms raw data for visualization."""
    
    # *** FIX: Standardize the original column names regardless of case ***
    cols_to_standardize = {
        "available globally?": "Available Globally?",
        "release date": "Release Date",
    }
    
    for original_col in list(df.columns):
        # Strip and lower the column name for case-insensitive matching
        col_lower = original_col.strip().lower()
        
        # Check if the column is one we need to standardize
        if col_lower in cols_to_standardize:
            desired_name = cols_to_standardize[col_lower]
            if original_col != desired_name:
                df.rename(columns={original_col: desired_name}, inplace=True)

    # --- Preprocessing that requires 'Release Date' ---
    # Convert 'Release Date' to datetime, coercing errors (e.g., if a date is missing)
    if 'Release Date' in df.columns:
        df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
        df['Release Month'] = df['Release Date'].dt.to_period('M').astype(str).fillna('TBD')
    else:
        # Prevent downstream KeyError if 'Release Date' is still missing
        df['Release Month'] = 'TBD' 

    # Convert H:MM runtime string to total hours (float)
    def runtime_to_hours(runtime_str):
        if pd.isna(runtime_str):
            return 0
        try:
            # Handle cases where runtime might be a single number (e.g., if it was parsed as int/float)
            if isinstance(runtime_str, (int, float)):
                return runtime_str
            # Use split(':') only if it's a string, otherwise it may raise an error on float/int
            if isinstance(runtime_str, str):
                h, m = map(int, str(runtime_str).split(':'))
                return round(h + m / 60, 2)
            else: # If it's not a string and not NaN, treat it as hours already
                 return float(runtime_str)
        except:
            return 0

    df['Runtime (Hours)'] = df['Runtime'].apply(runtime_to_hours)

    # Standardize column names for easier use in Vizro/Plotly
    df.rename(columns={
        "Available Globally?": "Available_Globally",
        "Hours Viewed": "Hours_Viewed",
    }, inplace=True)

    # *** FIX: Fill NaN/Null values in the filtering column to provide values for the dropdown ***
    if 'Available_Globally' in df.columns:
        # Fills NaN values with 'UNKNOWN' so the filter can create a valid option
        df['Available_Globally'] = df['Available_Globally'].fillna('UNKNOWN')

    # Ensure Hours_Viewed and Views are numeric (needed especially after using thousands=',')
    for col in ['Hours_Viewed', 'Views']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    return df

df_shows = preprocess_data(df_shows)
df_movies = preprocess_data(df_movies)

# Prepare data for specific charts (Top N lists are often best calculated pre-chart)
shows_top_10 = df_shows.sort_values("Hours_Viewed", ascending=False).head(10).reset_index(drop=True)
movies_top_10 = df_movies.sort_values("Views", ascending=False).head(10).reset_index(drop=True)

# --- 2. Vizro Components (Charts and Tables) ---

# --- Shows Components ---
shows_top_hours_chart = vm.Graph(
    title="Top 10 Shows by Hours Viewed",
    header="Shows are ranked by their total **Hours Viewed** in the second half of 2024. This metric is the primary indicator of content consumption.",
    footer="The top performers, contributing significantly to overall viewing hours.",
    figure=px.bar(
        shows_top_10,
        x="Title",
        y="Hours_Viewed",
        color="Available_Globally",
        template="vizro_dark",
        labels={"Hours_Viewed": "Hours Viewed (Millions)", "Available_Globally": "Global Release"}
    )
)

shows_month_hours_chart = vm.Graph(
    title="Total Hours Viewed by Release Month",
    header="Analysis of total hours viewed grouped by the original content **release month** (H2 2024).",
    footer="Viewing hours show .",
    figure=px.bar(
        df_shows.groupby("Release Month", as_index=False)["Hours_Viewed"].sum().sort_values("Release Month"),
        x="Release Month",
        y="Hours_Viewed",
        template="vizro_dark",
        labels={"Hours_Viewed": "Total Hours Viewed (Millions)"}
    )
)

shows_table = vm.Table(
    id="shows_data_table",
    title="All TV Shows Data (2024 H2 and 2025 H1)",
    # ADDED Time_Period to the table columns
    figure=vt.dash_data_table(data_frame=df_shows[['Title', 'Time_Period', 'Release Date', 'Available_Globally', 'Hours_Viewed', 'Views', 'Runtime (Hours)']])
)

# --- Movies Components ---
movies_top_views_chart = vm.Graph(
    title="Top 10 Movies by Views",
    header="Movies are ranked by their total **Views** in the second half of 2024. This indicates overall audience reach and popularity.",
    footer="The movie 'Back in Action' dominated H2 2024 in terms of views, signaling a high-demand title with strong initial audience pull.",
    figure=px.bar(
        movies_top_10,
        x="Title",
        y="Views",
        color="Available_Globally",
        template="vizro_dark",
        labels={"Views": "Views (Millions)", "Available_Globally": "Global Release"}
    )
)

movies_avg_runtime_chart = vm.Graph(
    title="Average Runtime by Release Month",
    header="The average runtime (in hours) for movies released in each month of H2 2024.",
    footer="The consistency in average runtime suggests a controlled release strategy, with no dramatic variation in film length across the half-year.",
    figure=px.bar(
        df_movies.groupby("Release Month", as_index=False)["Runtime (Hours)"].mean().sort_values("Release Month"),
        x="Release Month",
        y="Runtime (Hours)",
        template="vizro_dark",
        labels={"Runtime (Hours)": "Avg Runtime (Hours)"}
    )
)

movies_table = vm.Table(
    id="movies_data_table",
    title="All Movies Data (2024 H2 and 2025 H1)",
    # ADDED Time_Period to the table columns
    figure=vt.dash_data_table(data_frame=df_movies[['Title', 'Time_Period', 'Release Date', 'Available_Globally', 'Hours_Viewed', 'Views', 'Runtime (Hours)']])
)

# --- 3. Vizro Pages and Dashboard ---

shows_page = vm.Page(
    title="TV Shows Analysis",
    layout=vm.Layout(grid=[[0, 1], [2, 2]]),
    components=[
        shows_top_hours_chart,
        shows_month_hours_chart,
        shows_table,
    ],
    controls=[
        # NEW: Filter by Time Period
        vm.Filter(column="Time_Period", selector=vm.Dropdown(title="Time Period")),
        # Existing Filters
        vm.Filter(column="Available_Globally", selector=vm.Dropdown(title="Global Availability")),
        vm.Filter(column="Release Date"),
    ],
)

movies_page = vm.Page(
    title="Movies Analysis",
    layout=vm.Layout(grid=[[0, 1], [2, 2]]),
    components=[
        movies_top_views_chart,
        movies_avg_runtime_chart,
        movies_table,
    ],
    controls=[
        # NEW: Filter by Time Period
        vm.Filter(column="Time_Period", selector=vm.Dropdown(title="Time Period")),
        # Existing Filters
        vm.Filter(column="Available_Globally", selector=vm.Dropdown(title="Global Availability")),
        vm.Filter(column="Release Date"),
    ],
)

# Combine pages into the final dashboard
dashboard = vm.Dashboard(
    title="Netflix 2024 H2 and 2025 H1 Report Dashboard",
    pages=[shows_page, movies_page]
)

# Build and run the dashboard
Vizro().build(dashboard).run()
Py.Cafe

24 25 netflix dashboard

24 25 netflix