import pandas as pd
import vizro.plotly.express as px
from vizro import Vizro
import vizro.models as vm
import vizro.tables as vt # Import vizro.tables
# --- 1. Data Loading and Preprocessing ---
# The names of the files uploaded from the Excel sheets have been updated
# New files are:
SHOWS_FILE = "24H2_Netflix Report_TV.csv"
MOVIES_FILE = "24H2_Netflix Report_Movies.csv"
# Load dataframes
try:
# Adding thousands separator and decimal handling for robust loading of data like '313,000,000'
df_shows = pd.read_csv(SHOWS_FILE, thousands=',')
df_movies = pd.read_csv(MOVIES_FILE, thousands=',')
except FileNotFoundError:
print("Error: One or both data files were not found. Please ensure the file paths are correct.")
# Create empty dataframes to prevent crash in the vizro part
df_shows = pd.DataFrame(columns=['Title', 'Available Globally?', 'Release Date', 'Hours Viewed', 'Runtime', 'Views'])
df_movies = pd.DataFrame(columns=['Title', 'Available Globally?', 'Release Date', 'Hours Viewed', 'Runtime', 'Views'])
def preprocess_data(df):
"""Cleans and transforms raw data for visualization."""
# Convert 'Release Date' to datetime, coercing errors (e.g., if a date is missing)
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
df['Release Month'] = df['Release Date'].dt.to_period('M').astype(str).fillna('TBD')
# Convert H:MM runtime string to total hours (float)
def runtime_to_hours(runtime_str):
if pd.isna(runtime_str):
return 0
try:
# Handle cases where runtime might be a single number (e.g., if it was parsed as int/float)
if isinstance(runtime_str, (int, float)):
return runtime_str
h, m = map(int, str(runtime_str).split(':'))
return round(h + m / 60, 2)
except:
return 0
df['Runtime (Hours)'] = df['Runtime'].apply(runtime_to_hours)
# Standardize column names for easier use in Vizro/Plotly
df.rename(columns={
"Available Globally?": "Available_Globally",
"Hours Viewed": "Hours_Viewed",
}, inplace=True)
# Ensure Hours_Viewed and Views are numeric (needed especially after using thousands=',')
for col in ['Hours_Viewed', 'Views']:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
return df
df_shows = preprocess_data(df_shows)
df_movies = preprocess_data(df_movies)
# Prepare data for specific charts (Top N lists are often best calculated pre-chart)
shows_top_10 = df_shows.sort_values("Hours_Viewed", ascending=False).head(10).reset_index(drop=True)
movies_top_10 = df_movies.sort_values("Views", ascending=False).head(10).reset_index(drop=True)
# --- 2. Vizro Components (Charts and Tables) ---
# --- Shows Components ---
shows_top_hours_chart = vm.Graph(
title="Top 10 Shows by Hours Viewed",
# Added Header and Footer for context, as seen in the demo app
header="Shows are ranked by their total **Hours Viewed** in the second half of 2024. This metric is the primary indicator of content consumption.",
footer="The top performers, contributing significantly to overall viewing hours.",
figure=px.bar(
shows_top_10,
x="Title",
y="Hours_Viewed",
color="Available_Globally",
template="vizro_dark",
labels={"Hours_Viewed": "Hours Viewed (Millions)", "Available_Globally": "Global Release"}
)
)
shows_month_hours_chart = vm.Graph(
title="Total Hours Viewed by Release Month",
# Added Header and Footer for context, as seen in the demo app
header="Analysis of total hours viewed grouped by the original content **release month** (H2 2024).",
footer="Viewing hours show .",
figure=px.bar(
df_shows.groupby("Release Month", as_index=False)["Hours_Viewed"].sum().sort_values("Release Month"),
x="Release Month",
y="Hours_Viewed",
template="vizro_dark",
labels={"Hours_Viewed": "Total Hours Viewed (Millions)"}
)
)
shows_table = vm.Table(
id="shows_data_table",
title="All 2024 H2 Shows Data",
figure=vt.dash_data_table(data_frame=df_shows[['Title', 'Release Date', 'Available_Globally', 'Hours_Viewed', 'Views', 'Runtime (Hours)']])
)
# --- Movies Components ---
movies_top_views_chart = vm.Graph(
title="Top 10 Movies by Views",
# Added Header and Footer for context, as seen in the demo app
header="Movies are ranked by their total **Views** in the second half of 2024. This indicates overall audience reach and popularity.",
footer="The movie 'Back in Action' dominated H2 2024 in terms of views, signaling a high-demand title with strong initial audience pull.",
figure=px.bar(
movies_top_10,
x="Title",
y="Views",
color="Available_Globally",
template="vizro_dark",
labels={"Views": "Views (Millions)", "Available_Globally": "Global Release"}
)
)
movies_avg_runtime_chart = vm.Graph(
title="Average Runtime by Release Month",
# Added Header and Footer for context, as seen in the demo app
header="The average runtime (in hours) for movies released in each month of H2 2024.",
footer="The consistency in average runtime suggests a controlled release strategy, with no dramatic variation in film length across the half-year.",
figure=px.bar(
df_movies.groupby("Release Month", as_index=False)["Runtime (Hours)"].mean().sort_values("Release Month"),
x="Release Month",
y="Runtime (Hours)",
template="vizro_dark",
labels={"Runtime (Hours)": "Avg Runtime (Hours)"}
)
)
movies_table = vm.Table(
id="movies_data_table",
title="All H2 2024 Movies Data",
figure=vt.dash_data_table(data_frame=df_movies[['Title', 'Release Date', 'Available_Globally', 'Hours_Viewed', 'Views', 'Runtime (Hours)']])
)
# --- 3. Vizro Pages and Dashboard ---
shows_page = vm.Page(
title="TV Shows Analysis",
# FIXED: Grid indices must start at 0. [0, 1] for the top row charts, [2, 2] for the table spanning the bottom row.
layout=vm.Layout(grid=[[0, 1], [2, 2]]),
components=[
shows_top_hours_chart,
shows_month_hours_chart,
shows_table,
],
controls=[
# Filters are applied to all components on the page that use the respective columns
vm.Filter(column="Available_Globally", selector=vm.Dropdown(title="Global Availability")),
# FIX: Removed explicit selector to allow Vizro to use the correct DatePickerRange component automatically for temporal data.
vm.Filter(column="Release Date"),
],
)
movies_page = vm.Page(
title="Movies Analysis",
# FIXED: Grid indices must start at 0. [0, 1] for the top row charts, [2, 2] for the table spanning the bottom row.
layout=vm.Layout(grid=[[0, 1], [2, 2]]),
components=[
movies_top_views_chart,
movies_avg_runtime_chart,
movies_table,
],
controls=[
vm.Filter(column="Available_Globally", selector=vm.Dropdown(title="Global Availability")),
# FIX: Removed explicit selector to allow Vizro to use the correct DatePickerRange component automatically for temporal data.
vm.Filter(column="Release Date"),
],
)
# Combine pages into the final dashboard
dashboard = vm.Dashboard(
title="Netflix 2024 H2 Report Dashboard",
pages=[shows_page, movies_page]
)
# Build and run the dashboard
Vizro().build(dashboard).run()