import os, re, glob
import pandas as pd
import vizro.plotly.express as px
from vizro import Vizro
import vizro.models as vm
import vizro.tables as vt # vizro.tables
# =========================
# 1) AUTO-INGEST ALL CSVs
# =========================
DATA_DIRS = [".", "/mnt/data"] # scan both local and uploaded paths
GLOB_PAT = "*Netflix Report*.csv"
def infer_time_period_from_filename(fname: str) -> str:
"""
Matches: 23H1, 25_H1, 2024H2, etc. -> '2023 H1', '2025 H1', '2024 H2'
"""
m = re.search(r'(\d{2,4})[_ ]?H([12])', fname)
if not m:
return "Unknown"
year_token, half = m.groups()
year = int(year_token)
if year < 100: # e.g., "23" -> 2023
year += 2000
return f"{year} H{half}"
def load_csv(path: str) -> pd.DataFrame:
try:
return pd.read_csv(path, thousands=",")
except Exception:
return pd.DataFrame()
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Robust coercions + guaranteed presence of required cols (prevents fillna crash).
"""
if df is None or df.empty:
return pd.DataFrame(columns=[
"Title","Time_Period","Release Date","Release Month",
"Available_Globally","Hours_Viewed","Views","Runtime (Hours)"
])
# Case-insensitive renames
ci = {c.lower(): c for c in df.columns}
def rename_ci(old, new):
if old in ci and ci[old] != new:
df.rename(columns={ci[old]: new}, inplace=True)
rename_ci("title", "Title")
rename_ci("release date", "Release Date")
rename_ci("available globally?", "Available Globally?")
rename_ci("hours viewed", "Hours Viewed")
rename_ci("runtime", "Runtime")
rename_ci("views", "Views")
# Dates -> Release Month
if "Release Date" in df.columns:
df["Release Date"] = pd.to_datetime(df["Release Date"], errors="coerce")
df["Release Month"] = df["Release Date"].dt.to_period("M").astype(str).fillna("TBD")
else:
df["Release Month"] = pd.Series(["TBD"] * len(df), index=df.index)
# Runtime -> hours (float)
def to_hours(x):
if pd.isna(x): return 0.0
try:
if isinstance(x, (int, float)): return float(x)
s = str(x)
if ":" in s:
h, m = s.split(":")
return round(float(h) + float(m)/60.0, 2)
return float(s)
except Exception:
return 0.0
if "Runtime" in df.columns:
df["Runtime (Hours)"] = df["Runtime"].apply(to_hours)
elif "Runtime (Hours)" not in df.columns:
df["Runtime (Hours)"] = pd.Series([0.0] * len(df), index=df.index, dtype="float64")
# Availability
if "Available Globally?" in df.columns:
df.rename(columns={"Available Globally?": "Available_Globally"}, inplace=True)
if "Available_Globally" in df.columns:
df["Available_Globly_tmp"] = df["Available_Globally"].astype(str)
df["Available_Globally"] = df["Available_Globly_tmp"].replace({"": "UNKNOWN"}).fillna("UNKNOWN")
df.drop(columns=["Available_Globly_tmp"], inplace=True)
else:
df["Available_Globally"] = pd.Series(["UNKNOWN"] * len(df), index=df.index)
# Hours_Viewed
if "Hours Viewed" in df.columns:
hv = pd.to_numeric(df["Hours Viewed"].astype(str).str.replace(",", ""), errors="coerce")
df.drop(columns=["Hours Viewed"], inplace=True)
df["Hours_Viewed"] = hv.fillna(0)
elif "Hours_Viewed" in df.columns:
df["Hours_Viewed"] = pd.to_numeric(df["Hours_Viewed"], errors="coerce").fillna(0)
else:
df["Hours_Viewed"] = pd.Series([0] * len(df), index=df.index, dtype="float64")
# Views
if "Views" in df.columns:
df["Views"] = pd.to_numeric(df["Views"].astype(str).str.replace(",", ""), errors="coerce").fillna(0)
else:
df["Views"] = pd.Series([0] * len(df), index=df.index, dtype="float64")
# Ensure required columns exist
for col in ["Title","Release Date","Release Month","Available_Globally","Hours_Viewed","Views","Runtime (Hours)"]:
if col not in df.columns:
df[col] = pd.Series(index=df.index, dtype="object")
return df
def detect_category_col(df: pd.DataFrame):
# flexible detector for Movies vs Shows
for c in ["Category","Type","Format","Content Type","Programme Category"]:
if c in df.columns: return c
lower = {c.lower(): c for c in df.columns}
for c in ["category","type","format","content type","programme category"]:
if c in lower: return lower[c]
return None
def normalize_cat(val: str) -> str:
s = ("" if pd.isna(val) else str(val)).strip().lower()
if any(k in s for k in ["movie","film","feature"]): return "Movies"
if any(k in s for k in ["show","series","tv"]): return "Shows"
return "Unknown"
def split_combined(df: pd.DataFrame):
col = detect_category_col(df)
if col is not None:
norm = df[col].map(normalize_cat)
else:
norm = pd.Series(["Unknown"] * len(df), index=df.index)
return (
df[norm.eq("Shows")].copy(),
df[norm.eq("Movies")].copy(),
df[norm.eq("Unknown")].copy(),
)
# Gather all matching files
paths = []
for d in DATA_DIRS:
if os.path.isdir(d):
paths.extend(glob.glob(os.path.join(d, GLOB_PAT)))
# Debug: which files did we find?
print("CSV files found:", sorted(set(paths)))
shows_list, movies_list, unknown_list = [], [], []
for path in sorted(set(paths)):
df_raw = load_csv(path)
if df_raw.empty:
continue
df_raw["Time_Period"] = infer_time_period_from_filename(os.path.basename(path))
df_raw = standardize_columns(df_raw)
fname = os.path.basename(path).lower()
if any(k in fname for k in ["_tv", "shows", "show"]):
shows_list.append(df_raw)
elif "movies" in fname or "movie" in fname or "film" in fname:
movies_list.append(df_raw)
else:
s, m, u = split_combined(df_raw)
if not s.empty: shows_list.append(s)
if not m.empty: movies_list.append(m)
if not u.empty: unknown_list.append(u)
# Final frames
df_shows = pd.concat(shows_list, ignore_index=True) if shows_list else pd.DataFrame()
df_movies = pd.concat(movies_list, ignore_index=True) if movies_list else pd.DataFrame()
df_unknown = pd.concat(unknown_list, ignore_index=True) if unknown_list else pd.DataFrame()
# Standardize (idempotent) and ensure required columns
df_shows = standardize_columns(df_shows)
df_movies = standardize_columns(df_movies)
df_unknown = standardize_columns(df_unknown)
# Span for titles
periods = sorted(set(
df_shows.get("Time_Period", pd.Series(dtype=object)).dropna().tolist() +
df_movies.get("Time_Period", pd.Series(dtype=object)).dropna().tolist()
))
print("Time periods detected:", periods)
span = f"{periods[0]} – {periods[-1]}" if periods else "All Periods"
# =========================
# 2) FIGURES (with empty-data fallback)
# =========================
def _empty_placeholder_table(title: str, message: str):
return vm.Table(
id=title.lower().replace(" ", "_") + "_placeholder",
title=title,
figure=vt.dash_data_table(data_frame=pd.DataFrame({"Info": [message]})),
)
no_data = df_shows.empty and df_movies.empty
if no_data:
# Minimal placeholder components so the app never renders blank
shows_top_hours_chart = _empty_placeholder_table(
"No Data Detected",
"Drop CSVs named like '23H1_Netflix Report.csv' or '24H2_Netflix Report_TV.csv' into '.' or '/mnt/data'."
)
shows_month_hours_chart = _empty_placeholder_table(
"How to name files",
"Pattern matched: *Netflix Report*.csv (combined or split)."
)
shows_table = _empty_placeholder_table(
"Debug",
"CSV files found: 0. Check console logs printed above."
)
movies_top_views_chart = _empty_placeholder_table("Movies", "Waiting for data…")
movies_avg_runtime_chart = _empty_placeholder_table("Movies (runtime)", "Waiting for data…")
movies_table = _empty_placeholder_table("Movies Data", "Waiting for data…")
unknown_table = _empty_placeholder_table("Unknown Category", "No rows (or no data).")
else:
shows_top10 = df_shows.sort_values("Hours_Viewed", ascending=False).head(10).reset_index(drop=True)
movies_top10 = df_movies.sort_values("Views", ascending=False).head(10).reset_index(drop=True)
shows_top_hours_chart = vm.Graph(
title="Top 10 Shows by Hours Viewed",
header="Shows ranked by **Hours Viewed**. Use filters to slice across time periods.",
footer="These titles contribute a large share of total viewing time for series content.",
figure=px.bar(
shows_top10, x="Title", y="Hours_Viewed", color="Available_Globally",
template="vizro_dark",
labels={"Hours_Viewed":"Hours Viewed (Millions)","Available_Globally":"Global Release"}
)
)
shows_month_hours_chart = vm.Graph(
title="Total Show Hours Viewed by Release Month",
header="Sum of **Hours Viewed** by **Release Month** for shows.",
figure=px.bar(
df_shows.groupby("Release Month", as_index=False)["Hours_Viewed"].sum().sort_values("Release Month"),
x="Release Month", y="Hours_Viewed", template="vizro_dark",
labels={"Hours_Viewed":"Total Hours Viewed (Millions)"}
)
)
shows_table = vm.Table(
id="shows_data_table",
title=f"All TV Shows Data ({span})",
figure=vt.dash_data_table(data_frame=df_shows[[
"Title","Time_Period","Release Date","Available_Globally","Hours_Viewed","Views","Runtime (Hours)"
]])
)
movies_top_views_chart = vm.Graph(
title="Top 10 Movies by Views",
header="Movies ranked by **Views**. Use filters to slice across time periods.",
figure=px.bar(
movies_top10, x="Title", y="Views", color="Available_Globally",
template="vizro_dark",
labels={"Views":"Views (Millions)","Available_Globally":"Global Release"}
)
)
movies_avg_runtime_chart = vm.Graph(
title="Average Movie Runtime by Release Month",
header="Average **Runtime (hours)** of movies by **Release Month**.",
figure=px.bar(
df_movies.groupby("Release Month", as_index=False)["Runtime (Hours)"].mean().sort_values("Release Month"),
x="Release Month", y="Runtime (Hours)", template="vizro_dark",
labels={"Runtime (Hours)":"Avg Runtime (Hours)"}
)
)
movies_table = vm.Table(
id="movies_data_table",
title=f"All Movies Data ({span})",
figure=vt.dash_data_table(data_frame=df_movies[[
"Title","Time_Period","Release Date","Available_Globally","Hours_Viewed","Views","Runtime (Hours)"
]])
)
unknown_table = vm.Table(
id="unknown_data_table",
title="Rows with Unknown Category (inspect source files / mappings)",
figure=vt.dash_data_table(
data_frame=(df_unknown[[
"Title","Time_Period","Release Date","Available_Globally","Hours_Viewed","Views","Runtime (Hours)"
]] if not df_unknown.empty else pd.DataFrame({"Info":["No unknown-category rows found."]}))
)
)
# =========================
# 3) PAGES & FILTERS (separate instances per page to avoid collisions)
# =========================
# Separate filter instances per page (reusing the same instance across pages can blank the UI)
time_period_filter_shows = vm.Filter(column="Time_Period", selector=vm.Dropdown(title="Time Period", multi=True))
availability_filter_shows = vm.Filter(column="Available_Globally", selector=vm.Dropdown(title="Global Availability", multi=True))
time_period_filter_movies = vm.Filter(column="Time_Period", selector=vm.Dropdown(title="Time Period", multi=True))
availability_filter_movies = vm.Filter(column="Available_Globally", selector=vm.Dropdown(title="Global Availability", multi=True))
# Home page at '/' so the app isn’t blank on load
shows_page = vm.Page(
title="TV Shows Analysis",
path="/", # root
layout=vm.Grid(grid=[[0, 1], [2, 2]]),
components=[shows_top_hours_chart, shows_month_hours_chart, shows_table],
controls=[time_period_filter_shows, availability_filter_shows, vm.Filter(column="Release Date")],
)
movies_page = vm.Page(
title="Movies Analysis",
path="/movies", # unique
layout=vm.Grid(grid=[[0, 1], [2, 2]]),
components=[movies_top_views_chart, movies_avg_runtime_chart, movies_table],
controls=[time_period_filter_movies, availability_filter_movies, vm.Filter(column="Release Date")],
)
quality_page = vm.Page(
title="Data Quality",
path="/dq", # unique
layout=vm.Grid(grid=[[0]]),
components=[unknown_table],
)
dashboard = vm.Dashboard(
title=f"Netflix Multi-Period Report Dashboard ({span})",
pages=[shows_page, movies_page, quality_page],
)
Vizro().build(dashboard).run()