import dash
from dash import dcc, html, Input, Output
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import re # For FRU date matching
# --- Step 0: Define Stage Order, Mappings, and Pass Criteria ---
ORIGINAL_PROCESS_STAGES_COLS = [
'Defensive Driving',
'Medical Clearance Form',
'WAV Course',
'Drug Test',
'FRU Interview Scheduled',
'Driver Exam'
]
FRU_PASS_LABEL = 'FRU: Cleared/N/A'
FRU_SCHEDULED_LABEL = 'FRU: Scheduled'
FRU_OTHER_LABEL = 'FRU: Other/Pending'
def map_fru_status(value):
if pd.isna(value) or value is None:
return FRU_OTHER_LABEL
value_str = str(value).strip().lower()
if value_str == 'not applicable' or value_str == 'completed':
return FRU_PASS_LABEL
if re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', value_str):
return FRU_SCHEDULED_LABEL
return FRU_OTHER_LABEL
FRU_MAPPED_COL_NAME = 'FRU Interview Scheduled_Mapped'
PASS_CRITERIA = {
'Defensive Driving': ['Complete'],
'Medical Clearance Form': ['Complete', 'Not Applicable'],
'WAV Course': ['Complete', 'Not Applicable'],
'Drug Test': ['Complete', 'Not Applicable'],
FRU_MAPPED_COL_NAME: [FRU_PASS_LABEL],
'Driver Exam': ['Complete', 'Not Applicable']
}
SANKEY_STAGES_ORDER = [
'Defensive Driving',
'Medical Clearance Form',
'WAV Course',
'Drug Test',
FRU_MAPPED_COL_NAME,
'Driver Exam'
]
# --- Step 1: Load and Prepare Data ---
def generate_funnel_sample_data(num_rows=5000):
np.random.seed(42)
types = ['FHV Independent', 'FHV Base Affiliated', 'Medallion', 'SHL Permit', 'Commuter Van']
data = {'Type': np.random.choice(types, num_rows)}
data['Defensive Driving'] = np.random.choice(['Complete', 'Needed'], num_rows, p=[0.9, 0.1])
data['Medical Clearance Form'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.85, 0.1, 0.05]) # Higher pass
data['WAV Course'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.8, 0.1, 0.1])
data['Drug Test'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.75, 0.1, 0.15])
fru_raw_options = ['Not Applicable'] * 70 + ['4/1/2025'] * 20 + ['Pending Review'] * 10
data['FRU Interview Scheduled'] = np.random.choice(fru_raw_options, num_rows)
data['Driver Exam'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.7, 0.1, 0.2]) # Higher pass for more to reach final
# Final Status - make sure 'Denied' and 'Approved' are common for those passing Driver Exam
status_options = ['Denied', 'Approved - License Issued', 'Under Review', 'Incomplete', 'Pending Fitness Interview']
# Skew probabilities for more interesting final outcomes for passed applications
num_passed_approx = int(num_rows * 0.9 * 0.95 * 0.9 * 0.85 * 0.7 * 0.8) # Rough estimate
final_statuses = []
for _ in range(num_rows):
# Simple logic: if exam is needed, more likely incomplete/denied. Otherwise, more spread.
if data['Driver Exam'][_] == 'Needed':
final_statuses.append(np.random.choice(['Denied', 'Incomplete'], p=[0.6, 0.4]))
else:
final_statuses.append(np.random.choice(status_options, p=[0.25, 0.4, 0.15, 0.15, 0.05])) # Higher chance of approved/denied
data['Status'] = final_statuses
df = pd.DataFrame(data)
df['App Date'] = pd.to_datetime(np.random.choice(pd.date_range('2023-01-01', '2025-01-01'), num_rows))
df['App y-m'] = df['App Date'].dt.to_period('M')
cols_to_drop = ['App Date']
if 'App No' in df.columns: cols_to_drop.append('App No')
df = df.drop(columns=cols_to_drop, axis=1, errors='ignore') # Add errors='ignore' for robustness
return df
try:
df_global = pd.read_csv('data.csv')
df_global['App Date'] = pd.to_datetime(df_global['App Date'])
df_global['App y-m'] = df_global['App Date'].dt.to_period('M')
cols_to_drop = ['App Date']
if 'App No' in df_global.columns:
cols_to_drop.append('App No')
df_global = df_global.drop(columns=cols_to_drop, axis=1, errors='ignore')
print("Successfully loaded and preprocessed 'data.csv'")
except FileNotFoundError:
print("WARNING: 'data.csv' not found. Using generated sample data.")
df_global = generate_funnel_sample_data()
except KeyError as e:
print(f"WARNING: KeyError during data loading: {e}. Using generated sample data.")
df_global = generate_funnel_sample_data()
except Exception as e:
print(f"An unexpected error occurred during data loading: {e}. Using generated sample data.")
df_global = generate_funnel_sample_data()
available_types = ['All Types'] + sorted(df_global['Type'].unique())
app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'])
app.title = "TLC Application Funnel"
app.layout = html.Div([
html.H1("NYC TLC Application Funnel" ,style={'margin': '2rem'}),
html.Div([
html.P("This Sankey diagram visualizes the funnel of TLC applications, showing how many 'pass' each sequential requirement stage."),
html.P("Data is analyzed across all application dates (jan 1, 2025 - may 5, 2025), since only the application date is available. Use the dropdown to filter by Application Type."),
html.P("Question: why do not all drivers who passed everything end up with status Approved?."),
html.P("Answer: not all requirements are available in the dataset."),
], style={'margin': '2rem'}),
html.Div([
html.Div([
html.Label("Select Application Type:"),
dcc.Dropdown(
id='type-dropdown',
options=[{'label': t, 'value': t} for t in available_types],
value='All Types',
clearable=False
)
], className="six columns offset-by-three"),
], className="row", style={'margin': '2rem'}),
html.Div(id='sankey-title', style={'textAlign': 'center', 'fontSize': '20px', 'marginBottom': '10px'}),
dcc.Loading(
id="loading-sankey",
type="circle",
children=dcc.Graph(id='sankey-diagram', style={'height': '40vh', 'margin':'2rem'})
)
])
@app.callback(
[Output('sankey-diagram', 'figure'),
Output('sankey-title', 'children')],
[Input('type-dropdown', 'value')]
)
def update_funnel_sankey(selected_type):
current_df_filtered_by_type = df_global.copy()
type_display = selected_type
if selected_type != 'All Types':
current_df_filtered_by_type = current_df_filtered_by_type[current_df_filtered_by_type['Type'] == selected_type]
else:
type_display = "All Application Types"
if current_df_filtered_by_type.empty:
fig = go.Figure()
fig.update_layout(
annotations=[{'text': f"No data available for Type: {type_display}.", 'xref': "paper", 'yref': "paper", 'showarrow': False, 'font': {'size': 16}}]
)
return fig, f"Application Funnel for {type_display} (0 Applications)"
df_for_funnel = current_df_filtered_by_type.copy()
df_for_funnel.loc[:, FRU_MAPPED_COL_NAME] = df_for_funnel['FRU Interview Scheduled'].apply(map_fru_status)
sankey_title_text = f"Application Funnel for {type_display}"
labels = []
source_indices = []
target_indices = []
values = []
node_colors = []
node_map = {}
def get_node_index(label, color=None):
if label not in node_map:
node_map[label] = len(labels)
labels.append(label)
default_color = 'rgba(200,200,200,0.8)'
if "Passed:" in label: default_color = 'rgba(188,218,188,0.8)'
node_colors.append(color if color else default_color)
return node_map[label]
initial_count = len(df_for_funnel)
initial_node_label = f"Applications Started ({initial_count})"
previous_node_idx = get_node_index(initial_node_label, 'rgba(100,100,100,0.8)')
df_passing_stages = df_for_funnel.copy()
for i, stage_col_name_in_sankey in enumerate(SANKEY_STAGES_ORDER):
display_stage_name = "FRU Interview Scheduled" if stage_col_name_in_sankey == FRU_MAPPED_COL_NAME else stage_col_name_in_sankey
pass_values_for_stage = PASS_CRITERIA[stage_col_name_in_sankey]
df_passing_stages = df_passing_stages[df_passing_stages[stage_col_name_in_sankey].isin(pass_values_for_stage)]
passed_count = len(df_passing_stages)
target_label = f"Passed: {display_stage_name} ({passed_count})"
target_node_idx = get_node_index(target_label)
source_indices.append(previous_node_idx)
target_indices.append(target_node_idx)
values.append(passed_count)
previous_node_idx = target_node_idx
if passed_count == 0: break
status_node_colors_map = {
"Approved - License Issued": 'rgba(44, 160, 44, 0.8)',
"Denied": 'rgba(214, 39, 40, 0.8)',
"Incomplete": 'rgba(255, 127, 14, 0.8)',
"Under Review": 'rgba(148, 103, 189, 0.8)',
"Pending Fitness Interview": 'rgba(23, 190, 207, 0.8)'
}
if not df_passing_stages.empty:
# Explicitly handle "Denied" status for those who passed all stages
denied_after_all_stages_df = df_passing_stages[df_passing_stages['Status'] == 'Denied']
denied_count = len(denied_after_all_stages_df)
if denied_count > 0:
target_label_denied = f"Final Status: Denied ({denied_count})"
target_node_idx_denied = get_node_index(target_label_denied, color=status_node_colors_map.get('Denied'))
source_indices.append(previous_node_idx)
target_indices.append(target_node_idx_denied)
values.append(denied_count)
# Handle other non-Denied final statuses for those who passed all stages
other_final_statuses_df = df_passing_stages[df_passing_stages['Status'] != 'Denied']
if not other_final_statuses_df.empty:
other_status_counts = other_final_statuses_df['Status'].value_counts().reset_index()
other_status_counts.columns = ['status_val', 'count']
for _, row in other_status_counts.iterrows():
final_status_val = row['status_val']
count = row['count']
target_label_other = f"Final Status: {final_status_val} ({count})"
target_node_idx_other = get_node_index(target_label_other, color=status_node_colors_map.get(final_status_val))
source_indices.append(previous_node_idx)
target_indices.append(target_node_idx_other)
values.append(count)
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=25, thickness=20, line=dict(color="black", width=0.5),
label=labels, color=node_colors, customdata=labels,
hovertemplate='<b>%{customdata}</b><extra></extra>'
),
link=dict(
source=source_indices, target=target_indices, value=values,
hovertemplate='Applicants passing: %{value}<extra></extra>'
),
arrangement='snap', orientation='h'
)])
fig.update_layout(
font_size=10, margin=dict(l=20, r=20, t=35, b=20),
title=dict(text=f"Application Funnel - Started: {initial_count}", x=0.5, y=0.98, xanchor='center', yanchor='top', font=dict(size=14))
)
return fig, sankey_title_text
if __name__ == '__main__':
app.run_server(debug=True)