Py.Cafe

marie-anne/

2025-figurefriday-w19

Vibecoding with Gemini 2.5

DocsPricing
  • Starting-instructions.py
  • app.py
  • data.csv
  • requirements.txt
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import dash
from dash import dcc, html, Input, Output
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import re # For FRU date matching

# --- Step 0: Define Stage Order, Mappings, and Pass Criteria ---
ORIGINAL_PROCESS_STAGES_COLS = [
    'Defensive Driving',
    'Medical Clearance Form',
    'WAV Course',
    'Drug Test',
    'FRU Interview Scheduled',
    'Driver Exam'
]

FRU_PASS_LABEL = 'FRU: Cleared/N/A'
FRU_SCHEDULED_LABEL = 'FRU: Scheduled'
FRU_OTHER_LABEL = 'FRU: Other/Pending'

def map_fru_status(value):
    if pd.isna(value) or value is None:
        return FRU_OTHER_LABEL
    value_str = str(value).strip().lower()
    if value_str == 'not applicable' or value_str == 'completed':
        return FRU_PASS_LABEL
    if re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', value_str):
        return FRU_SCHEDULED_LABEL
    return FRU_OTHER_LABEL

FRU_MAPPED_COL_NAME = 'FRU Interview Scheduled_Mapped'

PASS_CRITERIA = {
    'Defensive Driving': ['Complete'],
    'Medical Clearance Form': ['Complete', 'Not Applicable'],
    'WAV Course': ['Complete', 'Not Applicable'],
    'Drug Test': ['Complete', 'Not Applicable'],
    FRU_MAPPED_COL_NAME: [FRU_PASS_LABEL],
    'Driver Exam': ['Complete', 'Not Applicable']
}

SANKEY_STAGES_ORDER = [
    'Defensive Driving',
    'Medical Clearance Form',
    'WAV Course',
    'Drug Test',
    FRU_MAPPED_COL_NAME,
    'Driver Exam'
]

# --- Step 1: Load and Prepare Data ---
def generate_funnel_sample_data(num_rows=5000):
    np.random.seed(42)
    types = ['FHV Independent', 'FHV Base Affiliated', 'Medallion', 'SHL Permit', 'Commuter Van']
    data = {'Type': np.random.choice(types, num_rows)}
    data['Defensive Driving'] = np.random.choice(['Complete', 'Needed'], num_rows, p=[0.9, 0.1])
    data['Medical Clearance Form'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.85, 0.1, 0.05]) # Higher pass
    data['WAV Course'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.8, 0.1, 0.1])
    data['Drug Test'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.75, 0.1, 0.15])
    fru_raw_options = ['Not Applicable'] * 70 + ['4/1/2025'] * 20 + ['Pending Review'] * 10
    data['FRU Interview Scheduled'] = np.random.choice(fru_raw_options, num_rows)
    data['Driver Exam'] = np.random.choice(['Complete', 'Not Applicable', 'Needed'], num_rows, p=[0.7, 0.1, 0.2]) # Higher pass for more to reach final
    
    # Final Status - make sure 'Denied' and 'Approved' are common for those passing Driver Exam
    status_options = ['Denied', 'Approved - License Issued', 'Under Review', 'Incomplete', 'Pending Fitness Interview']
    # Skew probabilities for more interesting final outcomes for passed applications
    num_passed_approx = int(num_rows * 0.9 * 0.95 * 0.9 * 0.85 * 0.7 * 0.8) # Rough estimate
    final_statuses = []
    for _ in range(num_rows):
        # Simple logic: if exam is needed, more likely incomplete/denied. Otherwise, more spread.
        if data['Driver Exam'][_] == 'Needed':
             final_statuses.append(np.random.choice(['Denied', 'Incomplete'], p=[0.6, 0.4]))
        else:
             final_statuses.append(np.random.choice(status_options, p=[0.25, 0.4, 0.15, 0.15, 0.05])) # Higher chance of approved/denied
    data['Status'] = final_statuses
    
    df = pd.DataFrame(data)
    df['App Date'] = pd.to_datetime(np.random.choice(pd.date_range('2023-01-01', '2025-01-01'), num_rows))
    df['App y-m'] = df['App Date'].dt.to_period('M')
    cols_to_drop = ['App Date']
    if 'App No' in df.columns: cols_to_drop.append('App No')
    df = df.drop(columns=cols_to_drop, axis=1, errors='ignore') # Add errors='ignore' for robustness
    return df

try:
    df_global = pd.read_csv('data.csv')
    df_global['App Date'] = pd.to_datetime(df_global['App Date'])
    df_global['App y-m'] = df_global['App Date'].dt.to_period('M')
    cols_to_drop = ['App Date']
    if 'App No' in df_global.columns:
        cols_to_drop.append('App No')
    df_global = df_global.drop(columns=cols_to_drop, axis=1, errors='ignore')
    print("Successfully loaded and preprocessed 'data.csv'")
except FileNotFoundError:
    print("WARNING: 'data.csv' not found. Using generated sample data.")
    df_global = generate_funnel_sample_data()
except KeyError as e:
    print(f"WARNING: KeyError during data loading: {e}. Using generated sample data.")
    df_global = generate_funnel_sample_data()
except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}. Using generated sample data.")
    df_global = generate_funnel_sample_data()

available_types = ['All Types'] + sorted(df_global['Type'].unique())

app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'])
app.title = "TLC Application Funnel"

app.layout = html.Div([
    html.H1("NYC TLC Application Funnel" ,style={'margin': '2rem'}),
    html.Div([
        html.P("This Sankey diagram visualizes the funnel of TLC applications, showing how many 'pass' each sequential requirement stage."),
        html.P("Data is analyzed across all application dates (jan 1, 2025 - may 5, 2025), since only the application date is available. Use the dropdown to filter by Application Type."),
        html.P("Question: why do not all drivers who passed everything end up with status Approved?."),
        html.P("Answer: not all requirements are available in the dataset."),
    ], style={'margin': '2rem'}),
    html.Div([
        html.Div([
            html.Label("Select Application Type:"),
            dcc.Dropdown(
                id='type-dropdown',
                options=[{'label': t, 'value': t} for t in available_types],
                value='All Types',
                clearable=False
            )
        ], className="six columns offset-by-three"),
    ], className="row", style={'margin': '2rem'}),
    html.Div(id='sankey-title', style={'textAlign': 'center', 'fontSize': '20px', 'marginBottom': '10px'}),
    dcc.Loading(
        id="loading-sankey",
        type="circle",
        children=dcc.Graph(id='sankey-diagram', style={'height': '40vh', 'margin':'2rem'})
    )
])

@app.callback(
    [Output('sankey-diagram', 'figure'),
     Output('sankey-title', 'children')],
    [Input('type-dropdown', 'value')]
)
def update_funnel_sankey(selected_type):
    current_df_filtered_by_type = df_global.copy()
    type_display = selected_type
    if selected_type != 'All Types':
        current_df_filtered_by_type = current_df_filtered_by_type[current_df_filtered_by_type['Type'] == selected_type]
    else:
        type_display = "All Application Types"

    if current_df_filtered_by_type.empty:
        fig = go.Figure()
        fig.update_layout(
            annotations=[{'text': f"No data available for Type: {type_display}.", 'xref': "paper", 'yref': "paper", 'showarrow': False, 'font': {'size': 16}}]
        )
        return fig, f"Application Funnel for {type_display} (0 Applications)"

    df_for_funnel = current_df_filtered_by_type.copy()
    df_for_funnel.loc[:, FRU_MAPPED_COL_NAME] = df_for_funnel['FRU Interview Scheduled'].apply(map_fru_status)
    
    sankey_title_text = f"Application Funnel for {type_display}"

    labels = []
    source_indices = []
    target_indices = []
    values = []
    node_colors = []
    node_map = {}

    def get_node_index(label, color=None):
        if label not in node_map:
            node_map[label] = len(labels)
            labels.append(label)
            default_color = 'rgba(200,200,200,0.8)'
            if "Passed:" in label: default_color = 'rgba(188,218,188,0.8)'
            node_colors.append(color if color else default_color)
        return node_map[label]

    initial_count = len(df_for_funnel)
    initial_node_label = f"Applications Started ({initial_count})"
    previous_node_idx = get_node_index(initial_node_label, 'rgba(100,100,100,0.8)')
    
    df_passing_stages = df_for_funnel.copy()

    for i, stage_col_name_in_sankey in enumerate(SANKEY_STAGES_ORDER):
        display_stage_name = "FRU Interview Scheduled" if stage_col_name_in_sankey == FRU_MAPPED_COL_NAME else stage_col_name_in_sankey
        pass_values_for_stage = PASS_CRITERIA[stage_col_name_in_sankey]
        df_passing_stages = df_passing_stages[df_passing_stages[stage_col_name_in_sankey].isin(pass_values_for_stage)]
        passed_count = len(df_passing_stages)
        
        target_label = f"Passed: {display_stage_name} ({passed_count})"
        target_node_idx = get_node_index(target_label)
        
        source_indices.append(previous_node_idx)
        target_indices.append(target_node_idx)
        values.append(passed_count)
        previous_node_idx = target_node_idx
        
        if passed_count == 0: break 

    status_node_colors_map = {
        "Approved - License Issued": 'rgba(44, 160, 44, 0.8)',
        "Denied": 'rgba(214, 39, 40, 0.8)',
        "Incomplete": 'rgba(255, 127, 14, 0.8)',
        "Under Review": 'rgba(148, 103, 189, 0.8)',
        "Pending Fitness Interview": 'rgba(23, 190, 207, 0.8)'
    }

    if not df_passing_stages.empty:
        # Explicitly handle "Denied" status for those who passed all stages
        denied_after_all_stages_df = df_passing_stages[df_passing_stages['Status'] == 'Denied']
        denied_count = len(denied_after_all_stages_df)
        if denied_count > 0:
            target_label_denied = f"Final Status: Denied ({denied_count})"
            target_node_idx_denied = get_node_index(target_label_denied, color=status_node_colors_map.get('Denied'))
            source_indices.append(previous_node_idx)
            target_indices.append(target_node_idx_denied)
            values.append(denied_count)

        # Handle other non-Denied final statuses for those who passed all stages
        other_final_statuses_df = df_passing_stages[df_passing_stages['Status'] != 'Denied']
        if not other_final_statuses_df.empty:
            other_status_counts = other_final_statuses_df['Status'].value_counts().reset_index()
            other_status_counts.columns = ['status_val', 'count']
            for _, row in other_status_counts.iterrows():
                final_status_val = row['status_val']
                count = row['count']
                target_label_other = f"Final Status: {final_status_val} ({count})"
                target_node_idx_other = get_node_index(target_label_other, color=status_node_colors_map.get(final_status_val))
                source_indices.append(previous_node_idx)
                target_indices.append(target_node_idx_other)
                values.append(count)

    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=25, thickness=20, line=dict(color="black", width=0.5),
            label=labels, color=node_colors, customdata=labels,
            hovertemplate='<b>%{customdata}</b><extra></extra>'
        ),
        link=dict(
            source=source_indices, target=target_indices, value=values,
            hovertemplate='Applicants passing: %{value}<extra></extra>'
        ),
        arrangement='snap', orientation='h'
    )])
    
    fig.update_layout(
        font_size=10, margin=dict(l=20, r=20, t=35, b=20),
        title=dict(text=f"Application Funnel - Started: {initial_count}", x=0.5, y=0.98, xanchor='center', yanchor='top', font=dict(size=14))
    )
    
    return fig, sankey_title_text

if __name__ == '__main__':
    app.run_server(debug=True)