import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
import plotly.express as px
from dash import Dash, dcc, html, Input, Output
import dash_bootstrap_components as dbc
from dash.dash_table import DataTable
df = pd.read_csv('TLC_New_Driver_Application.csv')
# INITIAL EXPLORATION
def initial_exploration(df: pd.DataFrame) -> dict:
return {
'shape': df.shape,
'dtypes': df.dtypes.astype(str).to_dict(),
'missing_pct': (df.isna().mean()*100).round(2).to_dict(),
'stats': df.describe(include='all').T.to_dict(orient='index')
}
# CLEAN & PREPROCESS
def clean_preprocess(df: pd.DataFrame) -> pd.DataFrame:
df2 = df.copy()
num = df2.select_dtypes(include=np.number).columns
cat = df2.select_dtypes(include=['object', 'category']).columns
# Impute
df2[num] = SimpleImputer(strategy='median').fit_transform(df2[num])
df2[cat] = SimpleImputer(strategy='most_frequent').fit_transform(df2[cat])
# Outliers mask
mask = np.ones(len(df2), dtype=bool)
for c in num:
q1, q3 = df2[c].quantile([.25, .75])
iqr = q3 - q1
mask &= df2[c].between(q1 - 1.5*iqr, q3 + 1.5*iqr)
df2 = df2[mask]
# Scale
df2[num] = StandardScaler().fit_transform(df2[num])
# Encode except text
text = ['Other Requirements']
for c in cat:
if c in text:
continue
if df2[c].nunique() == 2:
df2[c] = LabelEncoder().fit_transform(df2[c])
else:
ohe = OneHotEncoder(
sparse_output=False,
drop='first'
)
arr = ohe.fit_transform(df2[[c]])
cols = [f"{c}_{v}" for v in ohe.categories_[0][1:]]
df2 = pd.concat(
[df2.reset_index(drop=True), pd.DataFrame(arr, columns=cols)],
axis=1
)
df2.drop(columns=[c], inplace=True)
return df2
# FEATURES
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
d = df.copy()
d['year'] = d['App Date'].dt.year
d['quarter'] = d['App Date'].dt.quarter
d['month'] = d['App Date'].dt.month
d['day'] = d['App Date'].dt.day
d['dow'] = d['App Date'].dt.dayofweek
d['is_weekend'] = d['dow'].isin([5, 6]).astype(int)
# approval flag & monthly rate
d['flag'] = (d['Status']=='Approved - License Issued').astype(int)
rate = d.groupby('month')['flag'].mean().rename('mon_rate')
d = d.merge(rate, on='month')
num = d.select_dtypes(include=np.number).drop(columns['flag', 'mon_rate'])
if num.shape[1] > 10:
p = PCA(5).fit_transform(num)
sel = SelectKBest(chi2, k=min(10, num.shape[1]))
sel.fit_transform(np.abs(num), d['flag'])
keep = num.columns[sel.get_support()]
return d[list(keep) + ['flag', 'mon_rate']]
# INSIGHTS FUNCS
def acc_rate(df):
t = len(df)
return (df['Status'].eq('Approved - License Issued').sum()/t*100) if t else 0
def parse_rej(x):
if pd.isna(x):
return []
p = [i.strip().replace('needed','').strip() for i in x.split(';')]
return [i for i in p if i and i.lower() not in ['not applicable','open items']]
def top_rej(df, n=5):
r = df[df['Status']=='Denied']['Other Requirements'].apply(parse_rej).explode()
return r.value_counts().head(n)
# DASHBOARD
app = Dash(__name__, external_stylesheets=[dbc.themes.DARKLY])
server = app.server
# prep
if __name__ == '__main__':
# enforce datetime
df['App Date'] = pd.to_datetime(df['App Date'], errors='coerce')
if df['App Date'].isna().any():
raise ValueError("Some 'App Date' values could not be parsed as datetime.")
df_clean = clean_preprocess(df)
info=initial_exploration(df)
df = df.copy()
df['Month'] = df['App Date'].astype(str).str[:7]
# layout
app.layout = dbc.Container([
html.H1(
'TLC Dashboard',
className='text-center text-white my-2'
),
# filters
dbc.Row([
dbc.Col([
html.Label(
'Date Range',
className='text-white'
),
dcc.DatePickerRange(
id='date_rng',
start_date=df['App Date'].min(),
end_date = df['App Date'].max(),
display_format='YYYY-MM-DD'
)
], width=4),
dbc.Col([
html.Label(
'Status',
className='text-white'
),
dcc.Dropdown(
id='status_f',
options=[{'label':s,'value':s} for s in df['Status'].unique()],
multi=True,
style={'color': 'black'},
placeholder='All'
)
], width=4)
], className='mb-3'),
# cards
dbc.Row([
dbc.Col(
dbc.Card([
dbc.CardHeader(k),
dbc.CardBody(
html.H4(v,className='card-title'))],
className='mb-2'
),width=3)
for k,v in {
'Total':len(df),'Approved':int(df['Status'].eq('Approved - License Issued').sum()),
'Denied':int(df['Status'].eq('Denied').sum()),'Rate':f"{acc_rate(df):.2f}%"
}.items()
]),
# tabs
dcc.Tabs(
id='tabs',
value='tab1',
children=[
dcc.Tab(
label='Overview',
value='tab1',
style={'color': 'black'},
),
dcc.Tab(
label='Visuals',
value='tab2',
style={'color': 'black'},
),
dcc.Tab(
label='Insights',
value='tab3',
style={'color': 'black'},
)
]),html.Div(id='content')
], fluid=True,style={'backgroundColor':'#2c2f33'})
# callback
def filter_df(sd,ed,sts):
d=df[(df['App Date']>=sd)&(df['App Date']<=ed)]
return d[d['Status'].isin(sts)] if sts else d
@app.callback(
Output('content', 'children'),
[Input('tabs', 'value'), Input('date_rng', 'start_date'), Input('date_rng', 'end_date'), Input('status_f', 'value')]
)
def render(tab, sd, ed, sts):
dff = filter_df(sd, ed, sts)
if tab == 'tab1':
if dff.empty:
return html.Div("No data available for the selected filters.")
status_counts = dff['Status'].value_counts().reset_index()
status_counts.columns = ['Status', 'Count']
return dcc.Graph(figure=px.bar(status_counts, x='Status', y='Count', title='Status Distribution'))
if tab == 'tab2':
m = dff.groupby(['Month', 'Status']).size().reset_index(name='C')
bar = px.bar(m, x='Month', y='C', color='Status', barmode='stack')
return html.Div([dcc.Graph(figure=bar)])
# tab3: insights
tr = top_rej(dff)
return html.Div([
html.H4(f"Rate: {acc_rate(dff):.2f}%", className='text-white'),
html.Ul([html.Li(f"{r}: {c}") for r, c in tr.items()])
])
if __name__=='__main__':
app.run_server(debug=True)