# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, State
import dash_bootstrap_components as dbc
from dash import dash_table
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import chi2_contingency
from statsmodels.tsa.arima.model import ARIMA # Replaced Prophet with ARIMA from statsmodels
import networkx as nx
# Load the dataset
df = pd.read_csv('banknotesData.csv')
# Fill missing values: numeric columns with median and string columns with "Unknown"
num_cols = df.select_dtypes(include=[np.number]).columns
str_cols = df.select_dtypes(include=[object]).columns
# Fill numeric columns with median
for col in num_cols:
df[col] = df[col].fillna(df[col].median())
# Fill string columns with "Unknown"
for col in str_cols:
df[col] = df[col].fillna("Unknown")
# Convert the 'deathDate' column to numeric type, coercing errors to NaN
df['deathDate'] = pd.to_numeric(df['deathDate'], errors='coerce')
# 1. Create a flag 'isPioneer' based on whether the person is known for being the first
df['isPioneer'] = df['knownForBeingFirst'].apply(lambda x: True if str(x).strip().lower() == "yes" else False)
# 2. Standardize the profession names to title case
df['profession_clean'] = df['profession'].str.title()
# 3. Simplified classification of professions for easier analysis
def classify_profession(prof):
# Convert profession to lowercase for consistency
prof_lower = str(prof).lower()
# Define lists of professions for each category
creative = ['writer', 'musician', 'visual artist', 'performer']
political = ['politician', 'head of gov\'t', 'monarch', 'founder']
# Check and return category
if prof_lower in creative:
return 'Creative'
elif prof_lower in political:
return 'Political'
elif prof_lower == 'revolutionary':
return 'Revolutionary'
elif prof_lower == 'military':
return 'Military'
elif prof_lower == 'religious figure':
return 'Religious'
elif prof_lower == 'stem':
return 'STEM'
elif prof_lower == 'activist':
return 'Activist'
elif prof_lower == 'educator':
return 'Educator'
elif prof_lower == 'other':
return 'Historical Figure' # Default for "Other"
else:
return 'Unknown'
# Apply profession classification
df['prof_category'] = df['profession_clean'].apply(classify_profession)
# 4. Create a lifeSpan feature if data is available
def compute_lifespan(row):
try:
return abs(float(row['deathDate']) - float(row['firstAppearanceDate']))
except:
return np.nan
df['lifeSpan'] = df.apply(compute_lifespan, axis=1)
# Define filter options for the dashboard
country_options = [{'label': country, 'value': country} for country in sorted(df['country'].unique())]
gender_options = [{'label': gender, 'value': gender} for gender in sorted(df['gender'].unique())]
pioneer_options = [
{'label': 'Pioneers (Yes)', 'value': True},
{'label': 'Non-Pioneers (No)', 'value': False}
]
profession_options = [{'label': prof, 'value': prof} for prof in sorted(df['profession_clean'].unique())]
# Filter for the geography tab by profession category
geo_category_options = [{'label': prof, 'value': prof} for prof in sorted(df['profession_clean'].unique())]
geo_category_options.insert(0, {'label': 'All', 'value': 'all'})
# Define range slider limits
min_year = int(df['firstAppearanceDate'].min())
max_year = int(df['firstAppearanceDate'].max())
min_bill = float(df['currentBillValue'].min())
max_bill = float(df['currentBillValue'].max())
# For network filter – by node country
network_country_options = [{'label': country, 'value': country} for country in sorted(df['country'].unique())]
network_country_options.insert(0, {'label': 'All', 'value': 'all'})
# Set up the Dash app with a dark theme
external_stylesheets = [dbc.themes.DARKLY]
app = Dash(__name__, external_stylesheets=external_stylesheets)
app.title = 'Banknotes Data Dashboard'
# Define the app layout
app.layout = dbc.Container([
dbc.Row(
dbc.Col(
html.H1(
'Banknotes Data Dashboard',
className='text-center text-light my-4'),
width=12
)
),
# Global Filters with a Reset Button
dbc.Row([
dbc.Col([
html.Label('Country', className='text-light'),
dcc.Dropdown(
id='country-filter',
options=country_options,
multi=True,
className='text-dark',
placeholder='Select country...'
)
], md=2),
dbc.Col([
html.Label('Gender', className='text-light'),
dcc.Dropdown(
id='gender-filter',
options=gender_options,
multi=True,
className='text-dark',
placeholder='Select gender...'
)
], md=2),
dbc.Col([
html.Label('Pioneer', className='text-light'),
dcc.RadioItems(
id='pioneer-filter',
options=pioneer_options,
value=None,
inline=True,
labelStyle={'margin-right': '10px'}
)
], md=2),
dbc.Col([
html.Label('Profession', className='text-light'),
dcc.Dropdown(
id='profession-filter',
options=profession_options,
multi=True,
className='text-dark',
placeholder='Select profession...'
)
], md=2),
dbc.Col([
html.Label('First Appearance Year', className='text-light'),
dcc.RangeSlider(
id='year-slider',
min=min_year,
max=max_year,
step=1,
marks={str(year): str(year) for year in range(min_year, max_year+1, max(1, (max_year-min_year)//10))},
value=[min_year, max_year]
)
], md=2),
dbc.Col([
html.Label('Bill Value', className='text-light'),
dcc.RangeSlider(
id='bill-slider',
min=min_bill,
max=max_bill,
step=(max_bill-min_bill)/100,
marks={str(round(val,1)): str(round(val,1)) for val in np.linspace(min_bill, max_bill, num=5)},
value=[min_bill, max_bill]
)
], md=2)
], className='mb-4'),
dbc.Row([
dbc.Col(
dbc.Button('Reset Filters', id='reset-button', color='secondary', className='mb-2'),
width=2
)
]),
# Tabs for different analyses
dbc.Tabs([
dbc.Tab(label='Main Analysis', children=[
dbc.Row([
dbc.Col(dcc.Graph(id='bar-profession', config={'displayModeBar': False}), md=6),
dbc.Col(dcc.Graph(id='scatter-bill-waiting', config={'displayModeBar': False}), md=6)
]),
dbc.Row([
dbc.Col(dcc.Graph(id='grouped-profession-gender', config={'displayModeBar': False}), md=6),
dbc.Col(dcc.Graph(id='box-waiting-time', config={'displayModeBar': False}), md=6)
]),
dbc.Row([
dbc.Col(
dash_table.DataTable(
id='data-table',
columns=[{'name': i, 'id': i} for i in df.columns],
data=df.to_dict('records'),
filter_action='native',
sort_action='native',
page_action='native',
page_current=0,
page_size=10,
style_table={'overflowX': 'auto'},
style_header={'backgroundColor': '#303030', 'color': 'white'},
style_cell={'backgroundColor': '#424242', 'color': 'white', 'textAlign': 'left'}
), md=12)
], className='mt-4')
]),
dbc.Tab(label='Trends Analysis', children=[
dbc.Row([
dbc.Col([
html.Label('Group By:', className='text-light'),
dcc.RadioItems(
id='trend-groupby',
options=[
{'label': 'Gender', 'value': 'gender'},
{'label': 'Profession', 'value': 'profession_clean'}
],
value='gender',
inline=True,
labelStyle={'margin-right': '10px'}
)
], md=4)
], className='mb-4'),
dbc.Row([
dbc.Col(dcc.Graph(id='trend-analysis', config={'displayModeBar': False}), md=12)
])
]),
dbc.Tab(label='Geography Analysis', children=[
dbc.Row([
dbc.Col([
html.Label('Select Profession for Map:', className='text-light'),
dcc.Dropdown(
id='geo-category',
options=geo_category_options,
value='all',
className='text-dark',
clearable=False
)
], md=4)
], className='mb-4'),
dbc.Row([
dbc.Col(dcc.Graph(id='geo-map', config={'displayModeBar': False}), md=12)
])
]),
dbc.Tab(label='Correlation Analysis', children=[
dbc.Row([
dbc.Col(dcc.Graph(id='corr-heatmap', config={'displayModeBar': False}), md=6),
dbc.Col(dcc.Graph(id='cross-tab-heatmap', config={'displayModeBar': False}), md=6)
]),
dbc.Row([
dbc.Col(html.Div(id='chi2-result', className='text-light'), md=12)
])
]),
dbc.Tab(label='Machine Learning', children=[
dbc.Row([
dbc.Col(dcc.Graph(id='ml-feature', config={'displayModeBar': False}), md=12)
]),
dbc.Row([
dbc.Col(html.Div(id='ml-metrics', className='text-light'), md=12)
])
]),
dbc.Tab(label='Network Visualization', children=[
dbc.Row([
dbc.Col([
html.Label('Filter Nodes by Country:', className='text-light'),
dcc.Dropdown(
id='network-country-filter',
options=network_country_options,
value='all',
className='text-dark',
clearable=False
)
], md=4)
], className='mb-4'),
dbc.Row([
dbc.Col(dcc.Graph(id='network-graph', config={'displayModeBar': False}), md=12)
])
]),
dbc.Tab(label='Forecasting', children=[
dbc.Row([
dbc.Col(dcc.Graph(id='forecast-graph', config={'displayModeBar': False}), md=12)
])
])
])
], fluid=True)
# Callback to update all visualizations based on filter inputs
@app.callback(
[Output('bar-profession', 'figure'),
Output('scatter-bill-waiting', 'figure'),
Output('grouped-profession-gender', 'figure'),
Output('box-waiting-time', 'figure'),
Output('data-table', 'data'),
Output('trend-analysis', 'figure'),
Output('geo-map', 'figure'),
Output('corr-heatmap', 'figure'),
Output('cross-tab-heatmap', 'figure'),
Output('chi2-result', 'children'),
Output('ml-feature', 'figure'),
Output('ml-metrics', 'children'),
Output('network-graph', 'figure'),
Output('forecast-graph', 'figure')],
[Input('country-filter', 'value'),
Input('gender-filter', 'value'),
Input('pioneer-filter', 'value'),
Input('profession-filter', 'value'),
Input('year-slider', 'value'),
Input('bill-slider', 'value'),
Input('trend-groupby', 'value'),
Input('geo-category', 'value'),
Input('network-country-filter', 'value')]
)
def update_all(selected_countries, selected_genders, selected_pioneer, selected_professions, year_range, bill_range, trend_group, geo_profession, network_country):
# Filter the DataFrame based on global filters
dff = df.copy()
if selected_countries and len(selected_countries) > 0:
dff = dff[dff['country'].isin(selected_countries)]
if selected_genders and len(selected_genders) > 0:
dff = dff[dff['gender'].isin(selected_genders)]
if selected_pioneer is not None:
dff = dff[dff['isPioneer'] == selected_pioneer]
if selected_professions and len(selected_professions) > 0:
dff = dff[dff['profession_clean'].isin(selected_professions)]
dff = dff[(dff['firstAppearanceDate'] >= year_range[0]) & (dff['firstAppearanceDate'] <= year_range[1])]
dff = dff[(dff['currentBillValue'] >= bill_range[0]) & (dff['currentBillValue'] <= bill_range[1])]
# If filtered data is empty, return empty figures to avoid errors
if dff.empty:
empty_fig = go.Figure()
empty_fig.update_layout(template='plotly_dark', title='No data available')
return empty_fig, empty_fig, empty_fig, empty_fig, [], empty_fig, empty_fig, empty_fig, empty_fig, 'No data available', empty_fig, 'No ML results available', empty_fig, empty_fig
# 1. Main Analysis
# a) Bar Chart: Count of banknotes by simplified profession category
prof_counts = dff['prof_category'].value_counts().reset_index()
prof_counts.columns = ['Profession Category', 'Count']
fig_bar = px.bar(
prof_counts,
x='Profession Category',
y='Count',
text='Count',
title='Count of Banknotes by Profession Category',
template='plotly_dark'
)
fig_bar.update_traces(textposition='outside')
# b) Scatter Plot: Bill Value vs. Waiting Time (appearanceDeathDiff)
scatter_df = dff.dropna(subset=['appearanceDeathDiff', 'currentBillValue'])
fig_scatter = px.scatter(
scatter_df,
x='currentBillValue',
y='appearanceDeathDiff',
hover_data=['name', 'profession_clean'],
title='Bill Value vs. Waiting Time',
template='plotly_dark'
)
# c) Grouped Bar Chart: Distribution of Profession by Gender
fig_grouped = px.histogram(
dff,
x='profession_clean',
color='gender',
barmode='group',
title='Distribution of Profession by Gender',
template='plotly_dark'
)
fig_grouped.update_layout(xaxis_tickangle=-45)
# d) Box Plot: Waiting Time by Pioneer Status
dff['pioneer_label'] = dff['isPioneer'].apply(lambda x: 'Pioneer (Yes)' if x else 'Non-Pioneer (No)')
fig_box = px.box(
dff,
x='pioneer_label',
y='appearanceDeathDiff',
color='pioneer_label',
title='Waiting Time by Pioneer Status',
template='plotly_dark',
labels={'pioneer_label': 'Pioneer Status', 'appearanceDeathDiff': 'Waiting Time (years)'}
)
fig_box.update_layout(showlegend=False)
# e) Data Table Update
table_data = dff.to_dict('records')
# 2. Trends Analysis
# Group data by 'firstAppearanceDate' and the chosen grouping (gender or profession)
complete_years = pd.DataFrame({'firstAppearanceDate': range(year_range[0], year_range[1] + 1)})
trend_df = dff.groupby(['firstAppearanceDate', trend_group]).size().reset_index(name='count')
trend_df = complete_years.merge(trend_df, on='firstAppearanceDate', how='left').fillna({'count': 0})
if trend_group not in trend_df.columns:
trend_df[trend_group] = 'Unknown'
fig_trend = px.line(
trend_df,
x='firstAppearanceDate',
y='count',
color=trend_group,
title=f"Trends: Distribution of Banknotes Over Years (Grouped by {trend_group})",
template='plotly_dark'
)
# Add smoothing lines using moving average (window=3)
smooth_trends = []
for grp in trend_df[trend_group].unique():
sub = trend_df[trend_df[trend_group] == grp].sort_values('firstAppearanceDate')
sub['smoothed'] = sub['count'].rolling(window=3, min_periods=1).mean()
smooth_trends.append(go.Scatter(
x=sub['firstAppearanceDate'],
y=sub['smoothed'],
mode='lines',
name=f"{grp} (Smoothed)"
))
for trace in smooth_trends:
fig_trend.add_trace(trace)
# 3. Geography Analysis
# Filter by category if a specific one is selected (other than "all")
geo_df = dff.copy()
if geo_profession != "all":
geo_df = geo_df[geo_df['profession_clean'] == geo_profession]
geo_group = geo_df.groupby('country').size().reset_index(name='count')
fig_geo = px.choropleth(
geo_group,
locations='country',
locationmode='country names',
color='count',
hover_name='country',
color_continuous_scale='Viridis',
title=f"Distribution of Banknotes by Country ({'All Professions' if geo_profession == 'all' else geo_profession})",
template='plotly_dark'
)
# 4. Correlation Analysis
# a) Heatmap for numerical variables
corr_vars = ['currentBillValue', 'firstAppearanceDate', 'deathDate', 'appearanceDeathDiff', 'lifeSpan']
corr_df = dff[corr_vars].corr()
fig_corr = px.imshow(
corr_df,
text_auto=True,
title='Correlation between Numerical Variables',
template='plotly_dark'
)
# b) Cross-tab frequency heatmap for categorical data (gender vs. profession)
cross_tab = pd.crosstab(dff['gender'], dff['profession_clean'])
fig_cross = px.imshow(
cross_tab,
text_auto=True,
title='Cross-Tab Frequency: Gender vs. Profession',
template='plotly_dark'
)
# c) Perform chi-squared test for categorical data (gender vs. profession)
try:
chi2, p, dof, ex = chi2_contingency(cross_tab)
chi2_text = f"χ² test (Gender vs. Profession): χ² = {chi2:.2f}, p-value = {p:.4f}"
except Exception as e:
chi2_text = f"Error performing χ² test: {e}"
# 5. Machine Learning
# Train a RandomForest classifier to predict 'isPioneer' and display feature importance
ml_df = dff.copy()
ml_df = ml_df[(ml_df['gender'] != 'Unknown') & (ml_df['profession_clean'] != 'Unknown') & (ml_df['country'] != 'Unknown')]
if ml_df.shape[0] > 10:
features = ml_df[['gender', 'profession_clean', 'country', 'currentBillValue', 'firstAppearanceDate']]
target = ml_df['isPioneer']
features_encoded = pd.get_dummies(features, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.3, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
imp_df = pd.DataFrame({'feature': features_encoded.columns, 'importance': importances})
imp_df = imp_df.sort_values('importance', ascending=False)
fig_ml = px.bar(
imp_df,
x='importance',
y='feature',
orientation='h',
title='Feature Importance (RandomForest)',
template='plotly_dark'
)
# Compute additional metrics
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
ml_metrics_text = (f"Accuracy: {accuracy:.2f} | F1-score: {f1:.2f} | "
f"Precision: {precision:.2f} | Recall: {recall:.2f}")
else:
fig_ml = go.Figure()
fig_ml.update_layout(title='Insufficient data for ML', template='plotly_dark')
ml_metrics_text = 'No ML results available'
# 6. Network Visualization
# Use kamada_kawai_layout for improved performance and filter nodes by selected country
net_df = dff.copy()
if network_country != 'all':
net_df = net_df[net_df['country'] == network_country]
G = nx.Graph()
for idx, row in net_df.iterrows():
G.add_node(
row['id'],
label=row['name'],
country=row['country'],
profession=row['profession_clean'],
gender=row['gender']
)
nodes = list(G.nodes(data=True))
for i in range(len(nodes)):
for j in range(i+1, len(nodes)):
if (nodes[i][1]['country'] == nodes[j][1]['country']) or (nodes[i][1]['profession'] == nodes[j][1]['profession']):
G.add_edge(nodes[i][0], nodes[j][0])
pos = nx.kamada_kawai_layout(G)
node_x, node_y, node_text, node_color = [], [], [], []
for node, attr in G.nodes(data=True):
x, y = pos[node]
node_x.append(x)
node_y.append(y)
node_text.append(f"{attr['label']}\n{attr['profession']}\n{attr['country']}")
node_color.append('cyan' if attr['gender'].lower() == 'female' else 'magenta')
edge_x, edge_y = [], []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x.extend([x0, x1, None])
edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(
x=edge_x,
y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
)
node_trace = go.Scatter(
x=node_x,
y=node_y,
mode='markers',
marker=dict(size=10, color=node_color),
text=node_text,
hoverinfo='text'
)
fig_network = go.Figure(data=[edge_trace, node_trace])
fig_network.update_layout(
title='Network of Banknote Figures',
template='plotly_dark',
xaxis={'visible': False},
yaxis={'visible': False}
)
# 7. Forecasting
# Use ARIMA from statsmodels to forecast the count of banknotes over the years
min_year = dff['firstAppearanceDate'].min()
max_year = dff['firstAppearanceDate'].max()
complete_years = pd.DataFrame({'firstAppearanceDate': range(min_year, max_year + 1)})
ts_df = dff.groupby('firstAppearanceDate').size().reset_index(name='count')
ts_df = complete_years.merge(ts_df, on='firstAppearanceDate', how='left').fillna({'count':0})
ts_df['firstAppearanceDate'] = pd.to_datetime(ts_df['firstAppearanceDate'], format='%Y')
ts_df.set_index('firstAppearanceDate', inplace=True)
if len(ts_df) > 5:
# Function to find the best ARIMA order based on AIC
def find_best_arima_order(ts_data, p_values, d_values, q_values):
best_aic = float('inf')
best_order = None
for p in p_values:
for d in d_values:
for q in q_values:
try:
model = ARIMA(ts_data, order=(p,d,q))
results = model.fit()
aic = results.aic
if aic < best_aic:
best_aic = aic
best_order = (p,d,q)
except:
continue
return best_order
# Define possible values for p, d, q
p_values = range(0, 3)
d_values = range(0, 2)
q_values = range(0, 3)
# Find the best ARIMA order
best_order = find_best_arima_order(ts_df['count'], p_values, d_values, q_values)
# Fit ARIMA model with the best order
try:
model = ARIMA(ts_df['count'], order=best_order)
model_fit = model.fit()
# Generate forecast
forecast_steps = 5
forecast_values = model_fit.forecast(steps=forecast_steps)
# Generate future dates
last_date = ts_df.index.max()
future_dates = pd.date_range(start=last_date + pd.DateOffset(years=1), periods=forecast_steps, freq='Y')
# Create the forecast figure
fig_forecast = go.Figure()
fig_forecast.add_trace(
go.Scatter(
x=ts_df.index,
y=ts_df['count'],
mode='lines+markers',
name='Historical Data'
)
)
fig_forecast.add_trace(
go.Scatter(
x=future_dates,
y=forecast_values,
mode='lines',
name='Forecast'
)
)
fig_forecast.update_layout(
title=f'Forecast of Banknote Counts by Year (ARIMA {best_order})',
template='plotly_dark'
)
except Exception as e:
fig_forecast = go.Figure()
fig_forecast.update_layout(
title=f'Error in forecasting: {str(e)}',
template='plotly_dark'
)
else:
fig_forecast = go.Figure()
fig_forecast.update_layout(
title='Insufficient data for forecasting',
template='plotly_dark'
)
# Return all updated figures and data
return (fig_bar, fig_scatter, fig_grouped, fig_box, table_data, fig_trend, fig_geo,
fig_corr, fig_cross, chi2_text, fig_ml, ml_metrics_text, fig_network, fig_forecast)
# Run the app
if __name__ == '__main__':
app.run_server(debug=True)