import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.manifold import TSNE
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output, State
import dash_bootstrap_components as dbc
# --- 1. Configuration and Global Variables Definition ---
# CSV file name that should be in the same directory as app.py
CSV_FILE = 'steak-risk-survey.csv'
# Number of clusters (profiles) to identify
N_CLUSTERS = 4
# List of behavioral variables to be used for clustering
BEHAVIORAL_VARS = [
'Lottery_Choice', 'Smoke_Cigarettes', 'Drink_Alcohol',
'Gamble', 'Skydiving', 'Speed_Limit', 'Cheating', 'Eat_Steak'
]
# List of demographic variables (used for context, not directly for clustering)
DEMOGRAPHIC_VARS = [
'Gender', 'Age', 'Income', 'Education', 'Region', 'Steak_Preparation'
]
# Global variables to store processed data and models
# These will be initialized when loading the data
df_cleaned_global = None
scaled_df_global = None
label_encoders_global = {} # Dictionary to save LabelEncoders for each column
scaler_global = None
tsne_model_global = None
cluster_model_global = None
tsne_df_global = None
tsne_centroids_global = None # Will store centroid coordinates in t-SNE space
original_survey_size = 0 # To store the total number of original survey participants
cluster_statistics = {} # To store cluster size statistics
# --- 2. Cluster Profile Definition ---
# Here we define names, descriptions, colors and symbols for each cluster.
CLUSTER_PROFILES_INFO = {
0: {
'name': 'The Realistic Moderates',
'description': 'This group, the majority, prefers safety in lottery choices, avoids risks like smoking, gambling or cheating, and doesn\'t go skydiving. However, they do consume alcohol, eat steak and drive at the speed limit. Demographically, they are mostly women, over 60 years old, with medium-range income and education, and prefer medium-rare steak.',
'color': '#E63946', # Red
'symbol': 'circle',
'icon': '🎯' # Target - balanced approach
},
1: {
'name': 'The Principled Cautious',
'description': 'Distinctively, this group is extremely prudent, especially in not exceeding the speed limit. They avoid most common risks like smoking, gambling or skydiving, although they show slight tolerance for calculated risk in the lottery (Lottery A). They are predominantly women, over 60 years old, with college or associate education, and mainly from the Middle Atlantic region.',
'color': '#457B9D', # Blue
'symbol': 'diamond',
'icon': '🛡️' # Shield - protective/cautious
},
2: {
'name': 'The Selective Adventurers',
'description': 'This is a small group that combines aversion to habitual risks (don\'t smoke, don\'t gamble, etc.) with a clear and unique inclination for extreme high-risk experiences, like skydiving (100% have done it). They also drive at the limit. They are mostly women, but notably younger (30-44 years) than other groups, with average income and education.',
'color': '#F4A261', # Orange
'symbol': 'triangle-up',
'icon': '🪂' # Parachute - selective adventure
},
3: {
'name': 'The Risk Hedonists',
'description': 'This group shows the highest propensity for "lifestyle" risk behaviors: all smoke, most drink alcohol, gamble and choose the lottery with the highest potential gain. They also drive at the speed limit. They are predominantly men and the youngest group (18-29 years), with medium-range income and education, and a strong preference for medium-rare steak.',
'color': '#2A9D8F', # Green
'symbol': 'star',
'icon': '🎲' # Dice - risk-taking/gambling
}
}
# --- 3. Data Loading and Preprocessing (Executed once when starting the application) ---
def load_and_prepare_data(file_path):
# Declare global variables that we will modify within this function
global df_cleaned_global, scaled_df_global, label_encoders_global, \
scaler_global, tsne_model_global, cluster_model_global, \
tsne_df_global, tsne_centroids_global, original_survey_size, cluster_statistics
try:
# Load CSV and rename columns for easier handling
df = pd.read_csv(file_path).rename(columns={
"Consider the following hypothetical situations: <br>In Lottery A, you have a 50% chance of success, with a payout of $100. <br>In Lottery B, you have a 90% chance of success, with a payout of $20. <br><br>Assuming you have $10 to bet, would you play Lottery A or Lottery B?":"Lottery_Choice",
'Do you ever smoke cigarettes?':'Smoke_Cigarettes',
'Do you ever drink alcohol?':'Drink_Alcohol',
'Do you ever gamble?':'Gamble',
'Have you ever been skydiving?':'Skydiving',
'Do you ever drive above the speed limit?':'Speed_Limit',
'Have you ever cheated on your significant other?':'Cheating',
'Do you eat steak?':'Eat_Steak',
'How do you like your steak prepared?':'Steak_Preparation',
'Household Income':'Income',
'Location (Census Region)':'Region'
})
# Store original survey size before any cleaning
original_survey_size = df.shape[0]
except FileNotFoundError:
print(f"Error: '{file_path}' not found. Make sure the file is in the same directory as 'app.py'.")
# If file is not found, return None to indicate critical failure
return None, None, None, None, None, None, None, None, None, None
# Remove rows where data is missing in most behavioral variables
df_cleaned_global = df.dropna(subset=BEHAVIORAL_VARS, thresh=int(len(BEHAVIORAL_VARS) * 0.7)).copy()
# Fill null values in demographic variables with 'Not specified'
for col in DEMOGRAPHIC_VARS:
if col in df_cleaned_global.columns:
df_cleaned_global[col] = df_cleaned_global[col].fillna('Not specified')
# Encoding categorical behavioral variables to numeric
encoded_behavioral_data = df_cleaned_global[BEHAVIORAL_VARS].copy()
for col in BEHAVIORAL_VARS:
le = LabelEncoder()
# Fit and transform the column, ensuring it's treated as string
encoded_behavioral_data[col] = le.fit_transform(encoded_behavioral_data[col].astype(str))
# Save the LabelEncoder to use with new data
label_encoders_global[col] = le
# Feature scaling to normalize the data
scaler_global = StandardScaler()
scaled_features = scaler_global.fit_transform(encoded_behavioral_data[BEHAVIORAL_VARS])
scaled_df_global = pd.DataFrame(scaled_features, columns=BEHAVIORAL_VARS, index=df_cleaned_global.index)
# Apply Agglomerative Clustering to identify profiles
cluster_model_global = AgglomerativeClustering(n_clusters=N_CLUSTERS, linkage='ward')
df_cleaned_global['Cluster'] = cluster_model_global.fit_predict(scaled_df_global)
# Calculate cluster statistics
cluster_counts = df_cleaned_global['Cluster'].value_counts().sort_index()
total_participants = len(df_cleaned_global)
cluster_statistics = {}
for cluster_id in sorted(df_cleaned_global['Cluster'].unique()):
count = cluster_counts[cluster_id]
percentage = (count / total_participants) * 100
cluster_statistics[cluster_id] = {
'count': count,
'percentage': percentage
}
# Dimensionality reduction with t-SNE for 2D visualization
# (t-SNE doesn't have a direct 'transform' method for new points,
# so new points will be visualized at their cluster centroid)
tsne_model_global = TSNE(n_components=2, random_state=42, n_iter_without_progress=300, learning_rate='auto', init='random')
tsne_results = tsne_model_global.fit_transform(scaled_df_global)
# Create a DataFrame with t-SNE results and cluster labels
# Es crucial que tsne_df_global conserve el mismo índice que df_cleaned_global
tsne_df_global = pd.DataFrame(data=tsne_results, columns=['t-SNE Component 1', 't-SNE Component 2'],
index=df_cleaned_global.index)
tsne_df_global['Cluster'] = df_cleaned_global['Cluster'] # Asigna directamente la serie, que ya tiene el índice correcto
# Calculate cluster centroids in t-SNE space (to visualize new points)
tsne_centroids_global = tsne_df_global.groupby('Cluster').mean().values
print("Data and models loaded and preprocessed successfully.")
# Return global elements for confirmation (although they are accessed directly afterwards)
return df_cleaned_global, scaled_df_global, label_encoders_global, scaler_global, \
tsne_model_global, cluster_model_global, tsne_df_global, tsne_centroids_global, original_survey_size, cluster_statistics
# Execute data loading and preparation at application startup
df_cleaned_global, scaled_df_global, label_encoders_global, scaler_global, \
tsne_model_global, cluster_model_global, tsne_df_global, tsne_centroids_global, original_survey_size, cluster_statistics = load_and_prepare_data(CSV_FILE)
# If data couldn't be loaded, the application shouldn't start
if df_cleaned_global is None:
print("The application could not load the data. Please make sure the CSV file is present and accessible.")
exit() # Exit script if there's a critical error loading data
# --- 4. Function to Classify a New User ---
def classify_new_user(user_responses_dict):
# Convert user responses to a DataFrame for preprocessing
user_df = pd.DataFrame([user_responses_dict], columns=BEHAVIORAL_VARS)
# Preprocess user responses using the same LabelEncoders
encoded_user_data = user_df.copy()
for col in BEHAVIORAL_VARS:
# Validate that user response is a known option by the LabelEncoder
if user_responses_dict[col] not in label_encoders_global[col].classes_:
raise ValueError(f"Value '{user_responses_dict[col]}' not recognized for question '{col}'. Please choose one of the valid options.")
encoded_user_data[col] = label_encoders_global[col].transform(user_df[col].astype(str))
# Scale new user features using the same StandardScaler
scaled_user_features = scaler_global.transform(encoded_user_data[BEHAVIORAL_VARS])
# Calculate distance from new point to the mean of each cluster in scaled space
# This assigns the new user to the cluster whose centroid is closest
cluster_centers_scaled = np.array([scaled_df_global[df_cleaned_global['Cluster'] == k].mean().values for k in sorted(df_cleaned_global['Cluster'].unique())])
distances = np.linalg.norm(scaled_user_features - cluster_centers_scaled, axis=1)
assigned_cluster = np.argmin(distances)
# For visualization of the new point in t-SNE, we use the coordinates of its assigned cluster centroid
user_tsne_coords = tsne_centroids_global[assigned_cluster]
return assigned_cluster, user_tsne_coords
# --- 5. Function to Create or Update the t-SNE Plot ---
def create_tsne_plot(current_tsne_df, new_user_coords=None, new_user_cluster=None):
fig = go.Figure()
# Add existing cluster points to the graph
for i in sorted(current_tsne_df['Cluster'].unique()):
df_cluster = current_tsne_df[current_tsne_df['Cluster'] == i]
profile_info = CLUSTER_PROFILES_INFO.get(i, {})
cluster_name = profile_info.get('name', f'Cluster {i}')
cluster_color = profile_info.get('color', 'grey')
cluster_symbol = profile_info.get('symbol', 'circle')
# Add cluster statistics to legend name
stats = cluster_statistics.get(i, {'count': 0, 'percentage': 0})
legend_name = f"{cluster_name} ({stats['percentage']:.1f}%)"
# Prepare hovertext with demographic information
hovertexts = []
for idx in df_cluster.index:
# Asegúrate de que df_cleaned_global esté accesible y tenga los datos
participant_info = df_cleaned_global.loc[idx]
demog_text = ""
for demog_var in DEMOGRAPHIC_VARS:
# Add a check to ensure the column exists in participant_info to prevent KeyError
if demog_var in participant_info:
# Replace underscores with spaces and capitalize first letter for display
display_demog_var = demog_var.replace('_', ' ').capitalize()
demog_text += f"<br>{display_demog_var}: {participant_info[demog_var]}"
hovertexts.append(f'Profile: {cluster_name}<br>'
f'Participant ID: {idx}<br>'
f'Population: {stats["percentage"]:.1f}%'
f'{demog_text}') # <--- ¡Aquí se añade la información demográfica!
fig.add_trace(go.Scatter(
x=df_cluster['t-SNE Component 1'],
y=df_cluster['t-SNE Component 2'],
mode='markers',
name=legend_name, # Name for legend with percentage
marker=dict(
symbol=cluster_symbol,
color=cluster_color,
size=12,
opacity=0.8,
line=dict(width=2, color='white')
),
hoverinfo='text',
hovertext=hovertexts # Use the rich hovertexts
))
# Add new user point if classified
if new_user_coords is not None and new_user_cluster is not None:
profile_info = CLUSTER_PROFILES_INFO.get(new_user_cluster, {})
user_cluster_name = profile_info.get('name', f'Cluster {new_user_cluster}')
user_color = profile_info.get('color', 'black')
user_symbol = 'diamond-open' # Distinctive symbol for new user
# Get user cluster statistics
user_stats = cluster_statistics.get(new_user_cluster, {'percentage': 0})
fig.add_trace(go.Scatter(
x=[new_user_coords[0]],
y=[new_user_coords[1]],
mode='markers',
name=f'You ({user_stats["percentage"]:.1f}%)', # Name for legend with percentage
marker=dict(
symbol=user_symbol,
color=user_color,
size=20, # Larger size to highlight
line=dict(width=4, color='black'), # Thick border to highlight
opacity=1
),
hoverinfo='text',
hovertext=f'Your Profile<br>Assigned Cluster: {user_cluster_name}<br>Population: {user_stats["percentage"]:.1f}%'
))
# Graph layout configuration (igual que tu versión)
fig.update_layout(
title={
'text': 'Risk Profile Landscape',
'x': 0.5,
'xanchor': 'center',
'font': {'size': 24, 'color': '#2c3e50'}
},
height=600, # Fixed height for graph
hovermode='closest', # Hover behavior to show detailed information
template='plotly_white', # Clean and minimalist template
plot_bgcolor='rgba(248,249,250,0.8)',
paper_bgcolor='white',
# Configuration to remove X and Y axes
xaxis=dict(showgrid=False,zeroline=False,showticklabels=False, visible=False),
yaxis=dict(showgrid=False,zeroline=False,showticklabels=False, visible=False),
# Adjust margins so figure isn't stuck to container edges
margin=dict(l=20, r=20, t=80, b=20),
legend=dict(
orientation="h", yanchor="bottom",y=-0.1,
xanchor="center", x=0.5,font=dict(size=12))
)
return fig
# Create initial graph to show when loading the application
initial_plot = create_tsne_plot(tsne_df_global)
# --- 6. Function to Create Statistics Chart ---
def create_statistics_chart():
# Prepare data for the chart
cluster_names = []
percentages = []
colors = []
icons = []
for i in sorted(cluster_statistics.keys()):
profile_info = CLUSTER_PROFILES_INFO.get(i, {})
cluster_names.append(profile_info.get('name', f'Cluster {i}'))
percentages.append(cluster_statistics[i]['percentage'])
colors.append(profile_info.get('color', 'grey'))
icons.append(profile_info.get('icon', '🎯'))
# Create bar chart
fig = go.Figure(data=[
go.Bar(
x=cluster_names,
y=percentages,
marker_color=colors,
text=[f'{p:.1f}%' for p in percentages],
textposition='auto',
hovertemplate='<b>%{x}</b><br>Population: %{y:.1f}%<br>Count: %{customdata}<extra></extra>',
customdata=[cluster_statistics[i]['count'] for i in sorted(cluster_statistics.keys())]
)
])
fig.update_layout(
title={
'text': 'Population Distribution by Risk Profile',
'x': 0.5,'xanchor': 'center',
'font': {'size': 20, 'color': '#2c3e50'}
},
xaxis_title="Risk Profiles", yaxis_title="Population Percentage (%)",
template='plotly_white', height=400, margin=dict(l=20, r=20, t=60, b=20)
)
return fig
# --- 7. Initialize Dash Application ---
app = Dash(__name__, external_stylesheets=[dbc.themes.MINTY])
# Necessary for deployment in environments like Heroku or Vercel
server = app.server
app.title = "Steak Risk Survey Dashboard"
# --- 8. Application Layout (Design) ---
# Defines the visual structure of the user interface
app.layout = dbc.Container([
# Modal for methodology explanation
dbc.Modal([
dbc.ModalHeader(dbc.ModalTitle("🔬 Methodology Explanation")),
dbc.ModalBody([
html.H5("How the Analysis Works", className="mb-3"),
html.P([
"This dashboard uses machine learning techniques to identify distinct risk behavior profiles from survey data. Here's how it works:"
]),
html.Ol([
html.Li([
html.Strong("Data Collection: "),
f"We analyzed responses from {original_survey_size:,} participants across 8 key behavioral dimensions."
]),
html.Li([
html.Strong("Feature Engineering: "),
"Categorical responses are encoded numerically and standardized to ensure equal weighting."
]),
html.Li([
html.Strong("Clustering Algorithm: "),
"We use Agglomerative Clustering with Ward linkage to group similar behavioral patterns into 4 distinct profiles."
]),
html.Li([
html.Strong("Dimensionality Reduction: "),
"t-SNE (t-Distributed Stochastic Neighbor Embedding) reduces the 8-dimensional data to 2D for visualization while preserving local structure."
]),
html.Li([
html.Strong("Profile Assignment: "),
"New users are classified by calculating their distance to each cluster centroid in the scaled feature space."
])
]),
html.Hr(),
html.H5("Key Variables Analyzed", className="mb-3"),
html.P("The analysis considers these behavioral dimensions:"),
html.Ul([
html.Li("🎰 Risk preference in financial decisions (lottery choice)"),
html.Li("🚬 Smoking behavior"),
html.Li("🍺 Alcohol consumption"),
html.Li("🎲 Gambling habits"),
html.Li("🪂 Extreme sports participation (skydiving)"),
html.Li("🚗 Traffic rule compliance (speed limits)"),
html.Li("💔 Relationship fidelity"),
html.Li("🥩 Dietary choices (steak consumption)")
]),
html.Hr(),
html.H5("Statistical Validation", className="mb-3"),
html.P([
"The clustering solution was validated using silhouette analysis and elbow method. ",
"The 4-cluster solution provides the optimal balance between interpretability and statistical significance."
])
]),
dbc.ModalFooter(
dbc.Button("Close", id="close-methodology", className="ms-auto", n_clicks=0)
),
], id="methodology-modal", is_open=False, size="lg"),
# Header Section
dbc.Row([
dbc.Col([
html.Div([
html.H1("🎯 Risk Behavior Profile Analysis",
className="display-4 text-center mb-3",
style={'color': '#2c3e50', 'font-weight': 'bold'}),
html.P(f"Discover behavioral profiles derived from a comprehensive survey of {original_survey_size:,} participants. "
"Explore the identified profiles and see where you fit in the risk behavior spectrum.",
className="lead text-center text-muted mb-4"),
# Action buttons
dbc.Row([
dbc.Col([
dbc.Button(
"📊 View Methodology",
id="open-methodology",
color="info",
outline=True,
className="me-2"
),
dbc.Button(
"📈 Population Statistics",
id="toggle-stats",
color="secondary",
outline=True
)
], className="text-center")
], className="mb-4"),
html.Hr(className="my-4")
])
])
]),
# Statistics Section (collapsible)
dbc.Collapse([
dbc.Card([
dbc.CardHeader([
html.H4("📊 Population Statistics", className="mb-0", style={'color': '#34495e'})
]),
dbc.CardBody([
dbc.Row([
dbc.Col([
dcc.Graph(id='statistics-chart', figure=create_statistics_chart())
], width=8),
dbc.Col([
html.H5("Key Insights", className="mb-3"),
html.Div([
html.P([
html.Strong(f"{cluster_statistics[i]['percentage']:.1f}%"),
f" of participants are {CLUSTER_PROFILES_INFO[i]['name']} ",
html.Span(CLUSTER_PROFILES_INFO[i]['icon']),
html.Br(),
html.Small(f"({cluster_statistics[i]['count']} participants)", className="text-muted")
]) for i in sorted(cluster_statistics.keys())
]),
html.Hr(),
html.P([
html.Strong("Total Analyzed: "),
f"{sum(cluster_statistics[i]['count'] for i in cluster_statistics)} participants"
], className="text-muted")
], width=4)
])
])
], style={'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'})
], id="stats-collapse", is_open=False),
html.Br(),
# Risk Profiles Section
dbc.Row([
dbc.Col([
html.H2("🧩 Identified Risk Behavior Profiles",
className="h3 mb-4",
style={'color': '#34495e', 'font-weight': '600'})
])
]),
dbc.Row([
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H4([
html.Span(info["icon"], className="me-2"),
f'Profile {i+1}: {info["name"]}',
dbc.Badge(
f'{cluster_statistics[i]["percentage"]:.1f}%',
color="light",
text_color="dark",
className="ms-2"
)
],
style={'color': info["color"], 'margin': '0', 'font-weight': 'bold'})
], style={'background-color': f'{info["color"]}15', 'border': 'none'}),
dbc.CardBody([
html.P(info["description"],
className="card-text",
style={'font-size': '0.95em', 'line-height': '1.6'}),
html.Small([
html.Strong("Population: "),
f'{cluster_statistics[i]["count"]} participants ({cluster_statistics[i]["percentage"]:.1f}%)'
], className="text-muted")
])
],
style={'border': f'2px solid {info["color"]}', 'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'},
className="mb-4")
], width=12, lg=6) for i, info in CLUSTER_PROFILES_INFO.items()
]),
# Visualization Section
dbc.Row([
dbc.Col([
html.Hr(className="my-5"),
html.H2("📊 Interactive Profile Landscape",
className="h3 mb-4",
style={'color': '#34495e', 'font-weight': '600'}),
html.P("Each point represents a survey participant, positioned based on their risk behavior patterns. "
"Similar profiles cluster together in this visualization. Percentages in the legend show population distribution.",
className="text-muted mb-4"),
dbc.Card([
dbc.CardBody([
dcc.Graph(id='tsne-graph', figure=initial_plot)
])
], style={'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'})
])
]),
# User Classification Section
dbc.Row([
dbc.Col([
html.Hr(className="my-5"),
html.H2("🔍 Discover Your Risk Profile",
className="h3 mb-4",
style={'color': '#34495e', 'font-weight': '600'}),
html.P("Answer the questions below to see which risk behavior profile matches you best.",
className="text-muted mb-4")
])
]),
dbc.Card([
dbc.CardBody([
dbc.Row([
# Lottery Choice
dbc.Col([
html.Label("🎰 Lottery Preference:", className="fw-bold mb-2"),
dbc.RadioItems(
id='lottery-choice-input',
options=[
{'label': ' Lottery A (50% chance, $100 payout)', 'value': 'Lottery A'},
{'label': ' Lottery B (90% chance, $20 payout)', 'value': 'Lottery B'}
],
value='Lottery A',
className="mb-3"
)
], width=12, lg=6),
# Smoking
dbc.Col([
html.Label("🚬 Do you smoke cigarettes?", className="fw-bold mb-2"),
dbc.RadioItems(
id='smoke-cigarettes-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='No',
className="mb-3"
)
], width=12, lg=6),
# Drinking
dbc.Col([
html.Label("🍺 Do you drink alcohol?", className="fw-bold mb-2"),
dbc.RadioItems(
id='drink-alcohol-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='Yes',
className="mb-3"
)
], width=12, lg=6),
# Gambling
dbc.Col([
html.Label("🎲 Do you gamble?", className="fw-bold mb-2"),
dbc.RadioItems(
id='gamble-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='No',
className="mb-3"
)
], width=12, lg=6),
# Skydiving
dbc.Col([
html.Label("🪂 Have you been skydiving?", className="fw-bold mb-2"),
dbc.RadioItems(
id='skydiving-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='No',
className="mb-3"
)
], width=12, lg=6),
# Speed Limit
dbc.Col([
html.Label("🚗 Do you drive above speed limit?", className="fw-bold mb-2"),
dbc.RadioItems(
id='speed-limit-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='No',
className="mb-3"
)
], width=12, lg=6),
# Cheating
dbc.Col([
html.Label("💔 Have you cheated on a partner?", className="fw-bold mb-2"),
dbc.RadioItems(
id='cheating-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='No',
className="mb-3"
)
], width=12, lg=6),
# Steak Eating
dbc.Col([
html.Label("🥩 Do you eat steak?", className="fw-bold mb-2"),
dbc.RadioItems(
id='eat-steak-input',
options=[
{'label': ' Yes', 'value': 'Yes'},
{'label': ' No', 'value': 'No'}
],
value='Yes',
className="mb-3"
)
], width=12, lg=6),
]),
# Classification Button
dbc.Row([
dbc.Col([
html.Div([
dbc.Button(
"🎯 Analyze My Profile",
id='classify-button',
color="primary",
size="lg",
className="me-3"
),
dbc.Button(
"🔄 Reset Answers",
id='reset-button',
color="outline-secondary",
size="lg"
)
], className="text-center")
], width=12)
], className="mt-4")
])
], style={'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'}, className="mb-4"),
# Results Section
html.Div(id='classification-result', className="mb-4"),
# Footer
dbc.Row([
dbc.Col([
html.Hr(className="my-2"),
html.Footer([
html.P("Dashboard Created with Python-Plotly-Dash | Data Sourced: Thank you to FiveThirtyEight © 2025",
className="text-center text-muted small")
])
])
])
], fluid=True, className="py-4")
# --- 9. Callback Functions (Application Logic) ---
# Callback to open/close methodology modal
@app.callback(
Output("methodology-modal", "is_open"),
[Input("open-methodology", "n_clicks"), Input("close-methodology", "n_clicks")],
[State("methodology-modal", "is_open")],
)
def toggle_methodology_modal(n1, n2, is_open):
if n1 or n2:
return not is_open
return is_open
# Callback to toggle statistics section
@app.callback(
Output("stats-collapse", "is_open"),
[Input("toggle-stats", "n_clicks")],
[State("stats-collapse", "is_open")],
)
def toggle_stats_collapse(n, is_open):
if n:
return not is_open
return is_open
# Callback to reset form inputs
@app.callback(
[Output('lottery-choice-input', 'value'),
Output('smoke-cigarettes-input', 'value'),
Output('drink-alcohol-input', 'value'),
Output('gamble-input', 'value'),
Output('skydiving-input', 'value'),
Output('speed-limit-input', 'value'),
Output('cheating-input', 'value'),
Output('eat-steak-input', 'value')],
[Input('reset-button', 'n_clicks')]
)
def reset_form(n_clicks):
if n_clicks:
return 'Lottery A', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes'
return 'Lottery A', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes'
# Main callback for user classification and visualization update
@app.callback(
[Output('classification-result', 'children'),
Output('tsne-graph', 'figure')],
[Input('classify-button', 'n_clicks')],
[State('lottery-choice-input', 'value'),
State('smoke-cigarettes-input', 'value'),
State('drink-alcohol-input', 'value'),
State('gamble-input', 'value'),
State('skydiving-input', 'value'),
State('speed-limit-input', 'value'),
State('cheating-input', 'value'),
State('eat-steak-input', 'value')]
)
def classify_and_visualize(n_clicks, lottery, smoke, drink, gamble, skydive, speed, cheat, steak):
# If button hasn't been clicked yet, return empty result and initial plot
if n_clicks is None or n_clicks == 0:
return html.Div(), initial_plot
try:
# Collect user responses in the expected format
user_responses = {
'Lottery_Choice': lottery,
'Smoke_Cigarettes': smoke,
'Drink_Alcohol': drink,
'Gamble': gamble,
'Skydiving': skydive,
'Speed_Limit': speed,
'Cheating': cheat,
'Eat_Steak': steak
}
# Classify the new user
assigned_cluster, user_tsne_coords = classify_new_user(user_responses)
# Get profile information
profile_info = CLUSTER_PROFILES_INFO.get(assigned_cluster, {})
profile_name = profile_info.get('name', f'Cluster {assigned_cluster}')
profile_description = profile_info.get('description', 'No description available.')
profile_color = profile_info.get('color', 'grey')
profile_icon = profile_info.get('icon', '🎯')
# Get cluster statistics
stats = cluster_statistics.get(assigned_cluster, {'count': 0, 'percentage': 0})
# Create updated visualization
updated_plot = create_tsne_plot(tsne_df_global, user_tsne_coords, assigned_cluster)
# Create result card
result_card = dbc.Card([
dbc.CardHeader([
html.H3([
html.Span(profile_icon, className="me-3"),
f"Your Profile: {profile_name}",
dbc.Badge(
f'{stats["percentage"]:.1f}% of population',
color="light",
text_color="dark",
className="ms-3"
)
], style={'color': profile_color, 'margin': '0', 'font-weight': 'bold'})
], style={'background-color': f'{profile_color}15', 'border': 'none'}),
dbc.CardBody([
html.P(profile_description,
className="card-text mb-4",
style={'font-size': '1.1em', 'line-height': '1.6'}),
dbc.Row([
dbc.Col([
html.H5("📊 Population Statistics", className="mb-3"),
html.P([
html.Strong("Your Profile Size: "),
f"{stats['count']} participants ({stats['percentage']:.1f}%)"
]),
html.P([
html.Strong("Ranking: "),
f"#{sorted(cluster_statistics.keys(), key=lambda x: cluster_statistics[x]['percentage'], reverse=True).index(assigned_cluster) + 1} most common profile"
])
], width=12, lg=6),
dbc.Col([
html.H5("🎯 Your Responses", className="mb-3"),
html.Ul([
html.Li(f"Lottery: {lottery}"),
html.Li(f"Smoking: {smoke}"),
html.Li(f"Alcohol: {drink}"),
html.Li(f"Gambling: {gamble}"),
html.Li(f"Skydiving: {skydive}"),
html.Li(f"Speed Limit: {speed}"),
html.Li(f"Cheating: {cheat}"),
html.Li(f"Steak: {steak}")
], className="small")
], width=12, lg=6)
]),
html.Hr(),
html.P([
html.I(className="fas fa-info-circle me-2"),
"Your position in the visualization above shows how your risk profile compares to others. "
"Points closer together represent similar behavioral patterns."
], className="text-muted small")
])
], style={'border': f'3px solid {profile_color}', 'box-shadow': '0 6px 20px rgba(0,0,0,0.15)'})
return result_card, updated_plot
except ValueError as e:
# Handle errors in classification
error_alert = dbc.Alert([
html.H4("⚠️ Classification Error", className="alert-heading"),
html.P(str(e)),
html.P("Please make sure all questions are answered with valid options.", className="mb-0")
], color="warning")
return error_alert, initial_plot
except Exception as e:
# Handle unexpected errors
error_alert = dbc.Alert([
html.H4("❌ Unexpected Error", className="alert-heading"),
html.P("An unexpected error occurred during classification. Please try again."),
html.P(f"Error details: {str(e)}", className="small text-muted mb-0")
], color="danger")
return error_alert, initial_plot