PyCafe - Dash - steak_risk

app.py
requirements.txt
steak-risk-survey.csv
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.manifold import TSNE
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output, State
import dash_bootstrap_components as dbc

# --- 1. Configuration and Global Variables Definition ---
# CSV file name that should be in the same directory as app.py
CSV_FILE = 'steak-risk-survey.csv'
# Number of clusters (profiles) to identify
N_CLUSTERS = 4
# List of behavioral variables to be used for clustering
BEHAVIORAL_VARS = [
    'Lottery_Choice', 'Smoke_Cigarettes', 'Drink_Alcohol',
    'Gamble', 'Skydiving', 'Speed_Limit', 'Cheating', 'Eat_Steak'
]
# List of demographic variables (used for context, not directly for clustering)
DEMOGRAPHIC_VARS = [
    'Gender', 'Age', 'Income', 'Education', 'Region', 'Steak_Preparation'
]
# Global variables to store processed data and models
# These will be initialized when loading the data
df_cleaned_global = None
scaled_df_global = None
label_encoders_global = {} # Dictionary to save LabelEncoders for each column
scaler_global = None
tsne_model_global = None
cluster_model_global = None
tsne_df_global = None
tsne_centroids_global = None # Will store centroid coordinates in t-SNE space
original_survey_size = 0 # To store the total number of original survey participants
cluster_statistics = {} # To store cluster size statistics

# --- 2. Cluster Profile Definition ---
# Here we define names, descriptions, colors and symbols for each cluster.
CLUSTER_PROFILES_INFO = {
    0: {
        'name': 'The Realistic Moderates',
        'description': 'This group, the majority, prefers safety in lottery choices, avoids risks like smoking, gambling or cheating, and doesn\'t go skydiving. However, they do consume alcohol, eat steak and drive at the speed limit. Demographically, they are mostly women, over 60 years old, with medium-range income and education, and prefer medium-rare steak.',
        'color': '#E63946', # Red
        'symbol': 'circle',
        'icon': '🎯' # Target - balanced approach
    },
    1: {
        'name': 'The Principled Cautious',
        'description': 'Distinctively, this group is extremely prudent, especially in not exceeding the speed limit. They avoid most common risks like smoking, gambling or skydiving, although they show slight tolerance for calculated risk in the lottery (Lottery A). They are predominantly women, over 60 years old, with college or associate education, and mainly from the Middle Atlantic region.',
        'color': '#457B9D', # Blue
        'symbol': 'diamond',
        'icon': '🛡️' # Shield - protective/cautious
    },
    2: {
        'name': 'The Selective Adventurers',
        'description': 'This is a small group that combines aversion to habitual risks (don\'t smoke, don\'t gamble, etc.) with a clear and unique inclination for extreme high-risk experiences, like skydiving (100% have done it). They also drive at the limit. They are mostly women, but notably younger (30-44 years) than other groups, with average income and education.',
        'color': '#F4A261', # Orange
        'symbol': 'triangle-up',
        'icon': '🪂' # Parachute - selective adventure
    },
    3: {
        'name': 'The Risk Hedonists',
        'description': 'This group shows the highest propensity for "lifestyle" risk behaviors: all smoke, most drink alcohol, gamble and choose the lottery with the highest potential gain. They also drive at the speed limit. They are predominantly men and the youngest group (18-29 years), with medium-range income and education, and a strong preference for medium-rare steak.',
        'color': '#2A9D8F', # Green
        'symbol': 'star',
        'icon': '🎲' # Dice - risk-taking/gambling
    }
}
# --- 3. Data Loading and Preprocessing (Executed once when starting the application) ---
def load_and_prepare_data(file_path):
    # Declare global variables that we will modify within this function
    global df_cleaned_global, scaled_df_global, label_encoders_global, \
           scaler_global, tsne_model_global, cluster_model_global, \
           tsne_df_global, tsne_centroids_global, original_survey_size, cluster_statistics
    try:
        # Load CSV and rename columns for easier handling
        df = pd.read_csv(file_path).rename(columns={
            "Consider the following hypothetical situations: <br>In Lottery A, you have a 50% chance of success, with a payout of $100. <br>In Lottery B, you have a 90% chance of success, with a payout of $20. <br><br>Assuming you have $10 to bet, would you play Lottery A or Lottery B?":"Lottery_Choice",
            'Do you ever smoke cigarettes?':'Smoke_Cigarettes',
            'Do you ever drink alcohol?':'Drink_Alcohol',
            'Do you ever gamble?':'Gamble',
            'Have you ever been skydiving?':'Skydiving',
            'Do you ever drive above the speed limit?':'Speed_Limit',
            'Have you ever cheated on your significant other?':'Cheating',
            'Do you eat steak?':'Eat_Steak',
            'How do you like your steak prepared?':'Steak_Preparation',
            'Household Income':'Income',
            'Location (Census Region)':'Region'
        })
        # Store original survey size before any cleaning
        original_survey_size = df.shape[0]
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found. Make sure the file is in the same directory as 'app.py'.")
        # If file is not found, return None to indicate critical failure
        return None, None, None, None, None, None, None, None, None, None

    # Remove rows where data is missing in most behavioral variables
    df_cleaned_global = df.dropna(subset=BEHAVIORAL_VARS, thresh=int(len(BEHAVIORAL_VARS) * 0.7)).copy()

    # Fill null values in demographic variables with 'Not specified'
    for col in DEMOGRAPHIC_VARS:
        if col in df_cleaned_global.columns:
            df_cleaned_global[col] = df_cleaned_global[col].fillna('Not specified')

    # Encoding categorical behavioral variables to numeric
    encoded_behavioral_data = df_cleaned_global[BEHAVIORAL_VARS].copy()
    for col in BEHAVIORAL_VARS:
        le = LabelEncoder()
        # Fit and transform the column, ensuring it's treated as string
        encoded_behavioral_data[col] = le.fit_transform(encoded_behavioral_data[col].astype(str))
        # Save the LabelEncoder to use with new data
        label_encoders_global[col] = le

    # Feature scaling to normalize the data
    scaler_global = StandardScaler()
    scaled_features = scaler_global.fit_transform(encoded_behavioral_data[BEHAVIORAL_VARS])
    scaled_df_global = pd.DataFrame(scaled_features, columns=BEHAVIORAL_VARS, index=df_cleaned_global.index)

    # Apply Agglomerative Clustering to identify profiles
    cluster_model_global = AgglomerativeClustering(n_clusters=N_CLUSTERS, linkage='ward')
    df_cleaned_global['Cluster'] = cluster_model_global.fit_predict(scaled_df_global)

    # Calculate cluster statistics
    cluster_counts = df_cleaned_global['Cluster'].value_counts().sort_index()
    total_participants = len(df_cleaned_global)
    
    cluster_statistics = {}
    for cluster_id in sorted(df_cleaned_global['Cluster'].unique()):
        count = cluster_counts[cluster_id]
        percentage = (count / total_participants) * 100
        cluster_statistics[cluster_id] = {
            'count': count,
            'percentage': percentage
        }
    # Dimensionality reduction with t-SNE for 2D visualization
    # (t-SNE doesn't have a direct 'transform' method for new points,
    # so new points will be visualized at their cluster centroid)
    tsne_model_global = TSNE(n_components=2, random_state=42, n_iter_without_progress=300, learning_rate='auto', init='random')
    tsne_results = tsne_model_global.fit_transform(scaled_df_global)

    # Create a DataFrame with t-SNE results and cluster labels
    # Es crucial que tsne_df_global conserve el mismo índice que df_cleaned_global
    tsne_df_global = pd.DataFrame(data=tsne_results, columns=['t-SNE Component 1', 't-SNE Component 2'],
                                  index=df_cleaned_global.index)
    tsne_df_global['Cluster'] = df_cleaned_global['Cluster'] # Asigna directamente la serie, que ya tiene el índice correcto

    # Calculate cluster centroids in t-SNE space (to visualize new points)
    tsne_centroids_global = tsne_df_global.groupby('Cluster').mean().values

    print("Data and models loaded and preprocessed successfully.")
    # Return global elements for confirmation (although they are accessed directly afterwards)
    return df_cleaned_global, scaled_df_global, label_encoders_global, scaler_global, \
           tsne_model_global, cluster_model_global, tsne_df_global, tsne_centroids_global, original_survey_size, cluster_statistics

# Execute data loading and preparation at application startup
df_cleaned_global, scaled_df_global, label_encoders_global, scaler_global, \
tsne_model_global, cluster_model_global, tsne_df_global, tsne_centroids_global, original_survey_size, cluster_statistics = load_and_prepare_data(CSV_FILE)

# If data couldn't be loaded, the application shouldn't start
if df_cleaned_global is None:
    print("The application could not load the data. Please make sure the CSV file is present and accessible.")
    exit() # Exit script if there's a critical error loading data

# --- 4. Function to Classify a New User ---
def classify_new_user(user_responses_dict):
    # Convert user responses to a DataFrame for preprocessing
    user_df = pd.DataFrame([user_responses_dict], columns=BEHAVIORAL_VARS)

    # Preprocess user responses using the same LabelEncoders
    encoded_user_data = user_df.copy()
    for col in BEHAVIORAL_VARS:
        # Validate that user response is a known option by the LabelEncoder
        if user_responses_dict[col] not in label_encoders_global[col].classes_:
            raise ValueError(f"Value '{user_responses_dict[col]}' not recognized for question '{col}'. Please choose one of the valid options.")
        encoded_user_data[col] = label_encoders_global[col].transform(user_df[col].astype(str))

    # Scale new user features using the same StandardScaler
    scaled_user_features = scaler_global.transform(encoded_user_data[BEHAVIORAL_VARS])

    # Calculate distance from new point to the mean of each cluster in scaled space
    # This assigns the new user to the cluster whose centroid is closest
    cluster_centers_scaled = np.array([scaled_df_global[df_cleaned_global['Cluster'] == k].mean().values for k in sorted(df_cleaned_global['Cluster'].unique())])
    distances = np.linalg.norm(scaled_user_features - cluster_centers_scaled, axis=1)
    assigned_cluster = np.argmin(distances)

    # For visualization of the new point in t-SNE, we use the coordinates of its assigned cluster centroid
    user_tsne_coords = tsne_centroids_global[assigned_cluster]

    return assigned_cluster, user_tsne_coords

# --- 5. Function to Create or Update the t-SNE Plot ---

def create_tsne_plot(current_tsne_df, new_user_coords=None, new_user_cluster=None):
    fig = go.Figure()

    # Add existing cluster points to the graph
    for i in sorted(current_tsne_df['Cluster'].unique()):
        df_cluster = current_tsne_df[current_tsne_df['Cluster'] == i]
        profile_info = CLUSTER_PROFILES_INFO.get(i, {})
        cluster_name = profile_info.get('name', f'Cluster {i}')
        cluster_color = profile_info.get('color', 'grey')
        cluster_symbol = profile_info.get('symbol', 'circle')
        
        # Add cluster statistics to legend name
        stats = cluster_statistics.get(i, {'count': 0, 'percentage': 0})
        legend_name = f"{cluster_name} ({stats['percentage']:.1f}%)"

        # Prepare hovertext with demographic information
        hovertexts = []
        for idx in df_cluster.index:
            # Asegúrate de que df_cleaned_global esté accesible y tenga los datos
            participant_info = df_cleaned_global.loc[idx]
            demog_text = ""
            for demog_var in DEMOGRAPHIC_VARS:
                # Add a check to ensure the column exists in participant_info to prevent KeyError
                if demog_var in participant_info:
                    # Replace underscores with spaces and capitalize first letter for display
                    display_demog_var = demog_var.replace('_', ' ').capitalize()
                    demog_text += f"<br>{display_demog_var}: {participant_info[demog_var]}"
            
            hovertexts.append(f'Profile: {cluster_name}<br>'
                              f'Participant ID: {idx}<br>'
                              f'Population: {stats["percentage"]:.1f}%'
                              f'{demog_text}') # <--- ¡Aquí se añade la información demográfica!

        fig.add_trace(go.Scatter(
            x=df_cluster['t-SNE Component 1'],
            y=df_cluster['t-SNE Component 2'],
            mode='markers',
            name=legend_name, # Name for legend with percentage
            marker=dict(
                symbol=cluster_symbol,
                color=cluster_color,
                size=12,
                opacity=0.8,
                line=dict(width=2, color='white')
            ),
            hoverinfo='text',
            hovertext=hovertexts # Use the rich hovertexts
        ))

    # Add new user point if classified
    if new_user_coords is not None and new_user_cluster is not None:
        profile_info = CLUSTER_PROFILES_INFO.get(new_user_cluster, {})
        user_cluster_name = profile_info.get('name', f'Cluster {new_user_cluster}')
        user_color = profile_info.get('color', 'black')
        user_symbol = 'diamond-open' # Distinctive symbol for new user
        
        # Get user cluster statistics
        user_stats = cluster_statistics.get(new_user_cluster, {'percentage': 0})

        fig.add_trace(go.Scatter(
            x=[new_user_coords[0]],
            y=[new_user_coords[1]],
            mode='markers',
            name=f'You ({user_stats["percentage"]:.1f}%)', # Name for legend with percentage
            marker=dict(
                symbol=user_symbol,
                color=user_color,
                size=20, # Larger size to highlight
                line=dict(width=4, color='black'), # Thick border to highlight
                opacity=1
            ),
            hoverinfo='text',
            hovertext=f'Your Profile<br>Assigned Cluster: {user_cluster_name}<br>Population: {user_stats["percentage"]:.1f}%'
        ))

    # Graph layout configuration (igual que tu versión)
    fig.update_layout(
        title={
            'text': 'Risk Profile Landscape',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 24, 'color': '#2c3e50'}
        },
        height=600, # Fixed height for graph
        hovermode='closest', # Hover behavior to show detailed information
        template='plotly_white', # Clean and minimalist template
        plot_bgcolor='rgba(248,249,250,0.8)',
        paper_bgcolor='white',

        # Configuration to remove X and Y axes
        xaxis=dict(showgrid=False,zeroline=False,showticklabels=False, visible=False),
        yaxis=dict(showgrid=False,zeroline=False,showticklabels=False, visible=False),
        # Adjust margins so figure isn't stuck to container edges
        margin=dict(l=20, r=20, t=80, b=20),
        legend=dict(
            orientation="h", yanchor="bottom",y=-0.1,
            xanchor="center", x=0.5,font=dict(size=12))
    )
    return fig
# Create initial graph to show when loading the application
initial_plot = create_tsne_plot(tsne_df_global)

# --- 6. Function to Create Statistics Chart ---
def create_statistics_chart():
    # Prepare data for the chart
    cluster_names = []
    percentages = []
    colors = []
    icons = []
    
    for i in sorted(cluster_statistics.keys()):
        profile_info = CLUSTER_PROFILES_INFO.get(i, {})
        cluster_names.append(profile_info.get('name', f'Cluster {i}'))
        percentages.append(cluster_statistics[i]['percentage'])
        colors.append(profile_info.get('color', 'grey'))
        icons.append(profile_info.get('icon', '🎯'))
    
    # Create bar chart
    fig = go.Figure(data=[
        go.Bar(
            x=cluster_names,
            y=percentages,
            marker_color=colors,
            text=[f'{p:.1f}%' for p in percentages],
            textposition='auto',
            hovertemplate='<b>%{x}</b><br>Population: %{y:.1f}%<br>Count: %{customdata}<extra></extra>',
            customdata=[cluster_statistics[i]['count'] for i in sorted(cluster_statistics.keys())]
        )
    ])
    
    fig.update_layout(
        title={
            'text': 'Population Distribution by Risk Profile',
            'x': 0.5,'xanchor': 'center',
            'font': {'size': 20, 'color': '#2c3e50'}
        },
        xaxis_title="Risk Profiles", yaxis_title="Population Percentage (%)",
        template='plotly_white', height=400, margin=dict(l=20, r=20, t=60, b=20)
    )
    return fig

# --- 7. Initialize Dash Application ---
app = Dash(__name__, external_stylesheets=[dbc.themes.MINTY])
# Necessary for deployment in environments like Heroku or Vercel
server = app.server
app.title = "Steak Risk Survey Dashboard"
# --- 8. Application Layout (Design) ---
# Defines the visual structure of the user interface
app.layout = dbc.Container([
    # Modal for methodology explanation
    dbc.Modal([
        dbc.ModalHeader(dbc.ModalTitle("🔬 Methodology Explanation")),
        dbc.ModalBody([
            html.H5("How the Analysis Works", className="mb-3"),
            html.P([
                "This dashboard uses machine learning techniques to identify distinct risk behavior profiles from survey data. Here's how it works:"
            ]),
            html.Ol([
                html.Li([
                    html.Strong("Data Collection: "), 
                    f"We analyzed responses from {original_survey_size:,} participants across 8 key behavioral dimensions."
                ]),
                html.Li([
                    html.Strong("Feature Engineering: "), 
                    "Categorical responses are encoded numerically and standardized to ensure equal weighting."
                ]),
                html.Li([
                    html.Strong("Clustering Algorithm: "), 
                    "We use Agglomerative Clustering with Ward linkage to group similar behavioral patterns into 4 distinct profiles."
                ]),
                html.Li([
                    html.Strong("Dimensionality Reduction: "), 
                    "t-SNE (t-Distributed Stochastic Neighbor Embedding) reduces the 8-dimensional data to 2D for visualization while preserving local structure."
                ]),
                html.Li([
                    html.Strong("Profile Assignment: "), 
                    "New users are classified by calculating their distance to each cluster centroid in the scaled feature space."
                ])
            ]),
            html.Hr(),
            html.H5("Key Variables Analyzed", className="mb-3"),
            html.P("The analysis considers these behavioral dimensions:"),
            html.Ul([
                html.Li("🎰 Risk preference in financial decisions (lottery choice)"),
                html.Li("🚬 Smoking behavior"),
                html.Li("🍺 Alcohol consumption"),
                html.Li("🎲 Gambling habits"),
                html.Li("🪂 Extreme sports participation (skydiving)"),
                html.Li("🚗 Traffic rule compliance (speed limits)"),
                html.Li("💔 Relationship fidelity"),
                html.Li("🥩 Dietary choices (steak consumption)")
            ]),
            html.Hr(),
            html.H5("Statistical Validation", className="mb-3"),
            html.P([
                "The clustering solution was validated using silhouette analysis and elbow method. ",
                "The 4-cluster solution provides the optimal balance between interpretability and statistical significance."
            ])
        ]),
        dbc.ModalFooter(
            dbc.Button("Close", id="close-methodology", className="ms-auto", n_clicks=0)
        ),
    ], id="methodology-modal", is_open=False, size="lg"),
    # Header Section
    dbc.Row([
        dbc.Col([
            html.Div([
                html.H1("🎯 Risk Behavior Profile Analysis", 
                       className="display-4 text-center mb-3",
                       style={'color': '#2c3e50', 'font-weight': 'bold'}),
                html.P(f"Discover behavioral profiles derived from a comprehensive survey of {original_survey_size:,} participants. "
                       "Explore the identified profiles and see where you fit in the risk behavior spectrum.",
                       className="lead text-center text-muted mb-4"),
                # Action buttons
                dbc.Row([
                    dbc.Col([
                        dbc.Button(
                            "📊 View Methodology", 
                            id="open-methodology", 
                            color="info",
                            outline=True,
                            className="me-2"
                        ),
                        dbc.Button(
                            "📈 Population Statistics", 
                            id="toggle-stats",
                            color="secondary",
                            outline=True
                        )
                    ], className="text-center")
                ], className="mb-4"),
                
                html.Hr(className="my-4")
            ])
        ])
    ]),
    # Statistics Section (collapsible)
    dbc.Collapse([
        dbc.Card([
            dbc.CardHeader([
                html.H4("📊 Population Statistics", className="mb-0", style={'color': '#34495e'})
            ]),
            dbc.CardBody([
                dbc.Row([
                    dbc.Col([
                        dcc.Graph(id='statistics-chart', figure=create_statistics_chart())
                    ], width=8),
                    dbc.Col([
                        html.H5("Key Insights", className="mb-3"),
                        html.Div([
                            html.P([
                                html.Strong(f"{cluster_statistics[i]['percentage']:.1f}%"), 
                                f" of participants are {CLUSTER_PROFILES_INFO[i]['name']} ",
                                html.Span(CLUSTER_PROFILES_INFO[i]['icon']),
                                html.Br(),
                                html.Small(f"({cluster_statistics[i]['count']} participants)", className="text-muted")
                            ]) for i in sorted(cluster_statistics.keys())
                        ]),
                        html.Hr(),
                        html.P([
                            html.Strong("Total Analyzed: "), 
                            f"{sum(cluster_statistics[i]['count'] for i in cluster_statistics)} participants"
                        ], className="text-muted")
                    ], width=4)
                ])
            ])
        ], style={'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'})
    ], id="stats-collapse", is_open=False),
    html.Br(),
    # Risk Profiles Section
    dbc.Row([
        dbc.Col([
            html.H2("🧩 Identified Risk Behavior Profiles", 
                   className="h3 mb-4",
                   style={'color': '#34495e', 'font-weight': '600'})
        ])
    ]),    
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardHeader([
                    html.H4([
                        html.Span(info["icon"], className="me-2"),
                        f'Profile {i+1}: {info["name"]}',
                        dbc.Badge(
                            f'{cluster_statistics[i]["percentage"]:.1f}%',
                            color="light",
                            text_color="dark",
                            className="ms-2"
                        )
                    ], 
                    style={'color': info["color"], 'margin': '0', 'font-weight': 'bold'})
                ], style={'background-color': f'{info["color"]}15', 'border': 'none'}),
                dbc.CardBody([
                    html.P(info["description"], 
                          className="card-text",
                          style={'font-size': '0.95em', 'line-height': '1.6'}),
                    html.Small([
                        html.Strong("Population: "), 
                        f'{cluster_statistics[i]["count"]} participants ({cluster_statistics[i]["percentage"]:.1f}%)'
                    ], className="text-muted")
                ])
            ], 
            style={'border': f'2px solid {info["color"]}', 'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'},
            className="mb-4")
        ], width=12, lg=6) for i, info in CLUSTER_PROFILES_INFO.items()
    ]),
    # Visualization Section
    dbc.Row([
        dbc.Col([
            html.Hr(className="my-5"),
            html.H2("📊 Interactive Profile Landscape", 
                   className="h3 mb-4",
                   style={'color': '#34495e', 'font-weight': '600'}),
            html.P("Each point represents a survey participant, positioned based on their risk behavior patterns. "
                   "Similar profiles cluster together in this visualization. Percentages in the legend show population distribution.",
                   className="text-muted mb-4"),
            dbc.Card([
                dbc.CardBody([
                    dcc.Graph(id='tsne-graph', figure=initial_plot)
                ])
            ], style={'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'})
        ])
    ]),
    # User Classification Section
    dbc.Row([
        dbc.Col([
            html.Hr(className="my-5"),
            html.H2("🔍 Discover Your Risk Profile", 
                   className="h3 mb-4",
                   style={'color': '#34495e', 'font-weight': '600'}),
            html.P("Answer the questions below to see which risk behavior profile matches you best.",
                   className="text-muted mb-4")
        ])
    ]),
    dbc.Card([
        dbc.CardBody([
            dbc.Row([
                # Lottery Choice
                dbc.Col([
                    html.Label("🎰 Lottery Preference:", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='lottery-choice-input',
                        options=[
                            {'label': ' Lottery A (50% chance, $100 payout)', 'value': 'Lottery A'},
                            {'label': ' Lottery B (90% chance, $20 payout)', 'value': 'Lottery B'}
                        ],
                        value='Lottery A',
                        className="mb-3"
                    )
                ], width=12, lg=6),
                
                # Smoking
                dbc.Col([
                    html.Label("🚬 Do you smoke cigarettes?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='smoke-cigarettes-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='No',
                        className="mb-3"
                    )
                ], width=12, lg=6),
                
                # Drinking
                dbc.Col([
                    html.Label("🍺 Do you drink alcohol?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='drink-alcohol-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='Yes',
                        className="mb-3"
                    )
                ], width=12, lg=6),
                
                # Gambling
                dbc.Col([
                    html.Label("🎲 Do you gamble?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='gamble-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='No',
                        className="mb-3"
                    )
                ], width=12, lg=6),
                
                # Skydiving
                dbc.Col([
                    html.Label("🪂 Have you been skydiving?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='skydiving-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='No',
                        className="mb-3"
                    )
                ], width=12, lg=6),                
                # Speed Limit
                dbc.Col([
                    html.Label("🚗 Do you drive above speed limit?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='speed-limit-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='No',
                        className="mb-3"
                    )
                ], width=12, lg=6),                
                # Cheating
                dbc.Col([
                    html.Label("💔 Have you cheated on a partner?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='cheating-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='No',
                        className="mb-3"
                    )
                ], width=12, lg=6),
                
                # Steak Eating
                dbc.Col([
                    html.Label("🥩 Do you eat steak?", className="fw-bold mb-2"),
                    dbc.RadioItems(
                        id='eat-steak-input',
                        options=[
                            {'label': ' Yes', 'value': 'Yes'},
                            {'label': ' No', 'value': 'No'}
                        ],
                        value='Yes',
                        className="mb-3"
                    )
                ], width=12, lg=6),
            ]),           
            # Classification Button
            dbc.Row([
                dbc.Col([
                    html.Div([
                        dbc.Button(
                            "🎯 Analyze My Profile", 
                            id='classify-button',
                            color="primary",
                            size="lg",
                            className="me-3"
                        ),
                        dbc.Button(
                            "🔄 Reset Answers", 
                            id='reset-button',
                            color="outline-secondary",
                            size="lg"
                        )
                    ], className="text-center")
                ], width=12)
            ], className="mt-4")
        ])
    ], style={'box-shadow': '0 4px 12px rgba(0,0,0,0.1)'}, className="mb-4"),
    # Results Section
    html.Div(id='classification-result', className="mb-4"),
    # Footer
    dbc.Row([
        dbc.Col([
            html.Hr(className="my-2"),
            html.Footer([
                html.P("Dashboard Created with Python-Plotly-Dash | Data Sourced: Thank you to FiveThirtyEight © 2025",
                               className="text-center text-muted small")    
            ])
        ])
    ])
], fluid=True, className="py-4")
# --- 9. Callback Functions (Application Logic) ---

# Callback to open/close methodology modal
@app.callback(
    Output("methodology-modal", "is_open"),
    [Input("open-methodology", "n_clicks"), Input("close-methodology", "n_clicks")],
    [State("methodology-modal", "is_open")],
)
def toggle_methodology_modal(n1, n2, is_open):
    if n1 or n2:
        return not is_open
    return is_open

# Callback to toggle statistics section
@app.callback(
    Output("stats-collapse", "is_open"),
    [Input("toggle-stats", "n_clicks")],
    [State("stats-collapse", "is_open")],
)
def toggle_stats_collapse(n, is_open):
    if n:
        return not is_open
    return is_open

# Callback to reset form inputs
@app.callback(
    [Output('lottery-choice-input', 'value'),
     Output('smoke-cigarettes-input', 'value'),
     Output('drink-alcohol-input', 'value'),
     Output('gamble-input', 'value'),
     Output('skydiving-input', 'value'),
     Output('speed-limit-input', 'value'),
     Output('cheating-input', 'value'),
     Output('eat-steak-input', 'value')],
    [Input('reset-button', 'n_clicks')]
)
def reset_form(n_clicks):
    if n_clicks:
        return 'Lottery A', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes'
    return 'Lottery A', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes'

# Main callback for user classification and visualization update
@app.callback(
    [Output('classification-result', 'children'),
     Output('tsne-graph', 'figure')],
    [Input('classify-button', 'n_clicks')],
    [State('lottery-choice-input', 'value'),
     State('smoke-cigarettes-input', 'value'),
     State('drink-alcohol-input', 'value'),
     State('gamble-input', 'value'),
     State('skydiving-input', 'value'),
     State('speed-limit-input', 'value'),
     State('cheating-input', 'value'),
     State('eat-steak-input', 'value')]
)
def classify_and_visualize(n_clicks, lottery, smoke, drink, gamble, skydive, speed, cheat, steak):
    # If button hasn't been clicked yet, return empty result and initial plot
    if n_clicks is None or n_clicks == 0:
        return html.Div(), initial_plot
    
    try:
        # Collect user responses in the expected format
        user_responses = {
            'Lottery_Choice': lottery,
            'Smoke_Cigarettes': smoke,
            'Drink_Alcohol': drink,
            'Gamble': gamble,
            'Skydiving': skydive,
            'Speed_Limit': speed,
            'Cheating': cheat,
            'Eat_Steak': steak
        }       
        # Classify the new user
        assigned_cluster, user_tsne_coords = classify_new_user(user_responses)
        
        # Get profile information
        profile_info = CLUSTER_PROFILES_INFO.get(assigned_cluster, {})
        profile_name = profile_info.get('name', f'Cluster {assigned_cluster}')
        profile_description = profile_info.get('description', 'No description available.')
        profile_color = profile_info.get('color', 'grey')
        profile_icon = profile_info.get('icon', '🎯')
        
        # Get cluster statistics
        stats = cluster_statistics.get(assigned_cluster, {'count': 0, 'percentage': 0})
        
        # Create updated visualization
        updated_plot = create_tsne_plot(tsne_df_global, user_tsne_coords, assigned_cluster)
        # Create result card
        result_card = dbc.Card([
            dbc.CardHeader([
                html.H3([
                    html.Span(profile_icon, className="me-3"),
                    f"Your Profile: {profile_name}",
                    dbc.Badge(
                        f'{stats["percentage"]:.1f}% of population',
                        color="light",
                        text_color="dark",
                        className="ms-3"
                    )
                ], style={'color': profile_color, 'margin': '0', 'font-weight': 'bold'})
            ], style={'background-color': f'{profile_color}15', 'border': 'none'}),
            dbc.CardBody([
                html.P(profile_description, 
                      className="card-text mb-4",
                      style={'font-size': '1.1em', 'line-height': '1.6'}),
                dbc.Row([
                    dbc.Col([
                        html.H5("📊 Population Statistics", className="mb-3"),
                        html.P([
                            html.Strong("Your Profile Size: "), 
                            f"{stats['count']} participants ({stats['percentage']:.1f}%)"
                        ]),
                        html.P([
                            html.Strong("Ranking: "), 
                            f"#{sorted(cluster_statistics.keys(), key=lambda x: cluster_statistics[x]['percentage'], reverse=True).index(assigned_cluster) + 1} most common profile"
                        ])
                    ], width=12, lg=6),
                    dbc.Col([
                        html.H5("🎯 Your Responses", className="mb-3"),
                        html.Ul([
                            html.Li(f"Lottery: {lottery}"),
                            html.Li(f"Smoking: {smoke}"),
                            html.Li(f"Alcohol: {drink}"),
                            html.Li(f"Gambling: {gamble}"),
                            html.Li(f"Skydiving: {skydive}"),
                            html.Li(f"Speed Limit: {speed}"),
                            html.Li(f"Cheating: {cheat}"),
                            html.Li(f"Steak: {steak}")
                        ], className="small")
                    ], width=12, lg=6)
                ]),
                
                html.Hr(),
                html.P([
                    html.I(className="fas fa-info-circle me-2"),
                    "Your position in the visualization above shows how your risk profile compares to others. "
                    "Points closer together represent similar behavioral patterns."
                ], className="text-muted small")
            ])
        ], style={'border': f'3px solid {profile_color}', 'box-shadow': '0 6px 20px rgba(0,0,0,0.15)'})
        
        return result_card, updated_plot
        
    except ValueError as e:
        # Handle errors in classification
        error_alert = dbc.Alert([
            html.H4("⚠️ Classification Error", className="alert-heading"),
            html.P(str(e)),
            html.P("Please make sure all questions are answered with valid options.", className="mb-0")
        ], color="warning")
        
        return error_alert, initial_plot
    
    except Exception as e:
        # Handle unexpected errors
        error_alert = dbc.Alert([
            html.H4("❌ Unexpected Error", className="alert-heading"),
            html.P("An unexpected error occurred during classification. Please try again."),
            html.P(f"Error details: {str(e)}", className="small text-muted mb-0")
        ], color="danger")
        
        return error_alert, initial_plot
Py.Cafe

steak_risk_survey