import pandas as pd
import plotly.express as px
from sksurv.nonparametric import kaplan_meier_estimator
import dash
from dash import dcc, html, Input, Output
import dash_bootstrap_components as dbc
DATA_FILE = 'Dallas_Animal_Shelter_Data_Fiscal_Year_Jan_2024.csv'
OUTCOME_MAPPING = {
'ADOPTION': 'Exit', 'RETURNED TO OWNER': 'Exit', 'TRANSFER': 'Exit', 'FOSTER': 'Exit', 'DISPOSAL': 'Exit',
'TNR': 'Exit', 'WILDLIFE': 'Exit', 'SNR': 'Exit', 'EUTHANIZED': 'Stay', 'DIED': 'Stay', 'LOST EXP': 'Stay',
'FOUND EXP': 'Stay', 'TREATMENT': 'Stay', 'MISSING': 'Stay'
}
TOP_BREEDS_COUNT = 20
def load_and_preprocess_data():
df = pd.read_csv(DATA_FILE)
df_dogs = df[df['Animal_Type'] == 'DOG'].copy() # Usar .copy() para evitar warnings
df_dogs['Outcome_Category'] = df_dogs['Outcome_Type'].replace(OUTCOME_MAPPING)
df_dogs['Intake_DateTime'] = pd.to_datetime(df_dogs['Intake_Date'] + ' ' + df_dogs['Intake_Time'])
df_dogs['Outcome_DateTime'] = pd.to_datetime(df_dogs['Outcome_Date'] + ' ' + df_dogs['Outcome_Time'])
df_dogs['Duration'] = (df_dogs['Outcome_DateTime'] - df_dogs['Intake_DateTime']).dt.total_seconds() / (60 * 60 * 24)
df_dogs['Event'] = df_dogs['Outcome_Category'] == 'Exit'
df_dogs.dropna(subset=['Duration'], inplace=True)
df_dogs = df_dogs[df_dogs['Duration'] >= 0]
top_breeds = df_dogs['Animal_Breed'].value_counts().nlargest(TOP_BREEDS_COUNT).index
df_top_breeds = df_dogs[df_dogs['Animal_Breed'].isin(top_breeds)]
return df_top_breeds, top_breeds
df_top_breeds, top_breeds = load_and_preprocess_data()
breed_colors = {breed: px.colors.qualitative.D3[i % len(px.colors.qualitative.D3)] for i, breed in enumerate(top_breeds)}
def calculate_survival_curves(breeds):
survival_data = []
for breed in breeds:
breed_data = df_top_breeds[df_top_breeds['Animal_Breed'] == breed]
time, survival_prob = kaplan_meier_estimator(breed_data['Event'].astype(bool), breed_data['Duration'])
survival_data.append(pd.DataFrame({'Time': time, 'Survival Probability': survival_prob, 'Breed': breed}))
return pd.concat(survival_data)
def create_survival_plot(survival_df):
if survival_df.empty:
return px.line(title="No data to show")
fig = px.line(survival_df, x='Time', y='Survival Probability', color='Breed',
color_discrete_map=breed_colors,markers=True,
labels={'Time': 'Days', 'Survival Probability': '% Exit Probability'},
line_shape="spline", template='plotly_white',
)
fig.update_layout(
title_font=dict(size=20, family='Arial', color='black'),
xaxis_title_font=dict(size=14, family='Arial', color='black'),
yaxis_title_font=dict(size=14, family='Arial', color='black'),
legend_title_font=dict(size=16, family='Arial', color='black')
)
return fig
def create_breed_count_plot(filtered_df):
breed_counts = filtered_df['Animal_Breed'].value_counts().reset_index()
breed_counts.columns = ['Breed', 'Count']
fig_bar_breedcount = px.bar(breed_counts, x='Breed', y='Count',
text_auto='.2f', template='plotly_white', labels={'Breed': ''},
color='Breed', color_discrete_map=breed_colors)
fig_bar_breedcount.update_yaxes(visible=False)
fig_bar_breedcount.update_layout(showlegend=False)
return fig_bar_breedcount
def create_breed_duration_plot(filtered_df):
breed_duration = (filtered_df.groupby('Animal_Breed')['Duration'].agg(['mean', 'median'])
.reset_index().sort_values('mean', ascending=False))
fig_bar_breedmean = px.bar(breed_duration, x='Animal_Breed', y='mean',
text_auto='.2f', template='plotly_white', labels={'Animal_Breed': ''},
color='Animal_Breed', color_discrete_map=breed_colors)
fig_bar_breedmean.update_yaxes(visible=False)
fig_bar_breedmean.update_layout(showlegend=False)
for index, row in breed_duration.iterrows():
fig_bar_breedmean.add_annotation(x=row['Animal_Breed'], y=row['mean'],
text=f"Median: {row['median']:.2f}", showarrow=False, yshift=10)
return fig_bar_breedmean
#Styles spaces
style_space = {'border': 'none', 'height': '5px', 'background': 'linear-gradient(to right, #007bff, #ff7b00)', 'margin': '10px 0'}
# Dash App
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.LUX])
app.title=' Dallas Animal Shelter'
app.layout = dbc.Container([
html.Hr(style=style_space),
html.H2("Dog Diaries: Analyzing Breeds' Fate in Dallas Animal Shelter", style={'text-align': 'center'}),
html.Hr(style=style_space),
html.Div([
html.P("Analyzing only dogs, which make up 80% of the shelter animals, we selected the top 20 breeds, representing 90% of this group", style={'text-align': 'center', 'margin-top': '20px', 'font-style': 'italic','font-size': '24px', 'color': 'black'}),
html.Hr(style=style_space)
]),
dbc.Row([
dbc.Col(dbc.Card(dcc.Dropdown(id='breed-dropdown', options=[{'label': breed, 'value': breed} for breed in top_breeds],
value=top_breeds[:3], multi=True)), width=12,class_name="btn-group dash-dropdown")
]),
html.Hr(style={'border': 'none', 'height': '2px', 'background': 'linear-gradient(to right, #007bff, #ff7b00)', 'margin': '10px 0'}),
dbc.Row([html.H5("Exit Probability by Days: A Closer Look at Each Breed",style={'text-align': 'center'}),
dbc.Col(dcc.Graph(id='survival-plot'), width=12)]),
dbc.Row(
[dbc.Col([
html.H5("Top Dog Breeds at Dallas Shelter: A Count Analysis",style={'text-align': 'center'}),
dcc.Graph(id='breed-count-plot')], width=6),
dbc.Col([
html.H5("Shelter Stays: Average and Median Duration Dog Breeds",style={'text-align': 'center'}),
dcc.Graph(id='breed-duration-plot')], width=6)
])
],fluid=True)
@app.callback(
[Output('survival-plot', 'figure'), Output('breed-count-plot', 'figure'),
Output('breed-duration-plot', 'figure')],
Input('breed-dropdown', 'value')
)
def update_plot(selected_breeds):
if selected_breeds:
filtered_df = df_top_breeds[df_top_breeds['Animal_Breed'].isin(selected_breeds)]
return (create_survival_plot(calculate_survival_curves(selected_breeds)),
create_breed_count_plot(filtered_df),
create_breed_duration_plot(filtered_df))
else:
return create_survival_plot(pd.DataFrame()), px.bar(), px.bar()