Py.Cafe

edumunozsala/

streamlit-on-pycafe-guide

Streamlit on Py.cafe: Interactive Guide

DocsPricing
  • app.py
  • listings.csv
  • requirements.txt
  • reviews_sample.csv
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# Import section
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import datetime
import re
import pydeck as pdk

# SETTING PAGE CONFIG TO WIDE MODE
st.set_page_config(layout="wide")

# Main page title
st.title("Streamlit app")

# Create the main sidebar section, useful for the filters
st.sidebar.title("Streamlit Demo")

st.sidebar.header('Índice')
radio = st.sidebar.radio(label="",
                         options=["1. Cargar los datos",
                                  "2. Crear gráficos",
                                  "3. Crear mapas"])

# Functions to include
def read_data(rev_path, places_path):
    reviews_df = pd.read_csv(rev_path)
    reviews_df['date'] = pd.to_datetime(reviews_df['date']).dt.date
    places_df = pd.read_csv(places_path)
    places_df = places_df.rename(columns={"id": "listing_id"})
    return reviews_df, places_df

def enrich_dataset(df1, df2):
    # Enrich the dataframe
    enriched_df = pd.merge(left=df1, right=df2, on=['listing_id'], how='outer', indicator=True)

    # Clean null dates
    enriched_df = clean_null_dates(enriched_df)

    # Split time columns into year, month and day
    enriched_df = split_time_columns(enriched_df)

    # Format price
    enriched_df['price'] = enriched_df['price'].apply(lambda price: float(re.sub("[^\d\.]", "", price)))

    return enriched_df

def clean_null_dates(df):
    return df[~df['date'].isnull()]

def split_time_columns(df):
    df['year'] = df['date'].apply(lambda date: date.year)
    df['month'] = df['date'].apply(lambda date: date.month)
    df['day'] = df['date'].apply(lambda date: date.day)
    return df

def get_min_max_date(df):
    min_date = df['date'].min()
    max_date = df['date'].max()
    return min_date, max_date

def customize_interval(min_date, max_date):
    start_date = st.sidebar.date_input('Fecha de inicio ', min_date)
    end_date = st.sidebar.date_input('Fecha de fin ', max_date)
    if start_date < end_date:
        st.sidebar.success('Fecha de inicio: `%s`\n\nFecha de fin: `%s`' % (start_date, end_date))
    else:
        st.sidebar.error('Error: La fecha final no puede ser anterior a la de inicio')
    return start_date, end_date


# Get basic data
reviews_df, places_df = read_data("reviews_sample.csv", "listings.csv")

# Specific per section
if radio == '1. Cargar los datos':
    # Show basic data
    st.subheader(f"Cargar los datos")
    st.write(f"Dataset de reviews")
    st.write(reviews_df.head(500))

    if st.checkbox('Filtrar columnas de reviews', value=False):
        # Filter columns in sidebar
        st.sidebar.subheader(f"Filtrado de columnas")
        rev_cols_to_show = st.sidebar.multiselect('Dataset de reviews:', reviews_df.columns)

        st.write(f"Dataset de reviews filtrado")
        st.write(reviews_df[rev_cols_to_show].head(500))

    st.write(f"Cargamos el dataset de alojamientos")
    st.write(places_df.head(500))

    if st.checkbox('Filtrar columnas de alojamientos', value=False):
            # Filter columns in sidebar
            st.sidebar.subheader(f"Filtrado de columnas")
            places_cols_to_show = st.sidebar.multiselect('Dataset de alojamientos:', places_df.columns)

            st.write(f"Dataset de reviews filtrado")
            st.write(places_df[places_cols_to_show].head(500))

            if st.checkbox('Mostrar información de los alojamientos', value=False):
                st.sidebar.subheader(f"Filtrado de alojamientos por id")
                listings_ids = st.sidebar.multiselect('Id de los alojamientos:', set(places_df['listing_id'].values))
                for id in listings_ids:
                    st.markdown("- - -")
                    place = places_df[places_df['listing_id'] == id]
                    st.markdown("### **__Nombre del alojamiento__**")
                    st.write(place['name'].values[0])
                    st.markdown("### **__Descripción del alojamiento__**")
                    st.write(place['description'].values[0], unsafe_allow_html=True)
                    st.markdown("### **__Foto del alojamiento__**")
                    picture_url = place['picture_url'].values[0]
                    st.image(picture_url)
                    st.markdown("- - -")

if radio == '2. Crear gráficos':
    st.subheader(f"Creación de gráficos")
    # Enrich the dataframe
    enriched_df = enrich_dataset(reviews_df, places_df)

    st.write(f"Resultado después de enriquecer los datos. Dataset de reviews con información de alojamientos")

    # Get the min and max review date
    min_date, max_date = get_min_max_date(enriched_df)

    if st.checkbox('Explorar los datos por fecha de review', value=False):
        st.sidebar.title(f"Filtrar por fecha de review")
        custom_min_date, custom_max_date = customize_interval(min_date, max_date)

        enriched_df = enriched_df[(enriched_df['date']>=custom_min_date) & (enriched_df['date']<=custom_max_date)]

        fig_dates = px.histogram(enriched_df, x="date", title="Distribución de reviews por día")
        st.plotly_chart(fig_dates)

        st.markdown("#### Evolución de la relación calidad-precio por barrio y año-mes")
        reviews_neighbourhood_evol_df = enriched_df.groupby(['year', 'month', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed']).agg({'price': 'mean', 'review_scores_value':'mean', 'listing_id':'count'}).reset_index()
        reviews_neighbourhood_evol_df['yearMonth'] = reviews_neighbourhood_evol_df.apply(lambda row: str(row['year']) + '-' + str(row['month']).zfill(2), axis=1)
        reviews_neighbourhood_evol_df = reviews_neighbourhood_evol_df.rename(columns={"listing_id": "count"})
        reviews_neighbourhood_evol_df = reviews_neighbourhood_evol_df[['yearMonth','neighbourhood_cleansed','neighbourhood_group_cleansed', 'count','review_scores_value','price']]

        st.write(reviews_neighbourhood_evol_df)
        fig_anim = px.scatter(reviews_neighbourhood_evol_df, x="review_scores_value", y="price", animation_frame="yearMonth", animation_group="neighbourhood_cleansed", size='count',
        color="neighbourhood_group_cleansed", hover_name="neighbourhood_cleansed")
        st.plotly_chart(fig_anim)

if radio == '3. Crear mapas':
    st.subheader(f"Creación de mapas")

    # Enrich the dataframe
    enriched_df = enrich_dataset(reviews_df, places_df)
    st.write(f"Resultado después de enriquecer los datos. Dataset de reviews con información de alojamientos")

    # Plot a map
    st.write(f"Mapa de todos los alojamientos")
    st.map(enriched_df)

    # Filtrar por zona
    if st.checkbox('Filtrar por zona y periodo de tiempo en Valencia', value=False):
            st.sidebar.subheader(f"Filtrado de las zonas")
            zone = st.sidebar.selectbox('Selecciona la zona:', set(enriched_df['neighbourhood_group_cleansed'].values))
            year = st.sidebar.selectbox('Selecciona el año:', set(enriched_df['year'].values))

            filtered_df = enriched_df[(enriched_df['neighbourhood_group_cleansed'] == zone) & (enriched_df['year'] == year)]
            zone_midpoint = (filtered_df["latitude"].mean(), filtered_df["longitude"].mean())       
            st.pydeck_chart(pdk.Deck(
            map_style='mapbox://styles/mapbox/light-v9',
            initial_view_state=pdk.ViewState(
                latitude=zone_midpoint[0],
                longitude=zone_midpoint[1],
                zoom=11.5,
                pitch=50,
            ),
            layers=[
                pdk.Layer(
                    'HexagonLayer',
                    data=filtered_df[['longitude', 'latitude']],
                    get_position='[longitude, latitude]',
                    radius=200,
                    elevation_scale=4,
                    elevation_range=[0, 1000],
                    pickable=True,
                    extruded=True,
                ),
                pdk.Layer(
                    'ScatterplotLayer',
                    data=filtered_df[['longitude', 'latitude']],
                    get_position='[longitude, latitude]',
                    get_color='[200, 30, 0, 160]',
                    get_radius=200,
                ),
            ],
        ))
            if st.checkbox('Mostrar los datos', value=False):
                grouped_filtered_df = filtered_df.groupby(['neighbourhood_group_cleansed','neighbourhood_cleansed']).agg(count_reviews=('listing_id', 'count'))
                st.write(grouped_filtered_df)