PyCafe - Dash - ny_times_best

NYT Fiction Bestsellers.xlsx
app.py
requirements.txt
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import dash
from dash import dcc, html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Prepare the data
df = pd.read_excel("NYT Fiction Bestsellers.xlsx", sheet_name=1)[::-1]
df_subset = df.drop_duplicates(subset=['title'])
df_subset['title'] = df_subset['title'].astype(str)
df_subset['desc'] = df_subset['desc'].astype(str)
df_subset['year'] = df_subset.bestsellers_date.dt.year

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) | set(string.punctuation)

def generate_ngrams(text, n):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

def plot_top_ngrams_plotly(freq_counter):
    top_n = freq_counter.most_common(10)
    ngrams, counts = zip(*top_n)
    fig = px.bar(y=list(counts), x=list(ngrams), template="ggplot2", labels={'x':'', 'y':''}, 
                 text_auto=True,
                )
    fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)'
    )
   
    fig.update_yaxes(visible=False)
    return fig

def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return scores['compound']

radio_style = {
    'display': 'flex',
    'flex-direction': 'row',
    'justify-content': 'space-between',
    'padding': '5px',
    'border': '2px solid',
    'border-radius': '5px',
    'boxShadow': '3px 3px 3px rgba(10, 10, 10, 0.3)',
    'font-family': 'Aharoni, sans-serif',
    'font-size': '20px',
}

header_style={'text-align': 'center', 'margin': '10px','padding': '10px'}
    
# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.JOURNAL])

app.title = "NY Times Fiction Best-Sellers"

app.layout = dbc.Container([
    dbc.Row([
        dbc.Col(html.H1("NYT Bestseller Sentiment & Trends", className="title"), width=12)  # Clase CSS para el título
    ]),
    dbc.Row([
        dbc.Col(html.H5("Sentiment analysis reveals trends in NYT fiction bestsellers: titles, descriptions, sentiment, bigrams/trigrams, and author sentiment."), width=12) 
    ]),
    dbc.Row([
        dbc.Col(dbc.RadioItems(
            id='radio-buttons',
            options=[{'label': str(year), 'value': year} for year in df_subset.year.unique()],
            value=2018,
            inline=True,
            style=radio_style
        ), width=12)
    ]),
    dbc.Row([
       
        dbc.Col([
            html.H5("Top 10 Title Bigrams", style=header_style),html.Hr(),
            dcc.Graph(id='bigrams-chart', className="dash-graph")], width=6),  
        dbc.Col([
            html.H5("Top 10 Description Trigrams", style=header_style),html.Hr(),
            dcc.Graph(id='trigrams-chart', className="dash-graph")], width=6)  
    ]),
    html.Hr(),
    dbc.Row([
            dbc.Col([
            html.H5("Book Description Sentiment by Year", style=header_style),
            html.Hr(),
            dcc.Graph(id='sentiment-chart')], width=5),
        dbc.Col(html.Div([
            html.H5("Comparative Sentiment Distribution of Bestselling Authors", style=header_style),
            html.Hr(),
            html.Button('Update Authors', id='update-authors-button', className="update-button"),
            dcc.Graph(id='sentiment_author-boxchart')
    ]), width=7), 
        dbc.Col(
            html.H5("Sentiment scores range from -1 (VERY NEGATIVE) to +1 (VERY POSITIVE). Scores close to 0 indicate neutral sentiment.", style=header_style), width=12)
]),
   
], fluid=True, style={'backgroundColor': '#f0f0f0'})  

@app.callback(
    Output('bigrams-chart', 'figure'),
    Output('trigrams-chart', 'figure'),
    Output('sentiment-chart', 'figure'),
    Output('sentiment_author-boxchart', 'figure'),
    Input('radio-buttons', 'value'),
    Input('update-authors-button', 'n_clicks') # Input del botón
)
def update_charts(year, n_clicks):
    filtered_df = df_subset[df_subset.year == year]

    bigrams = [generate_ngrams(title, 2) for title in filtered_df['title']]
    bigram_freq = Counter([gram for sublist in bigrams for gram in sublist])
    bigrams_fig = plot_top_ngrams_plotly(bigram_freq)
    

    trigrams = [generate_ngrams(desc, 3) for desc in filtered_df['desc']]
    trigram_freq = Counter([gram for sublist in trigrams for gram in sublist])
    trigrams_fig = plot_top_ngrams_plotly(trigram_freq)
    

    filtered_df['sentiment'] = filtered_df['desc'].apply(get_sentiment)
    sentiment_fig = px.histogram(
        filtered_df, x="sentiment", nbins=10, histnorm='percent',range_x=[-1,1],
        template='ggplot2', text_auto= '.2f', labels={'sentiment':''}
    )
    sentiment_fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)')

    sentiment_fig.update_yaxes(visible=False)


    if n_clicks is None or n_clicks == 0:  
        author_to_watch = filtered_df.author.sample(5).tolist()
    elif n_clicks > 0: 
        author_to_watch = filtered_df.author.sample(5).tolist()

    author_to_watch = filtered_df.author.sample(5).tolist()
    author_df = filtered_df[filtered_df['author'].isin(author_to_watch)]
    author_df['sentiment'] = author_df['desc'].apply(get_sentiment)
    sentimen_author_fig = px.box(author_df, x="sentiment", y='author',
                                 range_x=[-1,1],
                                 color_discrete_sequence=px.colors.sequential.Bluered_r,
                                 template='ggplot2', labels={'sentiment':'', 'author':''})
    sentimen_author_fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)')

    

    return bigrams_fig, trigrams_fig, sentiment_fig, sentimen_author_fig
Py.Cafe

ny_times_best_sellers