import dash
from dash import dcc, html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
# Prepare the data
df = pd.read_excel("NYT Fiction Bestsellers.xlsx", sheet_name=1)[::-1]
df_subset = df.drop_duplicates(subset=['title'])
df_subset['title'] = df_subset['title'].astype(str)
df_subset['desc'] = df_subset['desc'].astype(str)
df_subset['year'] = df_subset.bestsellers_date.dt.year
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) | set(string.punctuation)
def generate_ngrams(text, n):
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
n_grams = ngrams(tokens, n)
return [' '.join(gram) for gram in n_grams]
def plot_top_ngrams_plotly(freq_counter):
top_n = freq_counter.most_common(10)
ngrams, counts = zip(*top_n)
fig = px.bar(y=list(counts), x=list(ngrams), template="ggplot2", labels={'x':'', 'y':''},
text_auto=True,
)
fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)'
)
fig.update_yaxes(visible=False)
return fig
def get_sentiment(text):
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)
return scores['compound']
radio_style = {
'display': 'flex',
'flex-direction': 'row',
'justify-content': 'space-between',
'padding': '5px',
'border': '2px solid',
'border-radius': '5px',
'boxShadow': '3px 3px 3px rgba(10, 10, 10, 0.3)',
'font-family': 'Aharoni, sans-serif',
'font-size': '20px',
}
header_style={'text-align': 'center', 'margin': '10px','padding': '10px'}
# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.JOURNAL])
app.title = "NY Times Fiction Best-Sellers"
app.layout = dbc.Container([
dbc.Row([
dbc.Col(html.H1("NYT Bestseller Sentiment & Trends", className="title"), width=12) # Clase CSS para el título
]),
dbc.Row([
dbc.Col(html.H5("Sentiment analysis reveals trends in NYT fiction bestsellers: titles, descriptions, sentiment, bigrams/trigrams, and author sentiment."), width=12)
]),
dbc.Row([
dbc.Col(dbc.RadioItems(
id='radio-buttons',
options=[{'label': str(year), 'value': year} for year in df_subset.year.unique()],
value=2018,
inline=True,
style=radio_style
), width=12)
]),
dbc.Row([
dbc.Col([
html.H5("Top 10 Title Bigrams", style=header_style),html.Hr(),
dcc.Graph(id='bigrams-chart', className="dash-graph")], width=6),
dbc.Col([
html.H5("Top 10 Description Trigrams", style=header_style),html.Hr(),
dcc.Graph(id='trigrams-chart', className="dash-graph")], width=6)
]),
html.Hr(),
dbc.Row([
dbc.Col([
html.H5("Book Description Sentiment by Year", style=header_style),
html.Hr(),
dcc.Graph(id='sentiment-chart')], width=5),
dbc.Col(html.Div([
html.H5("Comparative Sentiment Distribution of Bestselling Authors", style=header_style),
html.Hr(),
html.Button('Update Authors', id='update-authors-button', className="update-button"),
dcc.Graph(id='sentiment_author-boxchart')
]), width=7),
dbc.Col(
html.H5("Sentiment scores range from -1 (VERY NEGATIVE) to +1 (VERY POSITIVE). Scores close to 0 indicate neutral sentiment.", style=header_style), width=12)
]),
], fluid=True, style={'backgroundColor': '#f0f0f0'})
@app.callback(
Output('bigrams-chart', 'figure'),
Output('trigrams-chart', 'figure'),
Output('sentiment-chart', 'figure'),
Output('sentiment_author-boxchart', 'figure'),
Input('radio-buttons', 'value'),
Input('update-authors-button', 'n_clicks') # Input del botón
)
def update_charts(year, n_clicks):
filtered_df = df_subset[df_subset.year == year]
bigrams = [generate_ngrams(title, 2) for title in filtered_df['title']]
bigram_freq = Counter([gram for sublist in bigrams for gram in sublist])
bigrams_fig = plot_top_ngrams_plotly(bigram_freq)
trigrams = [generate_ngrams(desc, 3) for desc in filtered_df['desc']]
trigram_freq = Counter([gram for sublist in trigrams for gram in sublist])
trigrams_fig = plot_top_ngrams_plotly(trigram_freq)
filtered_df['sentiment'] = filtered_df['desc'].apply(get_sentiment)
sentiment_fig = px.histogram(
filtered_df, x="sentiment", nbins=10, histnorm='percent',range_x=[-1,1],
template='ggplot2', text_auto= '.2f', labels={'sentiment':''}
)
sentiment_fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)')
sentiment_fig.update_yaxes(visible=False)
if n_clicks is None or n_clicks == 0:
author_to_watch = filtered_df.author.sample(5).tolist()
elif n_clicks > 0:
author_to_watch = filtered_df.author.sample(5).tolist()
author_to_watch = filtered_df.author.sample(5).tolist()
author_df = filtered_df[filtered_df['author'].isin(author_to_watch)]
author_df['sentiment'] = author_df['desc'].apply(get_sentiment)
sentimen_author_fig = px.box(author_df, x="sentiment", y='author',
range_x=[-1,1],
color_discrete_sequence=px.colors.sequential.Bluered_r,
template='ggplot2', labels={'sentiment':'', 'author':''})
sentimen_author_fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)')
return bigrams_fig, trigrams_fig, sentiment_fig, sentimen_author_fig