jsulz HF staff commited on
Commit
256f7c8
·
1 Parent(s): f239c56

switching to plotly graphs

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. app.py +106 -83
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # ignore pycache
2
+ __pycache__/
app.py CHANGED
@@ -4,6 +4,8 @@ from nltk.util import ngrams
4
  from collections import Counter
5
  import pandas as pd
6
  import plotly.express as px
 
 
7
  import matplotlib.pyplot as plt
8
 
9
  # Load the dataset and convert it to a Pandas dataframe
@@ -28,7 +30,7 @@ df["ari"] = df["no-contractions"].apply(
28
  + (0.5 * (len(x.split()) / len(x.split("."))))
29
  - 21.43
30
  )
31
-
32
  written = df[df["categories"] == "Written"]
33
  spoken = df[df["categories"] == "Spoken"]
34
 
@@ -39,115 +41,136 @@ with gr.Blocks() as demo:
39
  # A Dashboard to Analyze the State of the Union Addresses
40
  """
41
  )
42
- gr.BarPlot(
43
  df,
44
  x="date",
45
  y="word_count",
46
- title="Total Number of Words in the Speeches",
47
- color="categories",
 
 
 
 
48
  )
 
49
  # group by president and category and calculate the average word count sort by date
50
  avg_word_count = (
51
- df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
52
  )
53
- # create a bar chart
54
- gr.BarPlot(
55
  avg_word_count,
56
  x="potus",
57
  y="word_count",
58
- title="Average Number of Words in the Speeches",
59
  color="categories",
60
- x_label_angle=-45,
61
- height=400,
62
- min_width=160,
63
- fill_height=True,
64
- container=True,
65
- scale=2,
66
  )
 
 
 
 
 
 
 
 
 
 
 
 
67
  with gr.Row():
68
  ari = df[["potus", "date", "ari", "categories"]]
69
- gr.LinePlot(
70
  ari,
71
  x="date",
72
  y="ari",
73
- title="Automated Readability Index",
 
 
 
 
 
74
  )
 
75
  # get all unique president names
76
  presidents = df["potus"].unique()
77
  # convert presidents to a list
78
  presidents = presidents.tolist()
79
  # create a dropdown to select a president
80
- president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
81
  grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
82
- with gr.Row():
83
- # if president is not of type string
84
- @gr.render(inputs=president)
85
- def show_text(potus):
86
- if potus != "All" and potus is not None:
87
- ari = df[df["potus"] == potus][
88
- ["date", "categories", "word_count", "ari"]
89
- ]
90
- gr.DataFrame(ari, height=200)
91
 
92
- @gr.render(inputs=president)
93
- def word_length_bar(potus):
94
- # calculate the total number of words in the speech_html column and add it to a new column
95
- # if the president is "All", show the word count for all presidents
96
- # if the president is not "All", show the word count for the selected president
97
- if potus != "All" and potus is not None:
98
- gr.LinePlot(
99
- df[df["potus"] == potus],
100
- x="date",
101
- y="word_count",
102
- title="Total Number of Words in the Speeches",
103
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- with gr.Row():
106
- with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- @gr.render(inputs=[president, grams])
109
- def ngram_bar(potus, n_grams):
110
- if potus != "All" and potus is not None:
111
- if type(n_grams) is not int:
112
- n_grams = 1
113
- print(n_grams)
114
- # create a Counter object from the trigrams
115
- potus_df = df[df["potus"] == potus]
116
- # decode the tokens-nostop column from a byte array to a list of string
117
- trigrams = (
118
- potus_df["tokens-nostop"]
119
- .apply(lambda x: list(ngrams(x, n_grams)))
120
- .apply(Counter)
121
- .sum()
122
- )
123
- # get the most common trigrams
124
- common_trigrams = trigrams.most_common(20)
125
- # unzip the list of tuples and plot the trigrams and counts as a bar chart
126
- trigrams, counts = zip(*common_trigrams)
127
- # join the trigrams into a single string
128
- trigrams = [" ".join(trigram) for trigram in trigrams]
129
- # create a dataframe from the trigrams and counts
130
- trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
131
- # plot the trigrams and counts as a bar chart from matplotlib
132
- """
133
- fig, ax = plt.subplots(figsize=(12, 4))
134
- ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
135
- ax.set_title("Top 20 Trigrams")
136
- ax.set_ylabel("Count")
137
- ax.set_xlabel("Trigrams")
138
- plt.xticks(rotation=45)
139
- # make it tight layout
140
- plt.tight_layout()
141
- """
142
- fig = px.scatter(
143
- trigrams_df,
144
- x="counts",
145
- y="trigrams",
146
- title="Top 20 Trigrams",
147
- orientation="h",
148
- )
149
- print(fig)
150
- gr.Plot(value=fig, container=True, visible=True)
151
 
152
 
153
  demo.launch(share=True)
 
4
  from collections import Counter
5
  import pandas as pd
6
  import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
  import matplotlib.pyplot as plt
10
 
11
  # Load the dataset and convert it to a Pandas dataframe
 
30
  + (0.5 * (len(x.split()) / len(x.split("."))))
31
  - 21.43
32
  )
33
+ df = df.sort_values(by="date")
34
  written = df[df["categories"] == "Written"]
35
  spoken = df[df["categories"] == "Spoken"]
36
 
 
41
  # A Dashboard to Analyze the State of the Union Addresses
42
  """
43
  )
44
+ fig1 = px.line(
45
  df,
46
  x="date",
47
  y="word_count",
48
+ title="Total Number of Words in Addresses",
49
+ line_shape="spline",
50
+ )
51
+ fig1.update_layout(
52
+ xaxis=dict(title="Date of Address"),
53
+ yaxis=dict(title="Word Count"),
54
  )
55
+ gr.Plot(fig1)
56
  # group by president and category and calculate the average word count sort by date
57
  avg_word_count = (
58
+ df.groupby(["potus", "categories"])["word_count"].mean().reset_index()
59
  )
60
+ fig2 = px.bar(
 
61
  avg_word_count,
62
  x="potus",
63
  y="word_count",
64
+ title="Average Number of Words in Addresses by President",
65
  color="categories",
66
+ barmode="group",
 
 
 
 
 
67
  )
68
+ fig2.update_layout(
69
+ xaxis=dict(
70
+ title="President",
71
+ tickangle=-45, # Rotate labels 45 degrees counterclockwise
72
+ ),
73
+ yaxis=dict(
74
+ title="Average Word Count",
75
+ tickangle=0, # Default label angle (horizontal)
76
+ ),
77
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
78
+ )
79
+ gr.Plot(fig2)
80
  with gr.Row():
81
  ari = df[["potus", "date", "ari", "categories"]]
82
+ fig3 = px.line(
83
  ari,
84
  x="date",
85
  y="ari",
86
+ title="Automated Readability Index in each Address",
87
+ line_shape="spline",
88
+ )
89
+ fig3.update_layout(
90
+ xaxis=dict(title="Date of Address"),
91
+ yaxis=dict(title="ARI Score"),
92
  )
93
+ gr.Plot(fig3)
94
  # get all unique president names
95
  presidents = df["potus"].unique()
96
  # convert presidents to a list
97
  presidents = presidents.tolist()
98
  # create a dropdown to select a president
99
+ president = gr.Dropdown(label="Select a President", choices=presidents)
100
  grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
 
 
 
 
 
 
 
 
 
101
 
102
+ def plotly_bar(n_grams, potus):
103
+ if potus is not None:
104
+ # create a Counter object from the trigrams
105
+ potus_df = df[df["potus"] == potus]
106
+ # decode the tokens-nostop column from a byte array to a list of string
107
+ trigrams = (
108
+ potus_df["tokens-nostop"]
109
+ .apply(lambda x: list(ngrams(x, n_grams)))
110
+ .apply(Counter)
111
+ .sum()
112
+ )
113
+ # get the most common trigrams
114
+ common_trigrams = trigrams.most_common(10)
115
+ # unzip the list of tuples and plot the trigrams and counts as a bar chart
116
+ trigrams, counts = zip(*common_trigrams)
117
+ # join the trigrams into a single string
118
+ trigrams = [" ".join(trigram) for trigram in trigrams]
119
+ # create a dataframe from the trigrams and counts
120
+ trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
121
+ fig4 = px.bar(
122
+ trigrams_df,
123
+ x="counts",
124
+ y="trigrams",
125
+ title=f"Top {n_grams}-grams",
126
+ orientation="h",
127
+ height=400,
128
+ )
129
+ return fig4
130
 
131
+ if president != "All" and president is not None:
132
+ gr.Plot(plotly_bar, inputs=[grams, president])
133
+
134
+ def plotly_line(president):
135
+ if president != "All" and president is not None:
136
+ potus_df = df[df["potus"] == president]
137
+ fig5 = make_subplots(specs=[[{"secondary_y": True}]])
138
+ fig5.add_trace(
139
+ go.Scatter(
140
+ x=potus_df["date"],
141
+ y=potus_df["word_count"],
142
+ name="Word Count",
143
+ ),
144
+ secondary_y=False,
145
+ )
146
+ fig5.add_trace(
147
+ go.Scatter(
148
+ x=potus_df["date"],
149
+ y=potus_df["ari"],
150
+ name="ARI",
151
+ ),
152
+ secondary_y=True,
153
+ )
154
+ # Add figure title
155
+ fig5.update_layout(title_text="Double Y Axis Example")
156
+
157
+ # Set x-axis title
158
+ fig5.update_xaxes(title_text="xaxis title")
159
+
160
+ # Set y-axes titles
161
+ fig5.update_yaxes(
162
+ title_text="<b>primary</b> yaxis title", secondary_y=False
163
+ )
164
+ fig5.update_yaxes(
165
+ title_text="<b>secondary</b> yaxis title", secondary_y=True
166
+ )
167
+ return fig5
168
 
169
+ # calculate the total number of words in the speech_html column and add it to a new column
170
+ # if the president is "All", show the word count for all presidents
171
+ # if the president is not "All", show the word count for the selected president
172
+ if president != "All" and president is not None:
173
+ gr.Plot(plotly_line, inputs=[president])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
 
176
  demo.launch(share=True)