santarabantoosoo commited on
Commit
102b824
1 Parent(s): 571d313

added word frequency

Browse files
Files changed (2) hide show
  1. app.py +171 -22
  2. requirements.txt +3 -0
app.py CHANGED
@@ -6,42 +6,166 @@ import plotly.express as px
6
  from stop_words import get_stop_words
7
  from wordcloud import WordCloud
8
  from datasets import load_dataset
9
-
10
 
11
  ## import data
12
 
13
  dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
14
  data = pd.DataFrame.from_dict(dataset["train"])
15
 
16
- # formulate a wordcloud for each emotion
17
 
18
- stop = get_stop_words('italian')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Wordcloud with anger tweets
21
  angry_tweets = data['tweet'][data["emotion"] == 'anger']
22
- stop_words = ["https", "co", "RT"] + list(stop)
 
23
  anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
24
 
 
25
  # Wordcloud with sad tweets
26
  sad_tweets = data['tweet'][data["emotion"] == 'sadness']
27
- stop_words = ["https", "co", "RT"] + list(stop)
 
28
  sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
29
 
 
30
  # Wordcloud with joy tweets
31
  joy_tweets = data['tweet'][data["emotion"] == 'joy']
32
- stop_words = ["https", "co", "RT"] + list(stop)
 
33
  joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
34
 
35
 
36
  # Wordcloud with fear tweets
37
  fear_tweets = data['tweet'][data["emotion"] == 'fear']
38
- stop_words = ["https", "co", "RT"] + list(stop)
 
39
  fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
40
 
41
- # combine wordclouds in a single matplotlib figure
42
 
43
  wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
44
 
 
 
45
  wc_fig.tight_layout()
46
 
47
  ax1.imshow(sad_wordcloud, interpolation="bilinear")
@@ -65,6 +189,7 @@ ax3.axis("off")
65
  ax3.set_title('Fear', {'fontsize': 30})
66
 
67
 
 
68
  ax4.imshow(anger_wordcloud, interpolation="bilinear")
69
 
70
  ax4.axis("off")
@@ -72,8 +197,6 @@ ax4.axis("off")
72
  ax4.set_title('Anger', {'fontsize': 30})
73
 
74
 
75
- plt.show()
76
-
77
  # plot a pie plot for emotions' distribution
78
 
79
  number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
@@ -91,7 +214,6 @@ sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Twee
91
  color_discrete_sequence=px.colors.qualitative.G10)
92
  sent_fig
93
 
94
-
95
  def display_plot(image_choice):
96
 
97
  if image_choice == 'Sentiment distribution':
@@ -103,22 +225,49 @@ def display_plot(image_choice):
103
  elif image_choice == 'Word clouds':
104
  return wc_fig
105
 
 
 
 
106
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  with gr.Blocks() as demo:
108
  gr.Markdown("## Choose your adventure")
 
109
  with gr.Tabs():
110
- with gr.TabItem("Sentiment analysis"):
111
- text_input = [gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')]
112
- plot_output = gr.Plot()
113
- text_button = gr.Button("Submit")
114
-
115
- text_button.click(display_plot, inputs=text_input, outputs=plot_output)
116
-
117
- with gr.TabItem("Word frequency"):
118
- gr.Markdown("Nothing here yet")
119
-
120
  with gr.TabItem("Topic modeling"):
121
  gr.Markdown("Nothing here yet")
 
 
 
 
 
 
 
 
 
 
122
 
 
 
 
 
 
 
123
 
124
- demo.launch();
 
 
 
 
 
6
  from stop_words import get_stop_words
7
  from wordcloud import WordCloud
8
  from datasets import load_dataset
9
+ import re
10
 
11
  ## import data
12
 
13
  dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
14
  data = pd.DataFrame.from_dict(dataset["train"])
15
 
 
16
 
17
+ # load stop words
18
+
19
+ it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
20
+ it_stop = pd.DataFrame.from_dict(it_stop_words["train"])
21
+
22
+ it_stop = it_stop.text.to_list()
23
+
24
+ ## Optimize stop words according to Luca's repo
25
+
26
+ def format_input(user_key, stopwords):
27
+ '''
28
+ format user input request to lookup in the database of frequencies
29
+
30
+ input:
31
+ user_key is a string
32
+ stopwords is a list of strings
33
+ output:
34
+ key is a string
35
+ '''
36
+
37
+ key = user_key.lower()
38
+ key = re.sub(r'[^\w\s]', ' ', key)
39
+
40
+ key = ' '.join([el for el in key.split() if not (el in stopwords)])
41
+
42
+
43
+ return key
44
+
45
+
46
+ ### Loading TFIDF
47
+
48
+ TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")
49
+
50
+ TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")
51
+
52
+ TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")
53
+
54
+ TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")
55
+
56
+
57
+ ## Loading whole_text
58
+
59
+ whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")
60
+
61
+ whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")
62
+
63
+ whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")
64
+
65
+ whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")
66
+
67
+ TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])
68
+
69
+ TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])
70
+
71
+ TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])
72
+
73
+ TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])
74
+
75
+ whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])
76
+
77
+ whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])
78
+
79
+ whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])
80
+
81
+ whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])
82
+
83
+ ser_TFIDF = []
84
+
85
+ ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
86
+ ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
87
+ ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
88
+ ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])
89
+
90
+ ser_whole_text = []
91
+
92
+ ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
93
+ ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
94
+ ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
95
+ ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])
96
+
97
+
98
+ def plot_time_series(choice, keyword, user_keys):
99
+
100
+ x = np.arange(2,10,2)
101
+
102
+ y = [[] for j in range(len(keyword))]
103
+
104
+ for j in range(len(keyword)):
105
+ i=0
106
+ while i < len(choice):
107
+ try:
108
+ y[j].append(choice[i][keyword[j]])
109
+ i += 1
110
+ except:
111
+ y[j].append(0.0)
112
+ i += 1
113
+
114
+ y[j] = np.array(y[j])
115
+
116
+
117
+ x_ticks_labels = ['Q1','Q2','Q3','Q4']
118
+
119
+ fig, ax = plt.subplots(1,1)
120
+
121
+ for j in range(len(keyword)):
122
+ ax.plot(x,y[j], label = user_keys[j].lower())
123
+
124
+
125
+ # Set number of ticks for x-axis
126
+ ax.set_xticks(x)
127
+ ax.set_xticklabels(x_ticks_labels, fontsize=12)
128
+
129
+ leg = plt.legend(loc='best')
130
+ plt.xlabel('Time')
131
+ plt.title("keywords quartely analysis (July 2021 - July 2022)")
132
+ plt.ylabel(f'Freq. from {user_choice}')
133
+ return fig
134
+
135
 
136
  # Wordcloud with anger tweets
137
  angry_tweets = data['tweet'][data["emotion"] == 'anger']
138
+ angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
139
+ stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
140
  anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
141
 
142
+
143
  # Wordcloud with sad tweets
144
  sad_tweets = data['tweet'][data["emotion"] == 'sadness']
145
+ sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
146
+ stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
147
  sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
148
 
149
+
150
  # Wordcloud with joy tweets
151
  joy_tweets = data['tweet'][data["emotion"] == 'joy']
152
+ joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
153
+ stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
154
  joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
155
 
156
 
157
  # Wordcloud with fear tweets
158
  fear_tweets = data['tweet'][data["emotion"] == 'fear']
159
+ fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
160
+ stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
161
  fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
162
 
163
+ ## COmbine all plots in a single plot
164
 
165
  wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
166
 
167
+ # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
168
+
169
  wc_fig.tight_layout()
170
 
171
  ax1.imshow(sad_wordcloud, interpolation="bilinear")
 
189
  ax3.set_title('Fear', {'fontsize': 30})
190
 
191
 
192
+
193
  ax4.imshow(anger_wordcloud, interpolation="bilinear")
194
 
195
  ax4.axis("off")
 
197
  ax4.set_title('Anger', {'fontsize': 30})
198
 
199
 
 
 
200
  # plot a pie plot for emotions' distribution
201
 
202
  number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
 
214
  color_discrete_sequence=px.colors.qualitative.G10)
215
  sent_fig
216
 
 
217
  def display_plot(image_choice):
218
 
219
  if image_choice == 'Sentiment distribution':
 
225
  elif image_choice == 'Word clouds':
226
  return wc_fig
227
 
228
+ def display_freq_plot(choice, *args):
229
+
230
+ user_keys = [arg for arg in args]
231
 
232
+ # clean input strings to match keywords in the database
233
+ keyword = []
234
+ for key in user_keys:
235
+ keyword.append(format_input(key, it_stop))
236
+
237
+ if choice == "TFIDF":
238
+ return plot_time_series(ser_TFIDF, keyword, user_keys)
239
+
240
+ elif choice == "Whole_text":
241
+ return plot_time_series(ser_whole_text, keyword, user_keys)
242
+
243
+
244
  with gr.Blocks() as demo:
245
  gr.Markdown("## Choose your adventure")
246
+
247
  with gr.Tabs():
248
+
 
 
 
 
 
 
 
 
 
249
  with gr.TabItem("Topic modeling"):
250
  gr.Markdown("Nothing here yet")
251
+
252
+ with gr.TabItem("Word frequency"):
253
+
254
+ inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
255
+ gr.Textbox(label = 'word 1'),
256
+ gr.Textbox(label = 'word 2'),
257
+ gr.Textbox(label = 'word 3'),
258
+ gr.Textbox(label = 'word 4')]
259
+ plot_output = gr.Plot(elem_id = 1)
260
+ freq_button = gr.Button("Submit")
261
 
262
+ freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)
263
+
264
+ with gr.TabItem("Sentiment analysis"):
265
+ text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
266
+ sent_plot = gr.Plot(label = 'jhg')
267
+ sent_button = gr.Button("Submit")
268
 
269
+ sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
270
+
271
+
272
+ demo.launch();
273
+
requirements.txt CHANGED
@@ -4,3 +4,6 @@ matplotlib
4
  plotly
5
  stop_words
6
  wordcloud
 
 
 
 
4
  plotly
5
  stop_words
6
  wordcloud
7
+ datasets
8
+ re
9
+