Spaces:
Runtime error
Runtime error
santarabantoosoo
commited on
Commit
•
102b824
1
Parent(s):
571d313
added word frequency
Browse files- app.py +171 -22
- requirements.txt +3 -0
app.py
CHANGED
@@ -6,42 +6,166 @@ import plotly.express as px
|
|
6 |
from stop_words import get_stop_words
|
7 |
from wordcloud import WordCloud
|
8 |
from datasets import load_dataset
|
9 |
-
|
10 |
|
11 |
## import data
|
12 |
|
13 |
dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
|
14 |
data = pd.DataFrame.from_dict(dataset["train"])
|
15 |
|
16 |
-
# formulate a wordcloud for each emotion
|
17 |
|
18 |
-
stop
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# Wordcloud with anger tweets
|
21 |
angry_tweets = data['tweet'][data["emotion"] == 'anger']
|
22 |
-
|
|
|
23 |
anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
|
24 |
|
|
|
25 |
# Wordcloud with sad tweets
|
26 |
sad_tweets = data['tweet'][data["emotion"] == 'sadness']
|
27 |
-
|
|
|
28 |
sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
|
29 |
|
|
|
30 |
# Wordcloud with joy tweets
|
31 |
joy_tweets = data['tweet'][data["emotion"] == 'joy']
|
32 |
-
|
|
|
33 |
joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
|
34 |
|
35 |
|
36 |
# Wordcloud with fear tweets
|
37 |
fear_tweets = data['tweet'][data["emotion"] == 'fear']
|
38 |
-
|
|
|
39 |
fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
|
40 |
|
41 |
-
|
42 |
|
43 |
wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
|
44 |
|
|
|
|
|
45 |
wc_fig.tight_layout()
|
46 |
|
47 |
ax1.imshow(sad_wordcloud, interpolation="bilinear")
|
@@ -65,6 +189,7 @@ ax3.axis("off")
|
|
65 |
ax3.set_title('Fear', {'fontsize': 30})
|
66 |
|
67 |
|
|
|
68 |
ax4.imshow(anger_wordcloud, interpolation="bilinear")
|
69 |
|
70 |
ax4.axis("off")
|
@@ -72,8 +197,6 @@ ax4.axis("off")
|
|
72 |
ax4.set_title('Anger', {'fontsize': 30})
|
73 |
|
74 |
|
75 |
-
plt.show()
|
76 |
-
|
77 |
# plot a pie plot for emotions' distribution
|
78 |
|
79 |
number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
|
@@ -91,7 +214,6 @@ sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Twee
|
|
91 |
color_discrete_sequence=px.colors.qualitative.G10)
|
92 |
sent_fig
|
93 |
|
94 |
-
|
95 |
def display_plot(image_choice):
|
96 |
|
97 |
if image_choice == 'Sentiment distribution':
|
@@ -103,22 +225,49 @@ def display_plot(image_choice):
|
|
103 |
elif image_choice == 'Word clouds':
|
104 |
return wc_fig
|
105 |
|
|
|
|
|
|
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
with gr.Blocks() as demo:
|
108 |
gr.Markdown("## Choose your adventure")
|
|
|
109 |
with gr.Tabs():
|
110 |
-
|
111 |
-
text_input = [gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')]
|
112 |
-
plot_output = gr.Plot()
|
113 |
-
text_button = gr.Button("Submit")
|
114 |
-
|
115 |
-
text_button.click(display_plot, inputs=text_input, outputs=plot_output)
|
116 |
-
|
117 |
-
with gr.TabItem("Word frequency"):
|
118 |
-
gr.Markdown("Nothing here yet")
|
119 |
-
|
120 |
with gr.TabItem("Topic modeling"):
|
121 |
gr.Markdown("Nothing here yet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
|
|
|
|
|
|
|
|
|
6 |
from stop_words import get_stop_words
|
7 |
from wordcloud import WordCloud
|
8 |
from datasets import load_dataset
|
9 |
+
import re
|
10 |
|
11 |
## import data
|
12 |
|
13 |
dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
|
14 |
data = pd.DataFrame.from_dict(dataset["train"])
|
15 |
|
|
|
16 |
|
17 |
+
# load stop words
|
18 |
+
|
19 |
+
it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
|
20 |
+
it_stop = pd.DataFrame.from_dict(it_stop_words["train"])
|
21 |
+
|
22 |
+
it_stop = it_stop.text.to_list()
|
23 |
+
|
24 |
+
## Optimize stop words according to Luca's repo
|
25 |
+
|
26 |
+
def format_input(user_key, stopwords):
|
27 |
+
'''
|
28 |
+
format user input request to lookup in the database of frequencies
|
29 |
+
|
30 |
+
input:
|
31 |
+
user_key is a string
|
32 |
+
stopwords is a list of strings
|
33 |
+
output:
|
34 |
+
key is a string
|
35 |
+
'''
|
36 |
+
|
37 |
+
key = user_key.lower()
|
38 |
+
key = re.sub(r'[^\w\s]', ' ', key)
|
39 |
+
|
40 |
+
key = ' '.join([el for el in key.split() if not (el in stopwords)])
|
41 |
+
|
42 |
+
|
43 |
+
return key
|
44 |
+
|
45 |
+
|
46 |
+
### Loading TFIDF
|
47 |
+
|
48 |
+
TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")
|
49 |
+
|
50 |
+
TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")
|
51 |
+
|
52 |
+
TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")
|
53 |
+
|
54 |
+
TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")
|
55 |
+
|
56 |
+
|
57 |
+
## Loading whole_text
|
58 |
+
|
59 |
+
whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")
|
60 |
+
|
61 |
+
whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")
|
62 |
+
|
63 |
+
whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")
|
64 |
+
|
65 |
+
whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")
|
66 |
+
|
67 |
+
TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])
|
68 |
+
|
69 |
+
TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])
|
70 |
+
|
71 |
+
TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])
|
72 |
+
|
73 |
+
TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])
|
74 |
+
|
75 |
+
whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])
|
76 |
+
|
77 |
+
whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])
|
78 |
+
|
79 |
+
whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])
|
80 |
+
|
81 |
+
whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])
|
82 |
+
|
83 |
+
ser_TFIDF = []
|
84 |
+
|
85 |
+
ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
|
86 |
+
ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
|
87 |
+
ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
|
88 |
+
ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])
|
89 |
+
|
90 |
+
ser_whole_text = []
|
91 |
+
|
92 |
+
ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
|
93 |
+
ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
|
94 |
+
ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
|
95 |
+
ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])
|
96 |
+
|
97 |
+
|
98 |
+
def plot_time_series(choice, keyword, user_keys):
|
99 |
+
|
100 |
+
x = np.arange(2,10,2)
|
101 |
+
|
102 |
+
y = [[] for j in range(len(keyword))]
|
103 |
+
|
104 |
+
for j in range(len(keyword)):
|
105 |
+
i=0
|
106 |
+
while i < len(choice):
|
107 |
+
try:
|
108 |
+
y[j].append(choice[i][keyword[j]])
|
109 |
+
i += 1
|
110 |
+
except:
|
111 |
+
y[j].append(0.0)
|
112 |
+
i += 1
|
113 |
+
|
114 |
+
y[j] = np.array(y[j])
|
115 |
+
|
116 |
+
|
117 |
+
x_ticks_labels = ['Q1','Q2','Q3','Q4']
|
118 |
+
|
119 |
+
fig, ax = plt.subplots(1,1)
|
120 |
+
|
121 |
+
for j in range(len(keyword)):
|
122 |
+
ax.plot(x,y[j], label = user_keys[j].lower())
|
123 |
+
|
124 |
+
|
125 |
+
# Set number of ticks for x-axis
|
126 |
+
ax.set_xticks(x)
|
127 |
+
ax.set_xticklabels(x_ticks_labels, fontsize=12)
|
128 |
+
|
129 |
+
leg = plt.legend(loc='best')
|
130 |
+
plt.xlabel('Time')
|
131 |
+
plt.title("keywords quartely analysis (July 2021 - July 2022)")
|
132 |
+
plt.ylabel(f'Freq. from {user_choice}')
|
133 |
+
return fig
|
134 |
+
|
135 |
|
136 |
# Wordcloud with anger tweets
|
137 |
angry_tweets = data['tweet'][data["emotion"] == 'anger']
|
138 |
+
angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
|
139 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
140 |
anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
|
141 |
|
142 |
+
|
143 |
# Wordcloud with sad tweets
|
144 |
sad_tweets = data['tweet'][data["emotion"] == 'sadness']
|
145 |
+
sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
|
146 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
147 |
sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
|
148 |
|
149 |
+
|
150 |
# Wordcloud with joy tweets
|
151 |
joy_tweets = data['tweet'][data["emotion"] == 'joy']
|
152 |
+
joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
|
153 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
154 |
joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
|
155 |
|
156 |
|
157 |
# Wordcloud with fear tweets
|
158 |
fear_tweets = data['tweet'][data["emotion"] == 'fear']
|
159 |
+
fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
|
160 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
161 |
fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
|
162 |
|
163 |
+
## COmbine all plots in a single plot
|
164 |
|
165 |
wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
|
166 |
|
167 |
+
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
|
168 |
+
|
169 |
wc_fig.tight_layout()
|
170 |
|
171 |
ax1.imshow(sad_wordcloud, interpolation="bilinear")
|
|
|
189 |
ax3.set_title('Fear', {'fontsize': 30})
|
190 |
|
191 |
|
192 |
+
|
193 |
ax4.imshow(anger_wordcloud, interpolation="bilinear")
|
194 |
|
195 |
ax4.axis("off")
|
|
|
197 |
ax4.set_title('Anger', {'fontsize': 30})
|
198 |
|
199 |
|
|
|
|
|
200 |
# plot a pie plot for emotions' distribution
|
201 |
|
202 |
number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
|
|
|
214 |
color_discrete_sequence=px.colors.qualitative.G10)
|
215 |
sent_fig
|
216 |
|
|
|
217 |
def display_plot(image_choice):
|
218 |
|
219 |
if image_choice == 'Sentiment distribution':
|
|
|
225 |
elif image_choice == 'Word clouds':
|
226 |
return wc_fig
|
227 |
|
228 |
+
def display_freq_plot(choice, *args):
|
229 |
+
|
230 |
+
user_keys = [arg for arg in args]
|
231 |
|
232 |
+
# clean input strings to match keywords in the database
|
233 |
+
keyword = []
|
234 |
+
for key in user_keys:
|
235 |
+
keyword.append(format_input(key, it_stop))
|
236 |
+
|
237 |
+
if choice == "TFIDF":
|
238 |
+
return plot_time_series(ser_TFIDF, keyword, user_keys)
|
239 |
+
|
240 |
+
elif choice == "Whole_text":
|
241 |
+
return plot_time_series(ser_whole_text, keyword, user_keys)
|
242 |
+
|
243 |
+
|
244 |
with gr.Blocks() as demo:
|
245 |
gr.Markdown("## Choose your adventure")
|
246 |
+
|
247 |
with gr.Tabs():
|
248 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
with gr.TabItem("Topic modeling"):
|
250 |
gr.Markdown("Nothing here yet")
|
251 |
+
|
252 |
+
with gr.TabItem("Word frequency"):
|
253 |
+
|
254 |
+
inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
|
255 |
+
gr.Textbox(label = 'word 1'),
|
256 |
+
gr.Textbox(label = 'word 2'),
|
257 |
+
gr.Textbox(label = 'word 3'),
|
258 |
+
gr.Textbox(label = 'word 4')]
|
259 |
+
plot_output = gr.Plot(elem_id = 1)
|
260 |
+
freq_button = gr.Button("Submit")
|
261 |
|
262 |
+
freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)
|
263 |
+
|
264 |
+
with gr.TabItem("Sentiment analysis"):
|
265 |
+
text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
|
266 |
+
sent_plot = gr.Plot(label = 'jhg')
|
267 |
+
sent_button = gr.Button("Submit")
|
268 |
|
269 |
+
sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
|
270 |
+
|
271 |
+
|
272 |
+
demo.launch();
|
273 |
+
|
requirements.txt
CHANGED
@@ -4,3 +4,6 @@ matplotlib
|
|
4 |
plotly
|
5 |
stop_words
|
6 |
wordcloud
|
|
|
|
|
|
|
|
4 |
plotly
|
5 |
stop_words
|
6 |
wordcloud
|
7 |
+
datasets
|
8 |
+
re
|
9 |
+
|