lunadebruyne commited on
Commit
01e7b54
·
verified ·
1 Parent(s): 082233a

Delete app_dataset.py

Browse files
Files changed (1) hide show
  1. app_dataset.py +0 -381
app_dataset.py DELETED
@@ -1,381 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import numpy as np
4
- import pickle
5
-
6
- import pandas as pd
7
- from tqdm import tqdm
8
-
9
- import altair as alt
10
- import matplotlib.pyplot as plt
11
- from datetime import date, timedelta
12
-
13
- from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
14
-
15
- """
16
- description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
17
- description_dataset = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
18
- inference_modelpath = "model/checkpoint-128"
19
- def inference_sentence(text):
20
- tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
21
- model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
22
- for text in tqdm([text]):
23
- inputs = tokenizer(text, return_tensors="pt")
24
- with torch.no_grad(): # run model
25
- logits = model(**inputs).logits
26
- predicted_class_id = logits.argmax().item()
27
- output = model.config.id2label[predicted_class_id]
28
- return output
29
- def frequencies(preds):
30
- preds_dict = {"neutral": 0, "anger": 0, "fear": 0, "joy": 0, "love": 0, "sadness": 0}
31
- for pred in preds:
32
- preds_dict[pred] = preds_dict[pred] + 1
33
- bars = list(preds_dict.keys())
34
- height = list(preds_dict.values())
35
- x_pos = np.arange(len(bars))
36
- plt.bar(x_pos, height, color=['lightgrey', 'firebrick', 'rebeccapurple', 'orange', 'palevioletred', 'cornflowerblue'])
37
- plt.xticks(x_pos, bars)
38
- return plt
39
-
40
- def inference_dataset(file_object, option_list):
41
- tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
42
- model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
43
- data_path = open(file_object.name, 'r')
44
- df = pd.read_csv(data_path, delimiter='\t', header=0, names=['id', 'text'])
45
- ids = df["id"].tolist()
46
- texts = df["text"].tolist()
47
- preds = []
48
- for text in tqdm(texts): # progressbar
49
- inputs = tokenizer(text, return_tensors="pt")
50
- with torch.no_grad(): # run model
51
- logits = model(**inputs).logits
52
- predicted_class_id = logits.argmax().item()
53
- prediction = model.config.id2label[predicted_class_id]
54
- preds.append(prediction)
55
- predictions_content = list(zip(ids, texts, preds))
56
- # write predictions to file
57
- output = "output.txt"
58
- f = open(output, 'w')
59
- f.write("id\ttext\tprediction\n")
60
- for line in predictions_content:
61
- f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
62
- output1 = output
63
- output2 = output3 = output4 = output5 = "This option was not selected."
64
- if "emotion frequencies" in option_list:
65
- output2 = frequencies(preds)
66
- else:
67
- output2 = None
68
- if "emotion distribution over time" in option_list:
69
- output3 = "This option was selected."
70
- if "peaks" in option_list:
71
- output4 = "This option was selected."
72
- if "topics" in option_list:
73
- output5 = "This option was selected."
74
- return [output1, output2, output3, output4, output5]
75
- iface_sentence = gr.Interface(
76
- fn=inference_sentence,
77
- description = description_sentence,
78
- inputs = gr.Textbox(
79
- label="Enter a sentence",
80
- lines=1),
81
- outputs="text")
82
- inputs = [gr.File(
83
- label="Upload a dataset"),
84
- gr.CheckboxGroup(
85
- ["emotion frequencies", "emotion distribution over time", "peaks", "topics"],
86
- label = "Select options")]
87
- outputs = [gr.File(),
88
- gr.Plot(label="Emotion frequencies"),
89
- gr.Textbox(label="Emotion distribution over time"),
90
- gr.Textbox(label="Peaks"),
91
- gr.Textbox(label="Topics")]
92
- iface_dataset = gr.Interface(
93
- fn = inference_dataset,
94
- description = description_dataset,
95
- inputs=inputs,
96
- outputs = outputs)
97
- iface = gr.TabbedInterface([iface_sentence, iface_dataset], ["Sentence", "Dataset"])
98
- iface.queue().launch()
99
- """
100
-
101
- inference_modelpath = "model/checkpoint-128"
102
-
103
- def inference_sentence(text):
104
- tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
105
- model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
106
- for text in tqdm([text]):
107
- inputs = tokenizer(text, return_tensors="pt")
108
- with torch.no_grad(): # run model
109
- logits = model(**inputs).logits
110
- predicted_class_id = logits.argmax().item()
111
- output = model.config.id2label[predicted_class_id]
112
- return "Predicted emotion:\n" + output
113
- """
114
- def inference_sentence(text):
115
- output = "This sentence will be processed:\n" + text
116
- return output
117
- """
118
-
119
- def unavailable(input_file, input_checks):
120
- output = "As we are currently updating this demo, submitting your own data is unavailable for the moment. However, you can try out the showcase mode 😊"
121
- return gr.update(visible=True), gr.update(value=output, label="Oops!", visible=True)
122
-
123
- def showcase(input_file):
124
- output = "showcase/example_predictions.txt"
125
- return gr.update(visible=True), gr.update(visible=False), gr.update(value=output, visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) # next_button_freq becomes available
126
-
127
- def file(input_file, input_checks):
128
- #output = "output.txt"
129
- #f = open(output, 'w')
130
- #f.write("The predictions come here.")
131
- #f.close()
132
- output = "showcase/example_predictions.txt"
133
- if "emotion frequencies" in input_checks:
134
- return gr.update(value=output, visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) # next_button_freq becomes available
135
- elif "emotion distribution over time" in input_checks:
136
- return gr.update(value=output, visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) # next_button_dist becomes available
137
- elif "peaks" in input_checks:
138
- return gr.update(value=output, visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) # next_button_peaks becomes available
139
- elif "topics" in input_checks:
140
- return gr.update(value=output, visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) # next_button_topics becomes available
141
- else:
142
- return gr.update(value=output, visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) # no next_button becomes available
143
-
144
- def freq(output_file, input_checks):
145
- #simple = pd.DataFrame({
146
- #'Emotion category': ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness'],
147
- #'Frequency': [10, 8, 2, 15, 3, 4]})
148
-
149
- f = open("showcase/example_predictions.txt", 'r')
150
- data = f.read().split("\n")
151
- f.close()
152
- data = [line.split("\t") for line in data[1:-1]]
153
-
154
- freq_dict = {}
155
- for line in data:
156
- if line[1] not in freq_dict.keys():
157
- freq_dict[line[1]] = 1
158
- else:
159
- freq_dict[line[1]] += 1
160
-
161
- simple = pd.DataFrame({
162
- 'Emotion category': ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness'],
163
- 'Frequency': [freq_dict['neutral'], freq_dict['anger'], freq_dict['fear'], freq_dict['joy'], freq_dict['love'], freq_dict['sadness']]})
164
-
165
- domain = ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness']
166
- range_ = ['#999999', '#b22222', '#663399', '#ffcc00', '#db7093', '#6495ed']
167
- n = max(simple['Frequency'])
168
-
169
- plot = alt.Chart(simple).mark_bar().encode(
170
- x=alt.X("Emotion category", sort=['neutral', 'anger', 'fear', 'joy', 'love', 'sadness']),
171
- y=alt.Y("Frequency", axis=alt.Axis(grid=False), scale=alt.Scale(domain=[0, (n + 9) // 10 * 10])),
172
- color=alt.Color("Emotion category", scale=alt.Scale(domain=domain, range=range_), legend=None),
173
- tooltip=['Emotion category', 'Frequency']).properties(
174
- width=600).configure_axis(
175
- grid=False).interactive()
176
-
177
- if "emotion distribution over time" in input_checks or (output_file.name).startswith('/tmp/example_predictions'):
178
- return gr.update(value=plot, visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) # next_button_dist becomes available
179
- elif "peaks" in input_checks:
180
- return gr.update(value=plot, visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) # next_button_peaks becomes available
181
- elif "topics" in input_checks:
182
- return gr.update(value=plot, visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) # next_button_topics becomes available
183
- else:
184
- return gr.update(value=plot, visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) # no next_button becomes available
185
-
186
-
187
- def dist(output_file, input_checks):
188
- #data = pd.DataFrame({
189
- #'Date': ['1/1', '1/1', '1/1', '1/1', '1/1', '1/1', '2/1', '2/1', '2/1', '2/1', '2/1', '2/1', '3/1', '3/1', '3/1', '3/1', '3/1', '3/1'],
190
- #'Frequency': [3, 5, 1, 8, 2, 3, 4, 7, 1, 12, 4, 2, 3, 6, 3, 10, 3, 4],
191
- #'Emotion category': ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness', 'neutral', 'anger', 'fear', 'joy', 'love', 'sadness', 'neutral', 'anger', 'fear', 'joy', 'love', 'sadness']})
192
-
193
- f = open("showcase/data.txt", 'r')
194
- data = f.read().split("\n")
195
- f.close()
196
- data = [line.split("\t") for line in data[1:-1]]
197
-
198
- freq_dict = {}
199
- for line in data:
200
- dat = str(date(2000+int(line[0].split("/")[2]), int(line[0].split("/")[1]), int(line[0].split("/")[0])))
201
- if dat not in freq_dict.keys():
202
- freq_dict[dat] = {}
203
- if line[1] not in freq_dict[dat].keys():
204
- freq_dict[dat][line[1]] = 1
205
- else:
206
- freq_dict[dat][line[1]] += 1
207
- else:
208
- if line[1] not in freq_dict[dat].keys():
209
- freq_dict[dat][line[1]] = 1
210
- else:
211
- freq_dict[dat][line[1]] += 1
212
-
213
- start_date = date(2000+int(data[0][0].split("/")[2]), int(data[0][0].split("/")[1]), int(data[0][0].split("/")[0]))
214
- end_date = date(2000+int(data[-1][0].split("/")[2]), int(data[-1][0].split("/")[1]), int(data[-1][0].split("/")[0]))
215
- delta = end_date - start_date # returns timedelta
216
- date_range = [str(start_date + timedelta(days=i)) for i in range(delta.days + 1)]
217
-
218
- dates = [dat for dat in date_range for i in range(6)]
219
- frequency = [freq_dict[dat][emotion] if (dat in freq_dict.keys() and emotion in freq_dict[dat].keys()) else 0 for dat in date_range for emotion in ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness']]
220
- categories = [emotion for dat in date_range for emotion in ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness']]
221
-
222
- data = pd.DataFrame({
223
- 'Date': dates,
224
- 'Frequency': frequency,
225
- 'Emotion category': categories})
226
-
227
- domain = ['neutral', 'anger', 'fear', 'joy', 'love', 'sadness']
228
- range_ = ['#999999', '#b22222', '#663399', '#ffcc00', '#db7093', '#6495ed']
229
- n = max(data['Frequency'])
230
-
231
- highlight = alt.selection(
232
- type='single', on='mouseover', fields=["Emotion category"], nearest=True)
233
-
234
-
235
- base = alt.Chart(data).encode(
236
- x ="Date:T",
237
- y=alt.Y("Frequency", scale=alt.Scale(domain=[0, (n + 9) // 10 * 10])),
238
- color=alt.Color("Emotion category", scale=alt.Scale(domain=domain, range=range_), legend=alt.Legend(orient='bottom', direction='horizontal')))
239
-
240
-
241
- points = base.mark_circle().encode(
242
- opacity=alt.value(0),
243
- tooltip=[
244
- alt.Tooltip('Emotion category', title='Emotion category'),
245
- alt.Tooltip('Date:T', title='Date'),
246
- alt.Tooltip('Frequency', title='Frequency')
247
- ]).add_selection(highlight)
248
-
249
-
250
- lines = base.mark_line().encode(
251
- size=alt.condition(~highlight, alt.value(1), alt.value(3)))
252
-
253
- plot = (points + lines).properties(width=600, height=350).interactive()
254
-
255
- if "peaks" in input_checks or (output_file.name).startswith('/tmp/example_predictions'):
256
- return gr.Plot.update(value=plot, visible=True), gr.update(visible=True), gr.update(visible=False) # next_button_peaks becomes available
257
- elif "topics" in input_checks:
258
- return gr.Plot.update(value=plot, visible=True), gr.update(visible=False), gr.update(visible=True) # next_button_topics becomes available
259
- else:
260
- return gr.Plot.update(value=plot, visible=True), gr.update(visible=False), gr.update(visible=False) # no next_button becomes available
261
-
262
- def peaks(output_file, input_checks):
263
- plot = pickle.load(open('showcase/peaks_covid.p', 'rb'))
264
- if "topics" in input_checks or (output_file.name).startswith('/tmp/example_predictions'):
265
- return gr.Plot.update(value=plot, visible=True), gr.update(visible=True) # next_button_topics becomes available
266
- else:
267
- return gr.Plot.update(value=plot, visible=True), gr.update(visible=False) # no next_button becomes available
268
-
269
- def topics(output_file, input_checks):
270
- plot = pickle.load(open('showcase/vis_classes_covid.p', 'rb'))
271
- plot.update_layout(width=600, height=400)
272
- return gr.Plot.update(value=plot, visible=True) # no next_button becomes available
273
-
274
- # This demo was made to demonstrate the EmotioNL model, a transformer-based classification model that analyses emotions in Dutch texts. The model uses [RobBERT](https://github.com/iPieter/RobBERT), which was further fine-tuned on the [EmotioNL dataset](https://lt3.ugent.be/resources/emotionl/). The resulting model is a classifier that, given a sentence, predicts one of the following emotion categories: _anger_, _fear_, _joy_, _love_, _sadness_ or _neutral_. The demo can be used either in **sentence mode**, which allows you to enter a sentence for which an emotion will be predicted; or in **dataset mode**, which allows you to upload a dataset or see the full functuonality of with example data.
275
-
276
-
277
- with gr.Blocks() as demo:
278
- with gr.Column(scale=1, min_width=50):
279
- gr.Markdown("""
280
- """)
281
- with gr.Column(scale=5):
282
- gr.Markdown("""
283
- <div style="text-align: center"><h1>EmotioNL: A framework for Dutch emotion detection</h1></div>
284
-
285
- <div style="display: block;margin-left: auto;margin-right: auto;width: 60%;"><img alt="EmotioNL logo" src="https://users.ugent.be/~lundbruy/EmotioNL.png" width="100%"></div>
286
-
287
- <div style="display: block;margin-left: auto;margin-right: auto;width: 75%;">This demo was made to demonstrate the EmotioNL model, a transformer-based classification model that analyses emotions in Dutch texts. The model uses <a href="https://github.com/iPieter/RobBERT">RobBERT</a>, which was further fine-tuned on the <a href="https://lt3.ugent.be/resources/emotionl/">EmotioNL dataset</a>. The resulting model is a classifier that, given a sentence, predicts one of the following emotion categories: <i>anger</i>, <i>fear</i>, <i>joy</i>, <i>love</i>, <i>sadness</i> or <i>neutral</i>. The demo can be used either in <b>sentence mode</b>, which allows you to enter a sentence for which an emotion will be predicted; or in <b>dataset mode</b>, which allows you to upload a dataset or see the full functionality with example data.</div>
288
- """)
289
- with gr.Tab("Sentence"):
290
- gr.Markdown("""
291
- """)
292
- with gr.Row():
293
- with gr.Column():
294
- input = gr.Textbox(
295
- label="Enter a sentence",
296
- value="Jaaah! Volgende vakantie Barcelona en na het zomerseizoen naar de Algarve",
297
- lines=1)
298
- send_btn = gr.Button("Send")
299
- output = gr.Textbox()
300
- send_btn.click(fn=inference_sentence, inputs=input, outputs=output)
301
- with gr.Tab("Dataset"):
302
- gr.Markdown("""
303
- _As we are currently updating this demo, submitting your own data is unavailable for the moment._
304
- _Try out the showcase mode._
305
- """)
306
- with gr.Row():
307
- with gr.Column():
308
- demo_btn = gr.Button("Showcase with example data", variant="primary")
309
- with gr.Column():
310
- gr.Markdown("""
311
- **<font size="4">Run in showcase mode or use your own data</font>**
312
- Try out the demo in showcase mode, which uses example data (609,206 tweets about the COVID-19 pandemic) with all the options provided by the demo, or upload your own dataset.
313
- """)
314
- with gr.Row():
315
- with gr.Column():
316
- input_file = gr.File(
317
- label="Upload a dataset")
318
- input_checks = gr.CheckboxGroup(
319
- ["emotion frequencies", "emotion distribution over time", "peaks", "topics"],
320
- label = "Select options")
321
- send_btn = gr.Button("Submit data")
322
- with gr.Column():
323
- gr.Markdown("""
324
- **<font size="4">Data format</font>**
325
- The data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected. For now, we only accept files with maximum 400 sentences and a limit of 300 tokens per sentence.
326
-
327
- **<font size="4">Options</font>**
328
- **Emotion frequencies** outputs a bar plot with the prediction frequencies of each emotion category (anger, fear, joy, love, sadness or neutral).
329
- **Emotion distribution over time** outputs a line plot that visualises the frequency of predicted emotions over time for each emotion category.
330
- **Peaks** outputs a step graph that only shows the significant fluctuations (upwards and downwards) in emotion frequencies over time.
331
- **Topics** uses [BERTopic](https://maartengr.github.io/BERTopic/index.html) to find topics in the datasets, and outputs a bar plot that shows the emotion distribution per topic.
332
- """)
333
-
334
-
335
- with gr.Row():
336
- gr.Markdown("""
337
- ___
338
- """)
339
- with gr.Row():
340
- with gr.Column():
341
- output_markdown = gr.Markdown("""
342
- **<font size="4">Output</font>**
343
- """, visible=False)
344
-
345
- message = gr.Textbox(label="Message", visible=False)
346
-
347
- output_file = gr.File(label="Predictions", visible=False)
348
- next_button_freq = gr.Button("Show emotion frequencies", visible=False)
349
-
350
- output_plot = gr.Plot(show_label=False, visible=False).style(container=True)
351
- next_button_dist = gr.Button("Show emotion distribution over time", visible=False)
352
-
353
- output_dist = gr.Plot(show_label=False, visible=False)
354
- next_button_peaks = gr.Button("Show peaks", visible=False)
355
-
356
- output_peaks = gr.Plot(show_label=False, visible=False)
357
- next_button_topics = gr.Button("Show topics", visible=False)
358
-
359
- output_topics = gr.Plot(show_label=False, visible=False)
360
-
361
- #send_btn.click(fn=file, inputs=[input_file,input_checks], outputs=[output_file,next_button_freq,next_button_dist,next_button_peaks,next_button_topics])
362
- next_button_freq.click(fn=freq, inputs=[output_file,input_checks], outputs=[output_plot,next_button_dist,next_button_peaks,next_button_topics])
363
- next_button_dist.click(fn=dist, inputs=[output_file,input_checks], outputs=[output_dist,next_button_peaks,next_button_topics])
364
- next_button_peaks.click(fn=peaks, inputs=[output_file,input_checks], outputs=[output_peaks,next_button_topics])
365
- next_button_topics.click(fn=topics, inputs=[output_file,input_checks], outputs=output_topics)
366
- send_btn.click(fn=unavailable, inputs=[input_file,input_checks], outputs=[output_markdown,message])
367
- demo_btn.click(fn=showcase, inputs=[input_file], outputs=[output_markdown,message,output_file,next_button_freq,next_button_dist,next_button_peaks,next_button_topics])
368
-
369
- with gr.Row():
370
- with gr.Column():
371
- gr.Markdown("""
372
- <font size="2">Both this demo and the dataset have been created by [LT3](https://lt3.ugent.be/), the Language and Translation Technology Team of Ghent University. The EmotioNL project has been carried out with support from the Research Foundation – Flanders (FWO). For any questions, please contact luna.debruyne@ugent.be.</font>
373
-
374
- <div style="display: grid;grid-template-columns:150px auto;"> <img style="margin-right: 1em" alt="LT3 logo" src="https://lt3.ugent.be/static/images/logo_v2_single.png" width="136" height="58"> <img style="margin-right: 1em" alt="FWO logo" src="https://www.fwo.be/images/logo_desktop.png" height="58"></div>
375
- """)
376
- with gr.Column(scale=1, min_width=50):
377
- gr.Markdown("""
378
- """)
379
-
380
- demo.launch()
381
- # <div style="display: grid;grid-template-columns:80px 150px auto;"><img style="margin-right: 1em" alt="UGent logo" src="https://lt3.ugent.be/static/images/logo_ugent_en.svg" height="58"> <img style="margin-right: 1em" alt="LT3 logo" src="https://lt3.ugent.be/static/images/logo_v2_single.png" width="136" height="58"> <img style="margin-right: 1em" alt="FWO logo" src="https://www.fwo.be/images/logo_desktop.png" height="58"></div>