Spaces:
Running
Running
Commit
·
9a74e03
1
Parent(s):
f625e51
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,9 @@ import gradio as gr
|
|
2 |
import torch
|
3 |
import numpy as np
|
4 |
|
|
|
|
|
|
|
5 |
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
|
6 |
from transformers import TrainingArguments, Trainer
|
7 |
|
@@ -12,6 +15,8 @@ description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse
|
|
12 |
description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
|
13 |
|
14 |
inference_modelpath = "model/checkpoint-128"
|
|
|
|
|
15 |
output_dir = "model"
|
16 |
model_config = {
|
17 |
"model_weights": "pdelobelle/robbert-v2-dutch-base",
|
@@ -24,6 +29,7 @@ model_config = {
|
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"])
|
25 |
model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
|
26 |
|
|
|
27 |
# Function for encoding (tokenizing) data
|
28 |
def encode_data(data):
|
29 |
text = data["text"]
|
@@ -52,6 +58,7 @@ trainer = Trainer(
|
|
52 |
model = model,
|
53 |
args = test_args)
|
54 |
|
|
|
55 |
def inference_dataset(file_object):
|
56 |
#input_file = open(file_object.name, 'r')
|
57 |
input_file = file_object
|
@@ -76,6 +83,32 @@ def inference_dataset(file_object):
|
|
76 |
f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
|
77 |
f.close()
|
78 |
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
def what_happened(text, file_object, option_list):
|
81 |
if file_object:
|
|
|
2 |
import torch
|
3 |
import numpy as np
|
4 |
|
5 |
+
import pandas as pd
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
|
9 |
from transformers import TrainingArguments, Trainer
|
10 |
|
|
|
15 |
description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
|
16 |
|
17 |
inference_modelpath = "model/checkpoint-128"
|
18 |
+
|
19 |
+
"""
|
20 |
output_dir = "model"
|
21 |
model_config = {
|
22 |
"model_weights": "pdelobelle/robbert-v2-dutch-base",
|
|
|
29 |
tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"])
|
30 |
model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
|
31 |
|
32 |
+
|
33 |
# Function for encoding (tokenizing) data
|
34 |
def encode_data(data):
|
35 |
text = data["text"]
|
|
|
58 |
model = model,
|
59 |
args = test_args)
|
60 |
|
61 |
+
|
62 |
def inference_dataset(file_object):
|
63 |
#input_file = open(file_object.name, 'r')
|
64 |
input_file = file_object
|
|
|
83 |
f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
|
84 |
f.close()
|
85 |
return output
|
86 |
+
"""
|
87 |
+
|
88 |
+
def inference_dataset(file_object):
|
89 |
+
tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
|
90 |
+
model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
|
91 |
+
data_path = open(file_object, 'r')
|
92 |
+
df = pd.read_csv(data_path, delimiter='\t', header=0, names=['id', 'text'])
|
93 |
+
ids = df["id"].tolist()
|
94 |
+
texts = df["text"].tolist()
|
95 |
+
preds = []
|
96 |
+
for text in tqdm(texts): # progressbar
|
97 |
+
inputs = tokenizer(text, return_tensors="pt")
|
98 |
+
with torch.no_grad(): # run model
|
99 |
+
logits = model(**inputs).logits
|
100 |
+
predicted_class_id = logits.argmax().item()
|
101 |
+
prediction = model.config.id2label[predicted_class_id]
|
102 |
+
preds.append(prediction)
|
103 |
+
predictions_content = list(zip(ids, texts, preds))
|
104 |
+
# write predictions to file
|
105 |
+
output = "output.txt"
|
106 |
+
f = open(output, 'w')
|
107 |
+
f.write("id\ttext\tprediction\n")
|
108 |
+
for line in predictions_content:
|
109 |
+
f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
|
110 |
+
f.close()
|
111 |
+
return output
|
112 |
|
113 |
def what_happened(text, file_object, option_list):
|
114 |
if file_object:
|