Upload app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ import fasttext
|
|
4 |
from transformers import AutoModelForSequenceClassification
|
5 |
from transformers import AutoTokenizer
|
6 |
|
7 |
-
import random
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import torch
|
@@ -27,7 +26,7 @@ class LanguageIdentification:
|
|
27 |
self.model = fasttext.load_model(pretrained_lang_model)
|
28 |
|
29 |
def predict_lang(self, text):
|
30 |
-
predictions = self.model.predict(text, k=
|
31 |
return predictions
|
32 |
|
33 |
LANGUAGE = LanguageIdentification()
|
@@ -46,10 +45,13 @@ def tokenized_data(tokenizer, inputs):
|
|
46 |
|
47 |
examples = []
|
48 |
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
|
49 |
-
random.seed(100)
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
53 |
|
54 |
|
55 |
eng_model_name = "roberta-base"
|
@@ -75,23 +77,31 @@ kor_model = AutoModelForSequenceClassification.from_pretrained(
|
|
75 |
|
76 |
|
77 |
def builder(lang, text):
|
|
|
|
|
78 |
if lang == 'Any':
|
79 |
pred = LANGUAGE.predict_lang(text)
|
80 |
-
if pred[0]
|
81 |
-
lang = 'Kor'
|
82 |
-
else: # '__label__en'
|
83 |
lang = 'Eng'
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
86 |
if lang == 'Eng':
|
87 |
model = eng_model
|
88 |
tokenizer = eng_tokenizer
|
|
|
|
|
89 |
if lang == 'Kor':
|
90 |
model = kor_model
|
91 |
tokenizer = kor_tokenizer
|
|
|
92 |
|
|
|
93 |
inputs = tokenized_data(tokenizer, text)
|
94 |
-
|
95 |
model.eval()
|
96 |
with torch.no_grad():
|
97 |
logits = model(input_ids=inputs['input_ids'],
|
@@ -103,13 +113,13 @@ def builder(lang, text):
|
|
103 |
|
104 |
prediction = torch.argmax(logits, axis=1)
|
105 |
|
106 |
-
return {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()}
|
107 |
return id2label[prediction.item()]
|
108 |
|
109 |
|
110 |
|
111 |
demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"],
|
112 |
-
outputs=gr.Label(num_top_classes=
|
113 |
# outputs='label',
|
114 |
title=title, description=description, examples=examples)
|
115 |
|
@@ -119,6 +129,7 @@ demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']),
|
|
119 |
# allow_flagging="auto",
|
120 |
# description=description, examples=examples)
|
121 |
|
|
|
122 |
if __name__ == "__main__":
|
123 |
# print(examples)
|
124 |
demo.launch()
|
|
|
4 |
from transformers import AutoModelForSequenceClassification
|
5 |
from transformers import AutoTokenizer
|
6 |
|
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
import torch
|
|
|
26 |
self.model = fasttext.load_model(pretrained_lang_model)
|
27 |
|
28 |
def predict_lang(self, text):
|
29 |
+
predictions = self.model.predict(text, k=200) # returns top 200 matching languages
|
30 |
return predictions
|
31 |
|
32 |
LANGUAGE = LanguageIdentification()
|
|
|
45 |
|
46 |
examples = []
|
47 |
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
|
48 |
+
np.random.seed(100)
|
49 |
+
|
50 |
+
idx = np.random.choice(50, size=5, replace=False)
|
51 |
+
eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
|
52 |
+
kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
|
53 |
+
examples = eng_examples + kor_examples
|
54 |
+
|
55 |
|
56 |
|
57 |
eng_model_name = "roberta-base"
|
|
|
77 |
|
78 |
|
79 |
def builder(lang, text):
|
80 |
+
percent_kor, percent_eng = 0, 0
|
81 |
+
|
82 |
if lang == 'Any':
|
83 |
pred = LANGUAGE.predict_lang(text)
|
84 |
+
if '__label__en' in pred[0]:
|
|
|
|
|
85 |
lang = 'Eng'
|
86 |
+
idx = pred[0].index('__label__en')
|
87 |
+
percent_eng = pred[1][idx]
|
88 |
+
if '__label__ko' in pred[0]:
|
89 |
+
lang = 'Kor'
|
90 |
+
idx = pred[0].index('__label__ko')
|
91 |
+
percent_kor = pred[1][idx]
|
92 |
+
|
93 |
if lang == 'Eng':
|
94 |
model = eng_model
|
95 |
tokenizer = eng_tokenizer
|
96 |
+
if percent_eng==0: percent_eng=1
|
97 |
+
|
98 |
if lang == 'Kor':
|
99 |
model = kor_model
|
100 |
tokenizer = kor_tokenizer
|
101 |
+
if percent_kor==0: percent_kor=1
|
102 |
|
103 |
+
|
104 |
inputs = tokenized_data(tokenizer, text)
|
|
|
105 |
model.eval()
|
106 |
with torch.no_grad():
|
107 |
logits = model(input_ids=inputs['input_ids'],
|
|
|
113 |
|
114 |
prediction = torch.argmax(logits, axis=1)
|
115 |
|
116 |
+
return [ {'Kor': percent_kor, 'Eng': percent_eng, 'Other': 1-(percent_kor+percent_eng)}, {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()} ]
|
117 |
return id2label[prediction.item()]
|
118 |
|
119 |
|
120 |
|
121 |
demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Any', 'Eng', 'Kor']), "text"],
|
122 |
+
outputs=[ gr.Label(num_top_classes=3, label='Lang'), gr.Label(num_top_classes=2, label='Result') ],
|
123 |
# outputs='label',
|
124 |
title=title, description=description, examples=examples)
|
125 |
|
|
|
129 |
# allow_flagging="auto",
|
130 |
# description=description, examples=examples)
|
131 |
|
132 |
+
|
133 |
if __name__ == "__main__":
|
134 |
# print(examples)
|
135 |
demo.launch()
|