Spaces:
Sleeping
Sleeping
sigmadream
commited on
Commit
โข
123ce6b
1
Parent(s):
a06c62b
Update app.py
Browse files
app.py
CHANGED
@@ -8,19 +8,23 @@ import numpy as np
|
|
8 |
import pandas as pd
|
9 |
import torch
|
10 |
|
|
|
11 |
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
|
12 |
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
|
13 |
|
14 |
-
title = "ํ๊ตญ์ด/์์ด ๊ฐ์ ๋ถ์ ์์ (๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ํ์ฉ)"
|
15 |
-
description = "์ํํ์ ์
๋ ฅํ์ฌ ๊ธ์ ์ ์ธ์ง ๋ถ์ ์ ์ธ์ง๋ฅผ ๋ถ๋ฅํ๋ ๋ชจ๋ธ์
๋๋ค."
|
16 |
|
|
|
|
|
|
|
|
|
|
|
17 |
class LanguageIdentification:
|
18 |
def __init__(self):
|
19 |
pretrained_lang_model = "./lid.176.ftz"
|
20 |
self.model = fasttext.load_model(pretrained_lang_model)
|
21 |
|
22 |
def predict_lang(self, text):
|
23 |
-
predictions = self.model.predict(text, k=200)
|
24 |
return predictions
|
25 |
|
26 |
LANGUAGE = LanguageIdentification()
|
@@ -36,6 +40,17 @@ def tokenized_data(tokenizer, inputs):
|
|
36 |
truncation=True)
|
37 |
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
eng_model_name = "roberta-base"
|
40 |
eng_step = 1900
|
41 |
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
|
@@ -62,6 +77,8 @@ def builder(Lang, Text):
|
|
62 |
percent_kor, percent_eng = 0, 0
|
63 |
text_list = Text.split(' ')
|
64 |
|
|
|
|
|
65 |
if Lang == '์ธ์ด๊ฐ์ง ๊ธฐ๋ฅ ์ฌ์ฉ':
|
66 |
pred = LANGUAGE.predict_lang(Text)
|
67 |
if '__label__en' in pred[0]:
|
@@ -72,6 +89,7 @@ def builder(Lang, Text):
|
|
72 |
Lang = 'Kor'
|
73 |
idx = pred[0].index('__label__ko')
|
74 |
p_kor = pred[1][idx]
|
|
|
75 |
percent_kor = p_kor / (p_kor+p_eng)
|
76 |
percent_eng = p_eng / (p_kor+p_eng)
|
77 |
|
@@ -84,7 +102,9 @@ def builder(Lang, Text):
|
|
84 |
model = kor_model
|
85 |
tokenizer = kor_tokenizer
|
86 |
if percent_kor==0: percent_kor=1
|
|
|
87 |
|
|
|
88 |
inputs = tokenized_data(tokenizer, Text)
|
89 |
model.eval()
|
90 |
with torch.no_grad():
|
@@ -93,7 +113,10 @@ def builder(Lang, Text):
|
|
93 |
|
94 |
m = torch.nn.Softmax(dim=1)
|
95 |
output = m(logits)
|
|
|
|
|
96 |
|
|
|
97 |
output_analysis = []
|
98 |
for word in text_list:
|
99 |
tokenized_word = tokenized_data(tokenizer, word)
|
@@ -116,10 +139,12 @@ def builder(Lang, Text):
|
|
116 |
else:
|
117 |
output_analysis.append( (word, None) )
|
118 |
|
|
|
119 |
return [ {'Kor': percent_kor, 'Eng': percent_eng},
|
120 |
{id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
|
121 |
output_analysis ]
|
122 |
|
|
|
123 |
return id2label[prediction.item()]
|
124 |
|
125 |
|
@@ -127,15 +152,33 @@ with gr.Blocks() as demo1:
|
|
127 |
gr.Markdown(
|
128 |
"""
|
129 |
<h1 align="center">
|
130 |
-
ํ๊ตญ์ด/์์ด ๊ฐ์ ๋ถ์ ์์
|
131 |
</h1>
|
132 |
""")
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
with gr.Row():
|
135 |
with gr.Column():
|
136 |
-
inputs_1 = gr.Dropdown(choices=['์ธ์ด๊ฐ์ง', 'Eng', 'Kor'], value='์ธ์ด๊ฐ์ง ๊ธฐ๋ฅ ์ฌ์ฉ', label='Lang')
|
137 |
inputs_2 = gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ์์ค.", label='Text')
|
138 |
with gr.Row():
|
|
|
139 |
btn = gr.Button("์ ์ถํ๊ธฐ")
|
140 |
with gr.Column():
|
141 |
output_1 = gr.Label(num_top_classes=3, label='Lang')
|
@@ -143,7 +186,13 @@ with gr.Blocks() as demo1:
|
|
143 |
output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
|
144 |
.style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
|
145 |
|
|
|
146 |
btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
|
|
|
147 |
|
|
|
|
|
148 |
if __name__ == "__main__":
|
|
|
|
|
149 |
demo1.launch()
|
|
|
8 |
import pandas as pd
|
9 |
import torch
|
10 |
|
11 |
+
|
12 |
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
|
13 |
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
|
14 |
|
|
|
|
|
15 |
|
16 |
+
title = "ํ๊ตญ์ด/์์ด ๊ฐ์ ๋ถ์ ์์ (๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ํ์ฉ)"
|
17 |
+
description = "์ํํ์ ์
๋ ฅํ์ฌ ๊ธ์ ์ ์ธ์ง ๋ถ์ ์ ์ธ์ง๋ฅผ ๋ถ๋ฅํ๋ ๋ชจ๋ธ์
๋๋ค. \
|
18 |
+
ํ๊ตญ์ด์ธ์ง ์์ด์ธ์ง ํ๋จํ๊ณ ์์ธกํด์ฃผ๋ ""Default""๋ผ๋ ๋ฒ์ ๋ ์ ๊ณตํฉ๋๋ค." \
|
19 |
+
ํ๊ตญ์ด ๋ฒ์ ๊ณผ ์์ด ๋ฒ์ ์ค์์ ์ ํํ ์ ์์ต๋๋ค.
|
20 |
+
|
21 |
class LanguageIdentification:
|
22 |
def __init__(self):
|
23 |
pretrained_lang_model = "./lid.176.ftz"
|
24 |
self.model = fasttext.load_model(pretrained_lang_model)
|
25 |
|
26 |
def predict_lang(self, text):
|
27 |
+
predictions = self.model.predict(text, k=200) # returns top 200 matching languages
|
28 |
return predictions
|
29 |
|
30 |
LANGUAGE = LanguageIdentification()
|
|
|
40 |
truncation=True)
|
41 |
|
42 |
|
43 |
+
examples = []
|
44 |
+
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
|
45 |
+
np.random.seed(100)
|
46 |
+
|
47 |
+
idx = np.random.choice(50, size=5, replace=False)
|
48 |
+
eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
|
49 |
+
kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
|
50 |
+
examples = eng_examples + kor_examples
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
eng_model_name = "roberta-base"
|
55 |
eng_step = 1900
|
56 |
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
|
|
|
77 |
percent_kor, percent_eng = 0, 0
|
78 |
text_list = Text.split(' ')
|
79 |
|
80 |
+
|
81 |
+
# [ output_1 ]
|
82 |
if Lang == '์ธ์ด๊ฐ์ง ๊ธฐ๋ฅ ์ฌ์ฉ':
|
83 |
pred = LANGUAGE.predict_lang(Text)
|
84 |
if '__label__en' in pred[0]:
|
|
|
89 |
Lang = 'Kor'
|
90 |
idx = pred[0].index('__label__ko')
|
91 |
p_kor = pred[1][idx]
|
92 |
+
# Normalize Percentage
|
93 |
percent_kor = p_kor / (p_kor+p_eng)
|
94 |
percent_eng = p_eng / (p_kor+p_eng)
|
95 |
|
|
|
102 |
model = kor_model
|
103 |
tokenizer = kor_tokenizer
|
104 |
if percent_kor==0: percent_kor=1
|
105 |
+
|
106 |
|
107 |
+
# [ output_2 ]
|
108 |
inputs = tokenized_data(tokenizer, Text)
|
109 |
model.eval()
|
110 |
with torch.no_grad():
|
|
|
113 |
|
114 |
m = torch.nn.Softmax(dim=1)
|
115 |
output = m(logits)
|
116 |
+
# print(logits, output)
|
117 |
+
|
118 |
|
119 |
+
# [ output_3 ]
|
120 |
output_analysis = []
|
121 |
for word in text_list:
|
122 |
tokenized_word = tokenized_data(tokenizer, word)
|
|
|
139 |
else:
|
140 |
output_analysis.append( (word, None) )
|
141 |
|
142 |
+
|
143 |
return [ {'Kor': percent_kor, 'Eng': percent_eng},
|
144 |
{id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
|
145 |
output_analysis ]
|
146 |
|
147 |
+
# prediction = torch.argmax(logits, axis=1)
|
148 |
return id2label[prediction.item()]
|
149 |
|
150 |
|
|
|
152 |
gr.Markdown(
|
153 |
"""
|
154 |
<h1 align="center">
|
155 |
+
ํ๊ตญ์ด/์์ด ๊ฐ์ ๋ถ์ ์์ (๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ํ์ฉ)
|
156 |
</h1>
|
157 |
""")
|
158 |
|
159 |
+
gr.Markdown(
|
160 |
+
"""
|
161 |
+
์ํ ๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ๋ฉด, ๊ธ์ ์ ์ธ ๊ฐ์ ์ธ์ง ๋ถ์ ์ ์ธ ๊ฐ์ ์ธ์ง ํ๋ณํ๋ ๋ชจ๋ธ์
๋๋ค. \
|
162 |
+
์์ด์ ํ๊ธ์ ์ง์ํ๋ฉฐ, ์ธ์ด๋ฅผ ์ง์ ์ ํํ ์๋, ํน์ ๋ชจ๋ธ์ด ์ธ์ด๊ฐ์ง๋ฅผ ์ง์ ํ๋๋ก ํ ์ ์์ต๋๋ค.
|
163 |
+
๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ๋ฉด, (1) ๊ฐ์ง๋ ์ธ์ด, (2) ๊ธ์ ๋ฆฌ๋ทฐ์ผ ํ๋ฅ ๊ณผ ๋ถ์ ๋ฆฌ๋ทฐ์ผ ํ๋ฅ , (3) ์
๋ ฅ๋ ๋ฆฌ๋ทฐ์ ์ด๋ ๋จ์ด๊ฐ ๊ธ์ /๋ถ์ ๊ฒฐ์ ์ ์ํฅ์ ์ฃผ์๋์ง \
|
164 |
+
(๊ธ์ ์ผ ๊ฒฝ์ฐ ๋นจ๊ฐ์, ๋ถ์ ์ผ ๊ฒฝ์ฐ ํ๋์)๋ฅผ ํ์ธํ ์ ์์ต๋๋ค.
|
165 |
+
""")
|
166 |
+
|
167 |
+
with gr.Accordion(label="๋ชจ๋ธ์ ๋ํ ์ค๋ช
( ์ฌ๊ธฐ๋ฅผ ํด๋ฆญ ํ์์ค. )", open=False):
|
168 |
+
gr.Markdown(
|
169 |
+
"""
|
170 |
+
์์ด ๋ชจ๋ธ์ bert-base-uncased ๊ธฐ๋ฐ์ผ๋ก, ์์ด ์ํ ๋ฆฌ๋ทฐ ๋ถ์ ๋ฐ์ดํฐ์
์ธ SST-2๋ก ํ์ต ๋ฐ ํ๊ฐ๋์์ต๋๋ค.
|
171 |
+
ํ๊ธ ๋ชจ๋ธ์ klue/roberta-base ๊ธฐ๋ฐ์ด๋ค. ๊ธฐ์กด ํ๊ธ ์ํ ๋ฆฌ๋ทฐ ๋ถ์ ๋ฐ์ดํฐ์
์ด ์กด์ฌํ์ง ์์, ๋ค์ด๋ฒ ์ํ์ ๋ฆฌ๋ทฐ๋ฅผ ํฌ๋กค๋งํด์ ์ํ ๋ฆฌ๋ทฐ ๋ถ์ ๋ฐ์ดํฐ์
์ ์ ์ํ๊ณ , ์ด๋ฅผ ์ด์ฉํ์ฌ ๋ชจ๋ธ์ ํ์ต ๋ฐ ํ๊ฐํ์์ต๋๋ค.
|
172 |
+
์์ด ๋ชจ๋ธ์ SST-2์์ 92.8%, ํ๊ธ ๋ชจ๋ธ์ ๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ์
์์ 94%์ ์ ํ๋๋ฅผ ๊ฐ์ง๋๋ค(test set ๊ธฐ์ค).
|
173 |
+
์ธ์ด๊ฐ์ง๋ fasttext์ language detector๋ฅผ ์ฌ์ฉํ์๋ค. ๋ฆฌ๋ทฐ์ ๋จ์ด๋ณ ์ํฅ๋ ฅ์, ๋จ์ด ๊ฐ๊ฐ์ ๋ชจ๋ธ์ ๋ฃ์์ ๋ ๊ฒฐ๊ณผ๊ฐ ๊ธ์ ์ผ๋ก ๋์ค๋์ง ๋ถ์ ์ผ๋ก ๋์ค๋์ง๋ฅผ ๋ฐํ์ผ๋ก ์ธก์ ํ์์ต๋๋ค.
|
174 |
+
""")
|
175 |
+
|
176 |
with gr.Row():
|
177 |
with gr.Column():
|
178 |
+
inputs_1 = gr.Dropdown(choices=['์ธ์ด๊ฐ์ง ๊ธฐ๋ฅ ์ฌ์ฉ', 'Eng', 'Kor'], value='์ธ์ด๊ฐ์ง ๊ธฐ๋ฅ ์ฌ์ฉ', label='Lang')
|
179 |
inputs_2 = gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์
๋ ฅํ์์ค.", label='Text')
|
180 |
with gr.Row():
|
181 |
+
# btn2 = gr.Button("ํด๋ฆฌ์ด")
|
182 |
btn = gr.Button("์ ์ถํ๊ธฐ")
|
183 |
with gr.Column():
|
184 |
output_1 = gr.Label(num_top_classes=3, label='Lang')
|
|
|
186 |
output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
|
187 |
.style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
|
188 |
|
189 |
+
# btn2.click(fn=fn2, inputs=[None, None], output=[output_1, output_2, output_3])
|
190 |
btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
|
191 |
+
gr.Examples(examples, inputs=[inputs_1, inputs_2])
|
192 |
|
193 |
+
|
194 |
+
|
195 |
if __name__ == "__main__":
|
196 |
+
# print(examples)
|
197 |
+
# demo.launch()
|
198 |
demo1.launch()
|