sigmadream commited on
Commit
123ce6b
โ€ข
1 Parent(s): a06c62b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -5
app.py CHANGED
@@ -8,19 +8,23 @@ import numpy as np
8
  import pandas as pd
9
  import torch
10
 
 
11
  id2label = {0: "NEGATIVE", 1: "POSITIVE"}
12
  label2id = {"NEGATIVE": 0, "POSITIVE": 1}
13
 
14
- title = "ํ•œ๊ตญ์–ด/์˜์–ด ๊ฐ์ • ๋ถ„์„ ์˜ˆ์ œ(๋„ค์ด๋ฒ„ ์˜ํ™” ๋ฆฌ๋ทฐ๋ฅผ ํ™œ์šฉ)"
15
- description = "์˜ํ™”ํ‰์„ ์ž…๋ ฅํ•˜์—ฌ ๊ธ์ •์ ์ธ์ง€ ๋ถ€์ •์ ์ธ์ง€๋ฅผ ๋ถ„๋ฅ˜ํ•˜๋Š” ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค."
16
 
 
 
 
 
 
17
  class LanguageIdentification:
18
  def __init__(self):
19
  pretrained_lang_model = "./lid.176.ftz"
20
  self.model = fasttext.load_model(pretrained_lang_model)
21
 
22
  def predict_lang(self, text):
23
- predictions = self.model.predict(text, k=200)
24
  return predictions
25
 
26
  LANGUAGE = LanguageIdentification()
@@ -36,6 +40,17 @@ def tokenized_data(tokenizer, inputs):
36
  truncation=True)
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
39
  eng_model_name = "roberta-base"
40
  eng_step = 1900
41
  eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
@@ -62,6 +77,8 @@ def builder(Lang, Text):
62
  percent_kor, percent_eng = 0, 0
63
  text_list = Text.split(' ')
64
 
 
 
65
  if Lang == '์–ธ์–ด๊ฐ์ง€ ๊ธฐ๋Šฅ ์‚ฌ์šฉ':
66
  pred = LANGUAGE.predict_lang(Text)
67
  if '__label__en' in pred[0]:
@@ -72,6 +89,7 @@ def builder(Lang, Text):
72
  Lang = 'Kor'
73
  idx = pred[0].index('__label__ko')
74
  p_kor = pred[1][idx]
 
75
  percent_kor = p_kor / (p_kor+p_eng)
76
  percent_eng = p_eng / (p_kor+p_eng)
77
 
@@ -84,7 +102,9 @@ def builder(Lang, Text):
84
  model = kor_model
85
  tokenizer = kor_tokenizer
86
  if percent_kor==0: percent_kor=1
 
87
 
 
88
  inputs = tokenized_data(tokenizer, Text)
89
  model.eval()
90
  with torch.no_grad():
@@ -93,7 +113,10 @@ def builder(Lang, Text):
93
 
94
  m = torch.nn.Softmax(dim=1)
95
  output = m(logits)
 
 
96
 
 
97
  output_analysis = []
98
  for word in text_list:
99
  tokenized_word = tokenized_data(tokenizer, word)
@@ -116,10 +139,12 @@ def builder(Lang, Text):
116
  else:
117
  output_analysis.append( (word, None) )
118
 
 
119
  return [ {'Kor': percent_kor, 'Eng': percent_eng},
120
  {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
121
  output_analysis ]
122
 
 
123
  return id2label[prediction.item()]
124
 
125
 
@@ -127,15 +152,33 @@ with gr.Blocks() as demo1:
127
  gr.Markdown(
128
  """
129
  <h1 align="center">
130
- ํ•œ๊ตญ์–ด/์˜์–ด ๊ฐ์ • ๋ถ„์„ ์˜ˆ์ œ
131
  </h1>
132
  """)
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  with gr.Row():
135
  with gr.Column():
136
- inputs_1 = gr.Dropdown(choices=['์–ธ์–ด๊ฐ์ง€', 'Eng', 'Kor'], value='์–ธ์–ด๊ฐ์ง€ ๊ธฐ๋Šฅ ์‚ฌ์šฉ', label='Lang')
137
  inputs_2 = gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์ž…๋ ฅํ•˜์‹œ์˜ค.", label='Text')
138
  with gr.Row():
 
139
  btn = gr.Button("์ œ์ถœํ•˜๊ธฐ")
140
  with gr.Column():
141
  output_1 = gr.Label(num_top_classes=3, label='Lang')
@@ -143,7 +186,13 @@ with gr.Blocks() as demo1:
143
  output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
144
  .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
145
 
 
146
  btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
 
147
 
 
 
148
  if __name__ == "__main__":
 
 
149
  demo1.launch()
 
8
  import pandas as pd
9
  import torch
10
 
11
+
12
  id2label = {0: "NEGATIVE", 1: "POSITIVE"}
13
  label2id = {"NEGATIVE": 0, "POSITIVE": 1}
14
 
 
 
15
 
16
+ title = "ํ•œ๊ตญ์–ด/์˜์–ด ๊ฐ์ • ๋ถ„์„ ์˜ˆ์ œ(๋„ค์ด๋ฒ„ ์˜ํ™” ๋ฆฌ๋ทฐ๋ฅผ ํ™œ์šฉ)"
17
+ description = "์˜ํ™”ํ‰์„ ์ž…๋ ฅํ•˜์—ฌ ๊ธ์ •์ ์ธ์ง€ ๋ถ€์ •์ ์ธ์ง€๋ฅผ ๋ถ„๋ฅ˜ํ•˜๋Š” ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค. \
18
+ ํ•œ๊ตญ์–ด์ธ์ง€ ์˜์–ด์ธ์ง€ ํŒ๋‹จํ•˜๊ณ  ์˜ˆ์ธกํ•ด์ฃผ๋Š” ""Default""๋ผ๋Š” ๋ฒ„์ „๋„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค." \
19
+ ํ•œ๊ตญ์–ด ๋ฒ„์ „๊ณผ ์˜์–ด ๋ฒ„์ „ ์ค‘์—์„œ ์„ ํƒํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
20
+
21
  class LanguageIdentification:
22
  def __init__(self):
23
  pretrained_lang_model = "./lid.176.ftz"
24
  self.model = fasttext.load_model(pretrained_lang_model)
25
 
26
  def predict_lang(self, text):
27
+ predictions = self.model.predict(text, k=200) # returns top 200 matching languages
28
  return predictions
29
 
30
  LANGUAGE = LanguageIdentification()
 
40
  truncation=True)
41
 
42
 
43
+ examples = []
44
+ df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
45
+ np.random.seed(100)
46
+
47
+ idx = np.random.choice(50, size=5, replace=False)
48
+ eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
49
+ kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
50
+ examples = eng_examples + kor_examples
51
+
52
+
53
+
54
  eng_model_name = "roberta-base"
55
  eng_step = 1900
56
  eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
 
77
  percent_kor, percent_eng = 0, 0
78
  text_list = Text.split(' ')
79
 
80
+
81
+ # [ output_1 ]
82
  if Lang == '์–ธ์–ด๊ฐ์ง€ ๊ธฐ๋Šฅ ์‚ฌ์šฉ':
83
  pred = LANGUAGE.predict_lang(Text)
84
  if '__label__en' in pred[0]:
 
89
  Lang = 'Kor'
90
  idx = pred[0].index('__label__ko')
91
  p_kor = pred[1][idx]
92
+ # Normalize Percentage
93
  percent_kor = p_kor / (p_kor+p_eng)
94
  percent_eng = p_eng / (p_kor+p_eng)
95
 
 
102
  model = kor_model
103
  tokenizer = kor_tokenizer
104
  if percent_kor==0: percent_kor=1
105
+
106
 
107
+ # [ output_2 ]
108
  inputs = tokenized_data(tokenizer, Text)
109
  model.eval()
110
  with torch.no_grad():
 
113
 
114
  m = torch.nn.Softmax(dim=1)
115
  output = m(logits)
116
+ # print(logits, output)
117
+
118
 
119
+ # [ output_3 ]
120
  output_analysis = []
121
  for word in text_list:
122
  tokenized_word = tokenized_data(tokenizer, word)
 
139
  else:
140
  output_analysis.append( (word, None) )
141
 
142
+
143
  return [ {'Kor': percent_kor, 'Eng': percent_eng},
144
  {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
145
  output_analysis ]
146
 
147
+ # prediction = torch.argmax(logits, axis=1)
148
  return id2label[prediction.item()]
149
 
150
 
 
152
  gr.Markdown(
153
  """
154
  <h1 align="center">
155
+ ํ•œ๊ตญ์–ด/์˜์–ด ๊ฐ์ • ๋ถ„์„ ์˜ˆ์ œ(๋„ค์ด๋ฒ„ ์˜ํ™” ๋ฆฌ๋ทฐ๋ฅผ ํ™œ์šฉ)
156
  </h1>
157
  """)
158
 
159
+ gr.Markdown(
160
+ """
161
+ ์˜ํ™” ๋ฆฌ๋ทฐ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด, ๊ธ์ •์ ์ธ ๊ฐ์ •์ธ์ง€ ๋ถ€์ •์ ์ธ ๊ฐ์ •์ธ์ง€ ํŒ๋ณ„ํ•˜๋Š” ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค. \
162
+ ์˜์–ด์™€ ํ•œ๊ธ€์„ ์ง€์›ํ•˜๋ฉฐ, ์–ธ์–ด๋ฅผ ์ง์ ‘ ์„ ํƒํ• ์ˆ˜๋„, ํ˜น์€ ๋ชจ๋ธ์ด ์–ธ์–ด๊ฐ์ง€๋ฅผ ์ง์ ‘ ํ•˜๋„๋ก ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
163
+ ๋ฆฌ๋ทฐ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด, (1) ๊ฐ์ง€๋œ ์–ธ์–ด, (2) ๊ธ์ • ๋ฆฌ๋ทฐ์ผ ํ™•๋ฅ ๊ณผ ๋ถ€์ • ๋ฆฌ๋ทฐ์ผ ํ™•๋ฅ , (3) ์ž…๋ ฅ๋œ ๋ฆฌ๋ทฐ์˜ ์–ด๋Š ๋‹จ์–ด๊ฐ€ ๊ธ์ •/๋ถ€์ • ๊ฒฐ์ •์— ์˜ํ–ฅ์„ ์ฃผ์—ˆ๋Š”์ง€ \
164
+ (๊ธ์ •์ผ ๊ฒฝ์šฐ ๋นจ๊ฐ•์ƒ‰, ๋ถ€์ •์ผ ๊ฒฝ์šฐ ํŒŒ๋ž€์ƒ‰)๋ฅผ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
165
+ """)
166
+
167
+ with gr.Accordion(label="๋ชจ๋ธ์— ๋Œ€ํ•œ ์„ค๋ช… ( ์—ฌ๊ธฐ๋ฅผ ํด๋ฆญ ํ•˜์‹œ์˜ค. )", open=False):
168
+ gr.Markdown(
169
+ """
170
+ ์˜์–ด ๋ชจ๋ธ์€ bert-base-uncased ๊ธฐ๋ฐ˜์œผ๋กœ, ์˜์–ด ์˜ํ™” ๋ฆฌ๋ทฐ ๋ถ„์„ ๋ฐ์ดํ„ฐ์…‹์ธ SST-2๋กœ ํ•™์Šต ๋ฐ ํ‰๊ฐ€๋˜์—ˆ์Šต๋‹ˆ๋‹ค.
171
+ ํ•œ๊ธ€ ๋ชจ๋ธ์€ klue/roberta-base ๊ธฐ๋ฐ˜์ด๋‹ค. ๊ธฐ์กด ํ•œ๊ธ€ ์˜ํ™” ๋ฆฌ๋ทฐ ๋ถ„์„ ๋ฐ์ดํ„ฐ์…‹์ด ์กด์žฌํ•˜์ง€ ์•Š์•„, ๋„ค์ด๋ฒ„ ์˜ํ™”์˜ ๋ฆฌ๋ทฐ๋ฅผ ํฌ๋กค๋งํ•ด์„œ ์˜ํ™” ๋ฆฌ๋ทฐ ๋ถ„์„ ๋ฐ์ดํ„ฐ์…‹์„ ์ œ์ž‘ํ•˜๊ณ , ์ด๋ฅผ ์ด์šฉํ•˜์—ฌ ๋ชจ๋ธ์„ ํ•™์Šต ๋ฐ ํ‰๊ฐ€ํ•˜์˜€์Šต๋‹ˆ๋‹ค.
172
+ ์˜์–ด ๋ชจ๋ธ์€ SST-2์—์„œ 92.8%, ํ•œ๊ธ€ ๋ชจ๋ธ์€ ๋„ค์ด๋ฒ„ ์˜ํ™” ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ์…‹์—์„œ 94%์˜ ์ •ํ™•๋„๋ฅผ ๊ฐ€์ง‘๋‹ˆ๋‹ค(test set ๊ธฐ์ค€).
173
+ ์–ธ์–ด๊ฐ์ง€๋Š” fasttext์˜ language detector๋ฅผ ์‚ฌ์šฉํ•˜์˜€๋‹ค. ๋ฆฌ๋ทฐ์˜ ๋‹จ์–ด๋ณ„ ์˜ํ–ฅ๋ ฅ์€, ๋‹จ์–ด ๊ฐ๊ฐ์„ ๋ชจ๋ธ์— ๋„ฃ์—ˆ์„ ๋•Œ ๊ฒฐ๊ณผ๊ฐ€ ๊ธ์ •์œผ๋กœ ๋‚˜์˜ค๋Š”์ง€ ๋ถ€์ •์œผ๋กœ ๋‚˜์˜ค๋Š”์ง€๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ธก์ •ํ•˜์˜€์Šต๋‹ˆ๋‹ค.
174
+ """)
175
+
176
  with gr.Row():
177
  with gr.Column():
178
+ inputs_1 = gr.Dropdown(choices=['์–ธ์–ด๊ฐ์ง€ ๊ธฐ๋Šฅ ์‚ฌ์šฉ', 'Eng', 'Kor'], value='์–ธ์–ด๊ฐ์ง€ ๊ธฐ๋Šฅ ์‚ฌ์šฉ', label='Lang')
179
  inputs_2 = gr.Textbox(placeholder="๋ฆฌ๋ทฐ๋ฅผ ์ž…๋ ฅํ•˜์‹œ์˜ค.", label='Text')
180
  with gr.Row():
181
+ # btn2 = gr.Button("ํด๋ฆฌ์–ด")
182
  btn = gr.Button("์ œ์ถœํ•˜๊ธฐ")
183
  with gr.Column():
184
  output_1 = gr.Label(num_top_classes=3, label='Lang')
 
186
  output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
187
  .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
188
 
189
+ # btn2.click(fn=fn2, inputs=[None, None], output=[output_1, output_2, output_3])
190
  btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
191
+ gr.Examples(examples, inputs=[inputs_1, inputs_2])
192
 
193
+
194
+
195
  if __name__ == "__main__":
196
+ # print(examples)
197
+ # demo.launch()
198
  demo1.launch()