Spaces:

sigmadream
/

ko-review

Sleeping

App Files Files Community

sigmadream commited on Oct 20, 2023

Commit

123ce6b

1 Parent(s): a06c62b

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -5

app.py CHANGED Viewed

@@ -8,19 +8,23 @@ import numpy as np
 import pandas as pd
 import torch
 id2label = {0: "NEGATIVE", 1: "POSITIVE"}
 label2id = {"NEGATIVE": 0, "POSITIVE": 1}
-title = "한국어/영어 감정 분석 예제(네이버 영화 리뷰를 활용)"
-description = "영화평을 입력하여 긍정적인지 부정적인지를 분류하는 모델입니다."
 class LanguageIdentification:
     def __init__(self):
         pretrained_lang_model = "./lid.176.ftz"
         self.model = fasttext.load_model(pretrained_lang_model)
     def predict_lang(self, text):
-        predictions = self.model.predict(text, k=200)
         return predictions
 LANGUAGE = LanguageIdentification()
@@ -36,6 +40,17 @@ def tokenized_data(tokenizer, inputs):
         truncation=True)
 eng_model_name = "roberta-base"
 eng_step = 1900
 eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
@@ -62,6 +77,8 @@ def builder(Lang, Text):
     percent_kor, percent_eng = 0, 0
     text_list = Text.split(' ')
     if Lang == '언어감지 기능 사용':
         pred = LANGUAGE.predict_lang(Text)
         if '__label__en' in pred[0]:
@@ -72,6 +89,7 @@ def builder(Lang, Text):
             Lang = 'Kor'
             idx = pred[0].index('__label__ko')
             p_kor = pred[1][idx]
         percent_kor = p_kor / (p_kor+p_eng)
         percent_eng = p_eng / (p_kor+p_eng)
@@ -84,7 +102,9 @@ def builder(Lang, Text):
         model = kor_model
         tokenizer = kor_tokenizer
         if percent_kor==0: percent_kor=1
     inputs = tokenized_data(tokenizer, Text)
     model.eval()
     with torch.no_grad():
@@ -93,7 +113,10 @@ def builder(Lang, Text):
     m = torch.nn.Softmax(dim=1)
     output = m(logits)
     output_analysis = []
     for word in text_list:
         tokenized_word = tokenized_data(tokenizer, word)
@@ -116,10 +139,12 @@ def builder(Lang, Text):
         else:
             output_analysis.append( (word, None) )
     return [ {'Kor': percent_kor, 'Eng': percent_eng},
             {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
             output_analysis ]
     return id2label[prediction.item()]
@@ -127,15 +152,33 @@ with gr.Blocks() as demo1:
     gr.Markdown(
     """
     <h1 align="center">
-    한국어/영어 감정 분석 예제
     </h1>
     """)
     with gr.Row():
         with gr.Column():
-            inputs_1 = gr.Dropdown(choices=['언어감지', 'Eng', 'Kor'], value='언어감지 기능 사용', label='Lang')
             inputs_2 = gr.Textbox(placeholder="리뷰를 입력하시오.", label='Text')
             with gr.Row():
                 btn = gr.Button("제출하기")
         with gr.Column():
             output_1 = gr.Label(num_top_classes=3, label='Lang')
@@ -143,7 +186,13 @@ with gr.Blocks() as demo1:
             output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
                 .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
     btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
 if __name__ == "__main__":
     demo1.launch()

 import pandas as pd
 import torch
 id2label = {0: "NEGATIVE", 1: "POSITIVE"}
 label2id = {"NEGATIVE": 0, "POSITIVE": 1}
+title = "한국어/영어 감정 분석 예제(네이버 영화 리뷰를 활용)"
+description = "영화평을 입력하여 긍정적인지 부정적인지를 분류하는 모델입니다. \
+                한국어인지 영어인지 판단하고 예측해주는 ""Default""라는 버전도 제공합니다." \
+                한국어 버전과 영어 버전 중에서 선택할 수 있습니다.
 class LanguageIdentification:
     def __init__(self):
         pretrained_lang_model = "./lid.176.ftz"
         self.model = fasttext.load_model(pretrained_lang_model)
     def predict_lang(self, text):
+        predictions = self.model.predict(text, k=200) # returns top 200 matching languages
         return predictions
 LANGUAGE = LanguageIdentification()
         truncation=True)
+examples = []
+df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
+np.random.seed(100)
+idx = np.random.choice(50, size=5, replace=False)
+eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
+kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
+examples = eng_examples + kor_examples
 eng_model_name = "roberta-base"
 eng_step = 1900
 eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
     percent_kor, percent_eng = 0, 0
     text_list = Text.split(' ')
+    # [ output_1 ]
     if Lang == '언어감지 기능 사용':
         pred = LANGUAGE.predict_lang(Text)
         if '__label__en' in pred[0]:
             Lang = 'Kor'
             idx = pred[0].index('__label__ko')
             p_kor = pred[1][idx]
+        # Normalize Percentage
         percent_kor = p_kor / (p_kor+p_eng)
         percent_eng = p_eng / (p_kor+p_eng)
         model = kor_model
         tokenizer = kor_tokenizer
         if percent_kor==0: percent_kor=1
+    # [ output_2 ]
     inputs = tokenized_data(tokenizer, Text)
     model.eval()
     with torch.no_grad():
     m = torch.nn.Softmax(dim=1)
     output = m(logits)
+    # print(logits, output)
+    # [ output_3 ]
     output_analysis = []
     for word in text_list:
         tokenized_word = tokenized_data(tokenizer, word)
         else:
             output_analysis.append( (word, None) )
     return [ {'Kor': percent_kor, 'Eng': percent_eng},
             {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()},
             output_analysis ]
+    # prediction = torch.argmax(logits, axis=1)
     return id2label[prediction.item()]
     gr.Markdown(
     """
     <h1 align="center">
+    한국어/영어 감정 분석 예제(네이버 영화 리뷰를 활용)
     </h1>
     """)
+    gr.Markdown(
+    """
+    영화 리뷰를 입력하면, 긍정적인 감정인지 부정적인 감정인지 판별하는 모델입니다. \
+    영어와 한글을 지원하며, 언어를 직접 선택할수도, 혹은 모델이 언어감지를 직접 하도록 할 수 있습니다.
+    리뷰를 입력하면, (1) 감지된 언어, (2) 긍정 리뷰일 확률과 부정 리뷰일 확률, (3) 입력된 리뷰의 어느 단어가 긍정/부정 결정에 영향을 주었는지 \
+    (긍정일 경우 빨강색, 부정일 경우 파란색)를 확인할 수 있습니다.
+    """)
+    with gr.Accordion(label="모델에 대한 설명 ( 여기를 클릭 하시오. )", open=False):
+        gr.Markdown(
+        """
+        영어 모델은 bert-base-uncased 기반으로, 영어 영화 리뷰 분석 데이터셋인 SST-2로 학습 및 평가되었습니다.
+        한글 모델은 klue/roberta-base 기반이다. 기존 한글 영화 리뷰 분석 데이터셋이 존재하지 않아, 네이버 영화의 리뷰를 크롤링해서 영화 리뷰 분석 데이터셋을 제작하고, 이를 이용하여 모델을 학습 및 평가하였습니다.
+        영어 모델은 SST-2에서 92.8%, 한글 모델은 네이버 영화 리뷰 데이터셋에서 94%의 정확도를 가집니다(test set 기준).
+        언어감지는 fasttext의 language detector를 사용하였다. 리뷰의 단어별 영향력은, 단어 각각을 모델에 넣었을 때 결과가 긍정으로 나오는지 부정으로 나오는지를 바탕으로 측정하였습니다.
+        """)
     with gr.Row():
         with gr.Column():
+            inputs_1 = gr.Dropdown(choices=['언어감지 기능 사용', 'Eng', 'Kor'], value='언어감지 기능 사용', label='Lang')
             inputs_2 = gr.Textbox(placeholder="리뷰를 입력하시오.", label='Text')
             with gr.Row():
+                # btn2 = gr.Button("클리어")
                 btn = gr.Button("제출하기")
         with gr.Column():
             output_1 = gr.Label(num_top_classes=3, label='Lang')
             output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
                 .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
+    # btn2.click(fn=fn2, inputs=[None, None], output=[output_1, output_2, output_3])
     btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
+    gr.Examples(examples, inputs=[inputs_1, inputs_2])
 if __name__ == "__main__":
+    # print(examples)
+    # demo.launch()
     demo1.launch()