alphahg commited on
Commit
9b7fea7
β€’
1 Parent(s): d11769d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +174 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
+ from nltk.tokenize import sent_tokenize
5
+ from konlpy.tag import Kkma
6
+ import gc
7
+
8
+ import nltk
9
+ nltk.download('punkt')
10
+
11
+ # from PyKakao import KoGPT
12
+ # kogpt_api = KoGPT(service_key = "")
13
+ import openai
14
+ openai.api_key = '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'
15
+ gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
16
+
17
+ import os
18
+ # if not(os.environ['JAVA_HOME']):
19
+ # os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-17\bin\server'
20
+
21
+ #en2ko = 'alphahg/m2m100_418M-finetuned-en-to-ko-4770260'#'alphahg/mbart-large-50-finetuned-en-to-ko-8603428-finetuned-en-to-ko-9914408'
22
+ en2ko = 'alphahg/mbart-large-50-finetuned-en-to-ko-8603428-finetuned-en-to-ko-9914408'
23
+ ko2en = 'alphahg/opus-mt-ko-en-finetuned-ko-to-en-2780616'
24
+ ensum = 'allenai/led-large-16384-arxiv'
25
+ kosum = 'alphahg/pko-t5-small-finetuned-paper-4564652' #'lcw99/t5-base-korean-text-summary'
26
+
27
+ kkma = Kkma()
28
+ #en_pipe = pipeline('translation', model=en2ko, tokenizer=en2ko, src_lang = "en", tgt_lang = "ko", device_map="auto")
29
+ en2ko_model = AutoModelForSeq2SeqLM.from_pretrained(en2ko)
30
+
31
+ en_pipe = pipeline('translation', model=en2ko_model, tokenizer=en2ko, src_lang = "en_XX", tgt_lang = "ko_KR", device="cuda:0")
32
+ ko_pipe = pipeline('translation', model=ko2en, tokenizer=ko2en, device="cuda:0")
33
+ style_pipe = pipeline('translation', model=en2ko_model, tokenizer=en2ko, src_lang = "ko_KR", tgt_lang = "ko_KR", device="cuda:0")
34
+
35
+ en_sum = pipeline('summarization', model=ensum, tokenizer=ensum, device="cuda:1")
36
+ ko_sum = pipeline('summarization', model=kosum, tokenizer=kosum, device="cuda:1")
37
+
38
+ def len_tokens(text, pipe):
39
+ return len(pipe.tokenizer(text)['input_ids'])
40
+
41
+ def split_sent(sentences, pipe, max_len=256):
42
+ if not sentences:
43
+ return []
44
+
45
+ paragraphs = []
46
+ example = sentences[0]
47
+ for i in range(1, len(sentences)):
48
+ if len_tokens(example + ' ' + sentences[i], pipe) > max_len:
49
+ paragraphs.append(example)
50
+ example = sentences[i]
51
+ else:
52
+ example += ' ' + sentences[i]
53
+
54
+ paragraphs.append(example)
55
+
56
+ return paragraphs
57
+
58
+ # chatbot = Chatbot({
59
+ # #"session_token": "eyJhbGciOiJkaXIiLCJlbmMiOiJBMjU2R0NNIn0..hV_ujfbYLwBgI-g6.zQW0evUrpYfli2cujTFp1ie5PhthUZayoSY2Chb1Eb8Ow3t6l2-NUwGJcYyxVKQS0aITN3-ph-KzPysnu7dCF9KrC-22DZzs1zMFm3PHEkjb4jD69qndcFEGGH8y4SejfYwvdj4wfKmVnGo3XTNZ31qPSDA2PBoaOMpxWABSqWULMJbS-_Y--wd0YhsqMFlkCQpXyfxSf9yxXlPYt0HR_NgupoBXP-WVYbODSUYFVqa3IsScbSPS-mUY0YrAb8AJUkvej7HiSCN9onyThgTtZgZklwqpB4FesJaC6R3nSXSg5cKDDupGVBUQlPRiemacCV6tXnSC-bCtN9a-l9RRqtX_FJNP8T7Kb75ktuedKKrXXTmk3x7hz_RhYhZ4wXFkbqXexZQXTfQoI2vKlLN73EHBlJOqsDLnOP7zT4Vr2RpBbk1HK5D_uHh5x1X3aBHslHfEQpjjZKiMs0to9DwKzSNHXlNmCeGjT9ZzKVsWYCiseO20IlKxQ63Q_nIbi-6y8e6LWw9O82ESKkkRe8kN-CzGxakKJegmHKQGRPZu9ZEIqwWYqlnahVyWRFOtjfMNN3ncGbQGi54VMyfSqmSvPXecaxsVNzl00gHvmCFBFJBDXM2GTsvEzQsJi1MLyopXtSuiU1anL_kC1eMvew61vd3TtC97ZwlQLaWc6dT14p5NdJpN9ihFpgtxMP1rcQhNTc5fo9BBoKrO4yGOuy0wixJs6ORVdY2o3c653X8PFmrso1XaV3KpBaSmcvszIhL1anJTA8SpnPNRmksEFONcX8AfpQ-4WbckCS737TZYCDulVtvukyVAbtq9cEQP5kXXAGOWKg5lX4nRFynM83f8P-XGIg-XUGE99NRIOCBo28cr04fWFOaJOyHf9eP6Rx5zjNv1qwp4FxhVVP9jlmSTfu97CZSR91L-k8V6jVgbj8F6YUZ6iiu51kaOAqf5de4EUncSFyGLuJfCGTJTPSYYl1lnR6bSfTVHKwP28YzzcU2myMM5B0ZXDwydD900TYXZOCxxLPUbu5-G3roR2KZnuWLXFOiafAvDx-LHYUHSWQZ9ouWcDaQBNsXmfTZtIWHQ8aTZwlNEnN4-uFdlk2Lm35qp1v-8Fp_3aXGQ3CrTy-ryMV0rUPTSMCEA8gVA_mD40zV6Wcb4asc3zsYAuomQ3Iu4iB5wyWGxUIJVzl1C9QaPpAx7vp5u7w-0_rtocVVXFRTZ8aSxNS3QAd62TbVyToIOrsvp4kOWDcqhNp5QBAsJtES9pbO9fiy_SJS83SFMliSFd-jhXfKu0kUYIUb9yaN5QC6eEpgJ7KzhwTcNDtoqyBKMyVTSdUXA9P2Yv2e4r-BVnxlW0RxknQdesK-wZrwuAZt_bnLaHSqFzyWz5AE7pukTBQ2QdVoity_tVURzhTcINh6rvPywm2IVl1gC3FjfhQVfTvHWFtUzNqLN4yUfI0Tc1mGHQlYuxZ_yux2B8HeYb_cyb1rR_mwDiOs3PKOnhfNRdXqXf6RWr7KdjNc0k-CMm13DAYQggmFCmEZW20FiwalKqVq3nFTFDhfp5mxtr0sLCVxGA3eTqC6_i2TAVGqDLjxzfz5WiK7J3FAN2_kmEZLBVXHabwa9kKyCzcgCx6FrxaFidskO6t4dWu3wok95hXMae0Z4ZFs7HVNisM1pkRm7LE3XdvnslKHAJkPr57HsFdlQITJRSx3Tg0EN1LUt80hKx8VGXPv7zBeXP5lni2ixpglMQmiKLiszowGoqu2oJPwougueu5Bj4BLhmoqK8DCtdxl3MYAyxLWWStXQqcEJQw7koYmPNwr4BzI9cQVk81LbPwrXBbJfR7G14e5qV0lULfuU5qVfNt7DU6FbwXmzv6qFI-jOClLzSTKpFzp51wQQ5fh2REs6CPJlL-kiyomJPXcqSeezDCLLwWjI_vIyODFkzt91l-dmFriu3HMkMC1v29AJlfPA_avSiJzJDEI7rb6AbEyT6piqp1TYlWMkI_rJCsCZXIb10Rjd2y9sR-Dz3_FZzRvJUA7BfRlP7Bf04HnYsyMRoJilbuyQ5fB0B2L2nxjYY2zoHJ_x6HTS6tcrAijOO4FSSQngWD9iTKCm6pjW3aZjFyXyjmP82S3VnhEyON390aIL7j9Y0wGnHzOkn54OfyxxGeo2mFAIv9kthL_Fi8d9G_rvvQOBUM2a7kjF5-n8wby0YDujoRl0ETg379HyMVf7F2BHWQ8nAbICRxWZ7EzPLwzrVjPiQVPZklkrVYgEmGxZDrgEG8IeNi7FMgGruaQ1tENczRMXzaApK8k6-FXKhfFIV7dN95tP4k6tnnxRFoMAUWcXwQCzRH8YhID36TAUFdBQ-c52MTogPo1Rki1N49j_e7Mph1OABQX2Fw9-CukT6reQkp3nGrwi0IKnKoyGhHOBHK3kzQwINfjOBbNpOjP-6MX_9kiRTstN2GLte8w0QJQVl84o8ACTjV8N4rhI6xyLKIvqoyZ6jNO3SYs8fEutmZO8-qB0iksIiQHupxQgcmgbAyM.KxAbJWqMvwm0PYtq7MuR6A"
60
+
61
+ # }, conversation_id=None, parent_id=None) # You can start a custom conversation
62
+ # %%
63
+ def translate(text, lang, gpt_fix=False):
64
+ from_en = False if lang == 'ν•œμ˜' else True
65
+ sentences = sent_tokenize(text) if from_en else kkma.sentences(text)
66
+ #print(sentences)
67
+ if not sentences:
68
+ return ''
69
+
70
+ paragraphs = split_sent(sentences, en_pipe, max_len=180) if from_en else split_sent(sentences, ko_pipe)
71
+ #print(paragraphs)
72
+
73
+ ret = []
74
+ for text in paragraphs:
75
+ result = en_pipe(text) if from_en else ko_pipe(text)
76
+ ret.append(result[0]['translation_text'])
77
+
78
+ translated = ' '.join(ret)
79
+ gc.collect()
80
+
81
+ if gpt_fix:
82
+ if lang == 'ν•œμ˜':
83
+ prompt = 'Improve given formal article without adding:'
84
+ elif lang == 'μ˜ν•œ':
85
+ prompt = "좔가적인 λ‚΄μš©μ—†μ΄ 주어진 글을 κ°œμ„ ν•΄:"
86
+
87
+ def fix_sent(sent):
88
+ number_of_tokens = len(gpt2_tokenizer(sent)['input_ids'])
89
+
90
+ response = openai.Completion.create(
91
+ model="text-davinci-003",
92
+ prompt=prompt+'\n'+sent,
93
+ temperature=0,
94
+ max_tokens=number_of_tokens+128,
95
+ top_p=1.0,
96
+ frequency_penalty=0.0,
97
+ presence_penalty=0.0
98
+ )
99
+
100
+ return response.choices[0].text.strip()
101
+
102
+ # def fix_sent(sent):
103
+ # generated = kogpt_api.generate(prompt+'\n'+sent, max_tokens=256)
104
+ # return generated['generations'][0]['text']
105
+
106
+ translated = fix_sent(translated)
107
+
108
+ return translated
109
+
110
+ #%%
111
+ def translate_with_sum(text, lang, gpt_fix=False):
112
+ from_en = False if lang == 'ν•œμ˜' else True
113
+
114
+ if lang == 'μ˜ν•œ':
115
+ summary = en_sum(text, max_length=int(len_tokens(text, en_sum)/2)+32)
116
+ text = summary[0]['summary_text']
117
+
118
+ sentences = sent_tokenize(text) if from_en else kkma.sentences(text)
119
+ #print(sentences)
120
+ if not sentences:
121
+ return ''
122
+
123
+ paragraphs = split_sent(sentences, en_pipe if from_en else ko_pipe)
124
+ #print(paragraphs)
125
+
126
+ ret = []
127
+ for text in paragraphs:
128
+ result = en_pipe(text) if from_en else ko_pipe(text)
129
+ ret.append(result[0]['translation_text'])
130
+
131
+ summarized = ' '.join(ret)
132
+ if lang == 'ν•œμ˜':
133
+ summary = en_sum(summarized, max_length=int(len_tokens(summarized, en_sum)/2)+32)
134
+ return summary[0]['summary_text']
135
+
136
+ gc.collect()
137
+ return summarized
138
+
139
+ def summarize(text, lang):
140
+ if lang == 'Korean':
141
+ summarizer = ko_sum
142
+ elif lang == 'English':
143
+ summarizer = en_sum
144
+
145
+ summary = summarizer(text, max_length=int(len_tokens(text, summarizer) * 0.7))[0]['summary_text']
146
+ return summary
147
+
148
+ def translate_styleonly(text):
149
+ sentences = kkma.sentences(text)
150
+ paragraphs = split_sent(sentences, style_pipe, max_len=180)
151
+ #print(paragraphs)
152
+
153
+ ret = []
154
+ for text in paragraphs:
155
+ result = style_pipe(text)
156
+ ret.append(result[0]['translation_text'])
157
+
158
+ translated = ' '.join(ret)
159
+ gc.collect()
160
+
161
+ return translated
162
+
163
+ # %%
164
+ interface1 = gr.Interface(fn=translate, inputs=["text", gr.Radio(["μ˜ν•œ", "ν•œμ˜"], value='μ˜ν•œ'), 'checkbox'], outputs="text", batch=True, max_batch_size=8)
165
+ interface2 = gr.Interface(fn=translate_with_sum, inputs=["text", gr.Radio(["μ˜ν•œ", "ν•œμ˜"], value='μ˜ν•œ')], outputs="text", batch=True, max_batch_size=8)
166
+ parallel_interface = gr.Parallel(interface1, interface2)
167
+
168
+ summarize_interface = gr.Interface(fn=summarize, inputs=["text", gr.Radio(["Korean", "English"], value='Korean')], outputs="text", batch=True, max_batch_size=8)
169
+ style_interface = gr.Interface(fn=translate_styleonly, inputs=["text"], outputs="text", batch=True, max_batch_size=8)
170
+ #%%
171
+ demo = gr.TabbedInterface([parallel_interface, summarize_interface, style_interface], ['λ²ˆμ—­ 및 μš”μ•½', 'μš”μ•½', 'μŠ€νƒ€μΌ λ²ˆμ—­'], css="footer {visibility: hidden}") # 'μš”μ•½'
172
+ demo.queue()
173
+ demo.launch(share=True) # Share the demo
174
+ # %%
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ nltk
2
+ konlpy
3
+ openai