pablocst commited on
Commit
89fac38
1 Parent(s): b841298

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -42
app.py CHANGED
@@ -1,14 +1,5 @@
1
- # OCR Translate v0.2
2
- # 创建人:曾逸夫
3
- # 创建时间:2022-07-19
4
-
5
- import os
6
-
7
- #os.system("apt-get install xclip")
8
-
9
  import gradio as gr
10
  import nltk
11
- import pyclip
12
  import pytesseract
13
  from nltk.tokenize import sent_tokenize
14
  from transformers import MarianMTModel, MarianTokenizer
@@ -24,17 +15,32 @@ img_dir = "./data"
24
  # 获取tesseract语言列表
25
  choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
26
 
 
 
 
 
 
 
27
 
28
- # 翻译模型选择
29
- def model_choice(src='trans_src', trg='trans_trg'):
30
- # https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-pt-en
31
- # https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-en-pt
32
- model_name = f"Helsinki-NLP/opus-mt-tc-big-{src}-{trg}" # 模型名称
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- tokenizer = MarianTokenizer.from_pretrained(model_name) # 分词器
35
- model = MarianMTModel.from_pretrained(model_name) # 模型
36
 
37
- return tokenizer, model
38
 
39
 
40
  # tesseract语言列表转pytesseract语言
@@ -77,32 +83,7 @@ def cp_clear():
77
  pyclip.clear()
78
 
79
 
80
- # 翻译
81
- def translate(input_text, inputs_transStyle):
82
- # 参考:https://huggingface.co/docs/transformers/model_doc/marian
83
- if input_text is None or input_text == "":
84
- return "System prompt: There is no content to translate!"
85
-
86
- # 选择翻译模型
87
- trans_src, trans_trg = inputs_transStyle.split("-")[0], inputs_transStyle.split("-")[1]
88
- tokenizer, model = model_choice(trans_src, trans_trg)
89
-
90
- translate_text = ""
91
- input_text_list = input_text.split("\n\n")
92
-
93
- translate_text_list_tmp = []
94
- for i in range(len(input_text_list)):
95
- if input_text_list[i] != "":
96
- translate_text_list_tmp.append(input_text_list[i])
97
-
98
- for i in range(len(translate_text_list_tmp)):
99
- translated_sub = model.generate(
100
- **tokenizer(sent_tokenize(translate_text_list_tmp[i]), return_tensors="pt", truncation=True, padding=True))
101
- tgt_text_sub = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_sub]
102
- translate_text_sub = "".join(tgt_text_sub)
103
- translate_text = translate_text + "\n\n" + translate_text_sub
104
 
105
- return translate_text[2:]
106
 
107
 
108
  def main():
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import nltk
 
3
  import pytesseract
4
  from nltk.tokenize import sent_tokenize
5
  from transformers import MarianMTModel, MarianTokenizer
 
15
  # 获取tesseract语言列表
16
  choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
17
 
18
+ # Função de escolha do modelo de tradução
19
+ def model_choice(src, trg):
20
+ model_name = f"Helsinki-NLP/opus-mt-tc-big-{src}-{trg}"
21
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
22
+ model = MarianMTModel.from_pretrained(model_name)
23
+ return tokenizer, model
24
 
25
+ # Função de tradução
26
+ def translate(input_text, trans_style):
27
+ if not input_text:
28
+ return "System prompt: There is no content to translate!"
29
+
30
+ src, trg = trans_style.split("-")
31
+ tokenizer, model = model_choice(src, trg)
32
+
33
+ sentences = sent_tokenize(input_text)
34
+ translated_text = ""
35
+ for sentence in sentences:
36
+ inputs = tokenizer.encode(sentence, return_tensors="pt", truncation=True, padding=True)
37
+ translated = model.generate(inputs)
38
+ translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
39
+ translated_text += translated_sentence + " "
40
+
41
+ return translated_text.strip()
42
 
 
 
43
 
 
44
 
45
 
46
  # tesseract语言列表转pytesseract语言
 
83
  pyclip.clear()
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
87
 
88
 
89
  def main():