Spaces:
Runtime error
Runtime error
cointegrated
commited on
Commit
•
d663592
1
Parent(s):
2306f2d
the first commit
Browse files- .gitignore +1 -0
- lid.323.ftz +3 -0
- main.py +164 -0
- requirements.txt +6 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.idea
|
lid.323.ftz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:221ce285041b0511ebc78050426df2bc05b0d37bacf72891bb6f82c80284af37
|
3 |
+
size 2219896
|
main.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import fasttext
|
5 |
+
import os
|
6 |
+
import urllib
|
7 |
+
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
|
8 |
+
|
9 |
+
|
10 |
+
MODEL_URL_MYV_MUL = 'slone/mbart-large-51-myv-mul-v1'
|
11 |
+
MODEL_URL_MUL_MYV = 'slone/mbart-large-51-mul-myv-v1'
|
12 |
+
MODEL_URL_LANGID = 'https://huggingface.co/slone/fastText-LID-323/resolve/main/lid.323.ftz'
|
13 |
+
MODEL_PATH_LANGID = 'lid.323.ftz'
|
14 |
+
|
15 |
+
|
16 |
+
lang_to_code = {
|
17 |
+
'Эрзянь | Erzya': 'myv_XX',
|
18 |
+
'Русский | Рузонь | Russian': 'ru_RU',
|
19 |
+
'Suomi | Суоминь | Finnish': 'fi_FI',
|
20 |
+
'Deutsch | Немецень | German': 'de_DE',
|
21 |
+
'Español | Испанонь | Spanish': 'es_XX',
|
22 |
+
'English | Англань ': 'en_XX',
|
23 |
+
'हिन्दी | Хинди | Hindi': 'hi_IN',
|
24 |
+
'漢語 | Китаень | Chinese': 'zh_CN',
|
25 |
+
'Türkçe | Турконь | Turkish': 'tr_TR',
|
26 |
+
'Українська | Украинань | Ukrainian': 'uk_UA',
|
27 |
+
'Français | Французонь | French': 'fr_XX',
|
28 |
+
'العربية | Арабонь | Arabic': 'ar_AR',
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
def fix_tokenizer(tokenizer, extra_lang='myv_XX'):
|
33 |
+
"""Add a new language id to a MBART 50 tokenizer (because it is not serialized) and shift the mask token id."""
|
34 |
+
old_len = len(tokenizer) - int(extra_lang in tokenizer.added_tokens_encoder)
|
35 |
+
tokenizer.lang_code_to_id[extra_lang] = old_len-1
|
36 |
+
tokenizer.id_to_lang_code[old_len-1] = extra_lang
|
37 |
+
tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
|
38 |
+
|
39 |
+
tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
|
40 |
+
tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
|
41 |
+
if extra_lang not in tokenizer._additional_special_tokens:
|
42 |
+
tokenizer._additional_special_tokens.append(extra_lang)
|
43 |
+
tokenizer.added_tokens_encoder = {}
|
44 |
+
|
45 |
+
|
46 |
+
def translate(
|
47 |
+
text, model, tokenizer,
|
48 |
+
src='ru_RU',
|
49 |
+
trg='myv_XX',
|
50 |
+
max_length='auto',
|
51 |
+
num_beams=3,
|
52 |
+
repetition_penalty=5.0,
|
53 |
+
train_mode=False, n_out=None,
|
54 |
+
**kwargs
|
55 |
+
):
|
56 |
+
tokenizer.src_lang = src
|
57 |
+
encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
|
58 |
+
if max_length == 'auto':
|
59 |
+
max_length = int(32 + 1.5 * encoded.input_ids.shape[1])
|
60 |
+
if train_mode:
|
61 |
+
model.train()
|
62 |
+
else:
|
63 |
+
model.eval()
|
64 |
+
generated_tokens = model.generate(
|
65 |
+
**encoded.to(model.device),
|
66 |
+
forced_bos_token_id=tokenizer.lang_code_to_id[trg],
|
67 |
+
max_length=max_length,
|
68 |
+
num_beams=num_beams,
|
69 |
+
repetition_penalty=repetition_penalty,
|
70 |
+
# early_stopping=True,
|
71 |
+
num_return_sequences=n_out or 1,
|
72 |
+
**kwargs
|
73 |
+
)
|
74 |
+
out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
75 |
+
if isinstance(text, str) and n_out is None:
|
76 |
+
return out[0]
|
77 |
+
return out
|
78 |
+
|
79 |
+
|
80 |
+
def translate_rerank(
|
81 |
+
text, model, tokenizer,
|
82 |
+
src='ru_RU', trg='myv_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False,
|
83 |
+
n=5, diversity_penalty=3.0, lang='myv', max_score=0.3, order_penalty=0.01,
|
84 |
+
verbose=False,
|
85 |
+
**kwargs
|
86 |
+
):
|
87 |
+
texts = translate(
|
88 |
+
text, model, tokenizer, src, trg,
|
89 |
+
max_length=max_length, train_mode=train_mode, repetition_penalty=repetition_penalty,
|
90 |
+
num_beams=n,
|
91 |
+
num_beam_groups=n,
|
92 |
+
diversity_penalty=diversity_penalty,
|
93 |
+
n_out=n,
|
94 |
+
**kwargs
|
95 |
+
)
|
96 |
+
scores = [get_mean_lang_score(t, lang=lang, max_score=max_score) for t in texts]
|
97 |
+
pen_scores = scores - order_penalty * np.arange(n)
|
98 |
+
if verbose:
|
99 |
+
print(texts)
|
100 |
+
print(scores)
|
101 |
+
print(pen_scores)
|
102 |
+
return texts[np.argmax(pen_scores)]
|
103 |
+
|
104 |
+
|
105 |
+
def get_mean_lang_score(text, lang='myv', k=300, max_score=0.3):
|
106 |
+
words = text.split() + [text]
|
107 |
+
res = []
|
108 |
+
for langs, scores in zip(*langid_model.predict(words, k=k)):
|
109 |
+
d = dict(zip([l[9:] for l in langs], scores))
|
110 |
+
score = min(d.get(lang, 0), max_score) / max_score
|
111 |
+
res.append(score)
|
112 |
+
# print(res)
|
113 |
+
return np.mean(res)
|
114 |
+
|
115 |
+
|
116 |
+
def translate_wrapper(text, src, trg):
|
117 |
+
src = lang_to_code.get(src)
|
118 |
+
trg = lang_to_code.get(trg)
|
119 |
+
if src == trg:
|
120 |
+
return 'Please choose two different languages'
|
121 |
+
if src == 'myv_XX':
|
122 |
+
model = model_myv_mul
|
123 |
+
elif trg == 'myv_XX':
|
124 |
+
model = model_mul_myv
|
125 |
+
else:
|
126 |
+
return 'Please translate to or from Erzya'
|
127 |
+
print(text, src, trg)
|
128 |
+
fn = translate_rerank if trg == 'myv_XX' else translate
|
129 |
+
result = fn(
|
130 |
+
text=text,
|
131 |
+
model=model,
|
132 |
+
tokenizer=tokenizer,
|
133 |
+
src=src,
|
134 |
+
trg=trg,
|
135 |
+
)
|
136 |
+
return result
|
137 |
+
|
138 |
+
|
139 |
+
interface = gr.Interface(
|
140 |
+
translate_wrapper,
|
141 |
+
[
|
142 |
+
gr.Textbox(label="Text / текстэнь", lines=2, placeholder='text to translate / текстэнь ютавтозь '),
|
143 |
+
gr.Dropdown(list(lang_to_code.keys()), type="value", label='source language / васень келесь', value=list(lang_to_code.keys())[0]),
|
144 |
+
gr.Dropdown(list(lang_to_code.keys()), type="value", label='target language / эряви келесь', value=list(lang_to_code.keys())[1]),
|
145 |
+
],
|
146 |
+
"text",
|
147 |
+
)
|
148 |
+
|
149 |
+
|
150 |
+
if __name__ == '__main__':
|
151 |
+
model_mul_myv = MBartForConditionalGeneration.from_pretrained(MODEL_URL_MUL_MYV)
|
152 |
+
model_myv_mul = MBartForConditionalGeneration.from_pretrained(MODEL_URL_MYV_MUL)
|
153 |
+
if torch.cuda.is_available():
|
154 |
+
model_mul_myv.cuda()
|
155 |
+
model_myv_mul.cuda()
|
156 |
+
tokenizer = MBart50Tokenizer.from_pretrained(MODEL_URL_MYV_MUL)
|
157 |
+
fix_tokenizer(tokenizer)
|
158 |
+
|
159 |
+
if not os.path.exists(MODEL_PATH_LANGID):
|
160 |
+
print('downloading LID model...')
|
161 |
+
urllib.request.urlretrieve(MODEL_URL_LANGID, MODEL_PATH_LANGID)
|
162 |
+
langid_model = fasttext.load_model(MODEL_PATH_LANGID)
|
163 |
+
|
164 |
+
interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
fasttext
|
3 |
+
sentencepiece
|
4 |
+
numpy
|
5 |
+
# gradio
|
6 |
+
# torch
|