cointegrated commited on
Commit
d663592
1 Parent(s): 2306f2d

the first commit

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. lid.323.ftz +3 -0
  3. main.py +164 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea
lid.323.ftz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:221ce285041b0511ebc78050426df2bc05b0d37bacf72891bb6f82c80284af37
3
+ size 2219896
main.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import fasttext
5
+ import os
6
+ import urllib
7
+ from transformers import MBartForConditionalGeneration, MBart50Tokenizer
8
+
9
+
10
+ MODEL_URL_MYV_MUL = 'slone/mbart-large-51-myv-mul-v1'
11
+ MODEL_URL_MUL_MYV = 'slone/mbart-large-51-mul-myv-v1'
12
+ MODEL_URL_LANGID = 'https://huggingface.co/slone/fastText-LID-323/resolve/main/lid.323.ftz'
13
+ MODEL_PATH_LANGID = 'lid.323.ftz'
14
+
15
+
16
+ lang_to_code = {
17
+ 'Эрзянь | Erzya': 'myv_XX',
18
+ 'Русский | Рузонь | Russian': 'ru_RU',
19
+ 'Suomi | Суоминь | Finnish': 'fi_FI',
20
+ 'Deutsch | Немецень | German': 'de_DE',
21
+ 'Español | Испанонь | Spanish': 'es_XX',
22
+ 'English | Англань ': 'en_XX',
23
+ 'हिन्दी | Хинди | Hindi': 'hi_IN',
24
+ '漢語 | Китаень | Chinese': 'zh_CN',
25
+ 'Türkçe | Турконь | Turkish': 'tr_TR',
26
+ 'Українська | Украинань | Ukrainian': 'uk_UA',
27
+ 'Français | Французонь | French': 'fr_XX',
28
+ 'العربية | Арабонь | Arabic': 'ar_AR',
29
+ }
30
+
31
+
32
+ def fix_tokenizer(tokenizer, extra_lang='myv_XX'):
33
+ """Add a new language id to a MBART 50 tokenizer (because it is not serialized) and shift the mask token id."""
34
+ old_len = len(tokenizer) - int(extra_lang in tokenizer.added_tokens_encoder)
35
+ tokenizer.lang_code_to_id[extra_lang] = old_len-1
36
+ tokenizer.id_to_lang_code[old_len-1] = extra_lang
37
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
38
+
39
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
40
+ tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
41
+ if extra_lang not in tokenizer._additional_special_tokens:
42
+ tokenizer._additional_special_tokens.append(extra_lang)
43
+ tokenizer.added_tokens_encoder = {}
44
+
45
+
46
+ def translate(
47
+ text, model, tokenizer,
48
+ src='ru_RU',
49
+ trg='myv_XX',
50
+ max_length='auto',
51
+ num_beams=3,
52
+ repetition_penalty=5.0,
53
+ train_mode=False, n_out=None,
54
+ **kwargs
55
+ ):
56
+ tokenizer.src_lang = src
57
+ encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
58
+ if max_length == 'auto':
59
+ max_length = int(32 + 1.5 * encoded.input_ids.shape[1])
60
+ if train_mode:
61
+ model.train()
62
+ else:
63
+ model.eval()
64
+ generated_tokens = model.generate(
65
+ **encoded.to(model.device),
66
+ forced_bos_token_id=tokenizer.lang_code_to_id[trg],
67
+ max_length=max_length,
68
+ num_beams=num_beams,
69
+ repetition_penalty=repetition_penalty,
70
+ # early_stopping=True,
71
+ num_return_sequences=n_out or 1,
72
+ **kwargs
73
+ )
74
+ out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
75
+ if isinstance(text, str) and n_out is None:
76
+ return out[0]
77
+ return out
78
+
79
+
80
+ def translate_rerank(
81
+ text, model, tokenizer,
82
+ src='ru_RU', trg='myv_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False,
83
+ n=5, diversity_penalty=3.0, lang='myv', max_score=0.3, order_penalty=0.01,
84
+ verbose=False,
85
+ **kwargs
86
+ ):
87
+ texts = translate(
88
+ text, model, tokenizer, src, trg,
89
+ max_length=max_length, train_mode=train_mode, repetition_penalty=repetition_penalty,
90
+ num_beams=n,
91
+ num_beam_groups=n,
92
+ diversity_penalty=diversity_penalty,
93
+ n_out=n,
94
+ **kwargs
95
+ )
96
+ scores = [get_mean_lang_score(t, lang=lang, max_score=max_score) for t in texts]
97
+ pen_scores = scores - order_penalty * np.arange(n)
98
+ if verbose:
99
+ print(texts)
100
+ print(scores)
101
+ print(pen_scores)
102
+ return texts[np.argmax(pen_scores)]
103
+
104
+
105
+ def get_mean_lang_score(text, lang='myv', k=300, max_score=0.3):
106
+ words = text.split() + [text]
107
+ res = []
108
+ for langs, scores in zip(*langid_model.predict(words, k=k)):
109
+ d = dict(zip([l[9:] for l in langs], scores))
110
+ score = min(d.get(lang, 0), max_score) / max_score
111
+ res.append(score)
112
+ # print(res)
113
+ return np.mean(res)
114
+
115
+
116
+ def translate_wrapper(text, src, trg):
117
+ src = lang_to_code.get(src)
118
+ trg = lang_to_code.get(trg)
119
+ if src == trg:
120
+ return 'Please choose two different languages'
121
+ if src == 'myv_XX':
122
+ model = model_myv_mul
123
+ elif trg == 'myv_XX':
124
+ model = model_mul_myv
125
+ else:
126
+ return 'Please translate to or from Erzya'
127
+ print(text, src, trg)
128
+ fn = translate_rerank if trg == 'myv_XX' else translate
129
+ result = fn(
130
+ text=text,
131
+ model=model,
132
+ tokenizer=tokenizer,
133
+ src=src,
134
+ trg=trg,
135
+ )
136
+ return result
137
+
138
+
139
+ interface = gr.Interface(
140
+ translate_wrapper,
141
+ [
142
+ gr.Textbox(label="Text / текстэнь", lines=2, placeholder='text to translate / текстэнь ютавтозь '),
143
+ gr.Dropdown(list(lang_to_code.keys()), type="value", label='source language / васень келесь', value=list(lang_to_code.keys())[0]),
144
+ gr.Dropdown(list(lang_to_code.keys()), type="value", label='target language / эряви келесь', value=list(lang_to_code.keys())[1]),
145
+ ],
146
+ "text",
147
+ )
148
+
149
+
150
+ if __name__ == '__main__':
151
+ model_mul_myv = MBartForConditionalGeneration.from_pretrained(MODEL_URL_MUL_MYV)
152
+ model_myv_mul = MBartForConditionalGeneration.from_pretrained(MODEL_URL_MYV_MUL)
153
+ if torch.cuda.is_available():
154
+ model_mul_myv.cuda()
155
+ model_myv_mul.cuda()
156
+ tokenizer = MBart50Tokenizer.from_pretrained(MODEL_URL_MYV_MUL)
157
+ fix_tokenizer(tokenizer)
158
+
159
+ if not os.path.exists(MODEL_PATH_LANGID):
160
+ print('downloading LID model...')
161
+ urllib.request.urlretrieve(MODEL_URL_LANGID, MODEL_PATH_LANGID)
162
+ langid_model = fasttext.load_model(MODEL_PATH_LANGID)
163
+
164
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ fasttext
3
+ sentencepiece
4
+ numpy
5
+ # gradio
6
+ # torch