Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -28,8 +28,10 @@ SPEAKER_KRC_TTS = 'b_krc'
|
|
28 |
REPO_TTS_PATH = "snakers4/silero-models"
|
29 |
MODEL_TTS_PATH = "silero_tts"
|
30 |
|
31 |
-
LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
|
32 |
-
|
|
|
|
|
33 |
TYPE = pd.DataFrame({"krc": ["Кёчюрюўчю", "Сёзлюк", "Сёлешиўчю"], "rus": ["Переводчик", "Словарь", "Озвучка"], "eng": ["Translator", "Dictionary", "Voice"], "tur": ["Çevirmen", "Sözlük", "Seslendirme"], "short_name": ["translator", "dictionary", "tts"]})
|
34 |
|
35 |
SYSTEM_LANG = "rus"
|
@@ -42,7 +44,7 @@ NAMES = pd.DataFrame({
|
|
42 |
})
|
43 |
|
44 |
|
45 |
-
DEVICE = '
|
46 |
|
47 |
device = torch.device(DEVICE)
|
48 |
|
@@ -73,7 +75,7 @@ model_tts, _ = torch.hub.load(repo_or_dir = REPO_TTS_PATH,
|
|
73 |
model_tts.to(device)
|
74 |
|
75 |
# 4. Fix tokenizer
|
76 |
-
#def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
|
77 |
# """
|
78 |
# Add a new language token to the tokenizer vocabulary
|
79 |
# (this should be done each time after its initialization)
|
@@ -94,10 +96,200 @@ model_tts.to(device)
|
|
94 |
|
95 |
#fixTokenizer(tokenizer)
|
96 |
|
97 |
-
# 5. Change letters
|
98 |
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
str = str.replace("тюйюл", "тюл")
|
102 |
str = str.replace("Тюйюл", "Тюл")
|
103 |
str = str.replace("уку", "гылын qуш")
|
@@ -106,346 +298,252 @@ def fromModel(str, dialect = "qrc"):
|
|
106 |
str = str.replace("Хораз", "Гугурукку")
|
107 |
str = str.replace("юзмез", "qум")
|
108 |
str = str.replace("Юзмез", "Qум")
|
|
|
|
|
|
|
|
|
109 |
str = str.replace("jиля", "jыла")
|
|
|
|
|
110 |
str = str.replace("Jиля", "Jыла")
|
|
|
|
|
111 |
str = str.replace("ярабий", "арабин")
|
112 |
str = str.replace("арабий", "арабин")
|
113 |
str = str.replace("Ярабий", "Арабин")
|
114 |
str = str.replace("Арабий", "Арабин")
|
115 |
str = str.replace("нтта", "нтда")
|
116 |
str = str.replace("ртте", "ртде")
|
|
|
117 |
str = str.replace("jамауат", "jамаgат")
|
118 |
-
str = str.replace("j
|
|
|
|
|
119 |
str = str.replace("Jамауат", "Jамаgат")
|
120 |
-
str = str.replace("J
|
|
|
121 |
str = str.replace("шуёх", "шох")
|
122 |
str = str.replace("Шуёх", "Шох")
|
123 |
str = str.replace("шёндю", "бусаgат")
|
|
|
124 |
str = str.replace("Шёндю", "Бусаgат")
|
125 |
-
str = str.replace("
|
126 |
-
str = str.replace("
|
|
|
|
|
|
|
|
|
127 |
# str = str.replace("терк", "тез")
|
128 |
-
str = str.replace("
|
129 |
-
str = str.replace("
|
130 |
-
str = str.replace("
|
131 |
-
str = str.replace("
|
132 |
-
str = str.replace("
|
133 |
-
str = str.replace("
|
134 |
-
str = str.replace("
|
135 |
-
str = str.replace("
|
136 |
-
str = str.replace("
|
137 |
-
str = str.replace("
|
138 |
-
str = str.replace("
|
139 |
-
str = str.replace("
|
140 |
-
str = str.replace("
|
141 |
-
str = str.replace("
|
142 |
-
str = str.replace("
|
143 |
-
str = str.replace("
|
144 |
-
str = str.replace("
|
145 |
-
str = str.replace("j", "
|
146 |
-
str = str.replace("J", "
|
147 |
-
str = str.replace("
|
148 |
-
str = str.replace("
|
149 |
-
str = str.replace("
|
150 |
-
str = str.replace("
|
151 |
-
|
152 |
-
str = str.replace("
|
153 |
-
str = str.replace("
|
154 |
-
str = str.replace("
|
155 |
-
str = str.replace("
|
156 |
-
str = str.replace("
|
157 |
-
str = str.replace("
|
158 |
-
str = str.replace("
|
159 |
-
str = str.replace("
|
160 |
-
str = str.replace("
|
161 |
-
str = str.replace("
|
162 |
-
str = str.replace("
|
163 |
-
str = str.replace("
|
164 |
-
str = str.replace("
|
165 |
-
str = str.replace("
|
166 |
-
str = str.replace("
|
167 |
-
str = str.replace("
|
168 |
-
str = str.replace("
|
169 |
-
str = str.replace("
|
170 |
-
str = str.replace("
|
171 |
-
str = str.replace("
|
172 |
-
str = str.replace("
|
173 |
-
str = str.replace("
|
174 |
-
str = str.replace("
|
175 |
-
str = str.replace("
|
176 |
-
str = str.replace("
|
177 |
-
str = str.replace("
|
178 |
-
str = str.replace("
|
179 |
-
str = str.replace("
|
180 |
-
str = str.replace("
|
181 |
-
str = str.replace("
|
182 |
-
str = str.replace("
|
183 |
-
str = str.replace("
|
184 |
-
str = str.replace("
|
185 |
-
str = str.replace("
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
str = str.replace("ж", "j")
|
266 |
-
str = str.replace("Ж", "J")
|
267 |
-
str = str.replace("себеп", "себеb")
|
268 |
-
str = str.replace("себеб", "себеb")
|
269 |
-
str = str.replace("Себеп", "Себеb")
|
270 |
-
str = str.replace("Себеб", "Себеb")
|
271 |
-
str = str.replace("тюйюл", "тюл")
|
272 |
-
str = str.replace("Тюйюл", "Тюл")
|
273 |
-
str = str.replace("уку", "гылын qуш")
|
274 |
-
str = str.replace("Уку", "Гылын qуш")
|
275 |
-
str = str.replace("хораз", "гугурукку")
|
276 |
-
str = str.replace("Хораз", "Гугурукку")
|
277 |
-
str = str.replace("юзмез", "qум")
|
278 |
-
str = str.replace("Юзмез", "Qум")
|
279 |
-
str = str.replace("арап", "араb")
|
280 |
-
str = str.replace("араб", "араb")
|
281 |
-
str = str.replace("Арап", "Араb")
|
282 |
-
str = str.replace("Араб", "Араb")
|
283 |
-
str = str.replace("jиля", "jыла")
|
284 |
-
str = str.replace("jыла", "jыла")
|
285 |
-
str = str.replace("jыла", "jыла")
|
286 |
-
str = str.replace("Jиля", "Jыла")
|
287 |
-
str = str.replace("Jыла", "Jыла")
|
288 |
-
str = str.replace("Jыла", "Jыла")
|
289 |
-
str = str.replace("ярабий", "арабин")
|
290 |
-
str = str.replace("арабий", "арабин")
|
291 |
-
str = str.replace("Ярабий", "Арабин")
|
292 |
-
str = str.replace("Арабий", "Арабин")
|
293 |
-
str = str.replace("нтта", "нтда")
|
294 |
-
str = str.replace("ртте", "ртде")
|
295 |
-
str = str.replace("jамагъат", "jамаgат")
|
296 |
-
str = str.replace("jамауат", "jамаgат")
|
297 |
-
str = str.replace("jамагъат", "jамаgат")
|
298 |
-
str = str.replace("jамауат", "jамаgат")
|
299 |
-
str = str.replace("Jамагъат", "Jамаgат")
|
300 |
-
str = str.replace("Jамауат", "Jамаgат")
|
301 |
-
str = str.replace("Jамагъат", "Jамаgат")
|
302 |
-
str = str.replace("Jамаўат", "Jамаgат")
|
303 |
-
str = str.replace("шуёх", "шох")
|
304 |
-
str = str.replace("Шуёх", "Шох")
|
305 |
-
str = str.replace("шёндю", "бусаgат")
|
306 |
-
str = str.replace("бусагъат", "бусаgат")
|
307 |
-
str = str.replace("Шёндю", "Бусаgат")
|
308 |
-
str = str.replace("Бусагъат", "Бусаgат")
|
309 |
-
str = str.replace("угъай", "оgай")
|
310 |
-
str = str.replace("огъай", "оgай")
|
311 |
-
str = str.replace("Угъай", "Оgай")
|
312 |
-
str = str.replace("Огъай", "Оgай")
|
313 |
-
# str = str.replace("терк", "тез")
|
314 |
-
# str = str.replace("терк", "тез")
|
315 |
-
str = str.replace("санга", "сенnе")
|
316 |
-
str = str.replace("сенге", "сенnе")
|
317 |
-
str = str.replace("сеннге", "сенnе")
|
318 |
-
str = str.replace("Санга", "Сенnе")
|
319 |
-
str = str.replace("Сеннге", "Сенnе")
|
320 |
-
str = str.replace("Сенге", "Сенnе")
|
321 |
-
str = str.replace("манга", "менnе")
|
322 |
-
str = str.replace("меннге", "менnе")
|
323 |
-
str = str.replace("менге", "менnе")
|
324 |
-
str = str.replace("Манга", "Менnе")
|
325 |
-
str = str.replace("Меннге", "Менnе")
|
326 |
-
str = str.replace("Менге", "Менnе")
|
327 |
-
str = str.replace("аякъ jол", "jахтана")
|
328 |
-
str = str.replace("аякъ jол", "jахтана")
|
329 |
-
str = str.replace("jахтана", "jахтана")
|
330 |
-
str = str.replace("jахтана", "jахтана")
|
331 |
-
str = str.replace("Аякъ jол", "Jахтана")
|
332 |
-
str = str.replace("Аякъ jол", "Jахтана")
|
333 |
-
str = str.replace("Jахтана", "Jахтана")
|
334 |
-
str = str.replace("Jахтана", "Jахтана")
|
335 |
-
str = str.replace("къамж", "qамыzh")
|
336 |
-
str = str.replace("къамыж", "qамыzh")
|
337 |
-
str = str.replace("Къамж", "Qамыzh")
|
338 |
-
str = str.replace("Къамыж", "Qамыzh")
|
339 |
-
str = str.replace("къымыж", "qымыzh")
|
340 |
-
str = str.replace("къымыж", "qымыzh")
|
341 |
-
str = str.replace("Къымыж", "Qымыzh")
|
342 |
-
str = str.replace("Къымыж", "Qымыzh")
|
343 |
-
str = str.replace("хау", "хо")
|
344 |
-
str = str.replace("хаў", "хо")
|
345 |
-
str = str.replace("Хау", "Хо")
|
346 |
-
str = str.replace("Хаў", "Хо")
|
347 |
-
str = str.replace("уа", "wa")
|
348 |
-
str = str.replace("ўа", "wa")
|
349 |
-
str = str.replace("Уа", "Wa")
|
350 |
-
str = str.replace("Ўа", "Wa")
|
351 |
-
str = str.replace("п", "b")
|
352 |
-
str = str.replace("б", "b")
|
353 |
-
str = str.replace("къ", "q")
|
354 |
-
str = str.replace("Къ", "Q")
|
355 |
-
str = str.replace("КЪ", "Q")
|
356 |
-
str = str.replace("гъ", "g")
|
357 |
-
str = str.replace("Гъ", "G")
|
358 |
-
str = str.replace("ГЪ", "G")
|
359 |
-
str = str.replace("ц", "ч")
|
360 |
-
str = str.replace("Ц", "Ч")
|
361 |
-
str = str.replace("ф", "п")
|
362 |
-
str = str.replace("сыпат", "сыфат")
|
363 |
-
str = str.replace("Сыпат", "Сыфат")
|
364 |
-
str = str.replace("Ф", "П")
|
365 |
-
str = str.replace("(?<=[аыоуэеиёюя])у(?=[аыоуэеиёюя])|(?<=[аыоуэеиёюя])ў(?=[аыоуэеиёюя])|(?<=[АЫОУЭЕИЁЮЯ])у(?=[АЫОУЭЕИЁЮЯ])|(?<=[АЫОУЭЕИЁЮЯ])ў(?=[АЫОУЭЕИЁЮЯ])", "w")
|
366 |
-
str = str.replace("(?<=[аыоуэеиёюя])у|(?<=[аыоуэеиёюя])ў|(?<=[АЫОУЭЕИЁЮЯ])у|(?<=[АЫОУЭЕИЁЮЯ])ў", "w")
|
367 |
-
# str = str.replace("у(?=[аыоуэеиёюя])|ў(?=[аыоуэеиёюя])|у(?=[АЫОУЭЕИЁЮЯ])|ў(?=[АЫОУЭЕИЁЮЯ])", "w")
|
368 |
-
# str = str.replace("У(?=[аыоуэеиёюя])|Ў(?=[аыоуэеиёюя])|У(?=[АЫОУЭЕИЁЮЯ])|Ў(?=[АЫОУЭЕИЁЮЯ])", "W")
|
369 |
-
str = str.replace("zh", "ж")
|
370 |
-
str = str.replace("нг", "n")
|
371 |
-
str = str.replace("Нг", " N")
|
372 |
-
str = str.replace("НГ", " N")
|
373 |
-
return str
|
374 |
|
375 |
-
#
|
376 |
-
|
377 |
-
|
378 |
-
):
|
379 |
-
"""Turn a text or a list of texts into a list of translations"""
|
380 |
-
tokenizer.src_lang = src_lang
|
381 |
-
tokenizer.tgt_lang = tgt_lang
|
382 |
-
inputs = tokenizer(
|
383 |
-
text, return_tensors='pt', padding=True, truncation=True,
|
384 |
-
max_length=max_input_length
|
385 |
-
)
|
386 |
-
#print(f'Inputs: {inputs}')
|
387 |
-
result = model_translate.generate(
|
388 |
-
**inputs.to(model_translate.device),
|
389 |
-
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
|
390 |
-
max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
|
391 |
-
num_beams=num_beams, **kwargs
|
392 |
-
)
|
393 |
-
#print(f'Outputs: {result}')
|
394 |
-
return tokenizer.batch_decode(result, skip_special_tokens=True)
|
395 |
-
|
396 |
-
|
397 |
-
def translateDisp(text, from_, to, dialect):
|
398 |
-
# print(from_)
|
399 |
-
# print(to)
|
400 |
-
# print(dialect)
|
401 |
-
if dialect == "" or dialect is None:
|
402 |
-
dialect = DIALECT.dialect[0] # "дж\ч"
|
403 |
-
if from_ == "" or from_ is None:
|
404 |
-
from_ = LANGUAGE.language[1] # "Русский язык"
|
405 |
-
if to == "" or to is None:
|
406 |
-
to = LANGUAGE.language[0] # "Къарачай-Малкъар тил"
|
407 |
-
|
408 |
-
from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
|
409 |
-
to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
|
410 |
-
dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
|
411 |
|
412 |
-
print(f'Input text: {text} - Time: {datetime.now(tz=TZ)}')
|
413 |
-
|
414 |
-
text = text.strip()
|
415 |
-
|
416 |
-
if from_ == 'krc_Cyrl':
|
417 |
-
text = toModel(text)
|
418 |
-
|
419 |
-
# Разбиваем текст на предложения, сохраняя знаки препинания
|
420 |
-
# .+?[.!?](?:\s|$): Захватывает предложения, которые заканчиваются точкой, восклицательным или вопросительным знаком.
|
421 |
-
# |.+?(?:\n|$): Добавляет поддержку для разрыва строки (\n) или конца текста ($), если предложение не заканчивается знаком препинания.
|
422 |
-
text = re.findall(r'.+?[.!?](?:\s|$)|.+?(?:\n|$)', text)
|
423 |
-
# text бош эсе
|
424 |
-
if len(text) == 0:
|
425 |
-
text = ""
|
426 |
-
|
427 |
-
#print(f'Split text: {text}')
|
428 |
-
|
429 |
-
str_ = translatePy(text, src_lang = from_, tgt_lang = to)
|
430 |
|
431 |
-
|
432 |
-
|
433 |
-
|
|
|
|
|
434 |
|
435 |
-
|
436 |
-
|
437 |
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
|
442 |
-
#
|
443 |
-
def dictionaryDisp(text,
|
444 |
|
445 |
-
if
|
446 |
-
|
447 |
|
448 |
-
|
449 |
|
450 |
text = text.strip()
|
451 |
str_l = text.lower()
|
@@ -454,10 +552,10 @@ def dictionaryDisp(text, from_):
|
|
454 |
df_from_to = pd.DataFrame()
|
455 |
df_to_from = pd.DataFrame()
|
456 |
|
457 |
-
if
|
458 |
df_from_to = dictionary_qm.copy()
|
459 |
df_to_from = dictionary_ru.copy()
|
460 |
-
elif
|
461 |
df_from_to = dictionary_ru.copy()
|
462 |
df_to_from = dictionary_qm.copy()
|
463 |
|
@@ -483,7 +581,7 @@ def dictionaryDisp(text, from_):
|
|
483 |
# len(sozluk)
|
484 |
|
485 |
|
486 |
-
#
|
487 |
def tts(text):
|
488 |
file_voice = ''.join(random.choices(string.ascii_letters, k=8))
|
489 |
file_voice = f'{file_voice}.wav'
|
@@ -498,7 +596,9 @@ def tts(text):
|
|
498 |
|
499 |
return file_voice
|
500 |
|
501 |
-
#
|
|
|
|
|
502 |
_title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
|
503 |
_type = "".join(NAMES[NAMES.id == "type"][SYSTEM_LANG].to_list())
|
504 |
_from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
|
@@ -523,16 +623,16 @@ with gr.Blocks() as demo:
|
|
523 |
# choice_type = gr.Dropdown(
|
524 |
# choices = TYPE[SYSTEM_LANG].to_list(), label=_type, value = TYPE[SYSTEM_LANG][0])
|
525 |
translate_lang_input = gr.Dropdown(
|
526 |
-
choices = LANGUAGE.
|
527 |
|
528 |
with gr.Column():
|
529 |
with gr.Row():
|
530 |
translate_lang_output = gr.Dropdown(
|
531 |
-
choices = LANGUAGE.
|
532 |
|
533 |
dialect = gr.Dropdown(
|
534 |
# choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
|
535 |
-
choices = DIALECT.
|
536 |
|
537 |
with gr.Row():
|
538 |
with gr.Column():
|
@@ -548,7 +648,7 @@ with gr.Blocks() as demo:
|
|
548 |
with gr.Column():
|
549 |
with gr.Row():
|
550 |
dict_lang_input = gr.Dropdown(
|
551 |
-
choices = LANGUAGE.
|
552 |
|
553 |
|
554 |
with gr.Row():
|
@@ -570,11 +670,12 @@ with gr.Blocks() as demo:
|
|
570 |
tts_button = gr.Button(_sound, variant = 'primary')
|
571 |
|
572 |
|
573 |
-
translate_button.click(
|
574 |
dict_button.click(dictionaryDisp, inputs=[dict_text_input, dict_lang_input], outputs=[dict_text_output]) # text, from
|
575 |
tts_button.click(tts, inputs=[tts_text_input], outputs=[tts_text_output]) # text
|
576 |
|
577 |
gr.Markdown(_annotation)
|
578 |
|
579 |
-
#
|
580 |
-
demo.launch()
|
|
|
|
28 |
REPO_TTS_PATH = "snakers4/silero-models"
|
29 |
MODEL_TTS_PATH = "silero_tts"
|
30 |
|
31 |
+
# LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
|
32 |
+
LANGUAGE = {"Къарачай-Малкъар тил": "krc_Cyrl", "Русский язык": "rus_Cyrl"}
|
33 |
+
# DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
|
34 |
+
DIALECT = {"дж\ч": "qrc", "ж\ч": "hlm", "з\ц": "mqr"}
|
35 |
TYPE = pd.DataFrame({"krc": ["Кёчюрюўчю", "Сёзлюк", "Сёлешиўчю"], "rus": ["Переводчик", "Словарь", "Озвучка"], "eng": ["Translator", "Dictionary", "Voice"], "tur": ["Çevirmen", "Sözlük", "Seslendirme"], "short_name": ["translator", "dictionary", "tts"]})
|
36 |
|
37 |
SYSTEM_LANG = "rus"
|
|
|
44 |
})
|
45 |
|
46 |
|
47 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
48 |
|
49 |
device = torch.device(DEVICE)
|
50 |
|
|
|
75 |
model_tts.to(device)
|
76 |
|
77 |
# 4. Fix tokenizer
|
78 |
+
# def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
|
79 |
# """
|
80 |
# Add a new language token to the tokenizer vocabulary
|
81 |
# (this should be done each time after its initialization)
|
|
|
96 |
|
97 |
#fixTokenizer(tokenizer)
|
98 |
|
|
|
99 |
|
100 |
+
class Translator:
|
101 |
+
"""
|
102 |
+
Class for translator NLLB-200.
|
103 |
+
|
104 |
+
Параметры:
|
105 |
+
- model: Модель
|
106 |
+
- tokenizer: Токенизатор
|
107 |
+
|
108 |
+
Функция translate алады:
|
109 |
+
- text (str): Текст
|
110 |
+
- src_lang (str): Тебреген тил
|
111 |
+
- tgt_lang (str): Тил таба
|
112 |
+
- dialect (int): Диалект
|
113 |
+
|
114 |
+
Чыгъарады:
|
115 |
+
- translated (str): Кёчюрюлгени
|
116 |
+
|
117 |
+
"""
|
118 |
+
def __init__(self, tokenizer, model) -> None:
|
119 |
+
self.model = model
|
120 |
+
self.tokenizer = tokenizer
|
121 |
+
# Change letters
|
122 |
+
def _fromModel(self, str: str, dialect: str = "qrc") -> str:
|
123 |
+
if dialect == "qrc":
|
124 |
+
str = str.replace("тюйюл", "тюл")
|
125 |
+
str = str.replace("Тюйюл", "Тюл")
|
126 |
+
str = str.replace("уку", "гылын qуш")
|
127 |
+
str = str.replace("Уку", "Гылын qуш")
|
128 |
+
str = str.replace("хораз", "гугурукку")
|
129 |
+
str = str.replace("Хораз", "Гугурукку")
|
130 |
+
str = str.replace("юзмез", "qум")
|
131 |
+
str = str.replace("Юзмез", "Qум")
|
132 |
+
str = str.replace("jиля", "jыла")
|
133 |
+
str = str.replace("Jиля", "Jыла")
|
134 |
+
str = str.replace("ярабий", "арабин")
|
135 |
+
str = str.replace("арабий", "арабин")
|
136 |
+
str = str.replace("Ярабий", "Арабин")
|
137 |
+
str = str.replace("Арабий", "Арабин")
|
138 |
+
str = str.replace("нтта", "нтда")
|
139 |
+
str = str.replace("ртте", "ртде")
|
140 |
+
str = str.replace("jамауат", "jамаgат")
|
141 |
+
str = str.replace("jамаwат", "jамаgат")
|
142 |
+
str = str.replace("Jамауат", "Jамаgат")
|
143 |
+
str = str.replace("Jамаwат", "Jамаgат")
|
144 |
+
str = str.replace("шуёх", "шох")
|
145 |
+
str = str.replace("Шуёх", "Шох")
|
146 |
+
str = str.replace("шёндю", "бусаgат")
|
147 |
+
str = str.replace("Шёндю", "Бусаgат")
|
148 |
+
str = str.replace("уgай", "оgай")
|
149 |
+
str = str.replace("Уgай", "Оgай")
|
150 |
+
# str = str.replace("терк", "тез")
|
151 |
+
str = str.replace("саnа", "сенnе")
|
152 |
+
str = str.replace("сеnе", "сенnе")
|
153 |
+
str = str.replace("Саnа", "Сенnе")
|
154 |
+
str = str.replace("Сеnе", "Сенnе")
|
155 |
+
str = str.replace("маnа", "менnе")
|
156 |
+
str = str.replace("меnе", "менnе")
|
157 |
+
str = str.replace("Маnа", "Менnе")
|
158 |
+
str = str.replace("Меnе", "Менnе")
|
159 |
+
str = str.replace("аяq jол", "jахтана")
|
160 |
+
str = str.replace("Аяq jол", "Jахтана")
|
161 |
+
str = str.replace("сыbат", "сыфат")
|
162 |
+
str = str.replace("Сыbат", "Сыфат")
|
163 |
+
str = str.replace("b", "б")
|
164 |
+
str = str.replace("q", "къ")
|
165 |
+
str = str.replace("Q", "Къ")
|
166 |
+
str = str.replace("g", "гъ")
|
167 |
+
str = str.replace("G", "Гъ")
|
168 |
+
str = str.replace("j", "дж")
|
169 |
+
str = str.replace("J", "Дж")
|
170 |
+
str = str.replace("w", "ў")
|
171 |
+
str = str.replace("W", "Ў")
|
172 |
+
str = str.replace("n", "нг")
|
173 |
+
str = str.replace("N", "Нг")
|
174 |
+
elif dialect == "hlm":
|
175 |
+
str = str.replace("тюл", "тюйюл")
|
176 |
+
str = str.replace("Тюл", "Тюйюл")
|
177 |
+
str = str.replace("гылын qуш", "уку")
|
178 |
+
str = str.replace("Гылын qуш", "Уку")
|
179 |
+
str = str.replace("гугурукку", "хораз")
|
180 |
+
str = str.replace("Гугурукку", "Хораз")
|
181 |
+
str = str.replace("qум", "юзмез")
|
182 |
+
str = str.replace("Qум", "Юзмез")
|
183 |
+
str = str.replace("jыла", "jиля")
|
184 |
+
str = str.replace("Jыла", "Jиля")
|
185 |
+
str = str.replace("арабин", "ярабий")
|
186 |
+
str = str.replace("арабий", "ярабий")
|
187 |
+
str = str.replace("Арабин", "Ярабий")
|
188 |
+
str = str.replace("Арабий", "Ярабий")
|
189 |
+
str = str.replace("нтда", "нтта")
|
190 |
+
str = str.replace("ртде", "ртте")
|
191 |
+
str = str.replace("jамаgат", "jамаwат")
|
192 |
+
str = str.replace("Jамаgат", "Jамаwат")
|
193 |
+
str = str.replace("шох", "шуёх")
|
194 |
+
str = str.replace("Шох", "Шуёх")
|
195 |
+
str = str.replace("бусаgат", "шёндю")
|
196 |
+
str = str.replace("Бусаgат", "Шёндю")
|
197 |
+
str = str.replace("оgай", "уgай")
|
198 |
+
str = str.replace("Оgай", "Уgай")
|
199 |
+
str = str.replace("тез", "терк")
|
200 |
+
str = str.replace("сенnе", "саnа")
|
201 |
+
str = str.replace("сеnе", "саnа")
|
202 |
+
str = str.replace("Сенnе", "Саnа")
|
203 |
+
str = str.replace("Сеnе", "Саnа")
|
204 |
+
str = str.replace("менnе", "маnа")
|
205 |
+
str = str.replace("меnе", "маnа")
|
206 |
+
str = str.replace("Менnе", "Маnа")
|
207 |
+
str = str.replace("Меnе", "Маnа")
|
208 |
+
str = str.replace("jахтана", "аяq jол")
|
209 |
+
str = str.replace("Jахтана", "аяq jол")
|
210 |
+
str = str.replace("хо", "хаw")
|
211 |
+
str = str.replace("Хо", "Хаw")
|
212 |
+
str = str.replace("сыbат", "сыфат")
|
213 |
+
str = str.replace("Сыbат", "Сыфат")
|
214 |
+
str = str.replace("b", "п")
|
215 |
+
str = str.replace("q", "къ")
|
216 |
+
str = str.replace("Q", "Къ")
|
217 |
+
str = str.replace("g", "гъ")
|
218 |
+
str = str.replace("G", "Гъ")
|
219 |
+
str = str.replace("j", "ж")
|
220 |
+
str = str.replace("J", "Ж")
|
221 |
+
str = str.replace("w", "ў")
|
222 |
+
str = str.replace("W", "Ў")
|
223 |
+
str = str.replace("n", "нг")
|
224 |
+
str = str.replace("N", "Нг")
|
225 |
+
elif dialect == "mqr":
|
226 |
+
str = str.replace("тюл", "тюйюл")
|
227 |
+
str = str.replace("Тюл", "Тюйюл")
|
228 |
+
str = str.replace("гылын qуш", "уку")
|
229 |
+
str = str.replace("Гылын qуш", "Уку")
|
230 |
+
str = str.replace("гугурукку", "хораз")
|
231 |
+
str = str.replace("Гугурукку", "Хораз")
|
232 |
+
str = str.replace("qум", "юзмез")
|
233 |
+
str = str.replace("Qум", "Юзмез")
|
234 |
+
str = str.replace("jыла", "jиля")
|
235 |
+
str = str.replace("Jыла", "Jиля")
|
236 |
+
str = str.replace("арабин", "ярабий")
|
237 |
+
str = str.replace("арабий", "ярабий")
|
238 |
+
str = str.replace("Арабин", "Ярабий")
|
239 |
+
str = str.replace("Арабий", "Ярабий")
|
240 |
+
str = str.replace("нтда", "нтта")
|
241 |
+
str = str.replace("ртде", "ртте")
|
242 |
+
str = str.replace("jамаgат", "жамаwат")
|
243 |
+
str = str.replace("Jамаgат", "Жамаwат")
|
244 |
+
str = str.replace("шох", "шуёх")
|
245 |
+
str = str.replace("Шох", "Шуёх")
|
246 |
+
str = str.replace("бусаgат", "шёндю")
|
247 |
+
str = str.replace("Бусаgат", "Шёндю")
|
248 |
+
str = str.replace("оgай", "уgай")
|
249 |
+
str = str.replace("Оgай", "Уgай")
|
250 |
+
str = str.replace("тез", "терк")
|
251 |
+
str = str.replace("сенnе", "саnа")
|
252 |
+
str = str.replace("сеnе", "саnа")
|
253 |
+
str = str.replace("Сенnе", "Саnа")
|
254 |
+
str = str.replace("Сеnе", "Саnа")
|
255 |
+
str = str.replace("менnе", "маnа")
|
256 |
+
str = str.replace("меnе", "маnа")
|
257 |
+
str = str.replace("Менnе", "Маnа")
|
258 |
+
str = str.replace("Меnе", "Маnа")
|
259 |
+
str = str.replace("jахтана", "аяq jол")
|
260 |
+
str = str.replace("Jахтана", "аяq jол")
|
261 |
+
str = str.replace("хо", "хаw")
|
262 |
+
str = str.replace("Хо", "Хаw")
|
263 |
+
str = str.replace("сыbат", "сыфат")
|
264 |
+
str = str.replace("Сыbат", "Сыфат")
|
265 |
+
str = str.replace("b", "п")
|
266 |
+
str = str.replace("q", "къ")
|
267 |
+
str = str.replace("Q", "Къ")
|
268 |
+
str = str.replace("g", "гъ")
|
269 |
+
str = str.replace("G", "Гъ")
|
270 |
+
str = str.replace("j", "з")
|
271 |
+
str = str.replace("J", "З")
|
272 |
+
str = str.replace("w", "ў")
|
273 |
+
str = str.replace("W", "Ў")
|
274 |
+
str = str.replace("n", "нг")
|
275 |
+
str = str.replace("N", "Нг")
|
276 |
+
str = str.replace("ч", "ц")
|
277 |
+
str = str.replace("Ч", "Ц")
|
278 |
+
str = str.replace("п", "ф")
|
279 |
+
str = str.replace("П", "Ф")
|
280 |
+
str = str.replace("къ|гъ", "х")
|
281 |
+
return str
|
282 |
+
|
283 |
+
def _toModel(self, str: str) -> str:
|
284 |
+
str = str.replace("дж", "j")
|
285 |
+
str = str.replace("Дж", "J")
|
286 |
+
str = str.replace("ДЖ", "J")
|
287 |
+
str = str.replace("ж", "j")
|
288 |
+
str = str.replace("Ж", "J")
|
289 |
+
str = str.replace("себеп", "себеb")
|
290 |
+
str = str.replace("себеб", "себеb")
|
291 |
+
str = str.replace("Себеп", "Себеb")
|
292 |
+
str = str.replace("Себеб", "Себеb")
|
293 |
str = str.replace("тюйюл", "тюл")
|
294 |
str = str.replace("Тюйюл", "Тюл")
|
295 |
str = str.replace("уку", "гылын qуш")
|
|
|
298 |
str = str.replace("Хораз", "Гугурукку")
|
299 |
str = str.replace("юзмез", "qум")
|
300 |
str = str.replace("Юзмез", "Qум")
|
301 |
+
str = str.replace("арап", "араb")
|
302 |
+
str = str.replace("араб", "араb")
|
303 |
+
str = str.replace("Арап", "Араb")
|
304 |
+
str = str.replace("Араб", "Араb")
|
305 |
str = str.replace("jиля", "jыла")
|
306 |
+
str = str.replace("jыла", "jыла")
|
307 |
+
str = str.replace("jыла", "jыла")
|
308 |
str = str.replace("Jиля", "Jыла")
|
309 |
+
str = str.replace("Jыла", "Jыла")
|
310 |
+
str = str.replace("Jыла", "Jыла")
|
311 |
str = str.replace("ярабий", "арабин")
|
312 |
str = str.replace("арабий", "арабин")
|
313 |
str = str.replace("Ярабий", "Арабин")
|
314 |
str = str.replace("Арабий", "Арабин")
|
315 |
str = str.replace("нтта", "нтда")
|
316 |
str = str.replace("ртте", "ртде")
|
317 |
+
str = str.replace("jамагъат", "jамаgат")
|
318 |
str = str.replace("jамауат", "jамаgат")
|
319 |
+
str = str.replace("jамагъат", "jамаgат")
|
320 |
+
str = str.replace("jамауат", "jамаgат")
|
321 |
+
str = str.replace("Jамагъат", "Jамаgат")
|
322 |
str = str.replace("Jамауат", "Jамаgат")
|
323 |
+
str = str.replace("Jамагъат", "Jамаgат")
|
324 |
+
str = str.replace("Jамаўат", "Jамаgат")
|
325 |
str = str.replace("шуёх", "шох")
|
326 |
str = str.replace("Шуёх", "Шох")
|
327 |
str = str.replace("шёндю", "бусаgат")
|
328 |
+
str = str.replace("бусагъат", "бусаgат")
|
329 |
str = str.replace("Шёндю", "Бусаgат")
|
330 |
+
str = str.replace("Бусагъат", "Бусаgат")
|
331 |
+
str = str.replace("угъай", "оgай")
|
332 |
+
str = str.replace("огъай", "оgай")
|
333 |
+
str = str.replace("Угъай", "Оgай")
|
334 |
+
str = str.replace("Огъай", "Оgай")
|
335 |
+
# str = str.replace("терк", "тез")
|
336 |
# str = str.replace("терк", "тез")
|
337 |
+
str = str.replace("санга", "сенnе")
|
338 |
+
str = str.replace("сенге", "сенnе")
|
339 |
+
str = str.replace("сеннге", "сенnе")
|
340 |
+
str = str.replace("Санга", "Сенnе")
|
341 |
+
str = str.replace("Сеннге", "Сенnе")
|
342 |
+
str = str.replace("Сенге", "Сенnе")
|
343 |
+
str = str.replace("манга", "менnе")
|
344 |
+
str = str.replace("меннге", "менnе")
|
345 |
+
str = str.replace("менге", "менnе")
|
346 |
+
str = str.replace("Манга", "Менnе")
|
347 |
+
str = str.replace("Меннге", "Менnе")
|
348 |
+
str = str.replace("Менге", "Менnе")
|
349 |
+
str = str.replace("аякъ jол", "jахтана")
|
350 |
+
str = str.replace("аякъ jол", "jахтана")
|
351 |
+
str = str.replace("jахтана", "jахтана")
|
352 |
+
str = str.replace("jахтана", "jахтана")
|
353 |
+
str = str.replace("Аякъ jол", "Jахтана")
|
354 |
+
str = str.replace("Аякъ jол", "Jахтана")
|
355 |
+
str = str.replace("Jахтана", "Jахтана")
|
356 |
+
str = str.replace("Jахтана", "Jахтана")
|
357 |
+
str = str.replace("къамж", "qамыzh")
|
358 |
+
str = str.replace("къамыж", "qамыzh")
|
359 |
+
str = str.replace("Къамж", "Qамыzh")
|
360 |
+
str = str.replace("Къамыж", "Qамыzh")
|
361 |
+
str = str.replace("къымыж", "qымыzh")
|
362 |
+
str = str.replace("къымыж", "qымыzh")
|
363 |
+
str = str.replace("Къымыж", "Qымыzh")
|
364 |
+
str = str.replace("Къымыж", "Qымыzh")
|
365 |
+
str = str.replace("хау", "хо")
|
366 |
+
str = str.replace("хаў", "хо")
|
367 |
+
str = str.replace("Хау", "Хо")
|
368 |
+
str = str.replace("Хаў", "Хо")
|
369 |
+
str = str.replace("уа", "wa")
|
370 |
+
str = str.replace("ўа", "wa")
|
371 |
+
str = str.replace("Уа", "Wa")
|
372 |
+
str = str.replace("Ўа", "Wa")
|
373 |
+
str = str.replace("п", "b")
|
374 |
+
str = str.replace("б", "b")
|
375 |
+
str = str.replace("къ", "q")
|
376 |
+
str = str.replace("Къ", "Q")
|
377 |
+
str = str.replace("КЪ", "Q")
|
378 |
+
str = str.replace("гъ", "g")
|
379 |
+
str = str.replace("Гъ", "G")
|
380 |
+
str = str.replace("ГЪ", "G")
|
381 |
+
str = str.replace("ц", "ч")
|
382 |
+
str = str.replace("Ц", "Ч")
|
383 |
+
str = str.replace("ф", "п")
|
384 |
+
str = str.replace("сыпат", "сыфат")
|
385 |
+
str = str.replace("Сыпат", "Сыфат")
|
386 |
+
str = str.replace("Ф", "П")
|
387 |
+
str = str.replace("(?<=[аыоуэеиёюя])у(?=[аыоуэеиёюя])|(?<=[аыоуэеиёюя])ў(?=[аыоуэеиёюя])|(?<=[АЫОУЭЕИЁЮЯ])у(?=[АЫОУЭЕИЁЮЯ])|(?<=[АЫОУЭЕИЁЮЯ])ў(?=[АЫОУЭЕИЁЮЯ])", "w")
|
388 |
+
str = str.replace("(?<=[аыоуэеиёюя])у|(?<=[аыоуэеиёюя])ў|(?<=[АЫОУЭЕИЁЮЯ])у|(?<=[АЫОУЭЕИЁЮЯ])ў", "w")
|
389 |
+
# str = str.replace("у(?=[аыоуэеиёюя])|ў(?=[аыоуэеиёюя])|у(?=[АЫОУЭЕИЁЮЯ])|ў(?=[АЫОУЭЕИЁЮЯ])", "w")
|
390 |
+
# str = str.replace("У(?=[аыоуэеиёюя])|Ў(?=[аыоуэеиёюя])|У(?=[АЫОУЭЕИЁЮЯ])|Ў(?=[АЫОУЭЕИЁЮЯ])", "W")
|
391 |
+
str = str.replace("zh", "ж")
|
392 |
+
str = str.replace("нг", "n")
|
393 |
+
str = str.replace("Нг", " N")
|
394 |
+
str = str.replace("НГ", " N")
|
395 |
+
return str
|
396 |
+
|
397 |
+
# structure
|
398 |
+
def _prepareTextAndStructure(self, text: str) -> tuple:
|
399 |
+
"""
|
400 |
+
The input text is divided into sentences, while maintaining the structure
|
401 |
+
"""
|
402 |
+
|
403 |
+
# Разбиваем текст на предложения, сохраняя знаки препинания
|
404 |
+
# .+?[.!?।ฯ؟](?:\s|$): Захватывает предложения, которые заканчиваются
|
405 |
+
# точкой, восклицательным или вопросительным знаком.
|
406 |
+
|
407 |
+
# |.+?(?:\n|$): Добавляет поддержку для разрыва строки (\n) или конца текста ($),
|
408 |
+
# если предложение не заканчивается знаком препинания.
|
409 |
+
segments = re.findall(pattern=r".+?[.!?।ฯ؟](?:\s|$)|.*?(?:\n|$)", string=text)
|
410 |
+
|
411 |
+
# Если последний элемент пустой, то его удаляем
|
412 |
+
if not segments[-1]:
|
413 |
+
segments = segments[:-1]
|
414 |
+
|
415 |
+
# Склеиваем разорванные предложения
|
416 |
+
merged_segments = []
|
417 |
+
buffer = ""
|
418 |
+
|
419 |
+
for i, segment in enumerate(segments):
|
420 |
+
# Проверяем, заканчивается ли текущий сегмент на .!? или пуст
|
421 |
+
if buffer:
|
422 |
+
buffer += " " + segment
|
423 |
+
else:
|
424 |
+
buffer = segment
|
425 |
+
|
426 |
+
# Если сегмент не заканчивается на .!? и следующий начинается с маленькой буквы
|
427 |
+
if ( # noqa: R507
|
428 |
+
not re.search(pattern=r"[.!?।ฯ؟](?:\s|$)", string=segment) # noqa: ECE001
|
429 |
+
and i + 1 < len(segments)
|
430 |
+
and segments[i + 1].strip()
|
431 |
+
and ((segments[i + 1].strip()[0].islower()) or (segments[i + 1].strip()[0] in ["'", '"']))
|
432 |
+
):
|
433 |
+
continue # Склеиваем с следующим сегментом
|
434 |
+
else:
|
435 |
+
merged_segments.append(buffer)
|
436 |
+
buffer = ""
|
437 |
+
|
438 |
+
# Удаляем пустые сегменты и сохраняем пробелы
|
439 |
+
original_structure = []
|
440 |
+
for segment in merged_segments:
|
441 |
+
match = re.match(pattern=r"^(\s*)(.*?)(\s*)$", string=segment, flags=re.DOTALL)
|
442 |
+
if match:
|
443 |
+
original_structure.append((match.group(1), match.group(2), match.group(3)))
|
444 |
+
|
445 |
+
# Токенизируем только текстовые части сегментов
|
446 |
+
texts_to_translate = [seg[1] for seg in original_structure if seg[1].strip()]
|
447 |
+
|
448 |
+
return texts_to_translate, original_structure
|
449 |
+
|
450 |
+
def _recoverTranslatedToStructure(self, translated_texts: str, original_structure: list) -> str:
|
451 |
+
"""
|
452 |
+
Translated sentences are embedded in the structure of the original text
|
453 |
+
"""
|
454 |
+
# Восстанавливаем исходную структуру текста
|
455 |
+
translated_segments = []
|
456 |
+
translated_index = 0
|
457 |
+
for seg in original_structure:
|
458 |
+
if seg[1].strip(): # Если сегмент был переведён
|
459 |
+
translated_segments.append(f"{seg[0]}{translated_texts[translated_index]}{seg[2]}")
|
460 |
+
translated_index += 1
|
461 |
+
else: # Если сегмент был пустым, оставляем его как есть
|
462 |
+
translated_segments.append(f"{seg[0]}{seg[1]}{seg[2]}")
|
463 |
+
|
464 |
+
return "".join(translated_segments)
|
465 |
+
|
466 |
+
|
467 |
+
# Translate function
|
468 |
+
def _translate(self, text: list | str, src_lang: str = 'rus_Cyrl', tgt_lang: str = 'krc_Cyrl',
|
469 |
+
a: int = 32, b: int = 3, max_input_length: int = 1024, num_beams: int = 3, **kwargs
|
470 |
+
) -> list:
|
471 |
+
"""Turn a text or a list of texts into a list of translations"""
|
472 |
+
self.tokenizer.src_lang = src_lang
|
473 |
+
self.tokenizer.tgt_lang = tgt_lang
|
474 |
+
inputs = self.tokenizer(
|
475 |
+
text, return_tensors='pt', padding=True, truncation=True,
|
476 |
+
max_length=max_input_length
|
477 |
+
)
|
478 |
+
#print(f'Inputs: {inputs}')
|
479 |
+
result = self.model.generate(
|
480 |
+
**inputs.to(self.model.device),
|
481 |
+
forced_bos_token_id=self.tokenizer.convert_tokens_to_ids(tgt_lang),
|
482 |
+
max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
|
483 |
+
num_beams=num_beams, **kwargs
|
484 |
+
)
|
485 |
+
#print(f'Outputs: {result}')
|
486 |
+
return self.tokenizer.batch_decode(result, skip_special_tokens=True)
|
487 |
+
|
488 |
+
def translate(self, text: str, src_lang: str | None = None, tgt_lang: str | None = None, dialect: str | None = None) -> str:
|
489 |
+
# print(src_lang)
|
490 |
+
# print(trg_lang)
|
491 |
+
# print(dialect)
|
492 |
+
if dialect == "" or dialect is None:
|
493 |
+
# dialect = DIALECT.dialect[0] # "дж\ч"
|
494 |
+
dialect = list(DIALECT.keys())[0] # "дж\ч"
|
495 |
+
if src_lang == "" or src_lang is None:
|
496 |
+
# src_lang = LANGUAGE.language[1] # "Русский язык"
|
497 |
+
src_lang = list(LANGUAGE.keys())[1] # "Русский язык"
|
498 |
+
if tgt_lang == "" or tgt_lang is None:
|
499 |
+
# tgt_lang = LANGUAGE.language[0] # "Къарачай-Малкъар тил"
|
500 |
+
tgt_lang = list(LANGUAGE.keys())[0] # "Къарачай-Малкъар тил"
|
501 |
+
|
502 |
+
# src_lang = "".join(LANGUAGE[LANGUAGE.language == src_lang].token.to_list())
|
503 |
+
# tgt_lang = "".join(LANGUAGE[LANGUAGE.language == tgt_lang].token.to_list())
|
504 |
+
# dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
|
505 |
+
src_lang = LANGUAGE[src_lang]
|
506 |
+
tgt_lang = LANGUAGE[tgt_lang]
|
507 |
+
dialect = DIALECT[dialect]
|
508 |
+
|
509 |
+
print(f'Input text: {text} - Time: {datetime.now(tz=TZ)}')
|
510 |
+
|
511 |
+
text = text.strip()
|
512 |
+
|
513 |
+
if src_lang == 'krc_Cyrl':
|
514 |
+
text = self._toModel(text)
|
515 |
|
516 |
+
# Разбиваем текст на предложения, сохраняя знаки препинания
|
517 |
+
texts_to_translate, original_structure = self._prepareTextAndStructure(text=text)
|
518 |
+
# text бош эсе
|
519 |
+
if len(texts_to_translate) == 0:
|
520 |
+
texts_to_translate = [""]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
|
522 |
+
#print(f'Split text: {texts_to_translate}')
|
523 |
+
|
524 |
+
translated_texts = self._translate(text=texts_to_translate, src_lang = src_lang, tgt_lang = tgt_lang)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
526 |
|
527 |
+
|
528 |
+
translated = self._recoverTranslatedToStructure(
|
529 |
+
translated_texts=translated_texts, original_structure=original_structure
|
530 |
+
)
|
531 |
+
#print(f'Translated text: {translated}')
|
532 |
|
533 |
+
if tgt_lang == 'krc_Cyrl':
|
534 |
+
translated = self._fromModel(str=translated, dialect = dialect)
|
535 |
|
536 |
+
print(f'Translated text: {translated} - Time: {datetime.now(tz=TZ)}')
|
537 |
+
|
538 |
+
return translated
|
539 |
|
540 |
+
# Dictionary function
|
541 |
+
def dictionaryDisp(text, src_lang):
|
542 |
|
543 |
+
if src_lang == "" or src_lang is None:
|
544 |
+
src_lang = list(LANGUAGE.keys())[1] # "Русский язык"
|
545 |
|
546 |
+
src_lang = LANGUAGE[src_lang]
|
547 |
|
548 |
text = text.strip()
|
549 |
str_l = text.lower()
|
|
|
552 |
df_from_to = pd.DataFrame()
|
553 |
df_to_from = pd.DataFrame()
|
554 |
|
555 |
+
if src_lang == 'krc_Cyrl':
|
556 |
df_from_to = dictionary_qm.copy()
|
557 |
df_to_from = dictionary_ru.copy()
|
558 |
+
elif src_lang == 'rus_Cyrl':
|
559 |
df_from_to = dictionary_ru.copy()
|
560 |
df_to_from = dictionary_qm.copy()
|
561 |
|
|
|
581 |
# len(sozluk)
|
582 |
|
583 |
|
584 |
+
# Voice function
|
585 |
def tts(text):
|
586 |
file_voice = ''.join(random.choices(string.ascii_letters, k=8))
|
587 |
file_voice = f'{file_voice}.wav'
|
|
|
596 |
|
597 |
return file_voice
|
598 |
|
599 |
+
# 5. Definition ui
|
600 |
+
translator = Translator(tokenizer=tokenizer, model=model_translate)
|
601 |
+
|
602 |
_title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
|
603 |
_type = "".join(NAMES[NAMES.id == "type"][SYSTEM_LANG].to_list())
|
604 |
_from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
|
|
|
623 |
# choice_type = gr.Dropdown(
|
624 |
# choices = TYPE[SYSTEM_LANG].to_list(), label=_type, value = TYPE[SYSTEM_LANG][0])
|
625 |
translate_lang_input = gr.Dropdown(
|
626 |
+
choices = list(LANGUAGE.keys()), label=_from, value = list(LANGUAGE.keys())[1])
|
627 |
|
628 |
with gr.Column():
|
629 |
with gr.Row():
|
630 |
translate_lang_output = gr.Dropdown(
|
631 |
+
choices = list(LANGUAGE.keys()), label=_to, value = list(LANGUAGE.keys())[0])
|
632 |
|
633 |
dialect = gr.Dropdown(
|
634 |
# choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
|
635 |
+
choices = list(DIALECT.keys()), label=_dialect, value = list(DIALECT.keys())[0])
|
636 |
|
637 |
with gr.Row():
|
638 |
with gr.Column():
|
|
|
648 |
with gr.Column():
|
649 |
with gr.Row():
|
650 |
dict_lang_input = gr.Dropdown(
|
651 |
+
choices = list(LANGUAGE.keys()), label=_from, value = list(LANGUAGE.keys())[1])
|
652 |
|
653 |
|
654 |
with gr.Row():
|
|
|
670 |
tts_button = gr.Button(_sound, variant = 'primary')
|
671 |
|
672 |
|
673 |
+
translate_button.click(translator.translate, inputs=[translate_text_input, translate_lang_input, translate_lang_output, dialect], outputs=[translate_text_output]) # text, from, to, dialect
|
674 |
dict_button.click(dictionaryDisp, inputs=[dict_text_input, dict_lang_input], outputs=[dict_text_output]) # text, from
|
675 |
tts_button.click(tts, inputs=[tts_text_input], outputs=[tts_text_output]) # text
|
676 |
|
677 |
gr.Markdown(_annotation)
|
678 |
|
679 |
+
# 6. Launch
|
680 |
+
demo.launch()
|
681 |
+
# demo.launch(inbrowser=True)
|