TSjB commited on
Commit
b4eaf9f
·
verified ·
1 Parent(s): be98e5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -334
app.py CHANGED
@@ -28,8 +28,10 @@ SPEAKER_KRC_TTS = 'b_krc'
28
  REPO_TTS_PATH = "snakers4/silero-models"
29
  MODEL_TTS_PATH = "silero_tts"
30
 
31
- LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
32
- DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
 
 
33
  TYPE = pd.DataFrame({"krc": ["Кёчюрюўчю", "Сёзлюк", "Сёлешиўчю"], "rus": ["Переводчик", "Словарь", "Озвучка"], "eng": ["Translator", "Dictionary", "Voice"], "tur": ["Çevirmen", "Sözlük", "Seslendirme"], "short_name": ["translator", "dictionary", "tts"]})
34
 
35
  SYSTEM_LANG = "rus"
@@ -42,7 +44,7 @@ NAMES = pd.DataFrame({
42
  })
43
 
44
 
45
- DEVICE = 'gpu' if torch.cuda.is_available() else 'cpu'
46
 
47
  device = torch.device(DEVICE)
48
 
@@ -73,7 +75,7 @@ model_tts, _ = torch.hub.load(repo_or_dir = REPO_TTS_PATH,
73
  model_tts.to(device)
74
 
75
  # 4. Fix tokenizer
76
- #def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
77
  # """
78
  # Add a new language token to the tokenizer vocabulary
79
  # (this should be done each time after its initialization)
@@ -94,10 +96,200 @@ model_tts.to(device)
94
 
95
  #fixTokenizer(tokenizer)
96
 
97
- # 5. Change letters
98
 
99
- def fromModel(str, dialect = "qrc"):
100
- if dialect == "qrc":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  str = str.replace("тюйюл", "тюл")
102
  str = str.replace("Тюйюл", "Тюл")
103
  str = str.replace("уку", "гылын qуш")
@@ -106,346 +298,252 @@ def fromModel(str, dialect = "qrc"):
106
  str = str.replace("Хораз", "Гугурукку")
107
  str = str.replace("юзмез", "qум")
108
  str = str.replace("Юзмез", "Qум")
 
 
 
 
109
  str = str.replace("jиля", "jыла")
 
 
110
  str = str.replace("Jиля", "Jыла")
 
 
111
  str = str.replace("ярабий", "арабин")
112
  str = str.replace("арабий", "арабин")
113
  str = str.replace("Ярабий", "Арабин")
114
  str = str.replace("Арабий", "Арабин")
115
  str = str.replace("нтта", "нтда")
116
  str = str.replace("ртте", "ртде")
 
117
  str = str.replace("jамауат", "jамаgат")
118
- str = str.replace("jамаwат", "jамаgат")
 
 
119
  str = str.replace("Jамауат", "Jамаgат")
120
- str = str.replace("Jамаwат", "Jамаgат")
 
121
  str = str.replace("шуёх", "шох")
122
  str = str.replace("Шуёх", "Шох")
123
  str = str.replace("шёндю", "бусаgат")
 
124
  str = str.replace("Шёндю", "Бусаgат")
125
- str = str.replace("уgай", "оgай")
126
- str = str.replace("Уgай", "Оgай")
 
 
 
 
127
  # str = str.replace("терк", "тез")
128
- str = str.replace("саnа", "сенnе")
129
- str = str.replace("сеnе", "сенnе")
130
- str = str.replace("Саnа", "Сенnе")
131
- str = str.replace("Сеnе", "Сенnе")
132
- str = str.replace("маnа", "менnе")
133
- str = str.replace("меnе", "менnе")
134
- str = str.replace("Маnа", "Менnе")
135
- str = str.replace("Меnе", "Менnе")
136
- str = str.replace("аяq jол", "jахтана")
137
- str = str.replace("Аяq jол", "Jахтана")
138
- str = str.replace("сыbат", "сыфат")
139
- str = str.replace("Сыbат", "Сыфат")
140
- str = str.replace("b", "б")
141
- str = str.replace("q", "къ")
142
- str = str.replace("Q", "Къ")
143
- str = str.replace("g", "гъ")
144
- str = str.replace("G", "Гъ")
145
- str = str.replace("j", "дж")
146
- str = str.replace("J", "Дж")
147
- str = str.replace("w", "ў")
148
- str = str.replace("W", "Ў")
149
- str = str.replace("n", "нг")
150
- str = str.replace("N", "Нг")
151
- elif dialect == "hlm":
152
- str = str.replace("тюл", "тюйюл")
153
- str = str.replace("Тюл", "Тюйюл")
154
- str = str.replace("гылын qуш", "уку")
155
- str = str.replace("Гылын qуш", "Уку")
156
- str = str.replace("гугурукку", "хораз")
157
- str = str.replace("Гугурукку", "Хораз")
158
- str = str.replace("qум", "юзмез")
159
- str = str.replace("Qум", "Юзмез")
160
- str = str.replace("jыла", "jиля")
161
- str = str.replace("Jыла", "Jиля")
162
- str = str.replace("арабин", "ярабий")
163
- str = str.replace("арабий", "ярабий")
164
- str = str.replace("Арабин", "Ярабий")
165
- str = str.replace("Арабий", "Ярабий")
166
- str = str.replace("нтда", "нтта")
167
- str = str.replace("ртде", "ртте")
168
- str = str.replace("jамаgат", "jамаwат")
169
- str = str.replace("Jамаgат", "Jамаwат")
170
- str = str.replace("шох", "шуёх")
171
- str = str.replace("Шох", "Шуёх")
172
- str = str.replace("бусаgат", "шёндю")
173
- str = str.replace("Бусаgат", "Шёндю")
174
- str = str.replace("оgай", "уgай")
175
- str = str.replace("Оgай", "Уgай")
176
- str = str.replace("тез", "терк")
177
- str = str.replace("сенnе", "саnа")
178
- str = str.replace("сеnе", "саnа")
179
- str = str.replace("Сенnе", "Саnа")
180
- str = str.replace("Сеnе", "Саnа")
181
- str = str.replace("менnе", "маnа")
182
- str = str.replace("меnе", "маnа")
183
- str = str.replace("Менnе", "Маnа")
184
- str = str.replace("Меnе", "Маnа")
185
- str = str.replace("jахтана", "аяq jол")
186
- str = str.replace("Jахтана", "аяq jол")
187
- str = str.replace("хо", "хаw")
188
- str = str.replace("Хо", "Хаw")
189
- str = str.replace("сыbат", "сыфат")
190
- str = str.replace("Сыbат", "Сыфат")
191
- str = str.replace("b", "п")
192
- str = str.replace("q", "къ")
193
- str = str.replace("Q", "Къ")
194
- str = str.replace("g", "гъ")
195
- str = str.replace("G", "Гъ")
196
- str = str.replace("j", "ж")
197
- str = str.replace("J", "Ж")
198
- str = str.replace("w", "ў")
199
- str = str.replace("W", "Ў")
200
- str = str.replace("n", "нг")
201
- str = str.replace("N", "Нг")
202
- elif dialect == "mqr":
203
- str = str.replace("тюл", "тюйюл")
204
- str = str.replace("Тюл", "Тюйюл")
205
- str = str.replace("гылын qуш", "уку")
206
- str = str.replace("Гылын qуш", "Уку")
207
- str = str.replace("гугурукку", "хораз")
208
- str = str.replace("Гугурукку", "Хораз")
209
- str = str.replace("qум", "юзмез")
210
- str = str.replace("Qум", "Юзмез")
211
- str = str.replace("jыла", "jиля")
212
- str = str.replace("Jыла", "Jиля")
213
- str = str.replace("арабин", "ярабий")
214
- str = str.replace("арабий", "ярабий")
215
- str = str.replace("Арабин", "Ярабий")
216
- str = str.replace("Арабий", "Ярабий")
217
- str = str.replace("нтда", "нтта")
218
- str = str.replace("ртде", "ртте")
219
- str = str.replace("jамаgат", "жамаwат")
220
- str = str.replace("Jамаgат", "Жамаwат")
221
- str = str.replace("шох", "шуёх")
222
- str = str.replace("Шох", "Шуёх")
223
- str = str.replace("бусаgат", "шёндю")
224
- str = str.replace("Бусаgат", "Шёндю")
225
- str = str.replace("оgай", "уgай")
226
- str = str.replace("Оgай", "Уgай")
227
- str = str.replace("тез", "терк")
228
- str = str.replace("сенnе", "саnа")
229
- str = str.replace("сеnе", "саnа")
230
- str = str.replace("Сенnе", "Саnа")
231
- str = str.replace("Сеnе", "Саnа")
232
- str = str.replace("менnе", "маnа")
233
- str = str.replace("меnе", "маnа")
234
- str = str.replace("Менnе", "Маnа")
235
- str = str.replace("Меnе", "Маnа")
236
- str = str.replace("jахтана", "аяq jол")
237
- str = str.replace("Jахтана", "аяq jол")
238
- str = str.replace("хо", "хаw")
239
- str = str.replace("Хо", "Хаw")
240
- str = str.replace("сыbат", "сыфат")
241
- str = str.replace("Сыbат", "Сыфат")
242
- str = str.replace("b", "п")
243
- str = str.replace("q", "къ")
244
- str = str.replace("Q", "Къ")
245
- str = str.replace("g", "гъ")
246
- str = str.replace("G", "Гъ")
247
- str = str.replace("j", "з")
248
- str = str.replace("J", "З")
249
- str = str.replace("w", "ў")
250
- str = str.replace("W", "Ў")
251
- str = str.replace("n", "нг")
252
- str = str.replace("N", "Нг")
253
- str = str.replace("ч", "ц")
254
- str = str.replace("Ч", "Ц")
255
- str = str.replace("п", "ф")
256
- str = str.replace("П", "Ф")
257
- str = str.replace("къ|гъ", "х")
258
- return str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
-
261
- def toModel(str):
262
- str = str.replace("дж", "j")
263
- str = str.replace("Дж", "J")
264
- str = str.replace("ДЖ", "J")
265
- str = str.replace("ж", "j")
266
- str = str.replace("Ж", "J")
267
- str = str.replace("себеп", "себеb")
268
- str = str.replace("себеб", "себеb")
269
- str = str.replace("Себеп", "Себеb")
270
- str = str.replace("Себеб", "Себеb")
271
- str = str.replace("тюйюл", "тюл")
272
- str = str.replace("Тюйюл", "Тюл")
273
- str = str.replace("уку", "гылын qуш")
274
- str = str.replace("Уку", "Гылын qуш")
275
- str = str.replace("хораз", "гугурукку")
276
- str = str.replace("Хораз", "Гугурукку")
277
- str = str.replace("юзмез", "qум")
278
- str = str.replace("Юзмез", "Qум")
279
- str = str.replace("арап", "араb")
280
- str = str.replace("араб", "араb")
281
- str = str.replace("Арап", "Араb")
282
- str = str.replace("Араб", "Араb")
283
- str = str.replace("jиля", "jыла")
284
- str = str.replace("jыла", "jыла")
285
- str = str.replace("jыла", "jыла")
286
- str = str.replace("Jиля", "Jыла")
287
- str = str.replace("Jыла", "Jыла")
288
- str = str.replace("Jыла", "Jыла")
289
- str = str.replace("ярабий", "арабин")
290
- str = str.replace("арабий", "арабин")
291
- str = str.replace("Ярабий", "Арабин")
292
- str = str.replace("Арабий", "Арабин")
293
- str = str.replace("нтта", "нтда")
294
- str = str.replace("ртте", "ртде")
295
- str = str.replace("jамагъат", "jамаgат")
296
- str = str.replace("jамауат", "jамаgат")
297
- str = str.replace("jамагъат", "jамаgат")
298
- str = str.replace("jамауат", "jамаgат")
299
- str = str.replace("Jамагъат", "Jамаgат")
300
- str = str.replace("Jамауат", "Jамаgат")
301
- str = str.replace("Jамагъат", "Jамаgат")
302
- str = str.replace("Jамаўат", "Jамаgат")
303
- str = str.replace("шуёх", "шох")
304
- str = str.replace("Шуёх", "Шох")
305
- str = str.replace("шёндю", "бусаgат")
306
- str = str.replace("бусагъат", "бусаgат")
307
- str = str.replace("Шёндю", "Бусаgат")
308
- str = str.replace("Бусагъат", "Бусаgат")
309
- str = str.replace("угъай", "оgай")
310
- str = str.replace("огъай", "оgай")
311
- str = str.replace("Угъай", "Оgай")
312
- str = str.replace("Огъай", "Оgай")
313
- # str = str.replace("терк", "тез")
314
- # str = str.replace("терк", "тез")
315
- str = str.replace("санга", "сенnе")
316
- str = str.replace("сенге", "сенnе")
317
- str = str.replace("сеннге", "сенnе")
318
- str = str.replace("Санга", "Сенnе")
319
- str = str.replace("Сеннге", "Сенnе")
320
- str = str.replace("Сенге", "Сенnе")
321
- str = str.replace("манга", "менnе")
322
- str = str.replace("меннге", "менnе")
323
- str = str.replace("менге", "менnе")
324
- str = str.replace("Манга", "Менnе")
325
- str = str.replace("Меннге", "Менnе")
326
- str = str.replace("Менге", "Менnе")
327
- str = str.replace("аякъ jол", "jахтана")
328
- str = str.replace("аякъ jол", "jахтана")
329
- str = str.replace("jахтана", "jахтана")
330
- str = str.replace("jахтана", "jахтана")
331
- str = str.replace("Аякъ jол", "Jахтана")
332
- str = str.replace("Аякъ jол", "Jахтана")
333
- str = str.replace("Jахтана", "Jахтана")
334
- str = str.replace("Jахтана", "Jахтана")
335
- str = str.replace("къамж", "qамыzh")
336
- str = str.replace("къамыж", "qамыzh")
337
- str = str.replace("Къамж", "Qамыzh")
338
- str = str.replace("Къамыж", "Qамыzh")
339
- str = str.replace("къымыж", "qымыzh")
340
- str = str.replace("къымыж", "qымыzh")
341
- str = str.replace("Къымыж", "Qымыzh")
342
- str = str.replace("Къымыж", "Qымыzh")
343
- str = str.replace("хау", "хо")
344
- str = str.replace("хаў", "хо")
345
- str = str.replace("Хау", "Хо")
346
- str = str.replace("Хаў", "Хо")
347
- str = str.replace("уа", "wa")
348
- str = str.replace("ўа", "wa")
349
- str = str.replace("Уа", "Wa")
350
- str = str.replace("Ўа", "Wa")
351
- str = str.replace("п", "b")
352
- str = str.replace("б", "b")
353
- str = str.replace("къ", "q")
354
- str = str.replace("Къ", "Q")
355
- str = str.replace("КЪ", "Q")
356
- str = str.replace("гъ", "g")
357
- str = str.replace("Гъ", "G")
358
- str = str.replace("ГЪ", "G")
359
- str = str.replace("ц", "ч")
360
- str = str.replace("Ц", "Ч")
361
- str = str.replace("ф", "п")
362
- str = str.replace("сыпат", "сыфат")
363
- str = str.replace("Сыпат", "Сыфат")
364
- str = str.replace("Ф", "П")
365
- str = str.replace("(?<=[аыоуэеиёюя])у(?=[аыоуэеиёюя])|(?<=[аыоуэеиёюя])ў(?=[аыоуэеиёюя])|(?<=[АЫОУЭЕИЁЮЯ])у(?=[АЫОУЭЕИЁЮЯ])|(?<=[АЫОУЭЕИЁЮЯ])ў(?=[АЫОУЭЕИЁЮЯ])", "w")
366
- str = str.replace("(?<=[аыоуэеиёюя])у|(?<=[аыоуэеиёюя])ў|(?<=[АЫОУЭЕИЁЮЯ])у|(?<=[АЫОУЭЕИЁЮЯ])ў", "w")
367
- # str = str.replace("у(?=[аыоуэеиёюя])|ў(?=[аыоуэеиёюя])|у(?=[АЫОУЭЕИЁЮЯ])|ў(?=[АЫОУЭЕИЁЮЯ])", "w")
368
- # str = str.replace("У(?=[аыоуэеиёюя])|Ў(?=[аыоуэеиёюя])|У(?=[АЫОУЭЕИЁЮЯ])|Ў(?=[АЫОУЭЕИЁЮЯ])", "W")
369
- str = str.replace("zh", "ж")
370
- str = str.replace("нг", "n")
371
- str = str.replace("Нг", " N")
372
- str = str.replace("НГ", " N")
373
- return str
374
 
375
- # 6. Translate function
376
- def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl',
377
- a=32, b=3, max_input_length=1024, num_beams=3, **kwargs
378
- ):
379
- """Turn a text or a list of texts into a list of translations"""
380
- tokenizer.src_lang = src_lang
381
- tokenizer.tgt_lang = tgt_lang
382
- inputs = tokenizer(
383
- text, return_tensors='pt', padding=True, truncation=True,
384
- max_length=max_input_length
385
- )
386
- #print(f'Inputs: {inputs}')
387
- result = model_translate.generate(
388
- **inputs.to(model_translate.device),
389
- forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
390
- max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
391
- num_beams=num_beams, **kwargs
392
- )
393
- #print(f'Outputs: {result}')
394
- return tokenizer.batch_decode(result, skip_special_tokens=True)
395
-
396
-
397
- def translateDisp(text, from_, to, dialect):
398
- # print(from_)
399
- # print(to)
400
- # print(dialect)
401
- if dialect == "" or dialect is None:
402
- dialect = DIALECT.dialect[0] # "дж\ч"
403
- if from_ == "" or from_ is None:
404
- from_ = LANGUAGE.language[1] # "Русский язык"
405
- if to == "" or to is None:
406
- to = LANGUAGE.language[0] # "Къарачай-Малкъар тил"
407
-
408
- from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
409
- to = "".join(LANGUAGE[LANGUAGE.language == to].token.to_list())
410
- dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
411
 
412
- print(f'Input text: {text} - Time: {datetime.now(tz=TZ)}')
413
-
414
- text = text.strip()
415
-
416
- if from_ == 'krc_Cyrl':
417
- text = toModel(text)
418
-
419
- # Разбиваем текст на предложения, сохраняя знаки препинания
420
- # .+?[.!?](?:\s|$): Захватывает предложения, которые заканчиваются точкой, восклицательным или вопросительным знаком.
421
- # |.+?(?:\n|$): Добавляет поддержку для разрыва строки (\n) или конца текста ($), если предложение не заканчивается знаком препинания.
422
- text = re.findall(r'.+?[.!?](?:\s|$)|.+?(?:\n|$)', text)
423
- # text бош эсе
424
- if len(text) == 0:
425
- text = ""
426
-
427
- #print(f'Split text: {text}')
428
-
429
- str_ = translatePy(text, src_lang = from_, tgt_lang = to)
430
 
431
- #print(f'Translated text: {str_}')
432
- str_ = ' '.join(str_).strip()
433
- #print(f'Jointed text: {str_}')
 
 
434
 
435
- if to == 'krc_Cyrl':
436
- str_ = fromModel(str_, dialect = dialect)
437
 
438
- print(f'Translated text: {str_} - Time: {datetime.now(tz=TZ)}')
439
-
440
- return str_
441
 
442
- # 7. Dictionary function
443
- def dictionaryDisp(text, from_):
444
 
445
- if from_ == "" or from_ is None:
446
- from_ = LANGUAGE.language[1] # "Русский язык"
447
 
448
- from_ = "".join(LANGUAGE[LANGUAGE.language == from_].token.to_list())
449
 
450
  text = text.strip()
451
  str_l = text.lower()
@@ -454,10 +552,10 @@ def dictionaryDisp(text, from_):
454
  df_from_to = pd.DataFrame()
455
  df_to_from = pd.DataFrame()
456
 
457
- if from_ == 'krc_Cyrl':
458
  df_from_to = dictionary_qm.copy()
459
  df_to_from = dictionary_ru.copy()
460
- elif from_ == 'rus_Cyrl':
461
  df_from_to = dictionary_ru.copy()
462
  df_to_from = dictionary_qm.copy()
463
 
@@ -483,7 +581,7 @@ def dictionaryDisp(text, from_):
483
  # len(sozluk)
484
 
485
 
486
- # 8. Voice function
487
  def tts(text):
488
  file_voice = ''.join(random.choices(string.ascii_letters, k=8))
489
  file_voice = f'{file_voice}.wav'
@@ -498,7 +596,9 @@ def tts(text):
498
 
499
  return file_voice
500
 
501
- # 9. Definition ui
 
 
502
  _title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
503
  _type = "".join(NAMES[NAMES.id == "type"][SYSTEM_LANG].to_list())
504
  _from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
@@ -523,16 +623,16 @@ with gr.Blocks() as demo:
523
  # choice_type = gr.Dropdown(
524
  # choices = TYPE[SYSTEM_LANG].to_list(), label=_type, value = TYPE[SYSTEM_LANG][0])
525
  translate_lang_input = gr.Dropdown(
526
- choices = LANGUAGE.language.to_list(), label=_from, value = LANGUAGE["language"][1])
527
 
528
  with gr.Column():
529
  with gr.Row():
530
  translate_lang_output = gr.Dropdown(
531
- choices = LANGUAGE.language.to_list(), label=_to, value = LANGUAGE["language"][0])
532
 
533
  dialect = gr.Dropdown(
534
  # choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
535
- choices = DIALECT.dialect.to_list(), label=_dialect, value = DIALECT["dialect"][0])
536
 
537
  with gr.Row():
538
  with gr.Column():
@@ -548,7 +648,7 @@ with gr.Blocks() as demo:
548
  with gr.Column():
549
  with gr.Row():
550
  dict_lang_input = gr.Dropdown(
551
- choices = LANGUAGE.language.to_list(), label=_from, value = LANGUAGE["language"][1])
552
 
553
 
554
  with gr.Row():
@@ -570,11 +670,12 @@ with gr.Blocks() as demo:
570
  tts_button = gr.Button(_sound, variant = 'primary')
571
 
572
 
573
- translate_button.click(translateDisp, inputs=[translate_text_input, translate_lang_input, translate_lang_output, dialect], outputs=[translate_text_output]) # text, from, to, dialect
574
  dict_button.click(dictionaryDisp, inputs=[dict_text_input, dict_lang_input], outputs=[dict_text_output]) # text, from
575
  tts_button.click(tts, inputs=[tts_text_input], outputs=[tts_text_output]) # text
576
 
577
  gr.Markdown(_annotation)
578
 
579
- # 10. Launch
580
- demo.launch()
 
 
28
  REPO_TTS_PATH = "snakers4/silero-models"
29
  MODEL_TTS_PATH = "silero_tts"
30
 
31
+ # LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]})
32
+ LANGUAGE = {"Къарачай-Малкъар тил": "krc_Cyrl", "Русский язык": "rus_Cyrl"}
33
+ # DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]})
34
+ DIALECT = {"дж\ч": "qrc", "ж\ч": "hlm", "з\ц": "mqr"}
35
  TYPE = pd.DataFrame({"krc": ["Кёчюрюўчю", "Сёзлюк", "Сёлешиўчю"], "rus": ["Переводчик", "Словарь", "Озвучка"], "eng": ["Translator", "Dictionary", "Voice"], "tur": ["Çevirmen", "Sözlük", "Seslendirme"], "short_name": ["translator", "dictionary", "tts"]})
36
 
37
  SYSTEM_LANG = "rus"
 
44
  })
45
 
46
 
47
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
48
 
49
  device = torch.device(DEVICE)
50
 
 
75
  model_tts.to(device)
76
 
77
  # 4. Fix tokenizer
78
+ # def fixTokenizer(tokenizer, new_lang='krc_Cyrl'):
79
  # """
80
  # Add a new language token to the tokenizer vocabulary
81
  # (this should be done each time after its initialization)
 
96
 
97
  #fixTokenizer(tokenizer)
98
 
 
99
 
100
+ class Translator:
101
+ """
102
+ Class for translator NLLB-200.
103
+
104
+ Параметры:
105
+ - model: Модель
106
+ - tokenizer: Токенизатор
107
+
108
+ Функция translate алады:
109
+ - text (str): Текст
110
+ - src_lang (str): Тебреген тил
111
+ - tgt_lang (str): Тил таба
112
+ - dialect (int): Диалект
113
+
114
+ Чыгъарады:
115
+ - translated (str): Кёчюрюлгени
116
+
117
+ """
118
+ def __init__(self, tokenizer, model) -> None:
119
+ self.model = model
120
+ self.tokenizer = tokenizer
121
+ # Change letters
122
+ def _fromModel(self, str: str, dialect: str = "qrc") -> str:
123
+ if dialect == "qrc":
124
+ str = str.replace("тюйюл", "тюл")
125
+ str = str.replace("Тюйюл", "Тюл")
126
+ str = str.replace("уку", "гылын qуш")
127
+ str = str.replace("Уку", "Гылын qуш")
128
+ str = str.replace("хораз", "гугурукку")
129
+ str = str.replace("Хораз", "Гугурукку")
130
+ str = str.replace("юзмез", "qум")
131
+ str = str.replace("Юзмез", "Qум")
132
+ str = str.replace("jиля", "jыла")
133
+ str = str.replace("Jиля", "Jыла")
134
+ str = str.replace("ярабий", "арабин")
135
+ str = str.replace("арабий", "арабин")
136
+ str = str.replace("Ярабий", "Арабин")
137
+ str = str.replace("Арабий", "Арабин")
138
+ str = str.replace("нтта", "нтда")
139
+ str = str.replace("ртте", "ртде")
140
+ str = str.replace("jамауат", "jамаgат")
141
+ str = str.replace("jамаwат", "jамаgат")
142
+ str = str.replace("Jамауат", "Jамаgат")
143
+ str = str.replace("Jамаwат", "Jамаgат")
144
+ str = str.replace("шуёх", "шох")
145
+ str = str.replace("Шуёх", "Шох")
146
+ str = str.replace("шёндю", "бусаgат")
147
+ str = str.replace("Шёндю", "Бусаgат")
148
+ str = str.replace("уgай", "оgай")
149
+ str = str.replace("Уgай", "Оgай")
150
+ # str = str.replace("терк", "тез")
151
+ str = str.replace("саnа", "сенnе")
152
+ str = str.replace("сеnе", "сенnе")
153
+ str = str.replace("Саnа", "Сенnе")
154
+ str = str.replace("Сеnе", "Сенnе")
155
+ str = str.replace("маnа", "менnе")
156
+ str = str.replace("меnе", "менnе")
157
+ str = str.replace("Маnа", "Менnе")
158
+ str = str.replace("Меnе", "Менnе")
159
+ str = str.replace("аяq jол", "jахтана")
160
+ str = str.replace("Аяq jол", "Jахтана")
161
+ str = str.replace("сыbат", "сыфат")
162
+ str = str.replace("Сыbат", "Сыфат")
163
+ str = str.replace("b", "б")
164
+ str = str.replace("q", "къ")
165
+ str = str.replace("Q", "Къ")
166
+ str = str.replace("g", "гъ")
167
+ str = str.replace("G", "Гъ")
168
+ str = str.replace("j", "дж")
169
+ str = str.replace("J", "Дж")
170
+ str = str.replace("w", "ў")
171
+ str = str.replace("W", "Ў")
172
+ str = str.replace("n", "нг")
173
+ str = str.replace("N", "Нг")
174
+ elif dialect == "hlm":
175
+ str = str.replace("тюл", "тюйюл")
176
+ str = str.replace("Тюл", "Тюйюл")
177
+ str = str.replace("гылын qуш", "уку")
178
+ str = str.replace("Гылын qуш", "Уку")
179
+ str = str.replace("гугурукку", "хораз")
180
+ str = str.replace("Гугурукку", "Хораз")
181
+ str = str.replace("qум", "юзмез")
182
+ str = str.replace("Qум", "Юзмез")
183
+ str = str.replace("jыла", "jиля")
184
+ str = str.replace("Jыла", "Jиля")
185
+ str = str.replace("арабин", "ярабий")
186
+ str = str.replace("арабий", "ярабий")
187
+ str = str.replace("Арабин", "Ярабий")
188
+ str = str.replace("Арабий", "Ярабий")
189
+ str = str.replace("нтда", "нтта")
190
+ str = str.replace("ртде", "ртте")
191
+ str = str.replace("jамаgат", "jамаwат")
192
+ str = str.replace("Jамаgат", "Jамаwат")
193
+ str = str.replace("шох", "шуёх")
194
+ str = str.replace("Шох", "Шуёх")
195
+ str = str.replace("бусаgат", "шёндю")
196
+ str = str.replace("Бусаgат", "Шёндю")
197
+ str = str.replace("оgай", "уgай")
198
+ str = str.replace("Оgай", "Уgай")
199
+ str = str.replace("тез", "терк")
200
+ str = str.replace("сенnе", "саnа")
201
+ str = str.replace("сеnе", "саnа")
202
+ str = str.replace("Сенnе", "Саnа")
203
+ str = str.replace("Сеnе", "Саnа")
204
+ str = str.replace("менnе", "маnа")
205
+ str = str.replace("меnе", "маnа")
206
+ str = str.replace("Менnе", "Маnа")
207
+ str = str.replace("Меnе", "Маnа")
208
+ str = str.replace("jахтана", "аяq jол")
209
+ str = str.replace("Jахтана", "аяq jол")
210
+ str = str.replace("хо", "хаw")
211
+ str = str.replace("Хо", "Хаw")
212
+ str = str.replace("сыbат", "сыфат")
213
+ str = str.replace("Сыbат", "Сыфат")
214
+ str = str.replace("b", "п")
215
+ str = str.replace("q", "къ")
216
+ str = str.replace("Q", "Къ")
217
+ str = str.replace("g", "гъ")
218
+ str = str.replace("G", "Гъ")
219
+ str = str.replace("j", "ж")
220
+ str = str.replace("J", "Ж")
221
+ str = str.replace("w", "ў")
222
+ str = str.replace("W", "Ў")
223
+ str = str.replace("n", "нг")
224
+ str = str.replace("N", "Нг")
225
+ elif dialect == "mqr":
226
+ str = str.replace("тюл", "тюйюл")
227
+ str = str.replace("Тюл", "Тюйюл")
228
+ str = str.replace("гылын qуш", "уку")
229
+ str = str.replace("Гылын qуш", "Уку")
230
+ str = str.replace("гугурукку", "хораз")
231
+ str = str.replace("Гугурукку", "Хораз")
232
+ str = str.replace("qум", "юзмез")
233
+ str = str.replace("Qум", "Юзмез")
234
+ str = str.replace("jыла", "jиля")
235
+ str = str.replace("Jыла", "Jиля")
236
+ str = str.replace("арабин", "ярабий")
237
+ str = str.replace("арабий", "ярабий")
238
+ str = str.replace("Арабин", "Ярабий")
239
+ str = str.replace("Арабий", "Ярабий")
240
+ str = str.replace("нтда", "нтта")
241
+ str = str.replace("ртде", "ртте")
242
+ str = str.replace("jамаgат", "жамаwат")
243
+ str = str.replace("Jамаgат", "Жамаwат")
244
+ str = str.replace("шох", "шуёх")
245
+ str = str.replace("Шох", "Шуёх")
246
+ str = str.replace("бусаgат", "шёндю")
247
+ str = str.replace("Бусаgат", "Шёндю")
248
+ str = str.replace("оgай", "уgай")
249
+ str = str.replace("Оgай", "Уgай")
250
+ str = str.replace("тез", "терк")
251
+ str = str.replace("сенnе", "саnа")
252
+ str = str.replace("сеnе", "саnа")
253
+ str = str.replace("Сенnе", "Саnа")
254
+ str = str.replace("Сеnе", "Саnа")
255
+ str = str.replace("менnе", "маnа")
256
+ str = str.replace("меnе", "маnа")
257
+ str = str.replace("Менnе", "Маnа")
258
+ str = str.replace("Меnе", "Маnа")
259
+ str = str.replace("jахтана", "аяq jол")
260
+ str = str.replace("Jахтана", "аяq jол")
261
+ str = str.replace("хо", "хаw")
262
+ str = str.replace("Хо", "Хаw")
263
+ str = str.replace("сыbат", "сыфат")
264
+ str = str.replace("Сыbат", "Сыфат")
265
+ str = str.replace("b", "п")
266
+ str = str.replace("q", "къ")
267
+ str = str.replace("Q", "Къ")
268
+ str = str.replace("g", "гъ")
269
+ str = str.replace("G", "Гъ")
270
+ str = str.replace("j", "з")
271
+ str = str.replace("J", "З")
272
+ str = str.replace("w", "ў")
273
+ str = str.replace("W", "Ў")
274
+ str = str.replace("n", "нг")
275
+ str = str.replace("N", "Нг")
276
+ str = str.replace("ч", "ц")
277
+ str = str.replace("Ч", "Ц")
278
+ str = str.replace("п", "ф")
279
+ str = str.replace("П", "Ф")
280
+ str = str.replace("къ|гъ", "х")
281
+ return str
282
+
283
+ def _toModel(self, str: str) -> str:
284
+ str = str.replace("дж", "j")
285
+ str = str.replace("Дж", "J")
286
+ str = str.replace("ДЖ", "J")
287
+ str = str.replace("ж", "j")
288
+ str = str.replace("Ж", "J")
289
+ str = str.replace("себеп", "себеb")
290
+ str = str.replace("себеб", "себеb")
291
+ str = str.replace("Себеп", "Себеb")
292
+ str = str.replace("Себеб", "Себеb")
293
  str = str.replace("тюйюл", "тюл")
294
  str = str.replace("Тюйюл", "Тюл")
295
  str = str.replace("уку", "гылын qуш")
 
298
  str = str.replace("Хораз", "Гугурукку")
299
  str = str.replace("юзмез", "qум")
300
  str = str.replace("Юзмез", "Qум")
301
+ str = str.replace("арап", "араb")
302
+ str = str.replace("араб", "араb")
303
+ str = str.replace("Арап", "Араb")
304
+ str = str.replace("Араб", "Араb")
305
  str = str.replace("jиля", "jыла")
306
+ str = str.replace("jыла", "jыла")
307
+ str = str.replace("jыла", "jыла")
308
  str = str.replace("Jиля", "Jыла")
309
+ str = str.replace("Jыла", "Jыла")
310
+ str = str.replace("Jыла", "Jыла")
311
  str = str.replace("ярабий", "арабин")
312
  str = str.replace("арабий", "арабин")
313
  str = str.replace("Ярабий", "Арабин")
314
  str = str.replace("Арабий", "Арабин")
315
  str = str.replace("нтта", "нтда")
316
  str = str.replace("ртте", "ртде")
317
+ str = str.replace("jамагъат", "jамаgат")
318
  str = str.replace("jамауат", "jамаgат")
319
+ str = str.replace("jамагъат", "jамаgат")
320
+ str = str.replace("jамауат", "jамаgат")
321
+ str = str.replace("Jамагъат", "Jамаgат")
322
  str = str.replace("Jамауат", "Jамаgат")
323
+ str = str.replace("Jамагъат", "Jамаgат")
324
+ str = str.replace("Jамаўат", "Jамаgат")
325
  str = str.replace("шуёх", "шох")
326
  str = str.replace("Шуёх", "Шох")
327
  str = str.replace("шёндю", "бусаgат")
328
+ str = str.replace("бусагъат", "бусаgат")
329
  str = str.replace("Шёндю", "Бусаgат")
330
+ str = str.replace("Бусагъат", "Бусаgат")
331
+ str = str.replace("угъай", "оgай")
332
+ str = str.replace("огъай", "оgай")
333
+ str = str.replace("Угъай", "Оgай")
334
+ str = str.replace("Огъай", "Оgай")
335
+ # str = str.replace("терк", "тез")
336
  # str = str.replace("терк", "тез")
337
+ str = str.replace("санга", "сенnе")
338
+ str = str.replace("сенге", "сенnе")
339
+ str = str.replace("сеннге", "сенnе")
340
+ str = str.replace("Санга", "Сенnе")
341
+ str = str.replace("Сеннге", "Сенnе")
342
+ str = str.replace("Сенге", "Сенnе")
343
+ str = str.replace("манга", "менnе")
344
+ str = str.replace("меннге", "менnе")
345
+ str = str.replace("менге", "менnе")
346
+ str = str.replace("Манга", "Менnе")
347
+ str = str.replace("Меннге", "Менnе")
348
+ str = str.replace("Менге", "Менnе")
349
+ str = str.replace("аякъ jол", "jахтана")
350
+ str = str.replace("аякъ jол", "jахтана")
351
+ str = str.replace("jахтана", "jахтана")
352
+ str = str.replace("jахтана", "jахтана")
353
+ str = str.replace("Аякъ jол", "Jахтана")
354
+ str = str.replace("Аякъ jол", "Jахтана")
355
+ str = str.replace("Jахтана", "Jахтана")
356
+ str = str.replace("Jахтана", "Jахтана")
357
+ str = str.replace("къамж", "qамыzh")
358
+ str = str.replace("къамыж", "qамыzh")
359
+ str = str.replace("Къамж", "Qамыzh")
360
+ str = str.replace("Къамыж", "Qамыzh")
361
+ str = str.replace("къымыж", "qымыzh")
362
+ str = str.replace("къымыж", "qымыzh")
363
+ str = str.replace("Къымыж", "Qымыzh")
364
+ str = str.replace("Къымыж", "Qымыzh")
365
+ str = str.replace("хау", "хо")
366
+ str = str.replace("хаў", "хо")
367
+ str = str.replace("Хау", "Хо")
368
+ str = str.replace("Хаў", "Хо")
369
+ str = str.replace("уа", "wa")
370
+ str = str.replace("ўа", "wa")
371
+ str = str.replace("Уа", "Wa")
372
+ str = str.replace("Ўа", "Wa")
373
+ str = str.replace("п", "b")
374
+ str = str.replace("б", "b")
375
+ str = str.replace("къ", "q")
376
+ str = str.replace("Къ", "Q")
377
+ str = str.replace("КЪ", "Q")
378
+ str = str.replace("гъ", "g")
379
+ str = str.replace("Гъ", "G")
380
+ str = str.replace("ГЪ", "G")
381
+ str = str.replace("ц", "ч")
382
+ str = str.replace("Ц", "Ч")
383
+ str = str.replace("ф", "п")
384
+ str = str.replace("сыпат", "сыфат")
385
+ str = str.replace("Сыпат", "Сыфат")
386
+ str = str.replace("Ф", "П")
387
+ str = str.replace("(?<=[аыоуэеиёюя])у(?=[аыоуэеиёюя])|(?<=[аыоуэеиёюя])ў(?=[аыоуэеиёюя])|(?<=[АЫОУЭЕИЁЮЯ])у(?=[АЫОУЭЕИЁЮЯ])|(?<=[АЫОУЭЕИЁЮЯ])ў(?=[АЫОУЭЕИЁЮЯ])", "w")
388
+ str = str.replace("(?<=[аыоуэеиёюя])у|(?<=[аыоуэеиёюя])ў|(?<=[АЫОУЭЕИЁЮЯ])у|(?<=[АЫОУЭЕИЁЮЯ])ў", "w")
389
+ # str = str.replace("у(?=[аыоуэеиёюя])|ў(?=[аыоуэеиёюя])|у(?=[АЫОУЭЕИЁЮЯ])|ў(?=[АЫОУЭЕИЁЮЯ])", "w")
390
+ # str = str.replace("У(?=[аыоуэеиёюя])|Ў(?=[аыоуэеиёюя])|У(?=[АЫОУЭЕИЁЮЯ])|Ў(?=[АЫОУЭЕИЁЮЯ])", "W")
391
+ str = str.replace("zh", "ж")
392
+ str = str.replace("нг", "n")
393
+ str = str.replace("Нг", " N")
394
+ str = str.replace("НГ", " N")
395
+ return str
396
+
397
+ # structure
398
+ def _prepareTextAndStructure(self, text: str) -> tuple:
399
+ """
400
+ The input text is divided into sentences, while maintaining the structure
401
+ """
402
+
403
+ # Разбиваем текст на предложения, сохраняя знаки препинания
404
+ # .+?[.!?।ฯ؟](?:\s|$): Захватывает предложения, которые заканчиваются
405
+ # точкой, восклицательным или вопросительным знаком.
406
+
407
+ # |.+?(?:\n|$): Добавляет поддержку для разрыва строки (\n) или конца текста ($),
408
+ # если предложение не заканчивается знаком препинания.
409
+ segments = re.findall(pattern=r".+?[.!?।ฯ؟](?:\s|$)|.*?(?:\n|$)", string=text)
410
+
411
+ # Если последний элемент пустой, то его удаляем
412
+ if not segments[-1]:
413
+ segments = segments[:-1]
414
+
415
+ # Склеиваем разорванные предложения
416
+ merged_segments = []
417
+ buffer = ""
418
+
419
+ for i, segment in enumerate(segments):
420
+ # Проверяем, заканчивается ли текущий сегмент на .!? или пуст
421
+ if buffer:
422
+ buffer += " " + segment
423
+ else:
424
+ buffer = segment
425
+
426
+ # Если сегмент не заканчивается на .!? и следующий начинается с маленькой буквы
427
+ if ( # noqa: R507
428
+ not re.search(pattern=r"[.!?।ฯ؟](?:\s|$)", string=segment) # noqa: ECE001
429
+ and i + 1 < len(segments)
430
+ and segments[i + 1].strip()
431
+ and ((segments[i + 1].strip()[0].islower()) or (segments[i + 1].strip()[0] in ["'", '"']))
432
+ ):
433
+ continue # Склеиваем с следующим сегментом
434
+ else:
435
+ merged_segments.append(buffer)
436
+ buffer = ""
437
+
438
+ # Удаляем пустые сегменты и сохраняем пробелы
439
+ original_structure = []
440
+ for segment in merged_segments:
441
+ match = re.match(pattern=r"^(\s*)(.*?)(\s*)$", string=segment, flags=re.DOTALL)
442
+ if match:
443
+ original_structure.append((match.group(1), match.group(2), match.group(3)))
444
+
445
+ # Токенизируем только текстовые части сегментов
446
+ texts_to_translate = [seg[1] for seg in original_structure if seg[1].strip()]
447
+
448
+ return texts_to_translate, original_structure
449
+
450
+ def _recoverTranslatedToStructure(self, translated_texts: str, original_structure: list) -> str:
451
+ """
452
+ Translated sentences are embedded in the structure of the original text
453
+ """
454
+ # Восстанавливаем исходную структуру текста
455
+ translated_segments = []
456
+ translated_index = 0
457
+ for seg in original_structure:
458
+ if seg[1].strip(): # Если сегмент был переведён
459
+ translated_segments.append(f"{seg[0]}{translated_texts[translated_index]}{seg[2]}")
460
+ translated_index += 1
461
+ else: # Если сегмент был пустым, оставляем его как есть
462
+ translated_segments.append(f"{seg[0]}{seg[1]}{seg[2]}")
463
+
464
+ return "".join(translated_segments)
465
+
466
+
467
+ # Translate function
468
+ def _translate(self, text: list | str, src_lang: str = 'rus_Cyrl', tgt_lang: str = 'krc_Cyrl',
469
+ a: int = 32, b: int = 3, max_input_length: int = 1024, num_beams: int = 3, **kwargs
470
+ ) -> list:
471
+ """Turn a text or a list of texts into a list of translations"""
472
+ self.tokenizer.src_lang = src_lang
473
+ self.tokenizer.tgt_lang = tgt_lang
474
+ inputs = self.tokenizer(
475
+ text, return_tensors='pt', padding=True, truncation=True,
476
+ max_length=max_input_length
477
+ )
478
+ #print(f'Inputs: {inputs}')
479
+ result = self.model.generate(
480
+ **inputs.to(self.model.device),
481
+ forced_bos_token_id=self.tokenizer.convert_tokens_to_ids(tgt_lang),
482
+ max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
483
+ num_beams=num_beams, **kwargs
484
+ )
485
+ #print(f'Outputs: {result}')
486
+ return self.tokenizer.batch_decode(result, skip_special_tokens=True)
487
+
488
+ def translate(self, text: str, src_lang: str | None = None, tgt_lang: str | None = None, dialect: str | None = None) -> str:
489
+ # print(src_lang)
490
+ # print(trg_lang)
491
+ # print(dialect)
492
+ if dialect == "" or dialect is None:
493
+ # dialect = DIALECT.dialect[0] # "дж\ч"
494
+ dialect = list(DIALECT.keys())[0] # "дж\ч"
495
+ if src_lang == "" or src_lang is None:
496
+ # src_lang = LANGUAGE.language[1] # "Русский язык"
497
+ src_lang = list(LANGUAGE.keys())[1] # "Русский язык"
498
+ if tgt_lang == "" or tgt_lang is None:
499
+ # tgt_lang = LANGUAGE.language[0] # "Къарачай-Малкъар тил"
500
+ tgt_lang = list(LANGUAGE.keys())[0] # "Къарачай-Малкъар тил"
501
+
502
+ # src_lang = "".join(LANGUAGE[LANGUAGE.language == src_lang].token.to_list())
503
+ # tgt_lang = "".join(LANGUAGE[LANGUAGE.language == tgt_lang].token.to_list())
504
+ # dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list())
505
+ src_lang = LANGUAGE[src_lang]
506
+ tgt_lang = LANGUAGE[tgt_lang]
507
+ dialect = DIALECT[dialect]
508
+
509
+ print(f'Input text: {text} - Time: {datetime.now(tz=TZ)}')
510
+
511
+ text = text.strip()
512
+
513
+ if src_lang == 'krc_Cyrl':
514
+ text = self._toModel(text)
515
 
516
+ # Разбиваем текст на предложения, сохраняя знаки препинания
517
+ texts_to_translate, original_structure = self._prepareTextAndStructure(text=text)
518
+ # text бош эсе
519
+ if len(texts_to_translate) == 0:
520
+ texts_to_translate = [""]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
 
522
+ #print(f'Split text: {texts_to_translate}')
523
+
524
+ translated_texts = self._translate(text=texts_to_translate, src_lang = src_lang, tgt_lang = tgt_lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
 
527
+
528
+ translated = self._recoverTranslatedToStructure(
529
+ translated_texts=translated_texts, original_structure=original_structure
530
+ )
531
+ #print(f'Translated text: {translated}')
532
 
533
+ if tgt_lang == 'krc_Cyrl':
534
+ translated = self._fromModel(str=translated, dialect = dialect)
535
 
536
+ print(f'Translated text: {translated} - Time: {datetime.now(tz=TZ)}')
537
+
538
+ return translated
539
 
540
+ # Dictionary function
541
+ def dictionaryDisp(text, src_lang):
542
 
543
+ if src_lang == "" or src_lang is None:
544
+ src_lang = list(LANGUAGE.keys())[1] # "Русский язык"
545
 
546
+ src_lang = LANGUAGE[src_lang]
547
 
548
  text = text.strip()
549
  str_l = text.lower()
 
552
  df_from_to = pd.DataFrame()
553
  df_to_from = pd.DataFrame()
554
 
555
+ if src_lang == 'krc_Cyrl':
556
  df_from_to = dictionary_qm.copy()
557
  df_to_from = dictionary_ru.copy()
558
+ elif src_lang == 'rus_Cyrl':
559
  df_from_to = dictionary_ru.copy()
560
  df_to_from = dictionary_qm.copy()
561
 
 
581
  # len(sozluk)
582
 
583
 
584
+ # Voice function
585
  def tts(text):
586
  file_voice = ''.join(random.choices(string.ascii_letters, k=8))
587
  file_voice = f'{file_voice}.wav'
 
596
 
597
  return file_voice
598
 
599
+ # 5. Definition ui
600
+ translator = Translator(tokenizer=tokenizer, model=model_translate)
601
+
602
  _title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list())
603
  _type = "".join(NAMES[NAMES.id == "type"][SYSTEM_LANG].to_list())
604
  _from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list())
 
623
  # choice_type = gr.Dropdown(
624
  # choices = TYPE[SYSTEM_LANG].to_list(), label=_type, value = TYPE[SYSTEM_LANG][0])
625
  translate_lang_input = gr.Dropdown(
626
+ choices = list(LANGUAGE.keys()), label=_from, value = list(LANGUAGE.keys())[1])
627
 
628
  with gr.Column():
629
  with gr.Row():
630
  translate_lang_output = gr.Dropdown(
631
+ choices = list(LANGUAGE.keys()), label=_to, value = list(LANGUAGE.keys())[0])
632
 
633
  dialect = gr.Dropdown(
634
  # choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч")
635
+ choices = list(DIALECT.keys()), label=_dialect, value = list(DIALECT.keys())[0])
636
 
637
  with gr.Row():
638
  with gr.Column():
 
648
  with gr.Column():
649
  with gr.Row():
650
  dict_lang_input = gr.Dropdown(
651
+ choices = list(LANGUAGE.keys()), label=_from, value = list(LANGUAGE.keys())[1])
652
 
653
 
654
  with gr.Row():
 
670
  tts_button = gr.Button(_sound, variant = 'primary')
671
 
672
 
673
+ translate_button.click(translator.translate, inputs=[translate_text_input, translate_lang_input, translate_lang_output, dialect], outputs=[translate_text_output]) # text, from, to, dialect
674
  dict_button.click(dictionaryDisp, inputs=[dict_text_input, dict_lang_input], outputs=[dict_text_output]) # text, from
675
  tts_button.click(tts, inputs=[tts_text_input], outputs=[tts_text_output]) # text
676
 
677
  gr.Markdown(_annotation)
678
 
679
+ # 6. Launch
680
+ demo.launch()
681
+ # demo.launch(inbrowser=True)