Spaces:
Running
Running
# 1. Libraries | |
from datasets import load_dataset | |
import gradio as gr | |
import torch | |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer | |
import pandas as pd | |
import random | |
import string | |
import re | |
from datetime import datetime | |
import pytz | |
# 2. Constants | |
# Translation | |
MODEL_TRANSLATE_PATH = 'TSjB/NLLB-201-600M-QM-V2' | |
# Dictionary | |
DATA_DICTIONARY_PATH = "TSjB/dictionary_krc_rus" | |
OUTPUT_ROW_BY_EVERY_DICTIONARY = 15 | |
# TTS | |
LANGUAGE_KRC_TTS = 'cyrillic' | |
MODEL_ID_KRC_TTS = 'v4_cyrillic' | |
SAMPLE_RATE_TTS = 48000 | |
SPEAKER_KRC_TTS = 'b_krc' | |
REPO_TTS_PATH = "snakers4/silero-models" | |
MODEL_TTS_PATH = "silero_tts" | |
# LANGUAGE = pd.DataFrame({"language": ["Къарачай-Малкъар тил", "Русский язык"], "token": ["krc_Cyrl", "rus_Cyrl"]}) | |
LANGUAGE = {"Къарачай-Малкъар тил": "krc_Cyrl", "Русский язык": "rus_Cyrl"} | |
# DIALECT = pd.DataFrame({"dialect": ["дж\ч", "ж\ч", "з\ц"], "short_name": ["qrc", "hlm", "mqr"]}) | |
DIALECT = {"дж\ч": "qrc", "ж\ч": "hlm", "з\ц": "mqr"} | |
TYPE = pd.DataFrame({"krc": ["Кёчюрюўчю", "Сёзлюк", "Сёлешиўчю"], "rus": ["Переводчик", "Словарь", "Озвучка"], "eng": ["Translator", "Dictionary", "Voice"], "tur": ["Çevirmen", "Sözlük", "Seslendirme"], "short_name": ["translator", "dictionary", "tts"]}) | |
SYSTEM_LANG = "rus" | |
NAMES = pd.DataFrame({ | |
"id": ["title", "type", "from", "to", "your_sent", "your_sent_tts", "transl_sent", "dialect", "translate", "annotation", "word_absence", "sound"], | |
"krc": ["# Къарачай-Малкъар сёзлюк бла кёчюрюўчю", "Тюрлюсю", "тилден", "тилге", "Мында джаз...", "Къарачай-Малкъарча мында джаз...", "Кёчюрюлгени", "Къарачай-Малкъарны диалекти", "Кёчюр","Къарачай-малкъар, орус тиллени арасында биринчи кёчюрюўчюдю. Сёзлюк да эмда Къарачай-Малкъар сёлешиўчю ичине салыннганды.\n\n[Богдан Теўуналаны](https://t.me/bogdan_tewunalany), [Али Берберлени](https://t.me/ali_berberov) къурагъандыла\n\nСоинвестированиени эмда спонсорлукъ болушлукъну юсюнден [Али Берберовгъа](https://t.me/ali_berberov) соругъуз", "Сорулгъаны сёзлюкде табылмагъанды.", "Сёлешдир"], | |
"rus": ["# Карачаево-балкарский словарь и переводчик", "Тип", "из", "на", "Напишите здесь...", "Напиши здесь по-карачаево-балкарски...", "Переведённый текст", "Карачаево-балкарский диалект", "Перевести","Первый переводчик между карачаево-балкарским и русским языками. Встроен словарь для отдельных слов или коротких фраз и озвучка карачаево-балкарского текста.\n\nРазработчики: [Богдан Теунаев](https://t.me/bogdan_tewunalany), [Али Берберов](https://t.me/ali_berberov)\n\nПо вопросам соинвестирования и спонсорской поддержки обращайтесь к [Али Берберову](https://t.me/ali_berberov)", "Запрашиваемое в словаре не найдено.", "Озвучить"], | |
"tur": ["# Karaçayca-Balkarca sözlük ve çevirmen", "Tür", "dilden", "dile", "Buraya yaz...", "Buraya Karaçay-Balkarca yaz...", "Çevrilmiş metin burada", "Karaçay-Malkar lehçesi", "Tercüme edin", "Karaçay-Balkarca ve Rusça dilleri arasındaki ilk çevirmen. Tek tek kelimeler veya kısa ifadeler için bir sözlük ve Karaçay-Balkar metninin seslendirmesi de yerleşiktir.\n\nGeliştiriciler: [Bogdan Tewunalanı](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nOrtak yatırım ve sponsorluk ile ilgili sorularınız için [Ali Berberov](https://t.me/ali_berberov) ile iletişime geçin", "Sorge sözlükte bulunmuyor.", "Ses vermek"], | |
"eng": ["# Qarachay-Malqar dictionary and translator", "Type", "from", "to", "Write here...", "Write here in Karachay-Balkar...", "Translated text is here", "Qarachay-Malqar dialect", "Translate", "The first translator between Qarachay-Malqar and Russian languages. There is also a built-in dictionary for individual words or short phrases and voice acting of the Karachay-Balkar text.\n\nDevelopers: [Bogdan Tewunalany](https://t.me/bogdan_tewunalany), [Ali Berberov](https://t.me/ali_berberov)\n\nFor co-investment and sponsorship, please contact [Ali Berberov] (https://t.me/ali_berberov)", "The requested was not found in the dictionary.", "Voice over"] | |
}) | |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
device = torch.device(DEVICE) | |
TZ = pytz.timezone('Europe/Moscow') | |
# 3. Upload | |
# Dictionary | |
dictionary = load_dataset(DATA_DICTIONARY_PATH) | |
dictionary = pd.DataFrame(dictionary['train']) | |
dictionary["soz"] = dictionary.soz.str.upper() | |
dictionary["soz_l"] = dictionary.soz.str.lower() | |
dictionary["belgi_l"] = dictionary.belgi.str.lower() | |
dictionary_qm = dictionary[dictionary.til == "krc"] | |
dictionary_ru = dictionary[dictionary.til == "rus"] | |
# Tranlation | |
tokenizer = NllbTokenizer.from_pretrained(MODEL_TRANSLATE_PATH) | |
model_translate = AutoModelForSeq2SeqLM.from_pretrained(MODEL_TRANSLATE_PATH) | |
model_translate.eval() # turn off training mode | |
# TTS | |
model_tts, _ = torch.hub.load(repo_or_dir = REPO_TTS_PATH, | |
model = MODEL_TTS_PATH, | |
language = LANGUAGE_KRC_TTS, | |
speaker = MODEL_ID_KRC_TTS) | |
model_tts.to(device) | |
# 4. Fix tokenizer | |
# def fixTokenizer(tokenizer, new_lang='krc_Cyrl'): | |
# """ | |
# Add a new language token to the tokenizer vocabulary | |
# (this should be done each time after its initialization) | |
# """ | |
# old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder) | |
# tokenizer.lang_code_to_id[new_lang] = old_len-1 | |
# tokenizer.id_to_lang_code[old_len-1] = new_lang | |
# # always move "mask" to the last position | |
# tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset | |
# | |
# tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id) | |
# tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()} | |
# if new_lang not in tokenizer._additional_special_tokens: | |
# tokenizer._additional_special_tokens.append(new_lang) | |
# # clear the added token encoder; otherwise a new token may end up there by mistake | |
# tokenizer.added_tokens_encoder = {} | |
# tokenizer.added_tokens_decoder = {} | |
#fixTokenizer(tokenizer) | |
class Translator: | |
""" | |
Class for translator NLLB-200. | |
Параметры: | |
- model: Модель | |
- tokenizer: Токенизатор | |
Функция translate алады: | |
- text (str): Текст | |
- src_lang (str): Тебреген тил | |
- tgt_lang (str): Тил таба | |
- dialect (int): Диалект | |
Чыгъарады: | |
- translated (str): Кёчюрюлгени | |
""" | |
def __init__(self, tokenizer, model) -> None: | |
self.model = model | |
self.tokenizer = tokenizer | |
# Change letters | |
def _fromModel(self, str: str, dialect: str = "qrc") -> str: | |
if dialect == "qrc": | |
str = str.replace("тюйюл", "тюл") | |
str = str.replace("Тюйюл", "Тюл") | |
str = str.replace("уку", "гылын qуш") | |
str = str.replace("Уку", "Гылын qуш") | |
str = str.replace("хораз", "гугурукку") | |
str = str.replace("Хораз", "Гугурукку") | |
str = str.replace("юзмез", "qум") | |
str = str.replace("Юзмез", "Qум") | |
str = str.replace("jиля", "jыла") | |
str = str.replace("Jиля", "Jыла") | |
str = str.replace("ярабий", "арабин") | |
str = str.replace("арабий", "арабин") | |
str = str.replace("Ярабий", "Арабин") | |
str = str.replace("Арабий", "Арабин") | |
str = str.replace("нтта", "нтда") | |
str = str.replace("ртте", "ртде") | |
str = str.replace("jамауат", "jамаgат") | |
str = str.replace("jамаwат", "jамаgат") | |
str = str.replace("Jамауат", "Jамаgат") | |
str = str.replace("Jамаwат", "Jамаgат") | |
str = str.replace("шуёх", "шох") | |
str = str.replace("Шуёх", "Шох") | |
str = str.replace("шёндю", "бусаgат") | |
str = str.replace("Шёндю", "Бусаgат") | |
str = str.replace("уgай", "оgай") | |
str = str.replace("Уgай", "Оgай") | |
# str = str.replace("терк", "тез") | |
str = str.replace("саnа", "сенnе") | |
str = str.replace("сеnе", "сенnе") | |
str = str.replace("Саnа", "Сенnе") | |
str = str.replace("Сеnе", "Сенnе") | |
str = str.replace("маnа", "менnе") | |
str = str.replace("меnе", "менnе") | |
str = str.replace("Маnа", "Менnе") | |
str = str.replace("Меnе", "Менnе") | |
str = str.replace("аяq jол", "jахтана") | |
str = str.replace("Аяq jол", "Jахтана") | |
str = str.replace("сыbат", "сыфат") | |
str = str.replace("Сыbат", "Сыфат") | |
str = str.replace("b", "б") | |
str = str.replace("q", "къ") | |
str = str.replace("Q", "Къ") | |
str = str.replace("g", "гъ") | |
str = str.replace("G", "Гъ") | |
str = str.replace("j", "дж") | |
str = str.replace("J", "Дж") | |
str = str.replace("w", "ў") | |
str = str.replace("W", "Ў") | |
str = str.replace("n", "нг") | |
str = str.replace("N", "Нг") | |
elif dialect == "hlm": | |
str = str.replace("тюл", "тюйюл") | |
str = str.replace("Тюл", "Тюйюл") | |
str = str.replace("гылын qуш", "уку") | |
str = str.replace("Гылын qуш", "Уку") | |
str = str.replace("гугурукку", "хораз") | |
str = str.replace("Гугурукку", "Хораз") | |
str = str.replace("qум", "юзмез") | |
str = str.replace("Qум", "Юзмез") | |
str = str.replace("jыла", "jиля") | |
str = str.replace("Jыла", "Jиля") | |
str = str.replace("арабин", "ярабий") | |
str = str.replace("арабий", "ярабий") | |
str = str.replace("Арабин", "Ярабий") | |
str = str.replace("Арабий", "Ярабий") | |
str = str.replace("нтда", "нтта") | |
str = str.replace("ртде", "ртте") | |
str = str.replace("jамаgат", "jамаwат") | |
str = str.replace("Jамаgат", "Jамаwат") | |
str = str.replace("шох", "шуёх") | |
str = str.replace("Шох", "Шуёх") | |
str = str.replace("бусаgат", "шёндю") | |
str = str.replace("Бусаgат", "Шёндю") | |
str = str.replace("оgай", "уgай") | |
str = str.replace("Оgай", "Уgай") | |
str = str.replace("тез", "терк") | |
str = str.replace("сенnе", "саnа") | |
str = str.replace("сеnе", "саnа") | |
str = str.replace("Сенnе", "Саnа") | |
str = str.replace("Сеnе", "Саnа") | |
str = str.replace("менnе", "маnа") | |
str = str.replace("меnе", "маnа") | |
str = str.replace("Менnе", "Маnа") | |
str = str.replace("Меnе", "Маnа") | |
str = str.replace("jахтана", "аяq jол") | |
str = str.replace("Jахтана", "аяq jол") | |
str = str.replace("хо", "хаw") | |
str = str.replace("Хо", "Хаw") | |
str = str.replace("сыbат", "сыфат") | |
str = str.replace("Сыbат", "Сыфат") | |
str = str.replace("b", "п") | |
str = str.replace("q", "къ") | |
str = str.replace("Q", "Къ") | |
str = str.replace("g", "гъ") | |
str = str.replace("G", "Гъ") | |
str = str.replace("j", "ж") | |
str = str.replace("J", "Ж") | |
str = str.replace("w", "ў") | |
str = str.replace("W", "Ў") | |
str = str.replace("n", "нг") | |
str = str.replace("N", "Нг") | |
elif dialect == "mqr": | |
str = str.replace("тюл", "тюйюл") | |
str = str.replace("Тюл", "Тюйюл") | |
str = str.replace("гылын qуш", "уку") | |
str = str.replace("Гылын qуш", "Уку") | |
str = str.replace("гугурукку", "хораз") | |
str = str.replace("Гугурукку", "Хораз") | |
str = str.replace("qум", "юзмез") | |
str = str.replace("Qум", "Юзмез") | |
str = str.replace("jыла", "jиля") | |
str = str.replace("Jыла", "Jиля") | |
str = str.replace("арабин", "ярабий") | |
str = str.replace("арабий", "ярабий") | |
str = str.replace("Арабин", "Ярабий") | |
str = str.replace("Арабий", "Ярабий") | |
str = str.replace("нтда", "нтта") | |
str = str.replace("ртде", "ртте") | |
str = str.replace("jамаgат", "жамаwат") | |
str = str.replace("Jамаgат", "Жамаwат") | |
str = str.replace("шох", "шуёх") | |
str = str.replace("Шох", "Шуёх") | |
str = str.replace("бусаgат", "шёндю") | |
str = str.replace("Бусаgат", "Шёндю") | |
str = str.replace("оgай", "уgай") | |
str = str.replace("Оgай", "Уgай") | |
str = str.replace("тез", "терк") | |
str = str.replace("сенnе", "саnа") | |
str = str.replace("сеnе", "саnа") | |
str = str.replace("Сенnе", "Саnа") | |
str = str.replace("Сеnе", "Саnа") | |
str = str.replace("менnе", "маnа") | |
str = str.replace("меnе", "маnа") | |
str = str.replace("Менnе", "Маnа") | |
str = str.replace("Меnе", "Маnа") | |
str = str.replace("jахтана", "аяq jол") | |
str = str.replace("Jахтана", "аяq jол") | |
str = str.replace("хо", "хаw") | |
str = str.replace("Хо", "Хаw") | |
str = str.replace("сыbат", "сыфат") | |
str = str.replace("Сыbат", "Сыфат") | |
str = str.replace("b", "п") | |
str = str.replace("q", "къ") | |
str = str.replace("Q", "Къ") | |
str = str.replace("g", "гъ") | |
str = str.replace("G", "Гъ") | |
str = str.replace("j", "з") | |
str = str.replace("J", "З") | |
str = str.replace("w", "ў") | |
str = str.replace("W", "Ў") | |
str = str.replace("n", "нг") | |
str = str.replace("N", "Нг") | |
str = str.replace("ч", "ц") | |
str = str.replace("Ч", "Ц") | |
str = str.replace("п", "ф") | |
str = str.replace("П", "Ф") | |
str = str.replace("къ|гъ", "х") | |
return str | |
def _toModel(self, str: str) -> str: | |
str = str.replace("дж", "j") | |
str = str.replace("Дж", "J") | |
str = str.replace("ДЖ", "J") | |
str = str.replace("ж", "j") | |
str = str.replace("Ж", "J") | |
str = str.replace("себеп", "себеb") | |
str = str.replace("себеб", "себеb") | |
str = str.replace("Себеп", "Себеb") | |
str = str.replace("Себеб", "Себеb") | |
str = str.replace("тюйюл", "тюл") | |
str = str.replace("Тюйюл", "Тюл") | |
str = str.replace("уку", "гылын qуш") | |
str = str.replace("Уку", "Гылын qуш") | |
str = str.replace("хораз", "гугурукку") | |
str = str.replace("Хораз", "Гугурукку") | |
str = str.replace("юзмез", "qум") | |
str = str.replace("Юзмез", "Qум") | |
str = str.replace("арап", "араb") | |
str = str.replace("араб", "араb") | |
str = str.replace("Арап", "Араb") | |
str = str.replace("Араб", "Араb") | |
str = str.replace("jиля", "jыла") | |
str = str.replace("jыла", "jыла") | |
str = str.replace("jыла", "jыла") | |
str = str.replace("Jиля", "Jыла") | |
str = str.replace("Jыла", "Jыла") | |
str = str.replace("Jыла", "Jыла") | |
str = str.replace("ярабий", "арабин") | |
str = str.replace("арабий", "арабин") | |
str = str.replace("Ярабий", "Арабин") | |
str = str.replace("Арабий", "Арабин") | |
str = str.replace("нтта", "нтда") | |
str = str.replace("ртте", "ртде") | |
str = str.replace("jамагъат", "jамаgат") | |
str = str.replace("jамауат", "jамаgат") | |
str = str.replace("jамагъат", "jамаgат") | |
str = str.replace("jамауат", "jамаgат") | |
str = str.replace("Jамагъат", "Jамаgат") | |
str = str.replace("Jамауат", "Jамаgат") | |
str = str.replace("Jамагъат", "Jамаgат") | |
str = str.replace("Jамаўат", "Jамаgат") | |
str = str.replace("шуёх", "шох") | |
str = str.replace("Шуёх", "Шох") | |
str = str.replace("шёндю", "бусаgат") | |
str = str.replace("бусагъат", "бусаgат") | |
str = str.replace("Шёндю", "Бусаgат") | |
str = str.replace("Бусагъат", "Бусаgат") | |
str = str.replace("угъай", "оgай") | |
str = str.replace("огъай", "оgай") | |
str = str.replace("Угъай", "Оgай") | |
str = str.replace("Огъай", "Оgай") | |
# str = str.replace("терк", "тез") | |
# str = str.replace("терк", "тез") | |
str = str.replace("санга", "сенnе") | |
str = str.replace("сенге", "сенnе") | |
str = str.replace("сеннге", "сенnе") | |
str = str.replace("Санга", "Сенnе") | |
str = str.replace("Сеннге", "Сенnе") | |
str = str.replace("Сенге", "Сенnе") | |
str = str.replace("манга", "менnе") | |
str = str.replace("меннге", "менnе") | |
str = str.replace("менге", "менnе") | |
str = str.replace("Манга", "Менnе") | |
str = str.replace("Меннге", "Менnе") | |
str = str.replace("Менге", "Менnе") | |
str = str.replace("аякъ jол", "jахтана") | |
str = str.replace("аякъ jол", "jахтана") | |
str = str.replace("jахтана", "jахтана") | |
str = str.replace("jахтана", "jахтана") | |
str = str.replace("Аякъ jол", "Jахтана") | |
str = str.replace("Аякъ jол", "Jахтана") | |
str = str.replace("Jахтана", "Jахтана") | |
str = str.replace("Jахтана", "Jахтана") | |
str = str.replace("къамж", "qамыzh") | |
str = str.replace("къамыж", "qамыzh") | |
str = str.replace("Къамж", "Qамыzh") | |
str = str.replace("Къамыж", "Qамыzh") | |
str = str.replace("къымыж", "qымыzh") | |
str = str.replace("къымыж", "qымыzh") | |
str = str.replace("Къымыж", "Qымыzh") | |
str = str.replace("Къымыж", "Qымыzh") | |
str = str.replace("хау", "хо") | |
str = str.replace("хаў", "хо") | |
str = str.replace("Хау", "Хо") | |
str = str.replace("Хаў", "Хо") | |
str = str.replace("уа", "wa") | |
str = str.replace("ўа", "wa") | |
str = str.replace("Уа", "Wa") | |
str = str.replace("Ўа", "Wa") | |
str = str.replace("п", "b") | |
str = str.replace("б", "b") | |
str = str.replace("къ", "q") | |
str = str.replace("Къ", "Q") | |
str = str.replace("КЪ", "Q") | |
str = str.replace("гъ", "g") | |
str = str.replace("Гъ", "G") | |
str = str.replace("ГЪ", "G") | |
str = str.replace("ц", "ч") | |
str = str.replace("Ц", "Ч") | |
str = str.replace("ф", "п") | |
str = str.replace("сыпат", "сыфат") | |
str = str.replace("Сыпат", "Сыфат") | |
str = str.replace("Ф", "П") | |
str = str.replace("(?<=[аыоуэеиёюя])у(?=[аыоуэеиёюя])|(?<=[аыоуэеиёюя])ў(?=[аыоуэеиёюя])|(?<=[АЫОУЭЕИЁЮЯ])у(?=[АЫОУЭЕИЁЮЯ])|(?<=[АЫОУЭЕИЁЮЯ])ў(?=[АЫОУЭЕИЁЮЯ])", "w") | |
str = str.replace("(?<=[аыоуэеиёюя])у|(?<=[аыоуэеиёюя])ў|(?<=[АЫОУЭЕИЁЮЯ])у|(?<=[АЫОУЭЕИЁЮЯ])ў", "w") | |
# str = str.replace("у(?=[аыоуэеиёюя])|ў(?=[аыоуэеиёюя])|у(?=[АЫОУЭЕИЁЮЯ])|ў(?=[АЫОУЭЕИЁЮЯ])", "w") | |
# str = str.replace("У(?=[аыоуэеиёюя])|Ў(?=[аыоуэеиёюя])|У(?=[АЫОУЭЕИЁЮЯ])|Ў(?=[АЫОУЭЕИЁЮЯ])", "W") | |
str = str.replace("zh", "ж") | |
str = str.replace("нг", "n") | |
str = str.replace("Нг", " N") | |
str = str.replace("НГ", " N") | |
return str | |
# structure | |
def _prepareTextAndStructure(self, text: str) -> tuple: | |
""" | |
The input text is divided into sentences, while maintaining the structure | |
""" | |
# Разбиваем текст на предложения, сохраняя знаки препинания | |
# .+?[.!?।ฯ؟](?:\s|$): Захватывает предложения, которые заканчиваются | |
# точкой, восклицательным или вопросительным знаком. | |
# |.+?(?:\n|$): Добавляет поддержку для разрыва строки (\n) или конца текста ($), | |
# если предложение не заканчивается знаком препинания. | |
segments = re.findall(pattern=r".+?[.!?।ฯ؟](?:\s|$)|.*?(?:\n|$)", string=text) | |
# Если последний элемент пустой, то его удаляем | |
if not segments[-1]: | |
segments = segments[:-1] | |
# Склеиваем разорванные предложения | |
merged_segments = [] | |
buffer = "" | |
for i, segment in enumerate(segments): | |
# Проверяем, заканчивается ли текущий сегмент на .!? или пуст | |
if buffer: | |
buffer += " " + segment | |
else: | |
buffer = segment | |
# Если сегмент не заканчивается на .!? и следующий начинается с маленькой буквы | |
if ( # noqa: R507 | |
not re.search(pattern=r"[.!?।ฯ؟](?:\s|$)", string=segment) # noqa: ECE001 | |
and i + 1 < len(segments) | |
and segments[i + 1].strip() | |
and ((segments[i + 1].strip()[0].islower()) or (segments[i + 1].strip()[0] in ["'", '"'])) | |
): | |
continue # Склеиваем с следующим сегментом | |
else: | |
merged_segments.append(buffer) | |
buffer = "" | |
# Удаляем пустые сегменты и сохраняем пробелы | |
original_structure = [] | |
for segment in merged_segments: | |
match = re.match(pattern=r"^(\s*)(.*?)(\s*)$", string=segment, flags=re.DOTALL) | |
if match: | |
original_structure.append((match.group(1), match.group(2), match.group(3))) | |
# Токенизируем только текстовые части сегментов | |
texts_to_translate = [seg[1] for seg in original_structure if seg[1].strip()] | |
return texts_to_translate, original_structure | |
def _recoverTranslatedToStructure(self, translated_texts: str, original_structure: list) -> str: | |
""" | |
Translated sentences are embedded in the structure of the original text | |
""" | |
# Восстанавливаем исходную структуру текста | |
translated_segments = [] | |
translated_index = 0 | |
for seg in original_structure: | |
if seg[1].strip(): # Если сегмент был переведён | |
translated_segments.append(f"{seg[0]}{translated_texts[translated_index]}{seg[2]}") | |
translated_index += 1 | |
else: # Если сегмент был пустым, оставляем его как есть | |
translated_segments.append(f"{seg[0]}{seg[1]}{seg[2]}") | |
return "".join(translated_segments) | |
# Translate function | |
def _translate(self, text: list | str, src_lang: str = 'rus_Cyrl', tgt_lang: str = 'krc_Cyrl', | |
a: int = 32, b: int = 3, max_input_length: int = 1024, num_beams: int = 3, **kwargs | |
) -> list: | |
"""Turn a text or a list of texts into a list of translations""" | |
self.tokenizer.src_lang = src_lang | |
self.tokenizer.tgt_lang = tgt_lang | |
inputs = self.tokenizer( | |
text, return_tensors='pt', padding=True, truncation=True, | |
max_length=max_input_length | |
) | |
#print(f'Inputs: {inputs}') | |
result = self.model.generate( | |
**inputs.to(self.model.device), | |
forced_bos_token_id=self.tokenizer.convert_tokens_to_ids(tgt_lang), | |
max_new_tokens=int(a + b * inputs.input_ids.shape[1]), | |
num_beams=num_beams, **kwargs | |
) | |
#print(f'Outputs: {result}') | |
return self.tokenizer.batch_decode(result, skip_special_tokens=True) | |
def translate(self, text: str, src_lang: str | None = None, tgt_lang: str | None = None, dialect: str | None = None) -> str: | |
# print(src_lang) | |
# print(trg_lang) | |
# print(dialect) | |
if dialect == "" or dialect is None: | |
# dialect = DIALECT.dialect[0] # "дж\ч" | |
dialect = list(DIALECT.keys())[0] # "дж\ч" | |
if src_lang == "" or src_lang is None: | |
# src_lang = LANGUAGE.language[1] # "Русский язык" | |
src_lang = list(LANGUAGE.keys())[1] # "Русский язык" | |
if tgt_lang == "" or tgt_lang is None: | |
# tgt_lang = LANGUAGE.language[0] # "Къарачай-Малкъар тил" | |
tgt_lang = list(LANGUAGE.keys())[0] # "Къарачай-Малкъар тил" | |
# src_lang = "".join(LANGUAGE[LANGUAGE.language == src_lang].token.to_list()) | |
# tgt_lang = "".join(LANGUAGE[LANGUAGE.language == tgt_lang].token.to_list()) | |
# dialect = "".join(DIALECT[DIALECT.dialect == dialect].short_name.to_list()) | |
src_lang = LANGUAGE[src_lang] | |
tgt_lang = LANGUAGE[tgt_lang] | |
dialect = DIALECT[dialect] | |
print(f'Input text: {text} - Time: {datetime.now(tz=TZ)}') | |
text = text.strip() | |
if src_lang == 'krc_Cyrl': | |
text = self._toModel(text) | |
# Разбиваем текст на предложения, сохраняя знаки препинания | |
texts_to_translate, original_structure = self._prepareTextAndStructure(text=text) | |
# text бош эсе | |
if len(texts_to_translate) == 0: | |
texts_to_translate = [""] | |
#print(f'Split text: {texts_to_translate}') | |
translated_texts = self._translate(text=texts_to_translate, src_lang = src_lang, tgt_lang = tgt_lang) | |
translated = self._recoverTranslatedToStructure( | |
translated_texts=translated_texts, original_structure=original_structure | |
) | |
#print(f'Translated text: {translated}') | |
if tgt_lang == 'krc_Cyrl': | |
translated = self._fromModel(str=translated, dialect = dialect) | |
print(f'Translated text: {translated} - Time: {datetime.now(tz=TZ)}') | |
return translated | |
# Dictionary function | |
def dictionaryDisp(text, src_lang): | |
if src_lang == "" or src_lang is None: | |
src_lang = list(LANGUAGE.keys())[1] # "Русский язык" | |
src_lang = LANGUAGE[src_lang] | |
text = text.strip() | |
str_l = text.lower() | |
filter_ = r"\W+" + str_l + r"|^" + str_l | |
df_from_to = pd.DataFrame() | |
df_to_from = pd.DataFrame() | |
if src_lang == 'krc_Cyrl': | |
df_from_to = dictionary_qm.copy() | |
df_to_from = dictionary_ru.copy() | |
elif src_lang == 'rus_Cyrl': | |
df_from_to = dictionary_ru.copy() | |
df_to_from = dictionary_qm.copy() | |
sozluk_1 = df_from_to[df_from_to.soz_l.str.startswith(str_l)] | |
# Select rows based on the sequence and output | |
sozluk_1 = sozluk_1.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY] | |
sozluk_2 = df_from_to[df_from_to.belgi_l.str.contains(filter_, regex=True)] | |
sozluk_2 = sozluk_2.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY] | |
sozluk_3 = df_to_from[df_to_from.belgi_l.str.contains(filter_, regex=True)] | |
sozluk_3 = sozluk_3.iloc[:OUTPUT_ROW_BY_EVERY_DICTIONARY] | |
# Concatenate the DataFrames and drop duplicates | |
sozluk = pd.concat([sozluk_1, sozluk_2, sozluk_3], ignore_index=True).drop_duplicates()[["soz", "belgi"]] | |
sozluk = [x.soz + " ----- " + x.belgi + "\n\n----------\n\n" for x in sozluk.itertuples()] | |
sozluk = "".join(sozluk) | |
if(len(sozluk) == 0): | |
sozluk = NAMES[NAMES.id == "word_absence"][SYSTEM_LANG].values[0] | |
return sozluk | |
# len(sozluk) | |
# Voice function | |
def tts(text): | |
file_voice = ''.join(random.choices(string.ascii_letters, k=8)) | |
file_voice = f'{file_voice}.wav' | |
text = text.strip() | |
model_tts.save_wav( | |
audio_path = file_voice, | |
text = text, | |
speaker=SPEAKER_KRC_TTS, | |
sample_rate=SAMPLE_RATE_TTS | |
) | |
return file_voice | |
# 5. Definition ui | |
translator = Translator(tokenizer=tokenizer, model=model_translate) | |
_title = "".join(NAMES[NAMES.id == "title"][SYSTEM_LANG].to_list()) | |
_type = "".join(NAMES[NAMES.id == "type"][SYSTEM_LANG].to_list()) | |
_from = "".join(NAMES[NAMES.id == "from"][SYSTEM_LANG].to_list()) | |
_to = "".join(NAMES[NAMES.id == "to"][SYSTEM_LANG].to_list()) | |
_your_sent = "".join(NAMES[NAMES.id == "your_sent"][SYSTEM_LANG].to_list()) | |
_your_sent_tts = "".join(NAMES[NAMES.id == "your_sent_tts"][SYSTEM_LANG].to_list()) | |
_transl_sent = "".join(NAMES[NAMES.id == "transl_sent"][SYSTEM_LANG].to_list()) | |
_dialect = "".join(NAMES[NAMES.id == "dialect"][SYSTEM_LANG].to_list()) | |
_translate = "".join(NAMES[NAMES.id == "translate"][SYSTEM_LANG].to_list()) | |
_annotation = "".join(NAMES[NAMES.id == "annotation"][SYSTEM_LANG].to_list()) | |
_sound = "".join(NAMES[NAMES.id == "sound"][SYSTEM_LANG].to_list()) | |
with gr.Blocks() as demo: | |
gr.Markdown(_title) | |
# Translation | |
with gr.Tab(TYPE[SYSTEM_LANG][0]): | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
# choice_type = gr.Dropdown( | |
# choices = TYPE[SYSTEM_LANG].to_list(), label=_type, value = TYPE[SYSTEM_LANG][0]) | |
translate_lang_input = gr.Dropdown( | |
choices = list(LANGUAGE.keys()), label=_from, value = list(LANGUAGE.keys())[1]) | |
with gr.Column(): | |
with gr.Row(): | |
translate_lang_output = gr.Dropdown( | |
choices = list(LANGUAGE.keys()), label=_to, value = list(LANGUAGE.keys())[0]) | |
dialect = gr.Dropdown( | |
# choices = DIALECT.dialect.to_list(), label=_dialect, value = "дж\ч") | |
choices = list(DIALECT.keys()), label=_dialect, value = list(DIALECT.keys())[0]) | |
with gr.Row(): | |
with gr.Column(): | |
translate_text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "", show_copy_button=True) | |
with gr.Column(): | |
translate_text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "", autoscroll=False, show_copy_button=True) | |
translate_button = gr.Button(_translate, variant = 'primary') | |
# Dictionary | |
with gr.Tab(TYPE[SYSTEM_LANG][1]): | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
dict_lang_input = gr.Dropdown( | |
choices = list(LANGUAGE.keys()), label=_from, value = list(LANGUAGE.keys())[1]) | |
with gr.Row(): | |
with gr.Column(): | |
dict_text_input = gr.Textbox(lines=15, placeholder=_your_sent, label = "", show_copy_button=True) | |
with gr.Column(): | |
dict_text_output = gr.Textbox(lines=15, placeholder=_transl_sent, label = "", autoscroll=False, show_copy_button=True) | |
dict_button = gr.Button(_translate, variant = 'primary') | |
# TTS | |
with gr.Tab(TYPE[SYSTEM_LANG][2]): | |
with gr.Row(): | |
with gr.Column(): | |
tts_text_input = gr.Textbox(lines=3, placeholder=_your_sent_tts, label = "", show_copy_button=True) | |
with gr.Column(): | |
tts_text_output = gr.Audio(label = "", type = 'filepath') | |
tts_button = gr.Button(_sound, variant = 'primary') | |
translate_button.click(translator.translate, inputs=[translate_text_input, translate_lang_input, translate_lang_output, dialect], outputs=[translate_text_output]) # text, from, to, dialect | |
dict_button.click(dictionaryDisp, inputs=[dict_text_input, dict_lang_input], outputs=[dict_text_output]) # text, from | |
tts_button.click(tts, inputs=[tts_text_input], outputs=[tts_text_output]) # text | |
gr.Markdown(_annotation) | |
# 6. Launch | |
demo.launch() | |
# demo.launch(inbrowser=True) |