musfiqdehan's picture
Refactor get_better_translation function in translators.py
75aab4f
raw
history blame
4.2 kB
"""
This file contains the functions to translate the text from one language to another.
"""
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from deep_translator import GoogleTranslator, MyMemoryTranslator, MicrosoftTranslator, YandexTranslator, ChatGptTranslator
from .text_preprocess import decontracting_words, space_punc
# Digit Translation
digit_converter = {
'০': '0',
'১': '1',
'২': '2',
'৩': '3',
'৪': '4',
'৫': '5',
'৬': '6',
'৭': '7',
'৮': '8',
'৯': '9'
}
def get_translated_digit(sentence):
"""
Translate the digits from Bengali to English
"""
translated_sentence = []
for each_letter in sentence:
if each_letter in digit_converter.keys():
translated_sentence.append(digit_converter[each_letter])
# print(digit_converter[each_letter], end="")
else:
translated_sentence.append(each_letter)
# print(each_letter, end="")
return "".join(each for each in translated_sentence)
def google_translation(sentence: str, source="auto", target="en") -> str:
"""
Translate a sentence from one language to another using Google Translator.\n
At first install dependencies \n
`!pip install -U deep-translator`
"""
translator = GoogleTranslator()
translated_sentence = translator.translate(
sentence, source=source, target=target)
return translated_sentence
def get_better_translation(src=""):
src_mod = get_translated_digit(src)
tgt = google_translation(src_mod)
tgt = decontracting_words(tgt)
tgt = tgt.replace('rupees', 'takas').replace('Rs', 'takas')
return tgt
target_lang_dict = {
"Afrikaans": "af",
"Albanian": "sq",
"Arabic": "ar",
"Aragonese": "an",
"Armenian": "hy",
"Asturian": "ast",
"Azerbaijani": "az",
"Bashkir": "ba",
"Basque": "eu",
"Bavarian": "bar",
"Belarusian": "be",
"Bengali": "bn",
"Bishnupriya Manipuri": "bpy",
"Bosnian": "bs",
"Breton": "br",
"Bulgarian": "bg",
"Burmese": "my",
"Catalan": "ca",
"Cebuano": "ceb",
"Chechen": "ce",
"Chinese (Simplified)": "zh",
"Chinese (Traditional)": "zh-tw",
"Chuvash": "cv",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"Finnish": "fi",
"French": "fr",
"Galician": "gl",
"Georgian": "ka",
"German": "de",
"Greek": "el",
"Gujarati": "gu",
"Haitian": "ht",
"Hebrew": "he",
"Hindi": "hi",
"Hungarian": "hu",
"Icelandic": "is",
"Ido": "io",
"Indonesian": "id",
"Irish": "ga",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jv",
"Kannada": "kn",
"Kazakh": "kk",
"Kirghiz": "ky",
"Korean": "ko",
"Latin": "la",
"Latvian": "lv",
"Lithuanian": "lt",
"Lombard": "lmo",
"Low Saxon": "nds",
"Luxembourgish": "lb",
"Macedonian": "mk",
"Malagasy": "mg",
"Malay": "ms",
"Malayalam": "ml",
"Marathi": "mr",
"Minangkabau": "min",
"Nepali": "ne",
"Newar": "new",
"Norwegian (Bokmal)": "nb",
"Norwegian (Nynorsk)": "nn",
"Occitan": "oc",
"Persian (Farsi)": "fa",
"Piedmontese": "pms",
"Polish": "pl",
"Portuguese": "pt",
"Punjabi": "pa",
"Romanian": "ro",
"Russian": "ru",
"Scots": "sco",
"Serbian": "sr",
"Serbo-Croatian": "sh",
"Sicilian": "scn",
"Slovak": "sk",
"Slovenian": "sl",
"South Azerbaijani": "azb",
"Spanish": "es",
"Sundanese": "su",
"Swahili": "sw",
"Swedish": "sv",
"Tagalog": "tl",
"Tajik": "tg",
"Tamil": "ta",
"Tatar": "tt",
"Telugu": "te",
"Turkish": "tr",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Vietnamese": "vi",
"Volapük": "vo",
"Waray-Waray": "war",
"Welsh": "cy",
"West Frisian": "fy",
"Western Punjabi": "pnb",
"Yoruba": "yo",
"Thai": "th",
"Mongolian": "mn"
}
def select_target_lang_code(lang):
"""
Select the target language code
"""
return target_lang_dict[lang] if lang in target_lang_dict else "en"