import src.constants as constants_utils import requests from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from mosestokenizer import * from indicnlp.tokenize import sentence_tokenize from googletrans import Translator, constants class TRANSLATOR: def __init__(self): print() def split_sentences(self, paragraph, language): if language == "en": with MosesSentenceSplitter(language) as splitter: return splitter([paragraph]) elif language in constants_utils.INDIC_LANGUAGE: return sentence_tokenize.sentence_split(paragraph, lang=language) def get_in_hindi(self, payload): tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") article = self.split_sentences(payload['inputs'], 'en') # inputs = tokenizer(payload['input'], return_tensors="pt") out_text = "" for a in article: inputs = tokenizer(a, return_tensors="pt") translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100) translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] out_text = out_text.join(translated_sent) return out_text def get_in_indic(self, text, language='Hindi'): tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") inputs = tokenizer(text, return_tensors="pt") code = "eng_Latn" if language == 'Hindi': code= "hin_Deva" elif language == 'Marathi': code = "mar_Deva" translated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[code], max_length=1000 ) out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] return out_text def get_indic_google_translate(self, text, language='Hindi'): # Init the Google API translator translator = Translator() translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en')) return str(translations.text)