File size: 2,398 Bytes
04e306a
a447435
 
 
 
 
 
 
 
b16454e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import src.constants as constants_utils
import requests
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from mosestokenizer import *
from indicnlp.tokenize import sentence_tokenize
from googletrans import Translator, constants


class TRANSLATOR:
    def __init__(self):
        print()


    def split_sentences(self, paragraph, language):
        if language == "en":
            with MosesSentenceSplitter(language) as splitter:
                return splitter([paragraph])
        elif language in constants_utils.INDIC_LANGUAGE:
            return sentence_tokenize.sentence_split(paragraph, lang=language)


    def get_in_hindi(self, payload):
        tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
        article = self.split_sentences(payload['inputs'], 'en')
        # inputs = tokenizer(payload['input'], return_tensors="pt")
        out_text = ""
        for a in article:
            inputs = tokenizer(a, return_tensors="pt")
            translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100)
            translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
            out_text = out_text.join(translated_sent)
        return out_text


    def get_in_indic(self, text, language='Hindi'):
        tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
        inputs = tokenizer(text, return_tensors="pt")

        code = "eng_Latn"
        if language == 'Hindi':
            code= "hin_Deva"
        elif language == 'Marathi':
            code = "mar_Deva"

        translated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[code],
            max_length=1000
        )

        out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return out_text


    def get_indic_google_translate(self, text, language='Hindi'):
        # Init the Google API translator
        translator = Translator()
        translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en'))
        return str(translations.text)