File size: 4,087 Bytes
fe9dbf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import openai

import locale
locale.getpreferredencoding = lambda: "UTF-8"

import dl_translate as dlt
from deep_translator import GoogleTranslator

from languages import LANGUAGES


OPENAI_API_KEY = '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'
OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions'
openai.api_key = OPENAI_API_KEY

class Translation:

    def __init__(self, transcript_dict, source_lang, target_lang, output_path):
        self.transcript_dict = transcript_dict
        self.output_path = os.path.join(os.getcwd(), output_path)
        
        # Languages
        self.source_lang = source_lang  # Whisper Detected Language
        self.target_lang = target_lang
        
        # Transcript
        self.transcript = transcript_dict['text'].strip()
        self.subtitles = self.__get_subtitles()

        # Translation Model
        nllb_model = 'facebook/nllb-200-distilled-600M'
        # nllb_model = 'facebook/nllb-200-1.3B'
        # nllb_model = 'facebook/nllb-200-3.3B'
        # nllb_model = 'facebook/nllb-moe-54b'
        self.nllb = dlt.TranslationModel(nllb_model)
    
    def __get_subtitles(self):
        '''
        Returns the subtitles from transcript dictionary
        '''

        subtitles = []
        for s in self.transcript_dict['segments']:
            segment = {
                'start': s['start'], 
                'end': s['end'], 
                'text': s['text'].strip()
            }
            subtitles.append(segment)
        return subtitles
    
    def __correct_punctuation_gpt(self):
        '''
        Corrects the Punctuation from GPT
        '''

        system_prompt = """
        You are a helpful NLP assistant. 
        Your task is to identify language of the provided text, 
        correct any spelling discrepancies in the transcribed text 
        as well as add punctuation in the multilingual text if they are missing. 
        Only add necessary punctuation such as periods, commas, and capitalization, 
        and use only the context provided.

        You response should be as follows:
        Corrected Text:
        Here goes the corrected text with punctuation.
        """

        user_prompt = f"""
        Here is the text:
        {self.transcript}
        """

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ]
        )

        text = response.choices[0].message.content.replace('Corrected Text:\n', '')
        return text

    def get_translated_transcript(self):
        '''
        Translates the transcript into required language
        '''
        
        # Correcting Punctuation using GPT
        transcript = self.__correct_punctuation_gpt()

        # Splitting Text into Sentences
        if self.source_lang in ['ar', 'ur']:
            splitter = '۔'
        else:
            splitter = '.'
        sentences = transcript.split(splitter)

        # Getting Translation using NLLB
        translated_transcript = ''
        for sentence in sentences:
            translated_sentence = self.nllb.translate(sentence, source=LANGUAGES[self.source_lang], target=LANGUAGES[self.target_lang])
            translated_transcript += translated_sentence + splitter + ' '
            # print('Text:', sentence)
            # print('Text:', translated_sentence)
            # print()
        translated_transcript = translated_transcript.strip()
        
        return translated_transcript

    def get_translated_subtitles(self):
        '''
        Translates the subtitles into required language
        '''

        # Creating copy of Transcript Dictionary
        subtitles = self.subtitles.copy()

        # Creating Instance for Google Translator
        gt = GoogleTranslator(source='auto', target=self.target_lang)
        for i, s in enumerate(subtitles):
            subtitles[i]['text'] = gt.translate(text=s['text'])
        
        return subtitles