import os import openai import locale locale.getpreferredencoding = lambda: "UTF-8" import dl_translate as dlt from deep_translator import GoogleTranslator from languages import LANGUAGES OPENAI_API_KEY = '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' OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions' openai.api_key = OPENAI_API_KEY class Translation: def __init__(self, transcript_dict, source_lang, target_lang, output_path): self.transcript_dict = transcript_dict self.output_path = os.path.join(os.getcwd(), output_path) # Languages self.source_lang = source_lang # Whisper Detected Language self.target_lang = target_lang # Transcript self.transcript = transcript_dict['text'].strip() self.subtitles = self.__get_subtitles() # Translation Model nllb_model = 'facebook/nllb-200-distilled-600M' # nllb_model = 'facebook/nllb-200-1.3B' # nllb_model = 'facebook/nllb-200-3.3B' # nllb_model = 'facebook/nllb-moe-54b' self.nllb = dlt.TranslationModel(nllb_model) def __get_subtitles(self): ''' Returns the subtitles from transcript dictionary ''' subtitles = [] for s in self.transcript_dict['segments']: segment = { 'start': s['start'], 'end': s['end'], 'text': s['text'].strip() } subtitles.append(segment) return subtitles def __correct_punctuation_gpt(self): ''' Corrects the Punctuation from GPT ''' system_prompt = """ You are a helpful NLP assistant. Your task is to identify language of the provided text, correct any spelling discrepancies in the transcribed text as well as add punctuation in the multilingual text if they are missing. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided. You response should be as follows: Corrected Text: Here goes the corrected text with punctuation. """ user_prompt = f""" Here is the text: {self.transcript} """ response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] ) text = response.choices[0].message.content.replace('Corrected Text:\n', '') return text def get_translated_transcript(self): ''' Translates the transcript into required language ''' # Correcting Punctuation using GPT transcript = self.__correct_punctuation_gpt() # Splitting Text into Sentences if self.source_lang in ['ar', 'ur']: splitter = '۔' else: splitter = '.' sentences = transcript.split(splitter) # Getting Translation using NLLB translated_transcript = '' for sentence in sentences: translated_sentence = self.nllb.translate(sentence, source=LANGUAGES[self.source_lang], target=LANGUAGES[self.target_lang]) translated_transcript += translated_sentence + splitter + ' ' # print('Text:', sentence) # print('Text:', translated_sentence) # print() translated_transcript = translated_transcript.strip() return translated_transcript def get_translated_subtitles(self): ''' Translates the subtitles into required language ''' # Creating copy of Transcript Dictionary subtitles = self.subtitles.copy() # Creating Instance for Google Translator gt = GoogleTranslator(source='auto', target=self.target_lang) for i, s in enumerate(subtitles): subtitles[i]['text'] = gt.translate(text=s['text']) return subtitles