Spaces:
Runtime error
Runtime error
File size: 4,087 Bytes
fe9dbf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import openai
import locale
locale.getpreferredencoding = lambda: "UTF-8"
import dl_translate as dlt
from deep_translator import GoogleTranslator
from languages import LANGUAGES
OPENAI_API_KEY = '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'
OPENAI_API_URL = 'https://api.openai.com/v1/chat/completions'
openai.api_key = OPENAI_API_KEY
class Translation:
def __init__(self, transcript_dict, source_lang, target_lang, output_path):
self.transcript_dict = transcript_dict
self.output_path = os.path.join(os.getcwd(), output_path)
# Languages
self.source_lang = source_lang # Whisper Detected Language
self.target_lang = target_lang
# Transcript
self.transcript = transcript_dict['text'].strip()
self.subtitles = self.__get_subtitles()
# Translation Model
nllb_model = 'facebook/nllb-200-distilled-600M'
# nllb_model = 'facebook/nllb-200-1.3B'
# nllb_model = 'facebook/nllb-200-3.3B'
# nllb_model = 'facebook/nllb-moe-54b'
self.nllb = dlt.TranslationModel(nllb_model)
def __get_subtitles(self):
'''
Returns the subtitles from transcript dictionary
'''
subtitles = []
for s in self.transcript_dict['segments']:
segment = {
'start': s['start'],
'end': s['end'],
'text': s['text'].strip()
}
subtitles.append(segment)
return subtitles
def __correct_punctuation_gpt(self):
'''
Corrects the Punctuation from GPT
'''
system_prompt = """
You are a helpful NLP assistant.
Your task is to identify language of the provided text,
correct any spelling discrepancies in the transcribed text
as well as add punctuation in the multilingual text if they are missing.
Only add necessary punctuation such as periods, commas, and capitalization,
and use only the context provided.
You response should be as follows:
Corrected Text:
Here goes the corrected text with punctuation.
"""
user_prompt = f"""
Here is the text:
{self.transcript}
"""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
)
text = response.choices[0].message.content.replace('Corrected Text:\n', '')
return text
def get_translated_transcript(self):
'''
Translates the transcript into required language
'''
# Correcting Punctuation using GPT
transcript = self.__correct_punctuation_gpt()
# Splitting Text into Sentences
if self.source_lang in ['ar', 'ur']:
splitter = '۔'
else:
splitter = '.'
sentences = transcript.split(splitter)
# Getting Translation using NLLB
translated_transcript = ''
for sentence in sentences:
translated_sentence = self.nllb.translate(sentence, source=LANGUAGES[self.source_lang], target=LANGUAGES[self.target_lang])
translated_transcript += translated_sentence + splitter + ' '
# print('Text:', sentence)
# print('Text:', translated_sentence)
# print()
translated_transcript = translated_transcript.strip()
return translated_transcript
def get_translated_subtitles(self):
'''
Translates the subtitles into required language
'''
# Creating copy of Transcript Dictionary
subtitles = self.subtitles.copy()
# Creating Instance for Google Translator
gt = GoogleTranslator(source='auto', target=self.target_lang)
for i, s in enumerate(subtitles):
subtitles[i]['text'] = gt.translate(text=s['text'])
return subtitles |