Spaces:

WhisperTube
/

whispertube_backend

Runtime error

App Files Files Community

whispertube_backend / translation.py

uzi007

Added GPU Models

a689179 12 months ago

raw

history blame

3.81 kB

	import os
	import openai

	import locale
	locale.getpreferredencoding = lambda: "UTF-8"

	import dl_translate as dlt
	from deep_translator import GoogleTranslator

	from languages import CODE2LANG, r2l_languages
	from config import OPENAI_API_KEY, OPENAI_API_URL

	openai.api_key = OPENAI_API_KEY


	class Translation:

	def __init__(self, model, transcript_dict, source_lang, target_lang, output_path):
	self.transcript_dict = transcript_dict
	self.output_path = os.path.join(os.getcwd(), output_path)

	# Languages
	self.source_lang = source_lang # Whisper Detected Language
	self.target_lang = target_lang

	# Transcript
	self.transcript = transcript_dict['text'].strip()
	self.subtitles = self.__get_subtitles()

	# Translation Model
	self.nllb = model

	def __get_subtitles(self):
	'''
	Returns the subtitles from transcript dictionary
	'''

	subtitles = []
	for s in self.transcript_dict['segments']:
	segment = {
	'start': s['start'],
	'end': s['end'],
	'text': s['text'].strip()
	}
	subtitles.append(segment)
	return subtitles

	def __correct_punctuation_gpt(self):
	'''
	Corrects the Punctuation from GPT
	'''

	system_prompt = """
	You are a helpful NLP assistant.
	Your task is to identify language of the provided text,
	correct any spelling discrepancies in the transcribed text
	as well as add punctuation in the multilingual text if they are missing.
	Only add necessary punctuation such as periods, commas, and capitalization,
	and use only the context provided.

	You response should be as follows:
	Corrected Text:
	Here goes the corrected text with punctuation.
	"""

	user_prompt = f"""
	Here is the text:
	{self.transcript}
	"""

	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]
	)

	text = response.choices[0].message.content.replace('Corrected Text:\n', '')
	return text

	def get_translated_transcript(self):
	'''
	Translates the transcript into required language
	'''

	# Correcting Punctuation using GPT
	transcript = self.__correct_punctuation_gpt()

	# Splitting Text into Sentences
	if self.source_lang in r2l_languages.keys():
	splitter = '۔'
	else:
	splitter = '.'
	sentences = transcript.split(splitter)

	# Getting Translation using NLLB
	translated_transcript = ''
	for sentence in sentences:
	translated_sentence = self.nllb.translate(sentence, source=CODE2LANG[self.source_lang], target=CODE2LANG[self.target_lang])
	translated_transcript += translated_sentence + splitter + ' '
	# print('Text:', sentence)
	# print('Text:', translated_sentence)
	# print()
	translated_transcript = translated_transcript.strip()

	return translated_transcript

	def get_translated_subtitles(self):
	'''
	Translates the subtitles into required language
	'''

	# Creating copy of Transcript Dictionary
	subtitles = self.subtitles.copy()

	# Creating Instance for Google Translator
	gt = GoogleTranslator(source='auto', target=self.target_lang)
	for i, s in enumerate(subtitles):
	subtitles[i]['text'] = gt.translate(text=s['text'])

	return subtitles