Spaces:

frogcho123
/

speech2speech

Runtime error

speech2speech / app.py

frogcho123

added app.py

d7dfa49 over 1 year ago

1.92 kB

	import os
	import tempfile
	import gradio as gr
	import whisper
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from gtts import gTTS
	from IPython.display import Audio

	# Load the models and tokenizer
	whisper_model = whisper.load_model("base")
	tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
	model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")

	def translate_audio(input_file, to_lang):
	# Load the audio file
	audio = whisper.load_audio(input_file)
	audio = whisper.pad_or_trim(audio)
	mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

	# Detect language using Whisper
	_, probs = whisper_model.detect_language(mel)
	lang = max(probs, key=probs.get)

	# Convert audio to text
	options = whisper.DecodingOptions()
	result = whisper.decode(whisper_model, mel, options)
	text = result.text

	# Translate the text
	tokenizer.src_lang = lang
	encoded_bg = tokenizer(text, return_tensors="pt")
	generated_tokens = model.generate(**encoded_bg)
	translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

	# Convert translated text to audio
	tts = gTTS(text=translated_text, lang=to_lang)
	temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp3").name
	tts.save(temp_output_file)

	# Load audio data from file
	audio_data = open(temp_output_file, "rb").read()

	return Audio(audio_data)

	def translate_audio_interface(input_file, to_lang):
	return translate_audio(input_file, to_lang)

	iface = gr.Interface(
	fn=translate_audio_interface,
	inputs=["file", "text"],
	outputs="audio",
	title="Audio Translation",
	description="Upload an MP3 file and select the target language for translation.",
	examples=[
	["audio_example.mp3", "en"],
	["speech_sample.mp3", "fr"],
	]
	)

	iface.launch(debug = True)