speech2speech / app.py
frogcho123
added app.py
d7dfa49
raw
history blame
1.92 kB
import os
import tempfile
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from IPython.display import Audio
# Load the models and tokenizer
whisper_model = whisper.load_model("base")
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
def translate_audio(input_file, to_lang):
# Load the audio file
audio = whisper.load_audio(input_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
# Detect language using Whisper
_, probs = whisper_model.detect_language(mel)
lang = max(probs, key=probs.get)
# Convert audio to text
options = whisper.DecodingOptions()
result = whisper.decode(whisper_model, mel, options)
text = result.text
# Translate the text
tokenizer.src_lang = lang
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Convert translated text to audio
tts = gTTS(text=translated_text, lang=to_lang)
temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp3").name
tts.save(temp_output_file)
# Load audio data from file
audio_data = open(temp_output_file, "rb").read()
return Audio(audio_data)
def translate_audio_interface(input_file, to_lang):
return translate_audio(input_file, to_lang)
iface = gr.Interface(
fn=translate_audio_interface,
inputs=["file", "text"],
outputs="audio",
title="Audio Translation",
description="Upload an MP3 file and select the target language for translation.",
examples=[
["audio_example.mp3", "en"],
["speech_sample.mp3", "fr"],
]
)
iface.launch(debug = True)