speech2speech / app.py
frogcho123's picture
Update app.py
6cfff67
raw
history blame
2.07 kB
import gradio as gr
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import IPython.display as ipd
import numpy as np
# Load Whisper STT model
whisper_model = whisper.load_model("base")
# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
def translate_speech(audio, target_lang):
# Save audio as a temporary file
audio_path = "recorded_audio.wav"
with open(audio_path, "wb") as f:
f.write(audio)
# Load audio
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
# Detect language
_, probs = whisper_model.detect_language(mel)
lang = max(probs, key=probs.get)
# Decode audio into text
options = whisper.DecodingOptions()
result = whisper.decode(whisper_model, mel, options)
text = result.text
# Translate text
tokenizer.src_lang = lang
encoded_text = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_text)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS)
tts = gTTS(text=translated_text, lang=target_lang)
audio_path = "translated_audio.mp3"
tts.save(audio_path)
return audio_path
def translate_speech_interface(audio, target_lang):
translated_audio = translate_speech(audio, target_lang)
translated_audio = open(translated_audio, "rb")
return translated_audio
# Define the Gradio interface
audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
target_language = gr.inputs.Dropdown(["en", "ru", "fr"], label="Target Language")
output_audio = gr.outputs.Audio(type="audio/mpeg", label="Translated Audio")
gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, target_language], outputs=output_audio, title="Speech Translator").launch()