s2s / app.py
frogcho123's picture
Update app.py
8d7bec1
raw
history blame
2.16 kB
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import sentencepiece
import sounddevice as sd
import soundfile as sf
import tempfile
def translate_voice(audio, target_lang):
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
temp_filename = temp_audio.name
sf.write(temp_filename, audio, 16000)
model = whisper.load_model("base").float()
audio = whisper.load_audio(temp_filename)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device).float()
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(model, mel, options)
text = result.text
lang = max(probs, key=probs.get)
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer.src_lang = target_lang
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
tts = gTTS(text=translated_text, lang=target_lang)
filename = "to_speech.mp3"
tts.save(filename)
return filename, text, translated_text, target_lang
def record_audio():
fs = 16000
duration = 5 # Record audio for 5 seconds, you can adjust the duration as needed
audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
sd.wait()
return audio.flatten()
iface = gr.Interface(
fn=translate_voice,
inputs=[
gr.inputs.Audio(type="microphone", label="Speak"),
gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
],
outputs=[
gr.outputs.Audio(type="filepath", label="Translated Audio"),
gr.outputs.Textbox(label="Original Text"),
gr.outputs.Textbox(label="Translated Text"),
gr.outputs.Textbox(label="Target Language"),
]
)
iface.launch()