s2s / app.py
frogcho123
Add application file
36bec1c
raw
history blame
1.94 kB
import gradio as gr
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from tempfile import NamedTemporaryFile
# Define translation function
def translate_audio(input_file, target_language):
# Save uploaded audio file to a temporary file
with NamedTemporaryFile(suffix=".wav") as temp_audio:
temp_audio.write(input_file.read())
temp_audio.seek(0)
# Auto to text (STT)
model = whisper.load_model("base")
audio = whisper.load_audio(temp_audio.name)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
text = result.text
lang = max(probs, key=probs.get)
# Translate
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer.src_lang = target_language
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-audio (TTS)
tts = gTTS(text=translated_text, lang=target_language)
output_file = NamedTemporaryFile(suffix=".mp3", delete=False)
output_file.close()
tts.save(output_file.name)
return output_file.name
# Define Gradio interface
inputs = [
gr.File(label="Upload Audio File"),
gr.Dropdown(choices=['en', 'es', 'fr', 'de', 'ru'], label="Target Language")
]
outputs = [
gr.File(label="Translated Audio")
]
title = "Audio Translation"
description = "Upload an audio file, translate the speech to a target language, and download the translated audio."
gr.Interface(fn=translate_audio, inputs=inputs, outputs=outputs, title=title, description=description).launch(share=True)