Spaces:
Sleeping
Sleeping
File size: 2,109 Bytes
d84f405 3420bec d84f405 17c527a c89a3ea 17c527a 30318a6 17c527a 30318a6 17c527a d84f405 17c527a 3420bec 17c527a 30318a6 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a d84f405 17c527a c89a3ea d84f405 0123058 c89a3ea d84f405 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import sentencepiece
def translate_voice(file, target_lang):
try:
# Load the model and switch to float32
model = whisper.load_model("base").float()
# Load the audio
audio = whisper.load_audio(file.name)
# Pad or trim the audio
audio = whisper.pad_or_trim(audio)
# Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case)
mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # convert to full-precision float32
# Proceed with your language detection and decoding
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
text = result.text
lang = max(probs, key=probs.get)
# Translate
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer.src_lang = target_lang
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-audio (TTS)
tts = gTTS(text=translated_text, lang=target_lang)
filename = "to_speech.mp3"
tts.save(filename)
return filename, text, translated_text, target_lang
except Exception as e:
return str(e), "", "", ""
iface = gr.Interface(
fn=translate_voice,
inputs=[
gr.inputs.File(label="Your Audio"),
gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
],
outputs=[
gr.outputs.Audio(type="filepath", label="Translated Audio"),
gr.outputs.Textbox(label="Original Text"),
gr.outputs.Textbox(label="Translated Text"),
gr.outputs.Textbox(label="Target Language"),
]
)
iface.launch()
|