Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
import scipy.io.wavfile | |
import numpy as np | |
from transformers import AutoProcessor, SeamlessM4Tv2Model | |
from pathlib import Path | |
from typing import Optional, Union | |
class SeamlessTranslator: | |
def __init__(self, model_name: str = "facebook/seamless-m4t-v2-large"): | |
try: | |
self.processor = AutoProcessor.from_pretrained(model_name) | |
self.model = SeamlessM4Tv2Model.from_pretrained(model_name) | |
self.sample_rate = self.model.config.sampling_rate | |
except Exception as e: | |
raise RuntimeError(f"Failed to initialize model: {str(e)}") | |
# Available language pairs | |
self.language_codes = { | |
"English": "eng", | |
"Spanish": "spa", | |
"French": "fra", | |
"German": "deu", | |
"Italian": "ita", | |
"Portuguese": "por", | |
"Russian": "rus", | |
"Chinese": "cmn", | |
"Japanese": "jpn", | |
"Korean": "kor", | |
"Arabic": "ara", | |
"Hindi": "hin", | |
} | |
def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> tuple[int, np.ndarray]: | |
try: | |
inputs = self.processor(text=text, src_lang=src_lang, return_tensors="pt") | |
audio_array = self.model.generate(**inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze() | |
return self.sample_rate, audio_array | |
except Exception as e: | |
raise RuntimeError(f"Text translation failed: {str(e)}") | |
def translate_audio(self, audio_path: str, tgt_lang: str) -> tuple[int, np.ndarray]: | |
try: | |
# Load and resample audio | |
audio, orig_freq = torchaudio.load(audio_path) | |
audio = torchaudio.functional.resample( | |
audio, | |
orig_freq=orig_freq, | |
new_freq=16_000 | |
) | |
# Process and generate translation | |
inputs = self.processor(audios=audio, return_tensors="pt") | |
audio_array = self.model.generate(**inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze() | |
return self.sample_rate, audio_array | |
except Exception as e: | |
raise RuntimeError(f"Audio translation failed: {str(e)}") | |
class GradioInterface: | |
def __init__(self): | |
self.translator = SeamlessTranslator() | |
self.languages = list(self.translator.language_codes.keys()) | |
def text_to_speech(self, text: str, src_lang: str, tgt_lang: str) -> tuple[int, np.ndarray]: | |
src_code = self.translator.language_codes[src_lang] | |
tgt_code = self.translator.language_codes[tgt_lang] | |
return self.translator.translate_text(text, src_code, tgt_code) | |
def speech_to_speech(self, audio_path: str, tgt_lang: str) -> tuple[int, np.ndarray]: | |
tgt_code = self.translator.language_codes[tgt_lang] | |
return self.translator.translate_audio(audio_path, tgt_code) | |
def launch(self): | |
# Create the Gradio interface | |
with gr.Blocks(title="SeamlessM4T Translator") as demo: | |
gr.Markdown("# π SeamlessM4T Translator") | |
gr.Markdown("Translate text or speech to different languages using Meta's SeamlessM4T model") | |
with gr.Tabs(): | |
# Text-to-Speech tab | |
with gr.TabItem("Text to Speech"): | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Input Text", | |
placeholder="Enter text to translate...", | |
lines=3 | |
) | |
src_lang = gr.Dropdown( | |
choices=self.languages, | |
value="English", | |
label="Source Language" | |
) | |
tgt_lang_text = gr.Dropdown( | |
choices=self.languages, | |
value="Spanish", | |
label="Target Language" | |
) | |
translate_btn = gr.Button("Translate", variant="primary") | |
with gr.Column(): | |
audio_output = gr.Audio( | |
label="Translated Speech", | |
type="numpy" | |
) | |
translate_btn.click( | |
fn=self.text_to_speech, | |
inputs=[text_input, src_lang, tgt_lang_text], | |
outputs=audio_output | |
) | |
# Speech-to-Speech tab | |
with gr.TabItem("Speech to Speech"): | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
label="Input Speech", | |
type="filepath" | |
) | |
tgt_lang_speech = gr.Dropdown( | |
choices=self.languages, | |
value="Spanish", | |
label="Target Language" | |
) | |
translate_audio_btn = gr.Button("Translate", variant="primary") | |
with gr.Column(): | |
audio_output_s2s = gr.Audio( | |
label="Translated Speech", | |
type="numpy" | |
) | |
translate_audio_btn.click( | |
fn=self.speech_to_speech, | |
inputs=[audio_input, tgt_lang_speech], | |
outputs=audio_output_s2s | |
) | |
gr.Markdown( | |
""" | |
### Notes | |
- Text-to-Speech: Enter text and select source/target languages | |
- Speech-to-Speech: Upload an audio file and select target language | |
- Processing may take a few moments depending on input length | |
""" | |
) | |
# Launch the interface | |
demo.launch(share=True) | |
if __name__ == "__main__": | |
interface = GradioInterface() | |
interface.launch() |