juangtzi's picture
Update app.py
b7cd514 verified
raw
history blame
2.86 kB
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
#vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
#vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
model = SpeechT5ForTextToSpeech.from_pretrained(
"juangtzi/speecht5_finetuned_voxpopuli_es"
)
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings2 = np.load('speaker_embeddings.npy')
speaker_embeddings2 = torch.tensor(speaker_embeddings2)
print(speaker_embeddings2)
lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
def language_detector(text):
resultado = lang_detector(text)
idioma_detectado = resultado[0]['label']
print(idioma_detectado)
return idioma_detectado
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
return outputs["text"]
def synthesise(text):
inputs = processor(text=text, return_tensors="pt")
output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
return output
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
audio_data = synthesised_speech.cpu().numpy()
audio_data = np.squeeze(audio_data)
audio_data = audio_data / np.max(np.abs(audio_data))
sample_rate = 16000
return (sample_rate, audio_data)
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish.
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()