import gradio as gr import numpy as np import torch from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor device = "cuda:0" if torch.cuda.is_available() else "cpu" asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device) #vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa") #vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa") model = SpeechT5ForTextToSpeech.from_pretrained( "juangtzi/speecht5_finetuned_voxpopuli_es" ) checkpoint = "microsoft/speecht5_tts" processor = SpeechT5Processor.from_pretrained(checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") speaker_embeddings2 = np.load('speaker_embeddings.npy') speaker_embeddings2 = torch.tensor(speaker_embeddings2) print(speaker_embeddings2) lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection") def language_detector(text): resultado = lang_detector(text) idioma_detectado = resultado[0]['label'] print(idioma_detectado) return idioma_detectado def translate(audio): outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"}) return outputs["text"] def synthesise(text): inputs = processor(text=text, return_tensors="pt") output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder) return output def speech_to_speech_translation(audio): translated_text = translate(audio) synthesised_speech = synthesise(translated_text) audio_data = synthesised_speech.cpu().numpy() audio_data = np.squeeze(audio_data) audio_data = audio_data / np.max(np.abs(audio_data)) sample_rate = 16000 return (sample_rate, audio_data) title = "Cascaded STST" description = """ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish. ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation") """ demo = gr.Blocks() mic_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), title=title, description=description, ) file_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), examples=[["./example.wav"]], title=title, description=description, ) with demo: gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"]) demo.launch()