import gradio as gr import numpy as np import torch from transformers import pipeline, VitsModel #from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor from transformers import WhisperTokenizer, GenerationConfig from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer device = "cuda:0" if torch.cuda.is_available() else "cpu" tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium") generation_config = GenerationConfig.from_pretrained("openai/whisper-medium") generation_config.forced_decoder_ids tokenizer.decode(generation_config.forced_decoder_ids[1][1]) asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device) # ---------------- Speech generator mms-tts-spa --------------------------# vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa") vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa") # ---------------- Speech generator specht5_tts --------------------------# # model = SpeechT5ForTextToSpeech.from_pretrained( # "juangtzi/speecht5_finetuned_voxpopuli_es" # ) # checkpoint = "microsoft/speecht5_tts" # processor = SpeechT5Processor.from_pretrained(checkpoint) # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # speaker_embeddings2 = np.load('speaker_embeddings.npy') # speaker_embeddings2 = torch.tensor(speaker_embeddings2) # print(speaker_embeddings2) # def language_detector(text): # resultado = lang_detector(text) # idioma_detectado = resultado[0]['label'] # print(idioma_detectado) # return idioma_detectado def translate(audio): outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"}) print(outputs["text"]) return outputs["text"] # def synthesise(text): # inputs = processor(text=text, return_tensors="pt") # output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder) # return output # def speech_to_speech_translation(audio): # translated_text = translate(audio) # synthesised_speech = synthesise(translated_text) # audio_data = synthesised_speech.cpu().numpy() # #audio_data = np.squeeze(audio_data) # #audio_data = audio_data / np.max(np.abs(audio_data)) # sample_rate = 16000 # return (sample_rate, audio_data) def synthesise(text): print(text) inputs = vist_tokenizer(text, return_tensors="pt") with torch.no_grad(): output = vist_model(**inputs).waveform[0] return output def speech_to_speech_translation(audio): translated_text = translate(audio) synthesised_speech = synthesise(translated_text) synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16) return 16000, synthesised_speech title = "Cascaded STST" description = """ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish. """ demo = gr.Blocks() mic_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), title=title, description=description, ) file_translate = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Audio(label="Generated Speech", type="numpy"), examples=[["./example.wav"]], title=title, description=description, ) # Definir la estructura dentro de gr.Blocks() with demo: # Mostrar el título y la descripción gr.Markdown(f"# {title}") gr.Markdown(description) # Incluir la imagen gr.Image("https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png") # Tabbed Interface para las dos modalidades gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"]) # with demo: # gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"]) demo.launch()