File size: 4,068 Bytes
d347764 4f7c91e e1c6d89 d7760f2 e1c6d89 d347764 08375d8 d347764 d7760f2 8af36ac d7760f2 d347764 c778606 e1c6d89 ea70de0 c778606 ea70de0 5e47240 c778606 d347764 e1c6d89 d347764 b7cd514 09708e9 99b08f3 d347764 e1c6d89 f75821a e1c6d89 f75821a c778606 f75821a e1c6d89 d347764 f805e49 08375d8 f805e49 c737803 d347764 08375d8 d347764 f805e49 d347764 c737803 08375d8 c737803 40aa950 c737803 40aa950 3946ba6 c737803 40aa950 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel
#from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
from transformers import WhisperTokenizer, GenerationConfig
from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium")
generation_config = GenerationConfig.from_pretrained("openai/whisper-medium")
generation_config.forced_decoder_ids
tokenizer.decode(generation_config.forced_decoder_ids[1][1])
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
# ---------------- Speech generator mms-tts-spa --------------------------#
vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
# ---------------- Speech generator specht5_tts --------------------------#
# model = SpeechT5ForTextToSpeech.from_pretrained(
# "juangtzi/speecht5_finetuned_voxpopuli_es"
# )
# checkpoint = "microsoft/speecht5_tts"
# processor = SpeechT5Processor.from_pretrained(checkpoint)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# speaker_embeddings2 = np.load('speaker_embeddings.npy')
# speaker_embeddings2 = torch.tensor(speaker_embeddings2)
# print(speaker_embeddings2)
# def language_detector(text):
# resultado = lang_detector(text)
# idioma_detectado = resultado[0]['label']
# print(idioma_detectado)
# return idioma_detectado
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
print(outputs["text"])
return outputs["text"]
# def synthesise(text):
# inputs = processor(text=text, return_tensors="pt")
# output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
# return output
# def speech_to_speech_translation(audio):
# translated_text = translate(audio)
# synthesised_speech = synthesise(translated_text)
# audio_data = synthesised_speech.cpu().numpy()
# #audio_data = np.squeeze(audio_data)
# #audio_data = audio_data / np.max(np.abs(audio_data))
# sample_rate = 16000
# return (sample_rate, audio_data)
def synthesise(text):
print(text)
inputs = vist_tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = vist_model(**inputs).waveform[0]
return output
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish.
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
# Definir la estructura dentro de gr.Blocks()
with demo:
# Mostrar el título y la descripción
gr.Markdown(f"# {title}")
gr.Markdown(description)
# Incluir la imagen
gr.Image("https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png")
# Tabbed Interface para las dos modalidades
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
# with demo:
# gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()
|