File size: 2,481 Bytes
0a5f7db 768b8b7 37cb8cc 2960239 0a5f7db 94cf9c3 0a5f7db 4990310 23f9f81 0a5f7db 71c7a94 37cb8cc 4990310 0a5f7db b503d5a 0a5f7db 4990310 0a5f7db 37cb8cc 6746e58 bcbcb8f 4990310 37cb8cc 4990310 0a5f7db 4990310 37cb8cc 0a5f7db 4990310 0a5f7db 4990310 0a5f7db 4990310 0a5f7db 4990310 0a5f7db 37cb8cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
# load speech translation checkpoint
ASR_MODEL_NAME = 'openai/whisper-base'
asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, device=device)
# load text-to-speech checkpoint
model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "de"})
return outputs["text"]
def synthesise(text):
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids)
speech = outputs.audio[0]
return speech.cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST - Any language to German speech"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[MMS TTS](https://huggingface.co/Matthijs/mms-tts-deu) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.queue(concurrency_count=2,max_size=10)
demo.launch() |