File size: 2,481 Bytes
0a5f7db
 
 
768b8b7
37cb8cc
2960239
0a5f7db
94cf9c3
 
 
0a5f7db
4990310
23f9f81
0a5f7db
71c7a94
37cb8cc
4990310
 
0a5f7db
b503d5a
0a5f7db
4990310
0a5f7db
37cb8cc
6746e58
bcbcb8f
4990310
 
 
 
37cb8cc
4990310
 
 
 
0a5f7db
4990310
 
 
 
37cb8cc
 
0a5f7db
4990310
 
0a5f7db
 
 
 
 
 
4990310
0a5f7db
 
 
 
 
 
 
4990310
0a5f7db
4990310
0a5f7db
 
 
 
 
 
 
37cb8cc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel, VitsTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

# load speech translation checkpoint
ASR_MODEL_NAME = 'openai/whisper-base'
asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, device=device)


# load text-to-speech checkpoint
model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")


def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "de"})
    return outputs["text"]
    
def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)
    
    speech = outputs.audio[0]
    return speech.cpu()
    
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech
    
    
title = "Cascaded STST - Any language to German speech"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[MMS TTS](https://huggingface.co/Matthijs/mms-tts-deu) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.queue(concurrency_count=2,max_size=10)
demo.launch()