File size: 4,211 Bytes
423c1cd
 
 
 
 
 
 
0b0e130
460cdbb
0b0e130
 
423c1cd
 
 
 
 
 
 
59671be
 
 
 
 
 
 
 
0b0e130
460cdbb
 
 
 
 
 
 
 
 
 
59671be
0b0e130
59671be
 
 
423c1cd
 
460cdbb
 
423c1cd
 
 
59671be
 
 
 
423c1cd
59671be
423c1cd
7c35a39
423c1cd
 
460cdbb
 
 
 
 
 
423c1cd
 
460cdbb
 
 
 
 
423c1cd
460cdbb
423c1cd
 
 
 
 
 
 
 
 
 
59671be
 
 
 
 
 
 
423c1cd
59671be
 
 
 
 
 
423c1cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline

## Imports for MMS
from transformers import VitsModel, VitsTokenizer



device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# load text-to-speech checkpoint and speaker embeddings
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)


# For Dutch

##### speecht5 #####
# model_id = 'sanchit-gandhi/speecht5_tts_vox_nl'
# processor = SpeechT5Processor.from_pretrained(model_id)
# model = SpeechT5ForTextToSpeech.from_pretrained(model_id)

# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)


##### mms #####
model = VitsModel.from_pretrained("Matthijs/mms-tts-nld")
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-nld")

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)







embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Original
# def translate(audio):
#     outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
#     return outputs["text"]

# Dutch
def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
    return outputs["text"]

# Original
# def synthesise(text):
#     inputs = processor(text=text, return_tensors="pt")
#     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
#     return speech.cpu()


def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(inputs["input_ids"])
    speech = outputs.audio[0]
    
    return speech.cpu()
    


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded STST"
# description = """
# Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
# [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
# """

description = """
# Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
# [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
# """



demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()