File size: 3,453 Bytes
e5dda8b
d347764
 
 
 
e5dda8b
d347764
 
 
e5dda8b
aa9a785
 
e5dda8b
d347764
b0adbe9
e5dda8b
b0adbe9
e5dda8b
d347764
aa9a785
b0adbe9
 
 
 
 
 
d347764
e5dda8b
3c32103
 
d347764
 
b0adbe9
e5dda8b
 
 
 
 
d347764
 
 
 
e5dda8b
 
d347764
e5dda8b
d347764
 
9ecfe83
f805e49
9ecfe83
 
 
83b5be8
aa9a785
 
f805e49
 
 
c737803
 
 
d347764
226ec3a
d347764
f805e49
 
d347764
c737803
 
 
 
 
 
 
 
 
 
 
3946ba6
c737803
d347764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"Задание 3"
import gradio as gr
import numpy as np
import torch

from transformers import pipeline, MarianMTModel, MarianTokenizer, VitsModel, VitsTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

import phonemizer
# variants: 'voidful/wav2vec2-xlsr-multilingual-56'; facebook/wav2vec2-lv-60-espeak-cv-ft, но здесь не загружается библиотека py-espeak-ng
model_wav2vec = 'openai/whisper-small'
asr_pipe = pipeline("automatic-speech-recognition", model=model_wav2vec, device=device)

# load speech-to-text checkpoint
def translate_audio(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]

# translation into Russian
def translate_text(text):  
    # to English - mul en, to Russian - en ru
    model_mul_en = pipeline("translation", model = "Helsinki-NLP/opus-mt-mul-en")
    model_en_ru = pipeline("translation", model = "Helsinki-NLP/opus-mt-en-ru")
    translated_text = model_en_ru(model_mul_en(text)[0]['translation_text'])
    return translated_text[0]['translation_text']

# load text-to-speech checkpoint
model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")

def synthesise(text):
    translated_text = translate_text(text)
    inputs = tokenizer(translated_text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        outputs = model(input_ids)
    speech = outputs["waveform"]
    return speech.cpu()


def speech_to_speech_translation(audio):
    text_from_audio = translate_audio(audio)
    synthesised_speech = synthesise(text_from_audio)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]


title = "Cascaded STST. **Russian** language version"
description = """
* В начале происходит распознавание речи с помощью модели `openai/whisper-small`.
* Затем полученный текст переводится сначала на английский с помощью `Helsinki-NLP/opus-mt-mul-en`, а потом на русский с помощью `Helsinki-NLP/opus-mt-en-ru`.
* На последнем шаге полученный текст озвучивается с помощью модели `facebook/mms-tts-rus model`.
  
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian.  
Demo uses `openai/whisper-small` for speech-to-text and `facebook/mms-tts-rus model` for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()