File size: 3,780 Bytes
4127c5c
 
 
 
 
 
 
 
 
 
 
 
 
 
d347764
 
 
 
 
4127c5c
d347764
 
 
 
4127c5c
d347764
4127c5c
d347764
4127c5c
 
 
 
 
 
 
 
 
 
d347764
45c8117
 
d347764
4127c5c
 
 
 
d347764
 
 
 
 
 
 
4127c5c
 
 
 
45c8117
4127c5c
45c8117
d347764
 
 
 
 
4127c5c
d347764
 
4127c5c
d347764
f805e49
 
9bf9f5e
 
 
7a93956
 
9bf9f5e
f805e49
 
 
c737803
 
 
d347764
226ec3a
d347764
f805e49
 
d347764
c737803
 
 
 
 
 
 
 
 
 
4127c5c
c737803
7a93956
4127c5c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
"""HW3_ml.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1z4ht7K9pttbgWmDDnrQhqoZ6SYAiaeUe
"""

# !pip -q uninstall gradio -y
# !pip -q install gradio==3.50.2

# !pip -q install datasets

import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)

# !pip -q install sentencepiece
# load text-to-speech checkpoint and speaker embeddings
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small")

translator1 = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator2 = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

from transformers import VitsModel, VitsTokenizer

# model = pipeline("text-to-speech", model="suno/bark-small")

model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")

def translator_mul_ru(text):

    translation  = translator2(translator1(text)[0]['translation_text'])
    return translation[0]['translation_text']

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]


def synthesise(text):
    translated_text = translator_mul_ru(text)
    inputs = tokenizer(translated_text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)
    speech = outputs["waveform"]
    return speech.cpu()


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    print(translated_text)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]

title = "Cascaded STST"
description = """
* Сначала модель распознает речь с помощью voidful/wav2vec2-xlsr-multilingual-56 и возвращает текст на любом из 56 языков.
* Далее происходит перевод текста с любого на английский с помощью Helsinki-NLP/opus-mt-mul-en, а затем с английского на русский также с помощью Helsinki-NLP/opus-mt-en-ru
* В конце осуществляется воспроизведение русского текста моделью facebook/mms-tts-rus


Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian. Demo uses facebook/mms-tts-rus model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])

demo.launch()