gsvann commited on
Commit
aa9a785
·
1 Parent(s): 83b5be8

Update for Russian speech app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -10
app.py CHANGED
@@ -8,9 +8,8 @@ from transformers import pipeline, MarianMTModel, MarianTokenizer, VitsModel, Vi
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  import phonemizer
11
- # сначала использовала facebook/wav2vec2-lv-60-espeak-cv-ft в коллабе, но здесь не загружается библиотека py-espeak-ng
12
- model_wav2vec = 'openai/whisper-small' #'voidful/wav2vec2-xlsr-multilingual-56'
13
-
14
  asr_pipe = pipeline("automatic-speech-recognition", model=model_wav2vec, device=device)
15
 
16
  # load speech-to-text checkpoint
@@ -18,7 +17,7 @@ def translate_audio(audio):
18
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
19
  return outputs["text"]
20
 
21
- # translation to Russian
22
  def translate_text(text):
23
  # to English - mul en, to Russian - en ru
24
  model_mul_en = pipeline("translation", model = "Helsinki-NLP/opus-mt-mul-en")
@@ -27,8 +26,8 @@ def translate_text(text):
27
  return translated_text[0]['translation_text']
28
 
29
  # load text-to-speech checkpoint
30
- model = VitsModel.from_pretrained("facebook/mms-tts-rus")
31
- tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")
32
 
33
  def synthesise(text):
34
  translated_text = translate_text(text)
@@ -47,13 +46,14 @@ def speech_to_speech_translation(audio):
47
  return 16000, synthesised_speech[0]
48
 
49
 
50
- title = "Cascaded STST for Russian"
51
  description = """
52
  * В начале происходит распознавание речи с помощью модели openai/whisper-small.
53
- * Затем полученный текст переводится сначала на английский с помощью Helsinki-NLP/opus-mt-mul-en, а потом на русский с помощью Helsinki-NLP/opus-mt-en-ru
54
- * На последнем шаге полученный текст озвучивается с помощью fine-tune-говой версии microsoft/speecht5_tts - voxxer/speecht5_finetuned_commonvoice_ru_translit
55
 
56
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian. Demo uses openai/whisper-small for speech-to-text and facebook/mms-tts-rus model for text-to-speech:
 
57
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
58
  """
59
 
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  import phonemizer
11
+ # variants: 'voidful/wav2vec2-xlsr-multilingual-56'; facebook/wav2vec2-lv-60-espeak-cv-ft, но здесь не загружается библиотека py-espeak-ng
12
+ model_wav2vec = 'openai/whisper-small'
 
13
  asr_pipe = pipeline("automatic-speech-recognition", model=model_wav2vec, device=device)
14
 
15
  # load speech-to-text checkpoint
 
17
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
18
  return outputs["text"]
19
 
20
+ # translation into Russian
21
  def translate_text(text):
22
  # to English - mul en, to Russian - en ru
23
  model_mul_en = pipeline("translation", model = "Helsinki-NLP/opus-mt-mul-en")
 
26
  return translated_text[0]['translation_text']
27
 
28
  # load text-to-speech checkpoint
29
+ model = VitsModel.from_pretrained("coqui/XTTS-v2") # or facebook/mms-tts-rus
30
+ tokenizer = VitsTokenizer.from_pretrained("coqui/XTTS-v2") # or facebook/mms-tts-rus
31
 
32
  def synthesise(text):
33
  translated_text = translate_text(text)
 
46
  return 16000, synthesised_speech[0]
47
 
48
 
49
+ title = "Cascaded STST. Russian language version"
50
  description = """
51
  * В начале происходит распознавание речи с помощью модели openai/whisper-small.
52
+ * Затем полученный текст переводится сначала на английский с помощью Helsinki-NLP/opus-mt-mul-en, а потом на русский с помощью Helsinki-NLP/opus-mt-en-ru.
53
+ * На последнем шаге полученный текст озвучивается с помощью модели coqui/XTTS-v2.
54
 
55
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian.
56
+ Demo uses `openai/whisper-small` for speech-to-text and `facebook/mms-tts-rus model` for text-to-speech:
57
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
58
  """
59