elizabetvaganova commited on
Commit
1196030
1 Parent(s): 6626f2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -1
app.py CHANGED
@@ -13,4 +13,51 @@ asr_pipe = pipeline("automatic-speech-recognition", model="alphacep/kaldi-ru", d
13
  processor = SpeechT5Processor.from_pretrained("ttskit/ttskit-tts-ljspeech")
14
 
15
  model = SpeechT5ForTextToSpeech.from_pretrained("ttskit/ttskit-tts-ljspeech").to(device)
16
- vocoder = SpeechT5HifiGan.from_pretrained("ljspeech/vocoder-cry
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  processor = SpeechT5Processor.from_pretrained("ttskit/ttskit-tts-ljspeech")
14
 
15
  model = SpeechT5ForTextToSpeech.from_pretrained("ttskit/ttskit-tts-ljspeech").to(device)
16
+ vocoder = SpeechT5HifiGan.from_pretrained("ljspeech/vocoder-cryptron").to(device)
17
+
18
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
20
+
21
+ def translate(audio):
22
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
23
+ return outputs["text"]
24
+
25
+ def synthesise(text):
26
+ inputs = processor(text=text, return_tensors="pt")
27
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
28
+ return speech.cpu()
29
+
30
+ def speech_to_speech_translation(audio):
31
+ translated_text = translate(audio)
32
+ synthesised_speech = synthesise(translated_text)
33
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
34
+ return 16000, synthesised_speech
35
+
36
+ title = "Cascaded STST"
37
+ description = """
38
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses Vosk for automatic speech recognition, and lightweight text-to-speech and vocoder models.
39
+ """
40
+
41
+ demo = gr.Blocks()
42
+
43
+ mic_translate = gr.Interface(
44
+ fn=speech_to_speech_translation,
45
+ inputs=gr.Audio(source="microphone", type="filepath"),
46
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
47
+ title=title,
48
+ description=description,
49
+ )
50
+
51
+ file_translate = gr.Interface(
52
+ fn=speech_to_speech_translation,
53
+ inputs=gr.Audio(source="upload", type="filepath"),
54
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
55
+ examples=[["./example.wav"]],
56
+ title=title,
57
+ description=description,
58
+ )
59
+
60
+ with demo:
61
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
62
+
63
+ demo.launch()