speech-to-speech-translation-test

Sleeping

App Files Files Community

juangtzi commited on Oct 10

Commit

c778606

•

1 Parent(s): 09708e9

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -18

app.py CHANGED Viewed

@@ -17,24 +17,35 @@ generation_config.forced_decoder_ids
 tokenizer.decode(generation_config.forced_decoder_ids[1][1])
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
 #vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
 #vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
-model = SpeechT5ForTextToSpeech.from_pretrained(
-    "juangtzi/speecht5_finetuned_voxpopuli_es"
-)
-checkpoint = "microsoft/speecht5_tts"
-processor = SpeechT5Processor.from_pretrained(checkpoint)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-speaker_embeddings2 = np.load('speaker_embeddings.npy')
-speaker_embeddings2 = torch.tensor(speaker_embeddings2)
-print(speaker_embeddings2)
-lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
 def language_detector(text):
     resultado = lang_detector(text)
@@ -47,19 +58,32 @@ def translate(audio):
     print(outputs["text"])
     return outputs["text"]
 def synthesise(text):
-    inputs = processor(text=text, return_tensors="pt")
-    output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
-    return output
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
-    audio_data = synthesised_speech.cpu().numpy()
-    audio_data = np.squeeze(audio_data)
-    audio_data = audio_data / np.max(np.abs(audio_data))
     sample_rate = 16000
-    return (sample_rate, audio_data)
 title = "Cascaded STST"
 description = """

 tokenizer.decode(generation_config.forced_decoder_ids[1][1])
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
+# ---------------- Speech generator mms-tts-spa --------------------------#
 #vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
 #vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
+# ---------------- Speech generator  specht5_tts --------------------------#
+# model = SpeechT5ForTextToSpeech.from_pretrained(
+#     "juangtzi/speecht5_finetuned_voxpopuli_es"
+# )
+# checkpoint = "microsoft/speecht5_tts"
+# processor = SpeechT5Processor.from_pretrained(checkpoint)
+# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+# speaker_embeddings2 = np.load('speaker_embeddings.npy')
+# speaker_embeddings2 = torch.tensor(speaker_embeddings2)
+# print(speaker_embeddings2)
+# lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
+# ---------------- Speech generator  bark--------------------------#
+from transformers import BarkModel, BarkProcessor
+model = BarkModel.from_pretrained("suno/bark-small")
+processor = BarkProcessor.from_pretrained("suno/bark-small")
 def language_detector(text):
     resultado = lang_detector(text)
     print(outputs["text"])
     return outputs["text"]
 def synthesise(text):
+    inputs = processor(text=text, voice_preset="v2/es_speaker_8")
+    speech_output = model.generate(**inputs).cpu().numpy()
+    return speech_output
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     sample_rate = 16000
+    return (sample_rate, synthesised_speech)
+# def synthesise(text):  speecht5_tts
+#     inputs = processor(text=text, return_tensors="pt")
+#     output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
+#     return output
+# def speech_to_speech_translation(audio):  speecht5_tts
+#     translated_text = translate(audio)
+#     synthesised_speech = synthesise(translated_text)
+#     audio_data = synthesised_speech.cpu().numpy()
+#     audio_data = np.squeeze(audio_data)
+#     audio_data = audio_data / np.max(np.abs(audio_data))
+#     sample_rate = 16000
+#     return (sample_rate, audio_data)
 title = "Cascaded STST"
 description = """