speech-to-speech-translation

Sleeping

Everton Aleixo commited on Aug 29, 2023

Commit

9578405

•

1 Parent(s): b9f4b9a

Change asr

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,25 +8,7 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
-# asr_pipe = pipeline("automatic-speech-recognition", model="jonatasgrosman/whisper-large-pt-cv11", device=device)
-# asr_pipe.model.config.forced_decoder_ids = (
-#   asr_pipe.tokenizer.get_decoder_prompt_ids(
-#     language="pt",
-#     task="transcribe"
-#   )
-# )
-asr_pipe = pipeline(
-  "automatic-speech-recognition",
-  model="jonatasgrosman/whisper-large-pt-cv11"
-)
-asr_pipe.model.config.forced_decoder_ids = (
-  asr_pipe.tokenizer.get_decoder_prompt_ids(
-    language="pt",
-    task="transcribe"
-  )
-)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
@@ -39,8 +21,8 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
 def translate(audio):
-    outputs = asr_pipe(audio)
-    print('translate', outputs)
     return outputs["text"]

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
+asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 def translate(audio):
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language":"portuguese"})
+    print('outputs', outputs)
     return outputs["text"]