vineetsharma commited on
Commit
59671be
1 Parent(s): 423c1cd

Update app.py for Target Language Dutch

Browse files
Files changed (1) hide show
  1. app.py +35 -8
app.py CHANGED
@@ -12,17 +12,35 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
 
 
 
 
23
 
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
26
  return outputs["text"]
27
 
28
 
@@ -40,12 +58,21 @@ def speech_to_speech_translation(audio):
40
 
41
 
42
  title = "Cascaded STST"
 
 
 
 
 
 
 
43
  description = """
44
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 
 
 
 
46
 
47
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
48
- """
49
 
50
  demo = gr.Blocks()
51
 
 
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
+ # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
+
17
+ # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
+ # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
+
20
+
21
+ # For Dutch
22
+ model_id = 'sanchit-gandhi/speecht5_tts_vox_nl'
23
+
24
+ processor = SpeechT5Processor.from_pretrained(model_id)
25
+ model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
26
+
27
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
28
+
29
+
30
+
31
 
 
 
32
 
33
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
34
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
35
 
36
+ # Original
37
+ # def translate(audio):
38
+ # outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
39
+ # return outputs["text"]
40
 
41
+ # Dutch
42
  def translate(audio):
43
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
44
  return outputs["text"]
45
 
46
 
 
58
 
59
 
60
  title = "Cascaded STST"
61
+ # description = """
62
+ # Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
63
+ # [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
64
+
65
+ # ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
66
+ # """
67
+
68
  description = """
69
+ # Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Dutch. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
70
+ # [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
71
+
72
+ # ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
73
+ # """
74
+
75
 
 
 
76
 
77
  demo = gr.Blocks()
78