preetam8 commited on
Commit
6ab6711
·
1 Parent(s): cc6d9dc

Change to MMS

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
  import torch
6
  from datasets import load_dataset
7
 
8
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
9
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
10
 
11
 
@@ -19,13 +19,10 @@ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_na
19
  decoder_ids = whisper_processor.get_decoder_prompt_ids(language=target_language, task="transcribe")
20
 
21
  # load text-to-speech checkpoint and speaker embeddings
22
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
23
 
24
- model = SpeechT5ForTextToSpeech.from_pretrained("preetam8/speecht5_finetuned_voxpopuli_fr").to(device)
25
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
26
 
27
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
29
 
30
 
31
  def translate(audio):
@@ -45,8 +42,10 @@ def translate(audio):
45
 
46
 
47
  def synthesise(text):
48
- inputs = processor(text=text, return_tensors="pt")
49
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
 
50
  return speech.cpu()
51
 
52
 
 
5
  import torch
6
  from datasets import load_dataset
7
 
8
+ from transformers import VitsModel, VitsTokenizer
9
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
10
 
11
 
 
19
  decoder_ids = whisper_processor.get_decoder_prompt_ids(language=target_language, task="transcribe")
20
 
21
  # load text-to-speech checkpoint and speaker embeddings
22
+ model = VitsModel.from_pretrained("facebook/mms-tts-fra")
23
+ tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")
24
 
 
 
25
 
 
 
26
 
27
 
28
  def translate(audio):
 
42
 
43
 
44
  def synthesise(text):
45
+ inputs = tokenizer(text, return_tensors="pt")
46
+ with torch.no_grad():
47
+ outputs = model(inputs["input_ids"])
48
+ speech = outputs["waveform"]
49
  return speech.cpu()
50
 
51