juangtzi commited on
Commit
c778606
1 Parent(s): 09708e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -18
app.py CHANGED
@@ -17,24 +17,35 @@ generation_config.forced_decoder_ids
17
  tokenizer.decode(generation_config.forced_decoder_ids[1][1])
18
 
19
 
20
-
21
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
22
 
 
 
 
23
  #vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
24
  #vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
25
 
 
26
 
27
- model = SpeechT5ForTextToSpeech.from_pretrained(
28
- "juangtzi/speecht5_finetuned_voxpopuli_es"
29
- )
30
- checkpoint = "microsoft/speecht5_tts"
31
- processor = SpeechT5Processor.from_pretrained(checkpoint)
32
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- speaker_embeddings2 = np.load('speaker_embeddings.npy')
35
- speaker_embeddings2 = torch.tensor(speaker_embeddings2)
36
- print(speaker_embeddings2)
37
- lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
38
 
39
  def language_detector(text):
40
  resultado = lang_detector(text)
@@ -47,19 +58,32 @@ def translate(audio):
47
  print(outputs["text"])
48
  return outputs["text"]
49
 
 
50
  def synthesise(text):
51
- inputs = processor(text=text, return_tensors="pt")
52
- output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
53
- return output
54
 
55
  def speech_to_speech_translation(audio):
56
  translated_text = translate(audio)
57
  synthesised_speech = synthesise(translated_text)
58
- audio_data = synthesised_speech.cpu().numpy()
59
- audio_data = np.squeeze(audio_data)
60
- audio_data = audio_data / np.max(np.abs(audio_data))
61
  sample_rate = 16000
62
- return (sample_rate, audio_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  title = "Cascaded STST"
65
  description = """
 
17
  tokenizer.decode(generation_config.forced_decoder_ids[1][1])
18
 
19
 
 
20
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
21
 
22
+
23
+ # ---------------- Speech generator mms-tts-spa --------------------------#
24
+
25
  #vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
26
  #vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
27
 
28
+ # ---------------- Speech generator specht5_tts --------------------------#
29
 
30
+ # model = SpeechT5ForTextToSpeech.from_pretrained(
31
+ # "juangtzi/speecht5_finetuned_voxpopuli_es"
32
+ # )
33
+ # checkpoint = "microsoft/speecht5_tts"
34
+ # processor = SpeechT5Processor.from_pretrained(checkpoint)
35
+ # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
36
+
37
+ # speaker_embeddings2 = np.load('speaker_embeddings.npy')
38
+ # speaker_embeddings2 = torch.tensor(speaker_embeddings2)
39
+ # print(speaker_embeddings2)
40
+ # lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
41
+
42
+ # ---------------- Speech generator bark--------------------------#
43
+
44
+ from transformers import BarkModel, BarkProcessor
45
+
46
+ model = BarkModel.from_pretrained("suno/bark-small")
47
+ processor = BarkProcessor.from_pretrained("suno/bark-small")
48
 
 
 
 
 
49
 
50
  def language_detector(text):
51
  resultado = lang_detector(text)
 
58
  print(outputs["text"])
59
  return outputs["text"]
60
 
61
+
62
  def synthesise(text):
63
+ inputs = processor(text=text, voice_preset="v2/es_speaker_8")
64
+ speech_output = model.generate(**inputs).cpu().numpy()
65
+ return speech_output
66
 
67
  def speech_to_speech_translation(audio):
68
  translated_text = translate(audio)
69
  synthesised_speech = synthesise(translated_text)
 
 
 
70
  sample_rate = 16000
71
+ return (sample_rate, synthesised_speech)
72
+
73
+
74
+ # def synthesise(text): speecht5_tts
75
+ # inputs = processor(text=text, return_tensors="pt")
76
+ # output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
77
+ # return output
78
+
79
+ # def speech_to_speech_translation(audio): speecht5_tts
80
+ # translated_text = translate(audio)
81
+ # synthesised_speech = synthesise(translated_text)
82
+ # audio_data = synthesised_speech.cpu().numpy()
83
+ # audio_data = np.squeeze(audio_data)
84
+ # audio_data = audio_data / np.max(np.abs(audio_data))
85
+ # sample_rate = 16000
86
+ # return (sample_rate, audio_data)
87
 
88
  title = "Cascaded STST"
89
  description = """