juangtzi commited on
Commit
e1c6d89
1 Parent(s): 968ccd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -16
app.py CHANGED
@@ -2,9 +2,9 @@ import gradio as gr
2
  import numpy as np
3
  import torch
4
  from transformers import pipeline, VitsModel
5
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
6
  from transformers import WhisperTokenizer, GenerationConfig
7
-
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
@@ -22,8 +22,8 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium
22
 
23
  # ---------------- Speech generator mms-tts-spa --------------------------#
24
 
25
- #vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
26
- #vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
27
 
28
  # ---------------- Speech generator specht5_tts --------------------------#
29
 
@@ -39,30 +39,43 @@ speaker_embeddings2 = torch.tensor(speaker_embeddings2)
39
  print(speaker_embeddings2)
40
 
41
 
42
- def language_detector(text):
43
- resultado = lang_detector(text)
44
- idioma_detectado = resultado[0]['label']
45
- print(idioma_detectado)
46
- return idioma_detectado
47
 
48
  def translate(audio):
49
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
50
  print(outputs["text"])
51
  return outputs["text"]
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def synthesise(text):
54
- inputs = processor(text=text, return_tensors="pt")
55
- output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
 
 
56
  return output
57
 
58
  def speech_to_speech_translation(audio):
59
  translated_text = translate(audio)
60
  synthesised_speech = synthesise(translated_text)
61
- audio_data = synthesised_speech.cpu().numpy()
62
- #audio_data = np.squeeze(audio_data)
63
- #audio_data = audio_data / np.max(np.abs(audio_data))
64
- sample_rate = 16000
65
- return (sample_rate, audio_data)
66
 
67
  title = "Cascaded STST"
68
  description = """
 
2
  import numpy as np
3
  import torch
4
  from transformers import pipeline, VitsModel
5
+ #from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
6
  from transformers import WhisperTokenizer, GenerationConfig
7
+ from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
 
22
 
23
  # ---------------- Speech generator mms-tts-spa --------------------------#
24
 
25
+ vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
26
+ vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
27
 
28
  # ---------------- Speech generator specht5_tts --------------------------#
29
 
 
39
  print(speaker_embeddings2)
40
 
41
 
42
+ # def language_detector(text):
43
+ # resultado = lang_detector(text)
44
+ # idioma_detectado = resultado[0]['label']
45
+ # print(idioma_detectado)
46
+ # return idioma_detectado
47
 
48
  def translate(audio):
49
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
50
  print(outputs["text"])
51
  return outputs["text"]
52
 
53
+ # def synthesise(text):
54
+ # inputs = processor(text=text, return_tensors="pt")
55
+ # output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
56
+ # return output
57
+
58
+ # def speech_to_speech_translation(audio):
59
+ # translated_text = translate(audio)
60
+ # synthesised_speech = synthesise(translated_text)
61
+ # audio_data = synthesised_speech.cpu().numpy()
62
+ # #audio_data = np.squeeze(audio_data)
63
+ # #audio_data = audio_data / np.max(np.abs(audio_data))
64
+ # sample_rate = 16000
65
+ # return (sample_rate, audio_data)
66
+
67
  def synthesise(text):
68
+ print(text)
69
+ inputs = vist_tokenizer(text, return_tensors="pt")
70
+ with torch.no_grad():
71
+ output = vist_model(**inputs).waveform[0]
72
  return output
73
 
74
  def speech_to_speech_translation(audio):
75
  translated_text = translate(audio)
76
  synthesised_speech = synthesise(translated_text)
77
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
78
+ return 16000, synthesised_speech
 
 
 
79
 
80
  title = "Cascaded STST"
81
  description = """