susnato commited on
Commit
cb99941
1 Parent(s): e6e41df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -35
app.py CHANGED
@@ -1,51 +1,47 @@
1
  import torch
2
  import numpy as np
3
  import gradio as gr
4
- from transformers import AutoProcessor, AutoModel, pipeline, MarianMTModel, MarianTokenizer
 
5
 
6
-
7
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
9
  # load speech translation checkpoint
10
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
11
 
12
- # load text-to-speech checkpoint and speaker embeddings
13
- processor = AutoProcessor.from_pretrained("suno/bark-small")
14
- model = AutoModel.from_pretrained("suno/bark-small").to(device)
15
-
16
- # load MartianMT model for translating English to Hindi.
17
- martian_mt_model = MarianMTModel.from_pretrained("AbhirupGhosh/opus-mt-finetuned-en-hi")
18
- martian_mt_tokenizer = MarianTokenizer.from_pretrained("AbhirupGhosh/opus-mt-finetuned-en-hi")
19
-
20
 
21
- def translate_english_to_hindi(english_text):
22
- tokenized_text = martian_mt_tokenizer.encode(english_text, return_tensors="pt")
23
- generated_token_ids = martian_mt_model.generate(tokenized_text, use_cache=True, max_new_tokens=128)
24
- hindi_text = martian_mt_tokenizer.decode(generated_token_ids.numpy()[0])
25
- hindi_text = hindi_text.replace("</s>", "")
26
- hindi_text = hindi_text.replace("<pad>", "")
27
 
28
- return hindi_text
29
 
30
-
31
- def translate_to_english(audio):
32
- outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe", "use_cache":"True", "max_new_tokens":128})
 
 
33
  return outputs["text"]
34
 
35
  def synthesise(text):
36
- inputs = processor(text=text, return_tensors="pt").to(device)
37
- speech_values = model.generate(**inputs, use_cache=True, max_new_tokens=128)
38
- speech_values = speech_values.cpu().numpy()
39
-
40
- return speech_values
 
 
 
41
 
42
- def speech_to_hindi_translation(audio):
43
- english_text = translate_to_english(audio)
44
- hindi_text = translate_english_to_hindi(english_text)
45
- synthesised_speech = synthesise(hindi_text)[0]
46
- synthesised_speech = (synthesised_speech * 32767).astype(np.int16)
47
 
48
- return 22050, synthesised_speech
49
 
50
 
51
  title = "Speech-To-Speech-Translation for Hindi"
@@ -56,7 +52,7 @@ description = """
56
  demo = gr.Blocks()
57
 
58
  mic_translate = gr.Interface(
59
- fn=speech_to_hindi_translation,
60
  inputs=gr.Audio(source="microphone", type="filepath"),
61
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
62
  title=title,
@@ -64,7 +60,7 @@ mic_translate = gr.Interface(
64
  )
65
 
66
  file_translate = gr.Interface(
67
- fn=speech_to_hindi_translation,
68
  inputs=gr.Audio(source="upload", type="filepath"),
69
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
70
  # examples=["./example.wav"]],
@@ -75,4 +71,4 @@ file_translate = gr.Interface(
75
  with demo:
76
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
77
 
78
- demo.launch(debug=True)
 
1
  import torch
2
  import numpy as np
3
  import gradio as gr
4
+ from transformers import AutoProcessor, SpeechT5ForTextToSpeech, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5HifiGan
5
+ from datasets import load_dataset
6
 
7
+ device = "cpu"
 
8
 
9
  # load speech translation checkpoint
10
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
11
 
12
+ # load text-to-speech checkpoint
13
+ tts_processor = AutoProcessor.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl")
14
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl").to(device)
15
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
 
 
 
16
 
17
+ # load speaker embeddings
18
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
 
20
 
 
21
 
22
+ def transcribe(audio):
23
+ outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe",
24
+ "language":"nl",
25
+ "use_cache":True,
26
+ "max_new_tokens":128})
27
  return outputs["text"]
28
 
29
  def synthesise(text):
30
+ inputs = tts_processor(text=text,
31
+ truncation=True,
32
+ return_tensors="pt")
33
+ speech = tts_model.generate_speech(inputs["input_ids"].to(device),
34
+ speaker_embeddings.to(device),
35
+ vocoder=vocoder,
36
+ )
37
+ return speech.cpu().numpy()
38
 
39
+ def speech_to_dutch_translation(audio):
40
+ dutch_text = transcribe(audio)
41
+ speech = synthesise(dutch_text)
42
+ speech = (speech * 32767).astype(np.int16)
 
43
 
44
+ return 16_000, speech
45
 
46
 
47
  title = "Speech-To-Speech-Translation for Hindi"
 
52
  demo = gr.Blocks()
53
 
54
  mic_translate = gr.Interface(
55
+ fn=speech_to_dutch_translation,
56
  inputs=gr.Audio(source="microphone", type="filepath"),
57
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
58
  title=title,
 
60
  )
61
 
62
  file_translate = gr.Interface(
63
+ fn=speech_to_dutch_translation,
64
  inputs=gr.Audio(source="upload", type="filepath"),
65
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
66
  # examples=["./example.wav"]],
 
71
  with demo:
72
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
73
 
74
+ demo.launch(debug=False)