ihanif commited on
Commit
35fee4b
1 Parent(s): 8ed9d92

Clean up text with LT alphabets

Browse files
Files changed (1) hide show
  1. app.py +31 -4
app.py CHANGED
@@ -14,7 +14,8 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base",
14
  # load text-to-speech checkpoint and speaker embeddings
15
  #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
  #"ihanif/speecht5_finetuned_voxpopuli_lt"
17
- model_id = "sanchit-gandhi/speecht5_tts_vox_nl"
 
18
  processor = SpeechT5Processor.from_pretrained(model_id)
19
 
20
  #model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
@@ -26,12 +27,38 @@ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validat
26
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def translate(audio):
30
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "nl"})
31
  return outputs["text"]
32
 
33
 
34
  def synthesise(text):
 
35
  inputs = processor(text=text, return_tensors="pt")
36
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
37
  return speech.cpu()
@@ -57,7 +84,7 @@ demo = gr.Blocks()
57
  mic_translate = gr.Interface(
58
  fn=speech_to_speech_translation,
59
  inputs=gr.Audio(source="microphone", type="filepath"),
60
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
61
  title=title,
62
  description=description,
63
  )
@@ -65,7 +92,7 @@ mic_translate = gr.Interface(
65
  file_translate = gr.Interface(
66
  fn=speech_to_speech_translation,
67
  inputs=gr.Audio(source="upload", type="filepath"),
68
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
69
  examples=[["./example.wav"]],
70
  title=title,
71
  description=description,
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
  #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
  #"ihanif/speecht5_finetuned_voxpopuli_lt"
17
+ #model_id = "sanchit-gandhi/speecht5_tts_vox_nl"
18
+ model_id = "ihanif/speecht5_finetuned_voxpopuli_lt"
19
  processor = SpeechT5Processor.from_pretrained(model_id)
20
 
21
  #model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 
27
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
28
 
29
 
30
+ replacements = [
31
+ ("à", "a"),
32
+ ("ą", "a"),
33
+ ("ç", "c"),
34
+ ("č", "c"),
35
+ ("è", "e"),
36
+ ("ë", "e"),
37
+ ("ė", "e"),
38
+ ("ę", "e"),
39
+ ("í", "i"),
40
+ ("ï", "i"),
41
+ ("į", "i"),
42
+ ("ö", "o"),
43
+ ("š", "s"),
44
+ ("ü", "u"),
45
+ ("ū", "u"),
46
+ ("ų", "u"),
47
+ ("ž", "z"),
48
+ ]
49
+
50
+ def cleanup_text(text):
51
+ for src, dst in replacements:
52
+ text = text.replace(src, dst)
53
+ return text
54
+
55
  def translate(audio):
56
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "lt"})
57
  return outputs["text"]
58
 
59
 
60
  def synthesise(text):
61
+ text = cleanup_text(text)
62
  inputs = processor(text=text, return_tensors="pt")
63
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
64
  return speech.cpu()
 
84
  mic_translate = gr.Interface(
85
  fn=speech_to_speech_translation,
86
  inputs=gr.Audio(source="microphone", type="filepath"),
87
+ outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
88
  title=title,
89
  description=description,
90
  )
 
92
  file_translate = gr.Interface(
93
  fn=speech_to_speech_translation,
94
  inputs=gr.Audio(source="upload", type="filepath"),
95
+ outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
96
  examples=[["./example.wav"]],
97
  title=title,
98
  description=description,