mms-tts-sherab

Sleeping

10zinten commited on Oct 30, 2024

Commit

5de81a9

verified ·

1 Parent(s): 883a948

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,30 +4,45 @@ import scipy.io.wavfile
 import numpy as np
 # Load the MMS-TTS model and processor for Tibetan (bod)
-model_id = "ganga4364/mms-tts-bod-finetune-sherab"  # Replace with your fine-tuned model if necessary
 # Use the text-to-speech pipeline with the model
 synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU
 # Function to perform TTS inference and save audio to a file
 def generate_audio(input_text):
     # Perform TTS inference
     speech = synthesiser(input_text)
-    file_path = "finetuned_output.wav"
-    # Save the audio to a file (e.g., 'output.wav')
-    scipy.io.wavfile.write(file_path, rate=speech["sampling_rate"], data=speech["audio"][0])
-    # Return the path to the audio file
-    return file_path
 # Create the Gradio interface
 iface = gr.Interface(
     fn=generate_audio,
     inputs="text",  # Text input for the TTS
     outputs="audio",  # Output will be an audio file
-    title="Tibetan Text-to-Speech (MMS-TTS)",
     description="Enter Tibetan text and generate speech using MMS-TTS."
 )

 import numpy as np
 # Load the MMS-TTS model and processor for Tibetan (bod)
+model_id = "openpecha/mms-tts-sherab"
 # Use the text-to-speech pipeline with the model
 synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU
+def replace_numbers_with_convert(sentence, wylie=True):
+    pattern = r'\d+(\.\d+)?'
+    def replace(match):
+        return convert(match.group(), wylie)
+    result = re.sub(pattern, replace, sentence)
+    return result
+def num2letter(sentence):
+    tibetan_nums = "༠༡༢༣༤༥༦༧༨༩"
+    for i, n in enumerate(tibetan_nums):
+        sentence = sentence.replace(n, str(i))
+    result = replace_numbers_with_convert(sentence, wylie=False)
+    return result
 # Function to perform TTS inference and save audio to a file
 def generate_audio(input_text):
+    # preprocess
+    text = num2letter(text)
     # Perform TTS inference
     speech = synthesiser(input_text)
+    # postprocess
+    audio = noisereduce.reduce_noise(y=speech["audio"], sr=speech["sampling_rate"])
+    return audio, speech["sampling_rate"]
 # Create the Gradio interface
 iface = gr.Interface(
     fn=generate_audio,
     inputs="text",  # Text input for the TTS
     outputs="audio",  # Output will be an audio file
+    title="Tibetan Text-to-Speech (MMS-TTS) Sherab",
     description="Enter Tibetan text and generate speech using MMS-TTS."
 )