import gradio as gr from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import scipy.io.wavfile import numpy as np # Load the MMS-TTS model and processor for Tibetan (bod) model_id = "openpecha/mms-tts-sherab" # Use the text-to-speech pipeline with the model synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU def replace_numbers_with_convert(sentence, wylie=True): pattern = r'\d+(\.\d+)?' def replace(match): return convert(match.group(), wylie) result = re.sub(pattern, replace, sentence) return result def num2letter(sentence): tibetan_nums = "༠༡༢༣༤༥༦༧༨༩" for i, n in enumerate(tibetan_nums): sentence = sentence.replace(n, str(i)) result = replace_numbers_with_convert(sentence, wylie=False) return result # Function to perform TTS inference and save audio to a file def generate_audio(input_text): # preprocess text = num2letter(text) # Perform TTS inference speech = synthesiser(input_text) # postprocess audio = noisereduce.reduce_noise(y=speech["audio"], sr=speech["sampling_rate"]) return audio, speech["sampling_rate"] # Create the Gradio interface iface = gr.Interface( fn=generate_audio, inputs="text", # Text input for the TTS outputs="audio", # Output will be an audio file title="Tibetan Text-to-Speech (MMS-TTS) Sherab", description="Enter Tibetan text and generate speech using MMS-TTS." ) # Launch the Gradio interface iface.launch()