import gradio as gr
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import scipy.io.wavfile
import numpy as np

# Load the MMS-TTS model and processor for Tibetan (bod)
model_id = "openpecha/mms-tts-sherab"

# Use the text-to-speech pipeline with the model
synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU

def replace_numbers_with_convert(sentence, wylie=True):
    pattern = r'\d+(\.\d+)?'
    def replace(match):
        return convert(match.group(), wylie)
        
    result = re.sub(pattern, replace, sentence)

    return result

def num2letter(sentence):
    tibetan_nums = "༠༡༢༣༤༥༦༧༨༩"
    for i, n in enumerate(tibetan_nums):
        sentence = sentence.replace(n, str(i))
    result = replace_numbers_with_convert(sentence, wylie=False)
    return result
    
# Function to perform TTS inference and save audio to a file
def generate_audio(input_text):
    # preprocess
    text = num2letter(text)
    # Perform TTS inference
    speech = synthesiser(input_text)
    # postprocess
    audio = noisereduce.reduce_noise(y=speech["audio"], sr=speech["sampling_rate"])
    
    return audio, speech["sampling_rate"]


# Create the Gradio interface
iface = gr.Interface(
    fn=generate_audio,
    inputs="text",  # Text input for the TTS
    outputs="audio",  # Output will be an audio file
    title="Tibetan Text-to-Speech (MMS-TTS) Sherab",
    description="Enter Tibetan text and generate speech using MMS-TTS."
)

# Launch the Gradio interface
iface.launch()