from transformers import VitsModel, AutoTokenizer
import torch
import scipy.io.wavfile
from parallel_wavegan.utils import load_model
from espnet2.bin.tts_inference import Text2Speech
from turkicTTS_utils import normalization
import util

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load processor and model
models_info = {
    "Meta-MMS": {
        "processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
        "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
        "arabic_script": True
    },
    "IS2AI-TurkicTTS": None
}

vocoder_checkpoint="parallelwavegan_male2_checkpoint/checkpoint-400000steps.pkl" ### specify vocoder path
vocoder = load_model(vocoder_checkpoint).to(device).eval()
vocoder.remove_weight_norm()

### specify path to the main model(transformer/tacotron2/fastspeech) and its config file
config_file = "exp/tts_train_raw_char/config.yaml"
model_path = "exp/tts_train_raw_char/train.loss.ave_5best.pth"

text2speech = Text2Speech(
    config_file,
    model_path,
    device=device, ## if cuda not available use cpu
    ### only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=True,
    backward_window=1,
    forward_window=3,
    ### only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  ### disable griffin-lim

def synthesize(text, model_id):
    if model_id == 'IS2AI-TurkicTTS':
        return synthesize_turkic_tts(text)
    
    if models_info[model_id]["arabic_script"]:
        text = util.ug_latn_to_arab(text)
    processor = models_info[model_id]["processor"]
    model = models_info[model_id]["model"].to(device)
    inputs = processor(text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model(**inputs).waveform.cpu()  # Move output back to CPU for saving
    
    output_path = "tts_output.wav"
    sample_rate = model.config.sampling_rate
    scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])

    return output_path

def synthesize_turkic_tts(text):
    text = normalization(text, 'uyghur')
    
    with torch.no_grad():
        c_mel = text2speech(text)['feat_gen']
        wav = vocoder.inference(c_mel)
    
    output = wav.view(-1).cpu()

    output_path = "tts_output.wav"
    scipy.io.wavfile.write(output_path, rate=22050, data=output.numpy()[0])