import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os

@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):

    myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
    fs = 48000
    if apply_se:
        myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        output_wav_dict = myClearVoice_se(input_path=input_wav, online_write=True, output_path='enhanced.wav')
        input_wav = 'enhanced.wav'
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
    return 'enhanced_high_res.wav'
    
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
    if sr == "16000 Hz":
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
        fs = 16000
    else:
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        fs = 48000
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced.wav', output_wav, fs)
    return 'enhanced.wav'

demo = gr.Blocks()

sr_demo = gr.Interface(
    fn=fn_clearvoice_sr,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
        gr.Checkbox(["Apply Speech Enhancement"], label="Apply Speech Enhancement"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
                   "To try it, simply upload your audio, or click one of the examples. "),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
    examples = [
        ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
        ["examples/LJSpeech-001-0001-22k.wav", "22050 Hz"],
        ["examples/LibriTTS_986_129388_24k.wav", "24000 Hz"]
        ["examples/english_speech_48kHz.wav", "48000 Hz"],
    ],
    cache_examples = True,
)

with demo:
    gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])

demo.launch()