import torch import soundfile as sf import gradio as gr import spaces from clearvoice import ClearVoice import os @spaces.GPU def fn_clearvoice_sr(input_wav, apply_se): myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K']) fs = 48000 if apply_se: myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K']) output_wav_dict = myClearVoice_se(input_path=input_wav, online_write=True, output_path='enhanced.wav') input_wav = 'enhanced.wav' output_wav_dict = myClearVoice(input_path=input_wav, online_write=False) if isinstance(output_wav_dict, dict): key = next(iter(output_wav_dict)) output_wav = output_wav_dict[key] else: output_wav = output_wav_dict sf.write('enhanced_high_res.wav', output_wav[0,:], fs) return 'enhanced_high_res.wav' @spaces.GPU def fn_clearvoice_se(input_wav, sr): if sr == "16000 Hz": myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K']) fs = 16000 else: myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K']) fs = 48000 output_wav_dict = myClearVoice(input_path=input_wav, online_write=False) if isinstance(output_wav_dict, dict): key = next(iter(output_wav_dict)) output_wav = output_wav_dict[key] else: output_wav = output_wav_dict sf.write('enhanced.wav', output_wav, fs) return 'enhanced.wav' demo = gr.Blocks() sr_demo = gr.Interface( fn=fn_clearvoice_sr, inputs = [ gr.Audio(label="Input Audio", type="filepath"), gr.Checkbox(["Apply Speech Enhancement"], label="Apply Speech Enhancement"), ], outputs = [ gr.Audio(label="Output Audio", type="filepath"), ], title = "ClearVoice: Speech Super Resolution", description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. " "To try it, simply upload your audio, or click one of the examples. "), article = ("
FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement
" ""), examples = [ ["examples/mandarin_speech_16kHz.wav", "16000 Hz"], ["examples/LJSpeech-001-0001-22k.wav", "22050 Hz"], ["examples/LibriTTS_986_129388_24k.wav", "24000 Hz"] ["examples/english_speech_48kHz.wav", "48000 Hz"], ], cache_examples = True, ) with demo: gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"]) demo.launch()