import gradio as gr import os os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..') import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm hps = utils.get_hparams_from_file("configs/config.json") net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model) _ = net_g.eval() # _ = utils.load_checkpoint("logs/woman_csmsc/G_100000.pth", net_g, None) _ = utils.load_checkpoint("G_98000.pth", net_g, None) def vc_fn(input): stn_tst = get_text(input, hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) # x_tst = stn_tst.cpu().unsqueeze(0) # x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu() audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy() sampling_rate = 44100 return (sampling_rate, audio) app = gr.Blocks() with app: with gr.Tabs(): with gr.TabItem("Basic"): vc_input = gr.Textbox(label="Input Message") vc_submit = gr.Button("Convert", variant="primary") vc_output = gr.Audio(label="Output Audio") vc_submit.click(vc_fn, [ vc_input], [vc_output]) app.launch()