Spaces:
Sleeping
Sleeping
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import matplotlib.pyplot as plt\n", | |
"import IPython.display as ipd\n", | |
"\n", | |
"import os\n", | |
"import json\n", | |
"import math\n", | |
"import torch\n", | |
"from torch import nn\n", | |
"from torch.nn import functional as F\n", | |
"from torch.utils.data import DataLoader\n", | |
"\n", | |
"import commons\n", | |
"import utils\n", | |
"from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n", | |
"from models import SynthesizerTrn\n", | |
"from text.symbols import symbols\n", | |
"from text import text_to_sequence\n", | |
"\n", | |
"from scipy.io.wavfile import write\n", | |
"\n", | |
"\n", | |
"def get_text(text, hps):\n", | |
" text_norm = text_to_sequence(text, hps.data.text_cleaners)\n", | |
" if hps.data.add_blank:\n", | |
" text_norm = commons.intersperse(text_norm, 0)\n", | |
" text_norm = torch.LongTensor(text_norm)\n", | |
" return text_norm" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## LJ Speech" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"hps = utils.get_hparams_from_file(\"./configs/ljs_base.json\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"net_g = SynthesizerTrn(\n", | |
" len(symbols),\n", | |
" hps.data.filter_length // 2 + 1,\n", | |
" hps.train.segment_size // hps.data.hop_length,\n", | |
" **hps.model).cuda()\n", | |
"_ = net_g.eval()\n", | |
"\n", | |
"_ = utils.load_checkpoint(\"/path/to/pretrained_ljs.pth\", net_g, None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"stn_tst = get_text(\"VITS is Awesome!\", hps)\n", | |
"with torch.no_grad():\n", | |
" x_tst = stn_tst.cuda().unsqueeze(0)\n", | |
" x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n", | |
" audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n", | |
"ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## VCTK" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"hps = utils.get_hparams_from_file(\"./configs/vctk_base.json\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"net_g = SynthesizerTrn(\n", | |
" len(symbols),\n", | |
" hps.data.filter_length // 2 + 1,\n", | |
" hps.train.segment_size // hps.data.hop_length,\n", | |
" n_speakers=hps.data.n_speakers,\n", | |
" **hps.model).cuda()\n", | |
"_ = net_g.eval()\n", | |
"\n", | |
"_ = utils.load_checkpoint(\"/path/to/pretrained_vctk.pth\", net_g, None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"stn_tst = get_text(\"VITS is Awesome!\", hps)\n", | |
"with torch.no_grad():\n", | |
" x_tst = stn_tst.cuda().unsqueeze(0)\n", | |
" x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n", | |
" sid = torch.LongTensor([4]).cuda()\n", | |
" audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n", | |
"ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Voice Conversion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)\n", | |
"collate_fn = TextAudioSpeakerCollate()\n", | |
"loader = DataLoader(dataset, num_workers=8, shuffle=False,\n", | |
" batch_size=1, pin_memory=True,\n", | |
" drop_last=True, collate_fn=collate_fn)\n", | |
"data_list = list(loader)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with torch.no_grad():\n", | |
" x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]\n", | |
" sid_tgt1 = torch.LongTensor([1]).cuda()\n", | |
" sid_tgt2 = torch.LongTensor([2]).cuda()\n", | |
" sid_tgt3 = torch.LongTensor([4]).cuda()\n", | |
" audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()\n", | |
" audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()\n", | |
" audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()\n", | |
"print(\"Original SID: %d\" % sid_src.item())\n", | |
"ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))\n", | |
"print(\"Converted SID: %d\" % sid_tgt1.item())\n", | |
"ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))\n", | |
"print(\"Converted SID: %d\" % sid_tgt2.item())\n", | |
"ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))\n", | |
"print(\"Converted SID: %d\" % sid_tgt3.item())\n", | |
"ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} | |