{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import gradio as gr\n", "import os\n", "\n", "import logging\n", "\n", "import librosa\n", "import torch\n", "\n", "import commons\n", "import utils\n", "from models import SynthesizerTrn\n", "from text.symbols import symbols\n", "from text import text_to_sequence\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "G:\\AI\\so-vits-svc_v2\\VITS_WebUI\\monotonic_align\n", "G:\\AI\\so-vits-svc_v2\\VITS_WebUI\n" ] } ], "source": [ "%cd G:\\AI\\so-vits-svc_v2\\VITS_WebUI\\monotonic_align\n", "!python setup.py build_ext --inplace\n", "%cd .." ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [], "source": [ "def resize2d(source, target_len):\n", " source[source<0.001] = np.nan\n", " target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)\n", " return np.nan_to_num(target)\n", "def convert_wav_22050_to_f0(audio):\n", " tmp = librosa.pyin(audio,\n", " fmin=librosa.note_to_hz('C0'),\n", " fmax=librosa.note_to_hz('C7'),\n", " frame_length=1780)[0]\n", " f0 = np.zeros_like(tmp)\n", " f0[tmp>0] = tmp[tmp>0]\n", " return f0" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "def get_text(text, hps):\n", " text_norm = text_to_sequence(text, hps.data.text_cleaners)\n", " if hps.data.add_blank:\n", " text_norm = commons.intersperse(text_norm, 0)\n", " text_norm = torch.LongTensor(text_norm)\n", " print(text_norm.shape)\n", " return text_norm" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [], "source": [ "CONFIG_PATH = \"configs/config.json\"\n", "MODEL_PATH = \"models/Yuuka/Yuuka.pth\"\n", "\n", "hps = utils.get_hparams_from_file(CONFIG_PATH)\n", "net_g_ms = SynthesizerTrn(\n", " len(hps.symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " n_speakers=hps.data.n_speakers,\n", " **hps.model).cuda()\n", "\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Loaded checkpoint 'models/Yuuka/Yuuka.pth' (iteration 445)\n" ] } ], "source": [ "_ = utils.load_checkpoint(MODEL_PATH, net_g_ms, None)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using cache found in C:\\Users\\l4227/.cache\\torch\\hub\\bshall_hubert_main\n" ] } ], "source": [ "hubert = torch.hub.load(\"bshall/hubert:main\", \"hubert_soft\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "def vc_fn(input_audio,vc_transform):\n", " if input_audio is None:\n", " return \"You need to upload an audio\", None\n", " sampling_rate, audio = input_audio\n", " # print(audio.shape,sampling_rate)\n", " duration = audio.shape[0] / sampling_rate\n", " if duration > 30:\n", " return \"Error: Audio is too long\", None\n", " audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)\n", " if len(audio.shape) > 1:\n", " audio = librosa.to_mono(audio.transpose(1, 0))\n", " if sampling_rate != 16000:\n", " audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)\n", "\n", " audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050)\n", " f0 = convert_wav_22050_to_f0(audio22050)\n", "\n", " source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)\n", " print(source.shape)\n", " with torch.inference_mode():\n", " units = hubert.units(source)\n", " soft = units.squeeze(0).numpy()\n", " print(sampling_rate)\n", " f0 = resize2d(f0, len(soft[:, 0])) * vc_transform\n", " soft[:, 0] = f0 / 10\n", " sid = torch.LongTensor([0])\n", " stn_tst = torch.FloatTensor(soft)\n", " with torch.no_grad():\n", " x_tst = stn_tst.unsqueeze(0)\n", " x_tst_lengths = torch.LongTensor([stn_tst.size(0)])\n", " audio = net_g_ms.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=0.1, noise_scale_w=0.1, length_scale=1)[0][\n", " 0, 0].data.float().numpy()\n", "\n", " return \"Success\", (hps.data.sampling_rate, audio)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.gradio.app:443\n", "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n", "DEBUG:asyncio:Using selector: SelectSelector\n", "DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 127.0.0.1:7890\n", "DEBUG:urllib3.connectionpool:http://127.0.0.1:7890 \"GET http://127.0.0.1:7861/startup-events HTTP/1.1\" 200 5\n", "DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): 127.0.0.1:7890\n", "DEBUG:urllib3.connectionpool:http://127.0.0.1:7890 \"HEAD http://127.0.0.1:7861/ HTTP/1.1\" 200 0\n", "Running on local URL: http://127.0.0.1:7861\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/plain": "", "text/html": "
" }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.gradio.app:443\n", "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Traceback (most recent call last):\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\soundfile.py\", line 161, in \n", " import _soundfile_data # ImportError if this doesn't exist\n", "ModuleNotFoundError: No module named '_soundfile_data'\n", "\n", "During handling of the above exception, another exception occurred:\n", "\n", "Traceback (most recent call last):\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\soundfile.py\", line 171, in \n", " _snd = _ffi.dlopen(_libname)\n", "OSError: cannot load library 'D:\\Anaconda\\envs\\ML\\Library\\bin\\sndfile.dll': error 0x7e\n", "\n", "During handling of the above exception, another exception occurred:\n", "\n", "Traceback (most recent call last):\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\gradio\\routes.py\", line 394, in run_predict\n", " output = await app.get_blocks().process_api(\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\gradio\\blocks.py\", line 1075, in process_api\n", " result = await self.call_function(\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\gradio\\blocks.py\", line 884, in call_function\n", " prediction = await anyio.to_thread.run_sync(\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\anyio\\to_thread.py\", line 28, in run_sync\n", " return await get_asynclib().run_sync_in_worker_thread(func, *args, cancellable=cancellable,\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 818, in run_sync_in_worker_thread\n", " return await future\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\anyio\\_backends\\_asyncio.py\", line 754, in run\n", " result = context.run(func, *args)\n", " File \"C:\\Users\\l4227\\AppData\\Local\\Temp\\ipykernel_23416\\731703501.py\", line 13, in vc_fn\n", " audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\lazy_loader\\__init__.py\", line 77, in __getattr__\n", " attr = getattr(submod, name)\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\lazy_loader\\__init__.py\", line 76, in __getattr__\n", " submod = importlib.import_module(submod_path)\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\importlib\\__init__.py\", line 127, in import_module\n", " return _bootstrap._gcd_import(name[level:], package, level)\n", " File \"\", line 1030, in _gcd_import\n", " File \"\", line 1007, in _find_and_load\n", " File \"\", line 986, in _find_and_load_unlocked\n", " File \"\", line 680, in _load_unlocked\n", " File \"\", line 850, in exec_module\n", " File \"\", line 228, in _call_with_frames_removed\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\librosa\\core\\audio.py\", line 10, in \n", " import soundfile as sf\n", " File \"D:\\Anaconda\\envs\\ML\\lib\\site-packages\\soundfile.py\", line 192, in \n", " _snd = _ffi.dlopen(_explicit_libname)\n", "OSError: cannot load library 'libsndfile.dll': error 0x7e\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Keyboard interruption in main thread... closing server.\n" ] } ], "source": [ "app = gr.Blocks()\n", "with app:\n", " with gr.Tabs():\n", " with gr.TabItem(\"Basic\"):\n", " vc_input3 = gr.Audio(label=\"Input Audio (30s limitation)\")\n", " vc_transform = gr.Number(label=\"transform\",value=1.0)\n", " vc_submit = gr.Button(\"Convert\", variant=\"primary\")\n", " vc_output1 = gr.Textbox(label=\"Output Message\")\n", " vc_output2 = gr.Audio(label=\"Output Audio\")\n", " vc_submit.click(vc_fn, [ vc_input3,vc_transform], [vc_output1, vc_output2])\n", "\n", " app.launch(debug=True)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }