{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ParallelWaveGAN to paddle.ipynb", "provenance": [], "collapsed_sections": [], "private_outputs": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "gZNDsJweNp1L" }, "outputs": [], "source": [ "!pip install parallel_wavegan paddlepaddle-gpu==2.2.2 \"paddlespeech<1\" pytest-runner" ] }, { "cell_type": "code", "source": [ "!gdown https://drive.google.com/uc?id=1q8oSAzwkqi99oOGXDZyLypCiz0Qzn3Ab\n", "!unzip -qq Vocoder.zip" ], "metadata": { "id": "HqA0VNKEOGfv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# load torch vocoder\n", "import torch\n", "from parallel_wavegan.utils import load_model\n", "\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "\n", "vocoder_torch = load_model(\"Vocoder/checkpoint-400000steps.pkl\").to(device).eval()\n", "vocoder_torch.remove_weight_norm()\n", "_ = vocoder_torch.eval()" ], "metadata": { "id": "9F0yA_dyPOVe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import yaml\n", "import paddle\n", "\n", "from yacs.config import CfgNode\n", "from paddlespeech.s2t.utils.dynamic_import import dynamic_import\n", "from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator\n", "\n", "with open('Vocoder/config.yml') as f:\n", " voc_config = CfgNode(yaml.safe_load(f))\n", "voc_config[\"generator_params\"].pop(\"upsample_net\")\n", "voc_config[\"generator_params\"][\"upsample_scales\"] = voc_config[\"generator_params\"].pop(\"upsample_params\")[\"upsample_scales\"]\n", "vocoder_paddle = PWGGenerator(**voc_config[\"generator_params\"])\n", "vocoder_paddle.remove_weight_norm()\n", "vocoder_paddle.eval()\n", "\n", "\n", "@paddle.no_grad()\n", "def convert_weights(torch_model, paddle_model):\n", " _ = torch_model.eval()\n", " _ = paddle_model.eval()\n", " dense_layers = []\n", " for name, layer in torch_model.named_modules():\n", " if isinstance(layer, torch.nn.Linear):\n", " dense_layers.append(name)\n", " torch_state_dict = torch_model.state_dict()\n", " for name, param in paddle_model.named_parameters():\n", " name = name.replace('._mean', '.running_mean')\n", " name = name.replace('._variance', '.running_var')\n", " name = name.replace('.scale', '.weight')\n", " target_param = torch_state_dict[name].detach().cpu().numpy()\n", " if '.'.join(name.split('.')[:-1]) in dense_layers:\n", " if len(param.shape) == 2:\n", " target_param = target_param.transpose((1,0))\n", " param.set_value(paddle.to_tensor(target_param))\n", "\n", "convert_weights(vocoder_torch, vocoder_paddle)" ], "metadata": { "id": "ch2uVW8OdKN0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import librosa\n", "import torchaudio\n", "import paddleaudio\n", "import numpy as np\n", "import IPython.display as ipd\n", "\n", "\n", "to_mel = torchaudio.transforms.MelSpectrogram(\n", " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n", "fb = to_mel.mel_scale.fb.detach().cpu().numpy().transpose([1,0])\n", "to_mel = paddleaudio.features.MelSpectrogram(\n", " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n", "to_mel.fbank_matrix[:] = fb\n", "mean, std = -4, 4\n", "\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "\n", "def preprocess(wave):\n", " wave_tensor = paddle.to_tensor(wave).astype(paddle.float32)\n", " mel_tensor = 2*to_mel(wave_tensor)\n", " mel_tensor = (paddle.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n", " return mel_tensor\n", "\n", "if not os.path.exists('p228_023.wav'):\n", " !wget https://github.com/yl4579/StarGANv2-VC/raw/main/Demo/VCTK-corpus/p228/p228_023.wav\n", "audio, source_sr = librosa.load('p228_023.wav', sr=24000)\n", "audio = audio / np.max(np.abs(audio))\n", "audio.dtype = np.float32\n", "mel = preprocess(audio)\n", "\n", "import numpy as np\n", "with torch.no_grad():\n", " with paddle.no_grad():\n", " c = mel.transpose([0, 2, 1]).squeeze()\n", " recon_paddle = vocoder_paddle.inference(c)\n", " recon_paddle = recon_paddle.reshape([-1]).numpy()\n", " recon_torch = vocoder_torch.inference(torch.from_numpy(c.numpy()).to(device))\n", " recon_torch = recon_torch.view(-1).cpu().numpy()\n", " print(np.mean((recon_paddle - recon_torch)**2))\n", "\n", "print('Paddle recon:')\n", "display(ipd.Audio(recon_paddle, rate=24000))\n", "print('Torch recon:')\n", "display(ipd.Audio(recon_torch, rate=24000))" ], "metadata": { "id": "Q9dK5j1CleJM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "paddle.save(vocoder_paddle.state_dict(), 'checkpoint-400000steps.pd')\n", "paddle.save({ 'fbank_matrix': to_mel.fbank_matrix }, 'fbank_matrix.pd')" ], "metadata": { "id": "HwaLd_Eq3JrH" }, "execution_count": null, "outputs": [] } ] }