In [None]:
!pip install parallel_wavegan paddlepaddle-gpu==2.2.2 "paddlespeech<1" pytest-runner

In [None]:
!gdown https://drive.google.com/uc?id=1q8oSAzwkqi99oOGXDZyLypCiz0Qzn3Ab
!unzip -qq Vocoder.zip

In [None]:
# load torch vocoder
import torch
from parallel_wavegan.utils import load_model

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocoder_torch = load_model("Vocoder/checkpoint-400000steps.pkl").to(device).eval()
vocoder_torch.remove_weight_norm()
_ = vocoder_torch.eval()

In [None]:
import yaml
import paddle

from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator

with open('Vocoder/config.yml') as f:
 voc_config = CfgNode(yaml.safe_load(f))
voc_config["generator_params"].pop("upsample_net")
voc_config["generator_params"]["upsample_scales"] = voc_config["generator_params"].pop("upsample_params")["upsample_scales"]
vocoder_paddle = PWGGenerator(**voc_config["generator_params"])
vocoder_paddle.remove_weight_norm()
vocoder_paddle.eval()


@paddle.no_grad()
def convert_weights(torch_model, paddle_model):
 _ = torch_model.eval()
 _ = paddle_model.eval()
 dense_layers = []
 for name, layer in torch_model.named_modules():
 if isinstance(layer, torch.nn.Linear):
 dense_layers.append(name)
 torch_state_dict = torch_model.state_dict()
 for name, param in paddle_model.named_parameters():
 name = name.replace('._mean', '.running_mean')
 name = name.replace('._variance', '.running_var')
 name = name.replace('.scale', '.weight')
 target_param = torch_state_dict[name].detach().cpu().numpy()
 if '.'.join(name.split('.')[:-1]) in dense_layers:
 if len(param.shape) == 2:
 target_param = target_param.transpose((1,0))
 param.set_value(paddle.to_tensor(target_param))

convert_weights(vocoder_torch, vocoder_paddle)

In [None]:
import os
import librosa
import torchaudio
import paddleaudio
import numpy as np
import IPython.display as ipd


to_mel = torchaudio.transforms.MelSpectrogram(
 n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
fb = to_mel.mel_scale.fb.detach().cpu().numpy().transpose([1,0])
to_mel = paddleaudio.features.MelSpectrogram(
 n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
to_mel.fbank_matrix[:] = fb
mean, std = -4, 4

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def preprocess(wave):
 wave_tensor = paddle.to_tensor(wave).astype(paddle.float32)
 mel_tensor = 2*to_mel(wave_tensor)
 mel_tensor = (paddle.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
 return mel_tensor

if not os.path.exists('p228_023.wav'):
 !wget https://github.com/yl4579/StarGANv2-VC/raw/main/Demo/VCTK-corpus/p228/p228_023.wav
audio, source_sr = librosa.load('p228_023.wav', sr=24000)
audio = audio / np.max(np.abs(audio))
audio.dtype = np.float32
mel = preprocess(audio)

import numpy as np
with torch.no_grad():
 with paddle.no_grad():
 c = mel.transpose([0, 2, 1]).squeeze()
 recon_paddle = vocoder_paddle.inference(c)
 recon_paddle = recon_paddle.reshape([-1]).numpy()
 recon_torch = vocoder_torch.inference(torch.from_numpy(c.numpy()).to(device))
 recon_torch = recon_torch.view(-1).cpu().numpy()
 print(np.mean((recon_paddle - recon_torch)**2))

print('Paddle recon:')
display(ipd.Audio(recon_paddle, rate=24000))
print('Torch recon:')
display(ipd.Audio(recon_torch, rate=24000))

In [None]:
paddle.save(vocoder_paddle.state_dict(), 'checkpoint-400000steps.pd')
paddle.save({ 'fbank_matrix': to_mel.fbank_matrix }, 'fbank_matrix.pd')