# from nsf_hifigan.models import load_model | |
from modules.BigVGAN.inference import load_model | |
import librosa | |
import torch | |
import torch.nn.functional as F | |
import torchaudio | |
import torchaudio.transforms as transforms | |
import numpy as np | |
import soundfile as sf | |
class LogMelSpectrogram(torch.nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.melspctrogram = transforms.MelSpectrogram( | |
sample_rate=22050, | |
n_fft=1024, | |
win_length=1024, | |
hop_length=256, | |
center=False, | |
power=1.0, | |
norm="slaney", | |
n_mels=80, | |
mel_scale="slaney", | |
f_max=8000, | |
f_min=0, | |
) | |
def forward(self, wav): | |
wav = F.pad(wav, ((1024 - 256) // 2, (1024 - 256) // 2), "reflect") | |
mel = self.melspctrogram(wav) | |
logmel = torch.log(torch.clamp(mel, min=1e-5)) | |
return logmel | |
hifigan, cfg = load_model('modules/BigVGAN/ckpt/bigvgan_22khz_80band/g_05000000', device='cuda') | |
M = LogMelSpectrogram() | |
source, sr = torchaudio.load("music.mp3") | |
source = torchaudio.functional.resample(source, sr, 22050) | |
source = source.unsqueeze(0) | |
mel = M(source).squeeze(0) | |
# f0, f0_bin = get_pitch("116_1_pred.wav") | |
# f0 = torch.tensor(f0).unsqueeze(0) | |
with torch.no_grad(): | |
y_hat = hifigan(mel.cuda()).cpu().numpy().squeeze(1) | |
sf.write('test.wav', y_hat[0], samplerate=22050) |