File size: 3,056 Bytes
90f7c1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import os.path
import numpy as np
import pandas as pd
import torch
import yaml
import librosa
import soundfile as sf
from tqdm import tqdm
from diffusers import DDIMScheduler
from pitch_controller.models.unet import UNetPitcher
from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
from pitch_controller.modules.BigVGAN.inference import load_model
from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
@torch.no_grad()
def template_pitcher(source, pitch_ref, model, hifigan, steps=50, shift_semi=0):
source_mel = get_world_mel(source, sr=sr)
f0_ref = get_matched_f0(source, pitch_ref, 'world')
f0_ref = f0_ref * 2 ** (shift_semi / 12)
f0_ref = log_f0(f0_ref, {'f0_bin': 345,
'f0_min': librosa.note_to_hz('C2'),
'f0_max': librosa.note_to_hz('C#6')})
source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
f0_ref = torch.from_numpy(f0_ref).float().unsqueeze(0).to(device)
noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
generator = torch.Generator(device=device).manual_seed(2024)
noise_scheduler.set_timesteps(steps)
noise = torch.randn(source_mel.shape, generator=generator, device=device)
pred = noise
source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
for t in tqdm(noise_scheduler.timesteps):
pred = noise_scheduler.scale_model_input(pred, t)
model_output = model(x=pred, mean=source_x, f0=f0_ref, t=t, ref=None, embed=None)
pred = noise_scheduler.step(model_output=model_output,
timestep=t,
sample=pred,
eta=1, generator=generator).prev_sample
pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
pred_audio = hifigan(pred)
pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
return pred_audio
if __name__ == '__main__':
min_mel = np.log(1e-5)
max_mel = 2.5
sr = 24000
use_gpu = torch.cuda.is_available()
device = 'cuda' if use_gpu else 'cpu'
# load diffusion model
config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
mel_cfg = config['logmel']
ddpm_cfg = config['ddpm']
unet_cfg = config['unet']
model = UNetPitcher(**unet_cfg)
unet_path = 'ckpts/world_fixed_40.pt'
state_dict = torch.load(unet_path)
for key in list(state_dict.keys()):
state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
model.load_state_dict(state_dict)
if use_gpu:
model.cuda()
model.eval()
# load vocoder
hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
hifigan, cfg = load_model(hifi_path, device=device)
hifigan.eval()
pred_audio = template_pitcher('examples/off-key.wav', 'examples/reference.wav', model, hifigan, steps=50, shift_semi=0)
sf.write('output_template.wav', pred_audio, samplerate=sr)
|