File size: 3,056 Bytes
90f7c1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os.path

import numpy as np
import pandas as pd
import torch
import yaml
import librosa
import soundfile as sf
from tqdm import tqdm

from diffusers import DDIMScheduler
from pitch_controller.models.unet import UNetPitcher
from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
from pitch_controller.modules.BigVGAN.inference import load_model
from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0


@torch.no_grad()
def template_pitcher(source, pitch_ref, model, hifigan, steps=50, shift_semi=0):

    source_mel = get_world_mel(source, sr=sr)

    f0_ref = get_matched_f0(source, pitch_ref, 'world')
    f0_ref = f0_ref * 2 ** (shift_semi / 12)

    f0_ref = log_f0(f0_ref, {'f0_bin': 345,
                             'f0_min': librosa.note_to_hz('C2'),
                             'f0_max': librosa.note_to_hz('C#6')})

    source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
    f0_ref = torch.from_numpy(f0_ref).float().unsqueeze(0).to(device)

    noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
    generator = torch.Generator(device=device).manual_seed(2024)

    noise_scheduler.set_timesteps(steps)
    noise = torch.randn(source_mel.shape, generator=generator, device=device)
    pred = noise
    source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)

    for t in tqdm(noise_scheduler.timesteps):
        pred = noise_scheduler.scale_model_input(pred, t)
        model_output = model(x=pred, mean=source_x, f0=f0_ref, t=t, ref=None, embed=None)
        pred = noise_scheduler.step(model_output=model_output,
                                    timestep=t,
                                    sample=pred,
                                    eta=1, generator=generator).prev_sample

    pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)

    pred_audio = hifigan(pred)
    pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)

    return pred_audio


if __name__ == '__main__':
    min_mel = np.log(1e-5)
    max_mel = 2.5
    sr = 24000

    use_gpu = torch.cuda.is_available()
    device = 'cuda' if use_gpu else 'cpu'

    # load diffusion model
    config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
    mel_cfg = config['logmel']
    ddpm_cfg = config['ddpm']
    unet_cfg = config['unet']
    model = UNetPitcher(**unet_cfg)
    unet_path = 'ckpts/world_fixed_40.pt'

    state_dict = torch.load(unet_path)
    for key in list(state_dict.keys()):
        state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
    model.load_state_dict(state_dict)
    if use_gpu:
        model.cuda()
    model.eval()

    #  load vocoder
    hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
    hifigan, cfg = load_model(hifi_path, device=device)
    hifigan.eval()

    pred_audio = template_pitcher('examples/off-key.wav', 'examples/reference.wav', model, hifigan, steps=50, shift_semi=0)
    sf.write('output_template.wav', pred_audio, samplerate=sr)