|
|
|
|
|
|
|
|
|
|
|
import torch |
|
import numpy as np |
|
|
|
from tqdm import tqdm |
|
from utils.util import pad_mels_to_tensors, pad_f0_to_tensors |
|
|
|
|
|
def vocoder_inference(cfg, model, mels, f0s=None, device=None, fast_inference=False): |
|
"""Inference the vocoder |
|
Args: |
|
mels: A tensor of mel-specs with the shape (batch_size, num_mels, frames) |
|
Returns: |
|
audios: A tensor of audios with the shape (batch_size, seq_len) |
|
""" |
|
model.eval() |
|
|
|
with torch.no_grad(): |
|
training_noise_schedule = np.array(cfg.model.diffwave.noise_schedule) |
|
inference_noise_schedule = ( |
|
np.array(cfg.model.diffwave.inference_noise_schedule) |
|
if fast_inference |
|
else np.array(cfg.model.diffwave.noise_schedule) |
|
) |
|
|
|
talpha = 1 - training_noise_schedule |
|
talpha_cum = np.cumprod(talpha) |
|
|
|
beta = inference_noise_schedule |
|
alpha = 1 - beta |
|
alpha_cum = np.cumprod(alpha) |
|
|
|
T = [] |
|
for s in range(len(inference_noise_schedule)): |
|
for t in range(len(training_noise_schedule) - 1): |
|
if talpha_cum[t + 1] <= alpha_cum[s] <= talpha_cum[t]: |
|
twiddle = (talpha_cum[t] ** 0.5 - alpha_cum[s] ** 0.5) / ( |
|
talpha_cum[t] ** 0.5 - talpha_cum[t + 1] ** 0.5 |
|
) |
|
T.append(t + twiddle) |
|
break |
|
T = np.array(T, dtype=np.float32) |
|
|
|
mels = mels.to(device) |
|
audio = torch.randn( |
|
mels.shape[0], |
|
cfg.preprocess.hop_size * mels.shape[-1], |
|
device=device, |
|
) |
|
|
|
for n in tqdm(range(len(alpha) - 1, -1, -1)): |
|
c1 = 1 / alpha[n] ** 0.5 |
|
c2 = beta[n] / (1 - alpha_cum[n]) ** 0.5 |
|
audio = c1 * ( |
|
audio |
|
- c2 |
|
* model(audio, torch.tensor([T[n]], device=audio.device), mels).squeeze( |
|
1 |
|
) |
|
) |
|
if n > 0: |
|
noise = torch.randn_like(audio) |
|
sigma = ( |
|
(1.0 - alpha_cum[n - 1]) / (1.0 - alpha_cum[n]) * beta[n] |
|
) ** 0.5 |
|
audio += sigma * noise |
|
audio = torch.clamp(audio, -1.0, 1.0) |
|
|
|
return audio.detach().cpu() |
|
|
|
|
|
def synthesis_audios(cfg, model, mels, f0s=None, batch_size=None, fast_inference=False): |
|
"""Inference the vocoder |
|
Args: |
|
mels: A list of mel-specs |
|
Returns: |
|
audios: A list of audios |
|
""" |
|
|
|
device = next(model.parameters()).device |
|
|
|
audios = [] |
|
|
|
|
|
mel_batches, mel_frames = pad_mels_to_tensors(mels, batch_size) |
|
if f0s != None: |
|
f0_batches = pad_f0_to_tensors(f0s, batch_size) |
|
|
|
if f0s == None: |
|
for mel_batch, mel_frame in zip(mel_batches, mel_frames): |
|
for i in range(mel_batch.shape[0]): |
|
mel = mel_batch[i] |
|
frame = mel_frame[i] |
|
audio = vocoder_inference( |
|
cfg, |
|
model, |
|
mel.unsqueeze(0), |
|
device=device, |
|
fast_inference=fast_inference, |
|
).squeeze(0) |
|
|
|
|
|
audio_length = frame * cfg.preprocess.hop_size |
|
audio = audio[:audio_length] |
|
|
|
audios.append(audio) |
|
else: |
|
for mel_batch, f0_batch, mel_frame in zip(mel_batches, f0_batches, mel_frames): |
|
for i in range(mel_batch.shape[0]): |
|
mel = mel_batch[i] |
|
f0 = f0_batch[i] |
|
frame = mel_frame[i] |
|
audio = vocoder_inference( |
|
cfg, |
|
model, |
|
mel.unsqueeze(0), |
|
f0s=f0.unsqueeze(0), |
|
device=device, |
|
fast_inference=fast_inference, |
|
).squeeze(0) |
|
|
|
|
|
audio_length = frame * cfg.preprocess.hop_size |
|
audio = audio[:audio_length] |
|
|
|
audios.append(audio) |
|
return audios |
|
|