|
import os |
|
import random |
|
import numpy as np |
|
import torch |
|
import tgt |
|
import pandas as pd |
|
|
|
from torch.utils.data import Dataset |
|
import librosa |
|
|
|
|
|
def f0_to_coarse(f0, hparams): |
|
f0_bin = hparams['f0_bin'] |
|
f0_max = hparams['f0_max'] |
|
f0_min = hparams['f0_min'] |
|
is_torch = isinstance(f0, torch.Tensor) |
|
|
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700) |
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700) |
|
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) |
|
|
|
unvoiced = (f0_mel == 0) |
|
|
|
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 |
|
|
|
f0_mel[f0_mel <= 1] = 1 |
|
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 |
|
|
|
f0_mel[unvoiced] = 0 |
|
|
|
f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) |
|
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min()) |
|
return f0_coarse |
|
|
|
|
|
|
|
class VCDecLPCDataset(Dataset): |
|
def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False): |
|
self.path = data_dir |
|
meta = pd.read_csv(data_dir + 'meta_fix.csv') |
|
self.meta = meta[meta['subset'] == subset] |
|
self.content_dir = content_dir |
|
self.extract_emb = extract_emb |
|
|
|
def get_vc_data(self, audio_path, mel_id): |
|
mel_dir = audio_path.replace('vocal', 'mel') |
|
embed_dir = audio_path.replace('vocal', 'embed') |
|
pitch_dir = audio_path.replace('vocal', 'f0') |
|
content_dir = audio_path.replace('vocal', self.content_dir) |
|
|
|
mel = os.path.join(mel_dir, mel_id + '.npy') |
|
embed = os.path.join(embed_dir, mel_id + '.npy') |
|
pitch = os.path.join(pitch_dir, mel_id + '.npy') |
|
content = os.path.join(content_dir, mel_id + '.npy') |
|
|
|
mel = np.load(mel) |
|
if self.extract_emb: |
|
embed = np.load(embed) |
|
else: |
|
embed = np.zeros(1) |
|
|
|
pitch = np.load(pitch) |
|
content = np.load(content) |
|
|
|
pitch = np.nan_to_num(pitch) |
|
pitch = f0_to_coarse(pitch, {'f0_bin': 256, |
|
'f0_min': librosa.note_to_hz('C2'), |
|
'f0_max': librosa.note_to_hz('C6')}) |
|
|
|
mel = torch.from_numpy(mel).float() |
|
embed = torch.from_numpy(embed).float() |
|
pitch = torch.from_numpy(pitch).float() |
|
content = torch.from_numpy(content).float() |
|
|
|
return (mel, embed, pitch, content) |
|
|
|
def __getitem__(self, index): |
|
row = self.meta.iloc[index] |
|
mel_id = row['file_name'] |
|
audio_path = self.path + row['folder'] + row['subfolder'] |
|
mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id) |
|
item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content} |
|
return item |
|
|
|
def __len__(self): |
|
return len(self.meta) |
|
|
|
|
|
class VCDecLPCBatchCollate(object): |
|
def __init__(self, train_frames, eps=np.log(1e-5), content_eps=np.log(1e-12)): |
|
self.train_frames = train_frames |
|
self.eps = eps |
|
self.content_eps = content_eps |
|
|
|
def __call__(self, batch): |
|
train_frames = self.train_frames |
|
eps = self.eps |
|
content_eps = self.content_eps |
|
|
|
B = len(batch) |
|
embed = torch.stack([item['embed'] for item in batch], 0) |
|
|
|
n_mels = batch[0]['mel'].shape[0] |
|
content_dim = batch[0]['content'].shape[0] |
|
|
|
|
|
mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps |
|
mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps |
|
|
|
|
|
contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * content_eps |
|
|
|
f0s1 = torch.zeros((B, train_frames), dtype=torch.float32) |
|
max_starts = [max(item['mel'].shape[-1] - train_frames, 0) |
|
for item in batch] |
|
|
|
starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts] |
|
starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts] |
|
mel_lengths = [] |
|
for i, item in enumerate(batch): |
|
mel = item['mel'] |
|
f0 = item['f0'] |
|
content = item['content'] |
|
|
|
if mel.shape[-1] < train_frames: |
|
mel_length = mel.shape[-1] |
|
else: |
|
mel_length = train_frames |
|
|
|
mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length] |
|
f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length] |
|
contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length] |
|
|
|
mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length] |
|
mel_lengths.append(mel_length) |
|
|
|
mel_lengths = torch.LongTensor(mel_lengths) |
|
|
|
return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths, |
|
'embed': embed, |
|
'f0_1': f0s1, |
|
'content1': contents1} |
|
|
|
|
|
class VCDecLPCTest(Dataset): |
|
def __init__(self, data_dir, subset='test', eps=np.log(1e-5), content_eps=np.log(1e-12), test_frames=256, content_dir='lpc_mel_512', extract_emb=False): |
|
self.path = data_dir |
|
meta = pd.read_csv(data_dir + 'meta_test.csv') |
|
self.meta = meta[meta['subset'] == subset] |
|
self.content_dir = content_dir |
|
self.extract_emb = extract_emb |
|
self.eps = eps |
|
self.content_eps = content_eps |
|
self.test_frames = test_frames |
|
|
|
def get_vc_data(self, audio_path, mel_id, pitch_shift): |
|
mel_dir = audio_path.replace('vocal', 'mel') |
|
embed_dir = audio_path.replace('vocal', 'embed') |
|
pitch_dir = audio_path.replace('vocal', 'f0') |
|
content_dir = audio_path.replace('vocal', self.content_dir) |
|
|
|
mel = os.path.join(mel_dir, mel_id + '.npy') |
|
embed = os.path.join(embed_dir, mel_id + '.npy') |
|
pitch = os.path.join(pitch_dir, mel_id + '.npy') |
|
content = os.path.join(content_dir, mel_id + '.npy') |
|
|
|
mel = np.load(mel) |
|
if self.extract_emb: |
|
embed = np.load(embed) |
|
else: |
|
embed = np.zeros(1) |
|
|
|
pitch = np.load(pitch) |
|
content = np.load(content) |
|
|
|
pitch = np.nan_to_num(pitch) |
|
pitch = pitch*pitch_shift |
|
pitch = f0_to_coarse(pitch, {'f0_bin': 256, |
|
'f0_min': librosa.note_to_hz('C2'), |
|
'f0_max': librosa.note_to_hz('C6')}) |
|
|
|
mel = torch.from_numpy(mel).float() |
|
embed = torch.from_numpy(embed).float() |
|
pitch = torch.from_numpy(pitch).float() |
|
content = torch.from_numpy(content).float() |
|
|
|
return (mel, embed, pitch, content) |
|
|
|
def __getitem__(self, index): |
|
row = self.meta.iloc[index] |
|
|
|
mel_id = row['content_file_name'] |
|
audio_path = self.path + row['content_folder'] + row['content_subfolder'] |
|
pitch_shift = row['pitch_shift'] |
|
mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift) |
|
|
|
mel_id = row['timbre_file_name'] |
|
audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder'] |
|
mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift) |
|
|
|
n_mels = mel1.shape[0] |
|
content_dim = content.shape[0] |
|
|
|
mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps |
|
mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps |
|
|
|
lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * self.content_eps |
|
|
|
f0s1 = torch.zeros(self.test_frames, dtype=torch.float32) |
|
|
|
if mel1.shape[-1] < self.test_frames: |
|
mel_length = mel1.shape[-1] |
|
else: |
|
mel_length = self.test_frames |
|
mels1[:, :mel_length] = mel1[:, :mel_length] |
|
f0s1[:mel_length] = f0[:mel_length] |
|
lpcs1[:, :mel_length] = content[:, :mel_length] |
|
|
|
if mel2.shape[-1] < self.test_frames: |
|
mel_length = mel2.shape[-1] |
|
else: |
|
mel_length = self.test_frames |
|
mels2[:, :mel_length] = mel2[:, :mel_length] |
|
|
|
return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1} |
|
|
|
def __len__(self): |
|
return len(self.meta) |
|
|
|
|
|
|
|
|