|
import os |
|
import random |
|
import numpy as np |
|
import pandas as pd |
|
import librosa |
|
|
|
import torch |
|
import torchaudio |
|
from torch.utils.data import Dataset |
|
|
|
|
|
def algin_mapping(content, target_len): |
|
|
|
src_len = content.shape[-1] |
|
target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device) |
|
temp = torch.arange(src_len+1) * target_len / src_len |
|
|
|
for i in range(target_len): |
|
cur_idx = torch.argmin(torch.abs(temp-i)) |
|
target[:, i] = content[:, cur_idx] |
|
return target |
|
|
|
|
|
def midi_to_hz(midi): |
|
idx = torch.zeros(midi.shape[-1]) |
|
for frame in range(midi.shape[-1]): |
|
midi_frame = midi[:, frame] |
|
non_zero = midi_frame.nonzero() |
|
if len(non_zero) != 0: |
|
hz = librosa.midi_to_hz(non_zero[0]) |
|
idx[frame] = torch.tensor(hz) |
|
return idx |
|
|
|
|
|
|
|
class DiffPitch(Dataset): |
|
def __init__(self, data_dir, subset, frames, content='world', shift=True, log_scale=False): |
|
meta = pd.read_csv(data_dir + 'meta.csv') |
|
self.data_dir = data_dir |
|
self.meta = meta[meta['subset'] == subset] |
|
self.frames = frames |
|
self.content = content |
|
self.shift = shift |
|
self.log_scale = log_scale |
|
|
|
def __getitem__(self, index): |
|
row = self.meta.iloc[index] |
|
folder = row['folder'] |
|
subfolder = row['subfolder'] |
|
file_id = row['file_name'] |
|
folder = os.path.join(self.data_dir, folder) |
|
folder = os.path.join(folder, str(subfolder)) |
|
folder = os.path.join(folder, 'vocal') |
|
folder = os.path.join(folder, file_id) |
|
|
|
content_folder = folder.replace('vocal', self.content).replace('.wav', '.npy') |
|
content = torch.tensor(np.load(content_folder), dtype=torch.float32) |
|
|
|
|
|
midi_folder = folder.replace('vocal', 'roll_align').replace('.wav', '.npy') |
|
midi = torch.tensor(np.load(midi_folder), dtype=torch.float32) |
|
|
|
|
|
|
|
f0_folder = folder.replace('vocal', 'f0').replace('.wav', '.npy') |
|
f0 = torch.tensor(np.load(f0_folder), dtype=torch.float32) |
|
|
|
max_start = max(content.shape[-1] - self.frames, 0) |
|
start = random.choice(range(max_start)) if max_start > 0 else 0 |
|
end = min(int(start + self.frames), content.shape[-1]) |
|
|
|
out_content = torch.ones((content.shape[0], self.frames)) * np.log(1e-5) |
|
out_midi = torch.zeros(self.frames) |
|
out_f0 = torch.zeros(self.frames) |
|
|
|
out_content[:, :end-start] = content[:, start:end] |
|
out_midi[:end-start] = midi[start:end] |
|
out_f0[:end-start] = f0[start:end] |
|
|
|
|
|
|
|
if self.shift is True: |
|
shift = np.random.choice(25, 1)[0] |
|
shift = shift - 12 |
|
|
|
|
|
out_midi = out_midi*(2**(shift/12)) |
|
out_f0 = out_f0*(2**(shift/12)) |
|
|
|
if self.log_scale: |
|
out_midi = 1127 * np.log(1 + out_midi / 700) |
|
out_f0 = 1127 * np.log(1 + out_f0 / 700) |
|
|
|
return out_content, out_midi, out_f0 |
|
|
|
def __len__(self): |
|
return len(self.meta) |