File size: 3,266 Bytes
90f7c1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import random
import numpy as np
import pandas as pd
import librosa
import torch
import torchaudio
from torch.utils.data import Dataset
def algin_mapping(content, target_len):
# align content with mel
src_len = content.shape[-1]
target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
temp = torch.arange(src_len+1) * target_len / src_len
for i in range(target_len):
cur_idx = torch.argmin(torch.abs(temp-i))
target[:, i] = content[:, cur_idx]
return target
def midi_to_hz(midi):
idx = torch.zeros(midi.shape[-1])
for frame in range(midi.shape[-1]):
midi_frame = midi[:, frame]
non_zero = midi_frame.nonzero()
if len(non_zero) != 0:
hz = librosa.midi_to_hz(non_zero[0])
idx[frame] = torch.tensor(hz)
return idx
# training "average voice" encoder
class DiffPitch(Dataset):
def __init__(self, data_dir, subset, frames, content='world', shift=True, log_scale=False):
meta = pd.read_csv(data_dir + 'meta.csv')
self.data_dir = data_dir
self.meta = meta[meta['subset'] == subset]
self.frames = frames
self.content = content
self.shift = shift
self.log_scale = log_scale
def __getitem__(self, index):
row = self.meta.iloc[index]
folder = row['folder']
subfolder = row['subfolder']
file_id = row['file_name']
folder = os.path.join(self.data_dir, folder)
folder = os.path.join(folder, str(subfolder))
folder = os.path.join(folder, 'vocal')
folder = os.path.join(folder, file_id)
content_folder = folder.replace('vocal', self.content).replace('.wav', '.npy')
content = torch.tensor(np.load(content_folder), dtype=torch.float32)
# print(content.shape)
midi_folder = folder.replace('vocal', 'roll_align').replace('.wav', '.npy')
midi = torch.tensor(np.load(midi_folder), dtype=torch.float32)
# print(midi.shape)
# midi = algin_mapping(midi, content.shape[-1])
f0_folder = folder.replace('vocal', 'f0').replace('.wav', '.npy')
f0 = torch.tensor(np.load(f0_folder), dtype=torch.float32)
max_start = max(content.shape[-1] - self.frames, 0)
start = random.choice(range(max_start)) if max_start > 0 else 0
end = min(int(start + self.frames), content.shape[-1])
out_content = torch.ones((content.shape[0], self.frames)) * np.log(1e-5)
out_midi = torch.zeros(self.frames)
out_f0 = torch.zeros(self.frames)
out_content[:, :end-start] = content[:, start:end]
out_midi[:end-start] = midi[start:end]
out_f0[:end-start] = f0[start:end]
# out_midi = midi_to_hz(out_midi)
if self.shift is True:
shift = np.random.choice(25, 1)[0]
shift = shift - 12
# midi[midi != 0] += shift
out_midi = out_midi*(2**(shift/12))
out_f0 = out_f0*(2**(shift/12))
if self.log_scale:
out_midi = 1127 * np.log(1 + out_midi / 700)
out_f0 = 1127 * np.log(1 + out_f0 / 700)
return out_content, out_midi, out_f0
def __len__(self):
return len(self.meta) |