PortaSpeech / utils /metrics /pitch_distance.py
RayeRen's picture
init
d1b91e7
import numpy as np
import matplotlib.pyplot as plt
from numba import jit
import torch
@jit
def time_warp(costs):
dtw = np.zeros_like(costs)
dtw[0, 1:] = np.inf
dtw[1:, 0] = np.inf
eps = 1e-4
for i in range(1, costs.shape[0]):
for j in range(1, costs.shape[1]):
dtw[i, j] = costs[i, j] + min(dtw[i - 1, j], dtw[i, j - 1], dtw[i - 1, j - 1])
return dtw
def align_from_distances(distance_matrix, debug=False, return_mindist=False):
# for each position in spectrum 1, returns best match position in spectrum2
# using monotonic alignment
dtw = time_warp(distance_matrix)
i = distance_matrix.shape[0] - 1
j = distance_matrix.shape[1] - 1
results = [0] * distance_matrix.shape[0]
while i > 0 and j > 0:
results[i] = j
i, j = min([(i - 1, j), (i, j - 1), (i - 1, j - 1)], key=lambda x: dtw[x[0], x[1]])
if debug:
visual = np.zeros_like(dtw)
visual[range(len(results)), results] = 1
plt.matshow(visual)
plt.show()
if return_mindist:
return results, dtw[-1, -1]
return results
def get_local_context(input_f, max_window=32, scale_factor=1.):
# input_f: [S, 1], support numpy array or torch tensor
# return hist: [S, max_window * 2], list of list
T = input_f.shape[0]
# max_window = int(max_window * scale_factor)
derivative = [[0 for _ in range(max_window * 2)] for _ in range(T)]
for t in range(T): # travel the time series
for feat_idx in range(-max_window, max_window):
if t + feat_idx < 0 or t + feat_idx >= T:
value = 0
else:
value = input_f[t + feat_idx]
derivative[t][feat_idx + max_window] = value
return derivative
def cal_localnorm_dist(src, tgt, src_len, tgt_len):
local_src = torch.tensor(get_local_context(src))
local_tgt = torch.tensor(get_local_context(tgt, scale_factor=tgt_len / src_len))
local_norm_src = (local_src - local_src.mean(-1).unsqueeze(-1)) # / local_src.std(-1).unsqueeze(-1) # [T1, 32]
local_norm_tgt = (local_tgt - local_tgt.mean(-1).unsqueeze(-1)) # / local_tgt.std(-1).unsqueeze(-1) # [T2, 32]
dists = torch.cdist(local_norm_src[None, :, :], local_norm_tgt[None, :, :]) # [1, T1, T2]
return dists
## here is API for one sample
def LoNDTWDistance(src, tgt):
# src: [S]
# tgt: [T]
dists = cal_localnorm_dist(src, tgt, src.shape[0], tgt.shape[0]) # [1, S, T]
costs = dists.squeeze(0) # [S, T]
alignment, min_distance = align_from_distances(costs.T.cpu().detach().numpy(), return_mindist=True) # [T]
return alignment, min_distance
# if __name__ == '__main__':
# # utils from ns
# from utils.pitch_utils import denorm_f0
# from tasks.singing.fsinging import FastSingingDataset
# from utils.hparams import hparams, set_hparams
#
# set_hparams()
#
# train_ds = FastSingingDataset('test')
#
# # Test One sample case
# sample = train_ds[0]
# amateur_f0 = sample['f0']
# prof_f0 = sample['prof_f0']
#
# amateur_uv = sample['uv']
# amateur_padding = sample['mel2ph'] == 0
# prof_uv = sample['prof_uv']
# prof_padding = sample['prof_mel2ph'] == 0
# amateur_f0_denorm = denorm_f0(amateur_f0, amateur_uv, hparams, pitch_padding=amateur_padding)
# prof_f0_denorm = denorm_f0(prof_f0, prof_uv, hparams, pitch_padding=prof_padding)
# alignment, min_distance = LoNDTWDistance(amateur_f0_denorm, prof_f0_denorm)
# print(min_distance)
# python utils/pitch_distance.py --config egs/datasets/audio/molar/svc_ppg.yaml