JustinLin610
update
10b0761
raw
history blame
8.33 kB
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Signal processing-based evaluation using waveforms
"""
import numpy as np
import os.path as op
import torchaudio
import tqdm
from tabulate import tabulate
from examples.speech_synthesis.utils import (
gross_pitch_error, voicing_decision_error, f0_frame_error
)
from examples.speech_synthesis.evaluation.eval_sp import load_eval_spec
def difference_function(x, n, tau_max):
"""
Compute difference function of data x. This solution is implemented directly
with Numpy fft.
:param x: audio data
:param n: length of data
:param tau_max: integration window size
:return: difference function
:rtype: list
"""
x = np.array(x, np.float64)
w = x.size
tau_max = min(tau_max, w)
x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum()))
size = w + tau_max
p2 = (size // 32).bit_length()
nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
fc = np.fft.rfft(x, size_pad)
conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - \
2 * conv
def cumulative_mean_normalized_difference_function(df, n):
"""
Compute cumulative mean normalized difference function (CMND).
:param df: Difference function
:param n: length of data
:return: cumulative mean normalized difference function
:rtype: list
"""
# scipy method
cmn_df = df[1:] * range(1, n) / np.cumsum(df[1:]).astype(float)
return np.insert(cmn_df, 0, 1)
def get_pitch(cmdf, tau_min, tau_max, harmo_th=0.1):
"""
Return fundamental period of a frame based on CMND function.
:param cmdf: Cumulative Mean Normalized Difference function
:param tau_min: minimum period for speech
:param tau_max: maximum period for speech
:param harmo_th: harmonicity threshold to determine if it is necessary to
compute pitch frequency
:return: fundamental period if there is values under threshold, 0 otherwise
:rtype: float
"""
tau = tau_min
while tau < tau_max:
if cmdf[tau] < harmo_th:
while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
tau += 1
return tau
tau += 1
return 0 # if unvoiced
def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500,
harmo_thresh=0.1):
"""
Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
https://github.com/NVIDIA/mellotron adaption of
https://github.com/patriceguyot/Yin
:param sig: Audio signal (list of float)
:param sr: sampling rate (int)
:param w_len: size of the analysis window (samples)
:param w_step: size of the lag between two consecutives windows (samples)
:param f0_min: Minimum fundamental frequency that can be detected (hertz)
:param f0_max: Maximum fundamental frequency that can be detected (hertz)
:param harmo_thresh: Threshold of detection. The yalgorithmù return the
first minimum of the CMND function below this threshold.
:returns:
* pitches: list of fundamental frequencies,
* harmonic_rates: list of harmonic rate values for each fundamental
frequency value (= confidence value)
* argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
* times: list of time of each estimation
:rtype: tuple
"""
tau_min = int(sr / f0_max)
tau_max = int(sr / f0_min)
# time values for each analysis window
time_scale = range(0, len(sig) - w_len, w_step)
times = [t/float(sr) for t in time_scale]
frames = [sig[t:t + w_len] for t in time_scale]
pitches = [0.0] * len(time_scale)
harmonic_rates = [0.0] * len(time_scale)
argmins = [0.0] * len(time_scale)
for i, frame in enumerate(frames):
# Compute YIN
df = difference_function(frame, w_len, tau_max)
cm_df = cumulative_mean_normalized_difference_function(df, tau_max)
p = get_pitch(cm_df, tau_min, tau_max, harmo_thresh)
# Get results
if np.argmin(cm_df) > tau_min:
argmins[i] = float(sr / np.argmin(cm_df))
if p != 0: # A pitch was found
pitches[i] = float(sr / p)
harmonic_rates[i] = cm_df[p]
else: # No pitch, but we compute a value of the harmonic rate
harmonic_rates[i] = min(cm_df)
return pitches, harmonic_rates, argmins, times
def extract_f0(samples):
f0_samples = []
for sample in tqdm.tqdm(samples):
if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]):
f0_samples.append(None)
continue
# assume single channel
yref, sr = torchaudio.load(sample["ref"])
ysyn, _sr = torchaudio.load(sample["syn"])
yref, ysyn = yref[0], ysyn[0]
assert sr == _sr, f"{sr} != {_sr}"
yref_f0 = compute_yin(yref, sr)
ysyn_f0 = compute_yin(ysyn, sr)
f0_samples += [
{
"ref": yref_f0,
"syn": ysyn_f0
}
]
return f0_samples
def eval_f0_error(samples, distortion_fn):
results = []
for sample in tqdm.tqdm(samples):
if sample is None:
results.append(None)
continue
# assume single channel
yref_f, _, _, yref_t = sample["ref"]
ysyn_f, _, _, ysyn_t = sample["syn"]
yref_f = np.array(yref_f)
yref_t = np.array(yref_t)
ysyn_f = np.array(ysyn_f)
ysyn_t = np.array(ysyn_t)
distortion = distortion_fn(yref_t, yref_f, ysyn_t, ysyn_f)
results.append((distortion.item(),
len(yref_f),
len(ysyn_f)
))
return results
def eval_gross_pitch_error(samples):
return eval_f0_error(samples, gross_pitch_error)
def eval_voicing_decision_error(samples):
return eval_f0_error(samples, voicing_decision_error)
def eval_f0_frame_error(samples):
return eval_f0_error(samples, f0_frame_error)
def print_results(results, show_bin):
results = np.array(list(filter(lambda x: x is not None, results)))
np.set_printoptions(precision=3)
def _print_result(results):
res = {
"nutt": len(results),
"error": results[:, 0].mean(),
"std": results[:, 0].std(),
"dur_ref": int(results[:, 1].sum()),
"dur_syn": int(results[:, 2].sum()),
}
print(tabulate([res.values()], res.keys(), floatfmt=".4f"))
print(">>>> ALL")
_print_result(results)
if show_bin:
edges = [0, 200, 400, 600, 800, 1000, 2000, 4000]
for i in range(1, len(edges)):
mask = np.logical_and(results[:, 1] >= edges[i-1],
results[:, 1] < edges[i])
if not mask.any():
continue
bin_results = results[mask]
print(f">>>> ({edges[i-1]}, {edges[i]})")
_print_result(bin_results)
def main(eval_f0, gpe, vde, ffe, show_bin):
samples = load_eval_spec(eval_f0)
if gpe or vde or ffe:
f0_samples = extract_f0(samples)
if gpe:
print("===== Evaluate Gross Pitch Error =====")
results = eval_gross_pitch_error(f0_samples)
print_results(results, show_bin)
if vde:
print("===== Evaluate Voicing Decision Error =====")
results = eval_voicing_decision_error(f0_samples)
print_results(results, show_bin)
if ffe:
print("===== Evaluate F0 Frame Error =====")
results = eval_f0_frame_error(f0_samples)
print_results(results, show_bin)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("eval_f0")
parser.add_argument("--gpe", action="store_true")
parser.add_argument("--vde", action="store_true")
parser.add_argument("--ffe", action="store_true")
parser.add_argument("--show-bin", action="store_true")
args = parser.parse_args()
main(args.eval_f0, args.gpe, args.vde, args.ffe, args.show_bin)