File size: 3,357 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
import torch
from scipy.interpolate import interp1d
import torchaudio

from fairseq.tasks.text_to_speech import (
    batch_compute_distortion, compute_rms_dist
)


def batch_mel_spectral_distortion(
        y1, y2, sr, normalize_type="path", mel_fn=None
):
    """
    https://arxiv.org/pdf/2011.03568.pdf

    Same as Mel Cepstral Distortion, but computed on log-mel spectrograms.
    """
    if mel_fn is None or mel_fn.sample_rate != sr:
        mel_fn = torchaudio.transforms.MelSpectrogram(
            sr, n_fft=int(0.05 * sr), win_length=int(0.05 * sr),
            hop_length=int(0.0125 * sr), f_min=20, n_mels=80,
            window_fn=torch.hann_window
        ).to(y1[0].device)
    offset = 1e-6
    return batch_compute_distortion(
        y1, y2, sr, lambda y: torch.log(mel_fn(y) + offset).transpose(-1, -2),
        compute_rms_dist, normalize_type
    )


# This code is based on
# "https://github.com/bastibe/MAPS-Scripts/blob/master/helper.py"
def _same_t_in_true_and_est(func):
    def new_func(true_t, true_f, est_t, est_f):
        assert type(true_t) is np.ndarray
        assert type(true_f) is np.ndarray
        assert type(est_t) is np.ndarray
        assert type(est_f) is np.ndarray

        interpolated_f = interp1d(
            est_t, est_f, bounds_error=False, kind='nearest', fill_value=0
        )(true_t)
        return func(true_t, true_f, true_t, interpolated_f)

    return new_func


@_same_t_in_true_and_est
def gross_pitch_error(true_t, true_f, est_t, est_f):
    """The relative frequency in percent of pitch estimates that are
    outside a threshold around the true pitch. Only frames that are
    considered pitched by both the ground truth and the estimator (if
    applicable) are considered.
    """

    correct_frames = _true_voiced_frames(true_t, true_f, est_t, est_f)
    gross_pitch_error_frames = _gross_pitch_error_frames(
        true_t, true_f, est_t, est_f
    )
    return np.sum(gross_pitch_error_frames) / np.sum(correct_frames)


def _gross_pitch_error_frames(true_t, true_f, est_t, est_f, eps=1e-8):
    voiced_frames = _true_voiced_frames(true_t, true_f, est_t, est_f)
    true_f_p_eps = [x + eps for x in true_f]
    pitch_error_frames = np.abs(est_f / true_f_p_eps - 1) > 0.2
    return voiced_frames & pitch_error_frames


def _true_voiced_frames(true_t, true_f, est_t, est_f):
    return (est_f != 0) & (true_f != 0)


def _voicing_decision_error_frames(true_t, true_f, est_t, est_f):
    return (est_f != 0) != (true_f != 0)


@_same_t_in_true_and_est
def f0_frame_error(true_t, true_f, est_t, est_f):
    gross_pitch_error_frames = _gross_pitch_error_frames(
        true_t, true_f, est_t, est_f
    )
    voicing_decision_error_frames = _voicing_decision_error_frames(
        true_t, true_f, est_t, est_f
    )
    return (np.sum(gross_pitch_error_frames) +
            np.sum(voicing_decision_error_frames)) / (len(true_t))


@_same_t_in_true_and_est
def voicing_decision_error(true_t, true_f, est_t, est_f):
    voicing_decision_error_frames = _voicing_decision_error_frames(
        true_t, true_f, est_t, est_f
    )
    return np.sum(voicing_decision_error_frames) / (len(true_t))