File size: 3,135 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import math
import librosa
import torch
import numpy as np
from utils.util import JsonHParams
from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median
ZERO = 1e-8
def extract_f0rmse(
audio_ref,
audio_deg,
hop_length=256,
f0_min=50,
f0_max=1100,
**kwargs,
):
"""Compute F0 Root Mean Square Error (RMSE) between the predicted and the ground truth audio.
audio_ref: path to the ground truth audio.
audio_deg: path to the predicted audio.
fs: sampling rate.
hop_length: hop length.
f0_min: lower limit for f0.
f0_max: upper limit for f0.
pitch_bin: number of bins for f0 quantization.
pitch_max: upper limit for f0 quantization.
pitch_min: lower limit for f0 quantization.
need_mean: subtract the mean value from f0 if "True".
method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
"cut" will cut both audios into a same length according to the one with the shorter length.
"""
# Load hyperparameters
kwargs = kwargs["kwargs"]
fs = kwargs["fs"]
method = kwargs["method"]
need_mean = kwargs["need_mean"]
# Load audio
if fs != None:
audio_ref, _ = librosa.load(audio_ref, sr=fs)
audio_deg, _ = librosa.load(audio_deg, sr=fs)
else:
audio_ref, fs = librosa.load(audio_ref)
audio_deg, fs = librosa.load(audio_deg)
# Initialize config for f0 extraction
cfg = JsonHParams()
cfg.sample_rate = fs
cfg.hop_size = hop_length
cfg.f0_min = f0_min
cfg.f0_max = f0_max
cfg.pitch_bin = 256
cfg.pitch_max = f0_max
cfg.pitch_min = f0_min
# Extract f0
f0_ref = get_f0_features_using_parselmouth(
audio_ref,
cfg,
)
f0_deg = get_f0_features_using_parselmouth(
audio_deg,
cfg,
)
# Subtract mean value from f0
if need_mean:
f0_ref = torch.from_numpy(f0_ref)
f0_deg = torch.from_numpy(f0_deg)
f0_ref = get_pitch_sub_median(f0_ref).numpy()
f0_deg = get_pitch_sub_median(f0_deg).numpy()
# Avoid silence
min_length = min(len(f0_ref), len(f0_deg))
if min_length <= 1:
return 0
# F0 length alignment
if method == "cut":
length = min(len(f0_ref), len(f0_deg))
f0_ref = f0_ref[:length]
f0_deg = f0_deg[:length]
elif method == "dtw":
_, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True)
f0_gt_new = []
f0_pred_new = []
for i in range(wp.shape[0]):
gt_index = wp[i][0]
pred_index = wp[i][1]
f0_gt_new.append(f0_ref[gt_index])
f0_pred_new.append(f0_deg[pred_index])
f0_ref = np.array(f0_gt_new)
f0_deg = np.array(f0_pred_new)
assert len(f0_ref) == len(f0_deg)
# Compute RMSE
f0_mse = np.square(np.subtract(f0_ref, f0_deg)).mean()
f0_rmse = math.sqrt(f0_mse)
return f0_rmse
|