OFA-OCR-dedao-demo001

Runtime error

File size: 4,149 Bytes

ee21b96

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


"""
Signal processing-based evaluation using waveforms
"""

import csv
import numpy as np
import os.path as op

import torch
import tqdm
from tabulate import tabulate
import torchaudio

from examples.speech_synthesis.utils import batch_mel_spectral_distortion
from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion


def load_eval_spec(path):
    with open(path) as f:
        reader = csv.DictReader(f, delimiter='\t')
        samples = list(reader)
    return samples


def eval_distortion(samples, distortion_fn, device="cuda"):
    nmiss = 0
    results = []
    for sample in tqdm.tqdm(samples):
        if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]):
            nmiss += 1
            results.append(None)
            continue
        # assume single channel
        yref, sr = torchaudio.load(sample["ref"])
        ysyn, _sr = torchaudio.load(sample["syn"])
        yref, ysyn = yref[0].to(device), ysyn[0].to(device)
        assert sr == _sr, f"{sr} != {_sr}"

        distortion, extra = distortion_fn([yref], [ysyn], sr, None)[0]
        _, _, _, _, _, pathmap = extra
        nins = torch.sum(pathmap.sum(dim=1) - 1)  # extra frames in syn
        ndel = torch.sum(pathmap.sum(dim=0) - 1)  # missing frames from syn
        results.append(
            (distortion.item(),  # path distortion
             pathmap.size(0),  # yref num frames
             pathmap.size(1),  # ysyn num frames
             pathmap.sum().item(),  # path length
             nins.item(),  # insertion
             ndel.item(),  # deletion
             )
        )
    return results


def eval_mel_cepstral_distortion(samples, device="cuda"):
    return eval_distortion(samples, batch_mel_cepstral_distortion, device)


def eval_mel_spectral_distortion(samples, device="cuda"):
    return eval_distortion(samples, batch_mel_spectral_distortion, device)


def print_results(results, show_bin):
    results = np.array(list(filter(lambda x: x is not None, results)))

    np.set_printoptions(precision=3)

    def _print_result(results):
        dist, dur_ref, dur_syn, dur_ali, nins, ndel = results.sum(axis=0)
        res = {
            "nutt": len(results),
            "dist": dist,
            "dur_ref": int(dur_ref),
            "dur_syn": int(dur_syn),
            "dur_ali": int(dur_ali),
            "dist_per_ref_frm": dist/dur_ref,
            "dist_per_syn_frm": dist/dur_syn,
            "dist_per_ali_frm": dist/dur_ali,
            "ins": nins/dur_ref,
            "del": ndel/dur_ref,
        }
        print(tabulate(
            [res.values()],
            res.keys(),
            floatfmt=".4f"
        ))

    print(">>>> ALL")
    _print_result(results)

    if show_bin:
        edges = [0, 200, 400, 600, 800, 1000, 2000, 4000]
        for i in range(1, len(edges)):
            mask = np.logical_and(results[:, 1] >= edges[i-1],
                                  results[:, 1] < edges[i])
            if not mask.any():
                continue
            bin_results = results[mask]
            print(f">>>> ({edges[i-1]}, {edges[i]})")
            _print_result(bin_results)


def main(eval_spec, mcd, msd, show_bin):
    samples = load_eval_spec(eval_spec)
    device = "cpu"
    if mcd:
        print("===== Evaluate Mean Cepstral Distortion =====")
        results = eval_mel_cepstral_distortion(samples, device)
        print_results(results, show_bin)
    if msd:
        print("===== Evaluate Mean Spectral Distortion =====")
        results = eval_mel_spectral_distortion(samples, device)
        print_results(results, show_bin)


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("eval_spec")
    parser.add_argument("--mcd", action="store_true")
    parser.add_argument("--msd", action="store_true")
    parser.add_argument("--show-bin", action="store_true")
    args = parser.parse_args()

    main(args.eval_spec, args.mcd, args.msd, args.show_bin)