Spaces:

amphion
/

singing_voice_conversion

Running on A10G

File size: 5,410 Bytes

0883aa1

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import json
from tqdm import tqdm


def cal_metadata(cfg):
    """
    Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets.
    """
    from collections import Counter

    datasets = cfg.dataset

    print("-" * 10)
    print("Preparing metadata...")
    print("Including: \n{}\n".format("\n".join(datasets)))

    datasets.sort()

    for dataset in tqdm(datasets):
        save_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
        assert os.path.exists(save_dir)

        # 'train.json' and 'test.json' of target dataset
        train_metadata = os.path.join(save_dir, "train.json")
        test_metadata = os.path.join(save_dir, "test.json")

        # Sort the metadata as the duration order
        with open(train_metadata, "r", encoding="utf-8") as f:
            train_utterances = json.load(f)
        with open(test_metadata, "r", encoding="utf-8") as f:
            test_utterances = json.load(f)

        train_utterances = sorted(train_utterances, key=lambda x: x["Duration"])
        test_utterances = sorted(test_utterances, key=lambda x: x["Duration"])

        # Write back the sorted metadata
        with open(train_metadata, "w") as f:
            json.dump(train_utterances, f, indent=4, ensure_ascii=False)
        with open(test_metadata, "w") as f:
            json.dump(test_utterances, f, indent=4, ensure_ascii=False)

        # Paths of metadata needed to be generated
        singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
        utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)

        # Get the total duration and singer names for train and test utterances
        train_total_duration = sum(utt["Duration"] for utt in train_utterances)
        test_total_duration = sum(utt["Duration"] for utt in test_utterances)

        singer_names = set(
            f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
            for utt in train_utterances + test_utterances
        )

        # Write the utt2singer file and sort the singer names
        with open(utt2singer_file, "w", encoding="utf-8") as f:
            for utt in train_utterances + test_utterances:
                f.write(
                    f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n"
                )

        singer_names = sorted(singer_names)
        singer_lut = {name: i for i, name in enumerate(singer_names)}

        # dump singers.json
        with open(singer_dict_file, "w", encoding="utf-8") as f:
            json.dump(singer_lut, f, indent=4, ensure_ascii=False)

        meta_info = {
            "dataset": dataset,
            "statistics": {
                "size": len(train_utterances) + len(test_utterances),
                "hours": round(train_total_duration / 3600, 4)
                + round(test_total_duration / 3600, 4),
            },
            "train": {
                "size": len(train_utterances),
                "hours": round(train_total_duration / 3600, 4),
            },
            "test": {
                "size": len(test_utterances),
                "hours": round(test_total_duration / 3600, 4),
            },
            "singers": {"size": len(singer_lut)},
        }
        # Use Counter to count the minutes for each singer
        total_singer2mins = Counter()
        training_singer2mins = Counter()
        for utt in train_utterances:
            k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
            training_singer2mins[k] += utt["Duration"] / 60
            total_singer2mins[k] += utt["Duration"] / 60
        for utt in test_utterances:
            k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
            total_singer2mins[k] += utt["Duration"] / 60

        training_singer2mins = dict(
            sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True)
        )
        training_singer2mins = {k: round(v, 2) for k, v in training_singer2mins.items()}
        meta_info["singers"]["training_minutes"] = training_singer2mins

        total_singer2mins = dict(
            sorted(total_singer2mins.items(), key=lambda x: x[1], reverse=True)
        )
        total_singer2mins = {k: round(v, 2) for k, v in total_singer2mins.items()}
        meta_info["singers"]["minutes"] = total_singer2mins

        with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
            json.dump(meta_info, f, indent=4, ensure_ascii=False)

        for singer, min in training_singer2mins.items():
            print(f"Singer {singer}: {min} mins for training")
        print("-" * 10, "\n")


def replace_augment_name(dataset: str) -> str:
    """Replace the augmented dataset name with the original dataset name.
    >>> print(replace_augment_name("dataset_equalizer"))
    dataset
    """
    if "equalizer" in dataset:
        dataset = dataset.replace("_equalizer", "")
    elif "formant_shift" in dataset:
        dataset = dataset.replace("_formant_shift", "")
    elif "pitch_shift" in dataset:
        dataset = dataset.replace("_pitch_shift", "")
    elif "time_stretch" in dataset:
        dataset = dataset.replace("_time_stretch", "")
    else:
        pass
    return dataset