Spaces:
Runtime error
Runtime error
OFA-Visual_Grounding
/
fairseq
/examples
/speech_synthesis
/preprocessing
/get_common_voice_audio_manifest.py
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import argparse | |
import logging | |
from pathlib import Path | |
from collections import defaultdict | |
from typing import List, Dict, Tuple | |
import pandas as pd | |
import numpy as np | |
import torchaudio | |
from tqdm import tqdm | |
from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv | |
log = logging.getLogger(__name__) | |
SPLITS = ["train", "dev", "test"] | |
def get_top_n( | |
root: Path, n_speakers: int = 10, min_n_tokens: int = 5 | |
) -> pd.DataFrame: | |
df = load_df_from_tsv(root / "validated.tsv") | |
df["n_tokens"] = [len(s.split()) for s in df["sentence"]] | |
df = df[df["n_tokens"] >= min_n_tokens] | |
df["n_frames"] = [ | |
torchaudio.info((root / "clips" / p).as_posix()).num_frames | |
for p in tqdm(df["path"]) | |
] | |
df["id"] = [Path(p).stem for p in df["path"]] | |
total_duration_ms = df.groupby("client_id")["n_frames"].agg(["sum"]) | |
total_duration_ms = total_duration_ms.sort_values("sum", ascending=False) | |
top_n_total_duration_ms = total_duration_ms.head(n_speakers) | |
top_n_client_ids = set(top_n_total_duration_ms.index.tolist()) | |
df_top_n = df[df["client_id"].isin(top_n_client_ids)] | |
return df_top_n | |
def get_splits( | |
df, train_split_ratio=0.99, speaker_in_all_splits=False, rand_seed=0 | |
) -> Tuple[Dict[str, str], List[str]]: | |
np.random.seed(rand_seed) | |
dev_split_ratio = (1. - train_split_ratio) / 3 | |
grouped = list(df.groupby("client_id")) | |
id_to_split = {} | |
for _, cur_df in tqdm(grouped): | |
cur_n_examples = len(cur_df) | |
if speaker_in_all_splits and cur_n_examples < 3: | |
continue | |
cur_n_train = int(cur_n_examples * train_split_ratio) | |
cur_n_dev = int(cur_n_examples * dev_split_ratio) | |
cur_n_test = cur_n_examples - cur_n_dev - cur_n_train | |
if speaker_in_all_splits and cur_n_dev * cur_n_test == 0: | |
cur_n_dev, cur_n_test = 1, 1 | |
cur_n_train = cur_n_examples - cur_n_dev - cur_n_test | |
cur_indices = cur_df.index.tolist() | |
cur_shuffled_indices = np.random.permutation(cur_n_examples) | |
cur_shuffled_indices = [cur_indices[i] for i in cur_shuffled_indices] | |
cur_indices_by_split = { | |
"train": cur_shuffled_indices[:cur_n_train], | |
"dev": cur_shuffled_indices[cur_n_train: cur_n_train + cur_n_dev], | |
"test": cur_shuffled_indices[cur_n_train + cur_n_dev:] | |
} | |
for split in SPLITS: | |
for i in cur_indices_by_split[split]: | |
id_ = df["id"].loc[i] | |
id_to_split[id_] = split | |
return id_to_split, sorted(df["client_id"].unique()) | |
def convert_to_wav(root: Path, filenames: List[str], target_sr=16_000): | |
out_root = root / "wav" | |
out_root.mkdir(exist_ok=True, parents=True) | |
print("Converting to WAV...") | |
for n in tqdm(filenames): | |
in_path = (root / "clips" / n).as_posix() | |
waveform, sr = torchaudio.load(in_path) | |
converted, converted_sr = torchaudio.sox_effects.apply_effects_tensor( | |
waveform, sr, [["rate", str(target_sr)], ["channels", "1"]] | |
) | |
out_path = (out_root / Path(n).with_suffix(".wav").name).as_posix() | |
torchaudio.save(out_path, converted, converted_sr, encoding="PCM_S", | |
bits_per_sample=16) | |
def process(args): | |
data_root = Path(args.data_root).absolute() / args.lang | |
# Generate TSV manifest | |
print("Generating manifest...") | |
df_top_n = get_top_n(data_root) | |
id_to_split, speakers = get_splits(df_top_n) | |
if args.convert_to_wav: | |
convert_to_wav(data_root, df_top_n["path"].tolist()) | |
manifest_by_split = {split: defaultdict(list) for split in SPLITS} | |
for sample in tqdm(df_top_n.to_dict(orient="index").values()): | |
sample_id = sample["id"] | |
split = id_to_split[sample_id] | |
manifest_by_split[split]["id"].append(sample_id) | |
if args.convert_to_wav: | |
audio_path = data_root / "wav" / f"{sample_id}.wav" | |
else: | |
audio_path = data_root / "clips" / f"{sample_id}.mp3" | |
manifest_by_split[split]["audio"].append(audio_path.as_posix()) | |
manifest_by_split[split]["n_frames"].append(sample["n_frames"]) | |
manifest_by_split[split]["tgt_text"].append(sample["sentence"]) | |
manifest_by_split[split]["speaker"].append(sample["client_id"]) | |
manifest_by_split[split]["src_text"].append(sample["sentence"]) | |
output_root = Path(args.output_manifest_root).absolute() | |
output_root.mkdir(parents=True, exist_ok=True) | |
for split in SPLITS: | |
save_df_to_tsv( | |
pd.DataFrame.from_dict(manifest_by_split[split]), | |
output_root / f"{split}.audio.tsv" | |
) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--data-root", "-d", required=True, type=str) | |
parser.add_argument("--output-manifest-root", "-m", required=True, type=str) | |
parser.add_argument("--lang", "-l", required=True, type=str) | |
parser.add_argument("--convert-to-wav", action="store_true") | |
args = parser.parse_args() | |
process(args) | |
if __name__ == "__main__": | |
main() | |