""" Generate VoxCeleb1 SID manifest for SpeechT5. iden_split.txt 1 id11251/s4R4hvqrhFw/00002.wav 1 id11251/gFfcgOVmiO0/00006.wav 3 id11251/7GtZpUtReJ8/00001.wav 2 id11251/5-6lI5JQtb8/00001.wav 3 id11251/7GtZpUtReJ8/00006.wav """ import logging import argparse import os from scipy.io import wavfile from tqdm import tqdm logger = logging.getLogger(__name__) SPLITS = { "train": 1, "valid": 2, "test": 3, } class VoxCeleb1SID: def __init__(self, root, split, iden_path): self.root = root self.speakers = [] self.paths = [] with open(iden_path, "r") as f: for line in f: items = line.strip().split(" ") split_type = int(items[0]) wav_path = items[1] if split == split_type: self.speakers.append(wav_path.split("/")[0]) self.paths.append(wav_path) def __len__(self): return len(self.paths) def __getitem__(self, index): speaker = self.speakers[index] file_audio = os.path.join(self.root, self.paths[index]) sample_rate, wav = wavfile.read(file_audio) n_frames = wav.shape[0] return n_frames, sample_rate, speaker, self.paths[index] def get_parser(): parser = argparse.ArgumentParser() parser.add_argument( "root", metavar="DIR", help="root directory containing wav files to index" ) parser.add_argument( "--output", default=".", type=str, metavar="DIR", help="output directory of manifest" ) parser.add_argument( "--split", required=True, type=str, choices=["train", "valid", "test"], help="dataset splits" ) parser.add_argument( "--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv" ) parser.add_argument( "--iden-split", required=True, type=str, help="officially released split for identification" ) return parser def main(args): dest_dir = args.output wav_root = args.wav_root if not os.path.exists(args.iden_split): logger.error(f"split {args.iden_split} does not exist") if not os.path.exists(dest_dir): os.makedirs(dest_dir) dataset = VoxCeleb1SID(args.root, SPLITS[args.split], args.iden_split) tsv = open(os.path.join(dest_dir, f"{args.split}.tsv"), "w") print(wav_root, file=tsv) for n_frames, sr, spk_id, wav_path in tqdm(dataset, desc="tsv/txt/wav"): assert sr == 16000, f"sampling rate {sr} != 16000" assert os.path.exists(os.path.join(args.root, wav_path)) print(f"{wav_path}\t{n_frames}\t{spk_id}", file=tsv) tsv.close() if __name__ == "__main__": parser = get_parser() args = parser.parse_args() main(args)