|
""" |
|
Generate VoxCeleb1 SID manifest for SpeechT5. |
|
|
|
iden_split.txt |
|
1 id11251/s4R4hvqrhFw/00002.wav |
|
1 id11251/gFfcgOVmiO0/00006.wav |
|
3 id11251/7GtZpUtReJ8/00001.wav |
|
2 id11251/5-6lI5JQtb8/00001.wav |
|
3 id11251/7GtZpUtReJ8/00006.wav |
|
""" |
|
|
|
import logging |
|
import argparse |
|
import os |
|
|
|
from scipy.io import wavfile |
|
from tqdm import tqdm |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
SPLITS = { |
|
"train": 1, |
|
"valid": 2, |
|
"test": 3, |
|
} |
|
|
|
class VoxCeleb1SID: |
|
|
|
def __init__(self, root, split, iden_path): |
|
self.root = root |
|
self.speakers = [] |
|
self.paths = [] |
|
with open(iden_path, "r") as f: |
|
for line in f: |
|
items = line.strip().split(" ") |
|
split_type = int(items[0]) |
|
wav_path = items[1] |
|
if split == split_type: |
|
self.speakers.append(wav_path.split("/")[0]) |
|
self.paths.append(wav_path) |
|
|
|
def __len__(self): |
|
return len(self.paths) |
|
|
|
def __getitem__(self, index): |
|
speaker = self.speakers[index] |
|
file_audio = os.path.join(self.root, self.paths[index]) |
|
sample_rate, wav = wavfile.read(file_audio) |
|
n_frames = wav.shape[0] |
|
return n_frames, sample_rate, speaker, self.paths[index] |
|
|
|
|
|
|
|
def get_parser(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"root", metavar="DIR", help="root directory containing wav files to index" |
|
) |
|
parser.add_argument( |
|
"--output", default=".", type=str, metavar="DIR", help="output directory of manifest" |
|
) |
|
parser.add_argument( |
|
"--split", required=True, type=str, choices=["train", "valid", "test"], help="dataset splits" |
|
) |
|
parser.add_argument( |
|
"--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv" |
|
) |
|
parser.add_argument( |
|
"--iden-split", required=True, type=str, help="officially released split for identification" |
|
) |
|
return parser |
|
|
|
|
|
def main(args): |
|
dest_dir = args.output |
|
wav_root = args.wav_root |
|
if not os.path.exists(args.iden_split): |
|
logger.error(f"split {args.iden_split} does not exist") |
|
if not os.path.exists(dest_dir): |
|
os.makedirs(dest_dir) |
|
|
|
dataset = VoxCeleb1SID(args.root, SPLITS[args.split], args.iden_split) |
|
tsv = open(os.path.join(dest_dir, f"{args.split}.tsv"), "w") |
|
print(wav_root, file=tsv) |
|
|
|
for n_frames, sr, spk_id, wav_path in tqdm(dataset, desc="tsv/txt/wav"): |
|
assert sr == 16000, f"sampling rate {sr} != 16000" |
|
assert os.path.exists(os.path.join(args.root, wav_path)) |
|
|
|
print(f"{wav_path}\t{n_frames}\t{spk_id}", file=tsv) |
|
|
|
tsv.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = get_parser() |
|
args = parser.parse_args() |
|
main(args) |
|
|