from speaker_encoder import inference as encoder from multiprocessing.pool import Pool from functools import partial from pathlib import Path # from utils import logmmse # from tqdm import tqdm # import numpy as np # import librosa def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False) def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int): wav_dir = outdir_root.joinpath("audio") metadata_fpath = synthesizer_root.joinpath("train.txt") assert wav_dir.exists() and metadata_fpath.exists() embed_dir = synthesizer_root.joinpath("embeds") embed_dir.mkdir(exist_ok=True) # Gather the input wave filepath and the target output embed filepath with metadata_fpath.open("r") as metadata_file: metadata = [line.split("|") for line in metadata_file] fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. # Embed the utterances in separate threads func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))