|
from speaker_encoder import inference as encoder |
|
from multiprocessing.pool import Pool |
|
from functools import partial |
|
from pathlib import Path |
|
|
|
|
|
|
|
|
|
|
|
|
|
def embed_utterance(fpaths, encoder_model_fpath): |
|
if not encoder.is_loaded(): |
|
encoder.load_model(encoder_model_fpath) |
|
|
|
|
|
wav_fpath, embed_fpath = fpaths |
|
wav = np.load(wav_fpath) |
|
wav = encoder.preprocess_wav(wav) |
|
embed = encoder.embed_utterance(wav) |
|
np.save(embed_fpath, embed, allow_pickle=False) |
|
|
|
|
|
def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int): |
|
|
|
wav_dir = outdir_root.joinpath("audio") |
|
metadata_fpath = synthesizer_root.joinpath("train.txt") |
|
assert wav_dir.exists() and metadata_fpath.exists() |
|
embed_dir = synthesizer_root.joinpath("embeds") |
|
embed_dir.mkdir(exist_ok=True) |
|
|
|
|
|
with metadata_fpath.open("r") as metadata_file: |
|
metadata = [line.split("|") for line in metadata_file] |
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] |
|
|
|
|
|
|
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) |
|
job = Pool(n_processes).imap(func, fpaths) |
|
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) |