{ "base_config": "config/vits.json", "model_type": "VITS", "dataset": [ "libritts" ], "dataset_path": { // TODO: Fill in your dataset path "libritts": "/mnt/workspace/xueliumeng/data/libritts/raw/LibriTTS" }, // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "/mnt/workspace/wangmingxuan/vits_on_libritts_hifitts/logs", "preprocess": { "extract_audio": true, "use_phone": true, // linguistic features "extract_phone": true, "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "/mnt/workspace/wangmingxuan/vits_on_libritts_hifitts/processed_data", "sample_rate": 24000, "train_file": "train_seen_spk_5k_cleaned.json", "valid_file": "valid_100.json", // validattion set "use_spkid": true, // True: use speaker id for multi-speaker dataset }, "train": { "batch_size": 16, "multi_speaker_training": true, // True: train multi-speaker model; False: training single-speaker model; // "n_speakers": 2500, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true } }