general: stage: "pretrain" corpus_type: "multi-unseen" # (single, multi-seen, multi-unseen) source_path: "./data/jvs_22k-low" aux_path: "./data/jvs_22k" preprocessed_path: "./preprocessed/jvs" output_path: "./output/vocfeats/pretrain" test_wav_path: null feature_type: "vocfeats" hifigan_path: "./hifigan/hifigan_jvs_40d_600k" power_norm: True use_gst: False preprocess: n_train: 90 n_val: 5 n_test: 5 sampling_rate: 22050 frame_length: 1024 frame_shift: 256 fft_length: 1024 fmin: 0 fmax: 8000 n_mels: 80 cep_order: 40 f0_extractor: "dio" comp_factor: 1.0 min_magnitude: 0.00001 max_wav_value: 32768.0 segment_length: 2 train: batchsize: 8 epoch: 50 alpha: 0.1 augment: True multi_gpu_mode: False num_workers: 4 learning_rate: 0.005 grad_clip_thresh: 1.0 logger_step: 1000 load_pretrained: False pretrained_path: null early_stopping: False multi_scale_loss: use_linear: True gamma: 1.0 feature_loss: type: "mae"