config: conf/tuning/prodiff_gst_xvector_base.yaml print_config: false log_level: INFO dry_run: false iterator_type: sequence output_dir: exp/tts_prodiff_gst_xvector_base_raw_phn_none ngpu: 1 seed: 0 num_workers: 6 num_att_plot: 3 num_valid_artifacts: 5 dist_backend: nccl dist_init_method: env:// dist_world_size: 2 dist_rank: 0 local_rank: 0 dist_master_addr: localhost dist_master_port: 32945 dist_launcher: null multiprocessing_distributed: true unused_parameters: false sharded_ddp: false growth_interval: 0 min_grad_scale: -1 cudnn_enabled: true cudnn_benchmark: false cudnn_deterministic: true collect_stats: false write_collected_feats: false max_epoch: 600 patience: null val_scheduler_criterion: - valid - loss early_stopping_criterion: - valid - loss - min best_model_criterion: - - valid - loss - min - - train - loss - min keep_nbest_models: 5 nbest_averaging_interval: 0 grad_clip: 1.0 grad_clip_type: 2.0 grad_noise: false accum_grad: 1 no_forward_run: false resume: true train_dtype: float32 use_amp: false log_interval: null use_matplotlib: true use_tensorboard: true detect_anomaly: false pretrain_path: null init_param: [] ignore_init_mismatch: false freeze_param: [] num_iters_per_epoch: 250 batch_size: 20 valid_batch_size: null valid_num_batches: null batch_bins: 6000000 valid_batch_bins: null train_shape_file: - exp/tts_stats_raw_phn_none/train/text_shape.phn - exp/tts_stats_raw_phn_none/train/speech_shape valid_shape_file: - exp/tts_stats_raw_phn_none/valid/text_shape.phn - exp/tts_stats_raw_phn_none/valid/speech_shape batch_type: numel valid_batch_type: null fold_length: - 150 - 240000 sort_in_batch: descending sort_batch: descending multiple_iterator: false chunk_length: 500 chunk_shift_ratio: 0.5 num_cache_chunks: 1024 train_data_path_and_name_and_type: - - dump/raw/tr_no_dev_phn/text - text - text - - data/tr_no_dev_phn/durations - durations - text_int - - dump/raw/tr_no_dev_phn/wav.scp - speech - sound - - exp/tts_stats_raw_phn_none/train/collect_feats/pitch.scp - pitch - npy - - exp/tts_stats_raw_phn_none/train/collect_feats/energy.scp - energy - npy - - xvector/tr_no_dev_phn/xvector.scp - spembs - kaldi_ark valid_data_path_and_name_and_type: - - dump/raw/dev_phn/text - text - text - - data/dev_phn/durations - durations - text_int - - dump/raw/dev_phn/wav.scp - speech - sound - - exp/tts_stats_raw_phn_none/valid/collect_feats/pitch.scp - pitch - npy - - exp/tts_stats_raw_phn_none/valid/collect_feats/energy.scp - energy - npy - -xvector/dev_phn/xvector.scp - spembs - kaldi_ark allow_variable_data_keys: false max_cache_size: 0.0 max_cache_fd: 32 valid_max_cache_size: null optim: adamw optim_conf: lr: 1.0 betas: - 0.9 - 0.98 scheduler: noamlr scheduler_conf: model_size: 384 warmup_steps: 2000 token_list: - - - o - a - u - i - e - k - r - t - n - 、 - N - s - sh - d - m - g - b - w - cl - j - ch - sil - h - y - p - ts - z - f - ky - U - ny - gy - ry - I - hy - my - by - py - v - odim: null model_conf: requires_word_duration: false use_preprocessor: true token_type: phn bpemodel: null non_linguistic_symbols: null cleaner: null g2p: null feats_extract: fbank feats_extract_conf: n_fft: 2048 hop_length: 300 win_length: 1200 fs: 24000 fmin: 80 fmax: 7600 n_mels: 80 normalize: global_mvn normalize_conf: stats_file: stats/feats_stats.npz tts: prodiff tts_conf: adim: 384 aheads: 2 elayers: 4 eunits: 1536 positionwise_layer_type: conv1d-linear positionwise_conv_kernel_size: 9 use_masking: true use_scaled_pos_enc: true encoder_normalize_before: true reduction_factor: 1 init_type: xavier_uniform init_enc_alpha: 1.0 transformer_enc_dropout_rate: 0.05 transformer_enc_positional_dropout_rate: 0.05 transformer_enc_attn_dropout_rate: 0.05 duration_predictor_layers: 2 duration_predictor_chans: 512 duration_predictor_kernel_size: 3 pitch_predictor_layers: 2 pitch_predictor_chans: 512 pitch_predictor_kernel_size: 3 pitch_predictor_dropout: 0.5 pitch_embed_kernel_size: 1 pitch_embed_dropout: 0.0 stop_gradient_from_pitch_predictor: true energy_predictor_layers: 2 energy_predictor_chans: 512 energy_predictor_kernel_size: 3 energy_predictor_dropout: 0.5 energy_embed_kernel_size: 1 energy_embed_dropout: 0.0 stop_gradient_from_energy_predictor: false spks: -1 spk_embed_dim: 192 denoiser_layers: 20 denoiser_channels: 256 diffusion_steps: 4 diffusion_timescale: 1 diffusion_beta: 40.0 diffusion_scheduler: vpsde diffusion_cycle_ln: 1 use_gst: true gst_heads: 8 gst_tokens: 256 pitch_extract: dio pitch_extract_conf: fs: 24000 n_fft: 2048 hop_length: 300 f0max: 400 f0min: 80 reduction_factor: 1 pitch_normalize: global_mvn pitch_normalize_conf: stats_file: stats/pitch_stats.npz energy_extract: energy energy_extract_conf: fs: 24000 n_fft: 2048 hop_length: 300 win_length: 1200 reduction_factor: 1 energy_normalize: global_mvn energy_normalize_conf: stats_file: stats/energy_stats.npz required: - output_dir - token_list version: '202207' distributed: true