log_dir: "runs/run_mel_seed_uvit_xlsr_tiny" save_freq: 1 log_interval: 10 save_interval: 500 device: "cuda" epochs: 1000 # number of epochs for first stage training (pre-training) batch_size: 2 batch_length: 100 # maximum duration of audio in a batch (in seconds) max_len: 80 # maximum number of frames pretrained_model: "DiT_uvit_tat_xlsr_ema.pth" pretrained_encoder: "" load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters preprocess_params: sr: 22050 spect_params: n_fft: 1024 win_length: 1024 hop_length: 256 n_mels: 80 fmin: 0 fmax: 8000 model_params: dit_type: "DiT" # uDiT or DiT reg_loss_type: "l1" # l1 or l2 diffusion_type: "flow" timbre_shifter: se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt" ckpt_path: './modules/openvoice/checkpoints_v2/converter' vocoder: type: "hifigan" speech_tokenizer: type: 'xlsr' output_layer: 12 name: 'facebook/wav2vec2-xls-r-300m' style_encoder: dim: 192 campplus_path: "campplus_cn_common.bin" length_regulator: channels: 384 is_discrete: false in_channels: 1024 content_codebook_size: 1024 sampling_ratios: [1, 1, 1, 1] vector_quantize: false n_codebooks: 2 quantizer_dropout: 0.0 f0_condition: false n_f0_bins: 512 DiT: hidden_dim: 384 num_heads: 6 depth: 9 class_dropout_prob: 0.1 block_size: 8192 in_channels: 80 style_condition: true final_layer_type: 'mlp' target: 'mel' # mel or betavae content_dim: 384 content_codebook_size: 1024 content_type: 'discrete' f0_condition: false n_f0_bins: 512 content_codebooks: 1 is_causal: false long_skip_connection: false zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token time_as_token: true style_as_token: true uvit_skip_connection: true add_resblock_in_transformer: false loss_params: base_lr: 0.0001