Spaces:
Build error
Build error
accum_grad: 1 | |
allow_variable_data_keys: false | |
batch_bins: 5000000 | |
batch_size: 20 | |
batch_type: numel | |
best_model_criterion: | |
- - train | |
- total_count | |
- max | |
bpemodel: null | |
chunk_length: 500 | |
chunk_shift_ratio: 0.5 | |
cleaner: tacotron | |
collect_stats: false | |
config: ./conf/tuning/train_xvector_vits.yaml | |
cudnn_benchmark: false | |
cudnn_deterministic: false | |
cudnn_enabled: true | |
detect_anomaly: false | |
dist_backend: nccl | |
dist_init_method: env:// | |
dist_launcher: null | |
dist_master_addr: localhost | |
dist_master_port: 60056 | |
dist_rank: 0 | |
dist_world_size: 4 | |
distributed: true | |
dry_run: false | |
early_stopping_criterion: | |
- valid | |
- loss | |
- min | |
energy_extract: null | |
energy_extract_conf: {} | |
energy_normalize: null | |
energy_normalize_conf: {} | |
feats_extract: linear_spectrogram | |
feats_extract_conf: | |
hop_length: 256 | |
n_fft: 1024 | |
win_length: null | |
fold_length: | |
- 150 | |
- 204800 | |
freeze_param: [] | |
g2p: g2p_en_no_space | |
generator_first: false | |
grad_clip: -1 | |
grad_clip_type: 2.0 | |
grad_noise: false | |
ignore_init_mismatch: false | |
init_param: [] | |
iterator_type: sequence | |
keep_nbest_models: 10 | |
local_rank: 0 | |
log_interval: 50 | |
log_level: INFO | |
max_cache_fd: 32 | |
max_cache_size: 0.0 | |
max_epoch: 100 | |
model_conf: {} | |
multiple_iterator: false | |
multiprocessing_distributed: true | |
ngpu: 1 | |
no_forward_run: false | |
non_linguistic_symbols: null | |
normalize: null | |
normalize_conf: {} | |
num_att_plot: 3 | |
num_cache_chunks: 1024 | |
num_iters_per_epoch: 10000 | |
num_workers: 4 | |
odim: null | |
optim: adamw | |
optim2: adamw | |
optim2_conf: | |
betas: | |
- 0.8 | |
- 0.99 | |
eps: 1.0e-09 | |
lr: 0.0002 | |
weight_decay: 0.0 | |
optim_conf: | |
betas: | |
- 0.8 | |
- 0.99 | |
eps: 1.0e-09 | |
lr: 0.0002 | |
weight_decay: 0.0 | |
output_dir: exp/tts_train_xvector_vits_raw_phn_tacotron_g2p_en_no_space | |
patience: null | |
pitch_extract: null | |
pitch_extract_conf: {} | |
pitch_normalize: null | |
pitch_normalize_conf: {} | |
pretrain_path: null | |
print_config: false | |
required: | |
- output_dir | |
- token_list | |
resume: true | |
scheduler: exponentiallr | |
scheduler2: exponentiallr | |
scheduler2_conf: | |
gamma: 0.999875 | |
scheduler_conf: | |
gamma: 0.999875 | |
seed: 777 | |
sharded_ddp: false | |
sort_batch: descending | |
sort_in_batch: descending | |
token_list: | |
- <blank> | |
- <unk> | |
- AH0 | |
- T | |
- N | |
- D | |
- S | |
- R | |
- L | |
- IH1 | |
- DH | |
- M | |
- K | |
- Z | |
- EH1 | |
- AE1 | |
- IH0 | |
- AH1 | |
- W | |
- ',' | |
- HH | |
- ER0 | |
- P | |
- IY1 | |
- V | |
- F | |
- B | |
- UW1 | |
- AA1 | |
- AY1 | |
- AO1 | |
- . | |
- EY1 | |
- IY0 | |
- OW1 | |
- NG | |
- G | |
- SH | |
- Y | |
- AW1 | |
- CH | |
- ER1 | |
- UH1 | |
- TH | |
- JH | |
- '''' | |
- '?' | |
- OW0 | |
- EH2 | |
- '!' | |
- IH2 | |
- OY1 | |
- EY2 | |
- AY2 | |
- EH0 | |
- UW0 | |
- AA2 | |
- AE2 | |
- OW2 | |
- AO2 | |
- AE0 | |
- AH2 | |
- ZH | |
- AA0 | |
- UW2 | |
- IY2 | |
- AY0 | |
- AO0 | |
- AW2 | |
- EY0 | |
- UH2 | |
- ER2 | |
- AW0 | |
- '...' | |
- UH0 | |
- OY2 | |
- . . . | |
- OY0 | |
- . . . . | |
- .. | |
- . ... | |
- . . | |
- . . . . . | |
- .. .. | |
- '... .' | |
- <sos/eos> | |
token_type: phn | |
train_data_path_and_name_and_type: | |
- - dump/22k/raw/train-clean-460/text | |
- text | |
- text | |
- - dump/22k/raw/train-clean-460/wav.scp | |
- speech | |
- sound | |
- - dump/22k/xvector/train-clean-460/xvector.scp | |
- spembs | |
- kaldi_ark | |
train_dtype: float32 | |
train_shape_file: | |
- exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/text_shape.phn | |
- exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/speech_shape | |
tts: vits | |
tts_conf: | |
cache_generator_outputs: true | |
discriminator_adv_loss_params: | |
average_by_discriminators: false | |
loss_type: mse | |
discriminator_params: | |
follow_official_norm: false | |
period_discriminator_params: | |
bias: true | |
channels: 32 | |
downsample_scales: | |
- 3 | |
- 3 | |
- 3 | |
- 3 | |
- 1 | |
in_channels: 1 | |
kernel_sizes: | |
- 5 | |
- 3 | |
max_downsample_channels: 1024 | |
nonlinear_activation: LeakyReLU | |
nonlinear_activation_params: | |
negative_slope: 0.1 | |
out_channels: 1 | |
use_spectral_norm: false | |
use_weight_norm: true | |
periods: | |
- 2 | |
- 3 | |
- 5 | |
- 7 | |
- 11 | |
scale_discriminator_params: | |
bias: true | |
channels: 128 | |
downsample_scales: | |
- 2 | |
- 2 | |
- 4 | |
- 4 | |
- 1 | |
in_channels: 1 | |
kernel_sizes: | |
- 15 | |
- 41 | |
- 5 | |
- 3 | |
max_downsample_channels: 1024 | |
max_groups: 16 | |
nonlinear_activation: LeakyReLU | |
nonlinear_activation_params: | |
negative_slope: 0.1 | |
out_channels: 1 | |
use_spectral_norm: false | |
use_weight_norm: true | |
scale_downsample_pooling: AvgPool1d | |
scale_downsample_pooling_params: | |
kernel_size: 4 | |
padding: 2 | |
stride: 2 | |
scales: 1 | |
discriminator_type: hifigan_multi_scale_multi_period_discriminator | |
feat_match_loss_params: | |
average_by_discriminators: false | |
average_by_layers: false | |
include_final_outputs: true | |
generator_adv_loss_params: | |
average_by_discriminators: false | |
loss_type: mse | |
generator_params: | |
aux_channels: 513 | |
decoder_channels: 512 | |
decoder_kernel_size: 7 | |
decoder_resblock_dilations: | |
- - 1 | |
- 3 | |
- 5 | |
- - 1 | |
- 3 | |
- 5 | |
- - 1 | |
- 3 | |
- 5 | |
decoder_resblock_kernel_sizes: | |
- 3 | |
- 7 | |
- 11 | |
decoder_upsample_kernel_sizes: | |
- 16 | |
- 16 | |
- 4 | |
- 4 | |
decoder_upsample_scales: | |
- 8 | |
- 8 | |
- 2 | |
- 2 | |
flow_base_dilation: 1 | |
flow_dropout_rate: 0.0 | |
flow_flows: 4 | |
flow_kernel_size: 5 | |
flow_layers: 4 | |
global_channels: 256 | |
hidden_channels: 192 | |
posterior_encoder_base_dilation: 1 | |
posterior_encoder_dropout_rate: 0.0 | |
posterior_encoder_kernel_size: 5 | |
posterior_encoder_layers: 16 | |
posterior_encoder_stacks: 1 | |
segment_size: 32 | |
spk_embed_dim: 512 | |
spks: -1 | |
stochastic_duration_predictor_dds_conv_layers: 3 | |
stochastic_duration_predictor_dropout_rate: 0.5 | |
stochastic_duration_predictor_flows: 4 | |
stochastic_duration_predictor_kernel_size: 3 | |
text_encoder_activation_type: swish | |
text_encoder_attention_dropout_rate: 0.1 | |
text_encoder_attention_heads: 2 | |
text_encoder_blocks: 6 | |
text_encoder_conformer_kernel_size: -1 | |
text_encoder_dropout_rate: 0.1 | |
text_encoder_ffn_expand: 4 | |
text_encoder_normalize_before: true | |
text_encoder_positional_dropout_rate: 0.0 | |
text_encoder_positional_encoding_layer_type: rel_pos | |
text_encoder_positionwise_conv_kernel_size: 3 | |
text_encoder_positionwise_layer_type: conv1d | |
text_encoder_self_attention_layer_type: rel_selfattn | |
use_conformer_conv_in_text_encoder: false | |
use_macaron_style_in_text_encoder: true | |
use_only_mean_in_flow: true | |
use_weight_norm_in_decoder: true | |
use_weight_norm_in_flow: true | |
use_weight_norm_in_posterior_encoder: true | |
vocabs: 86 | |
generator_type: vits_generator | |
lambda_adv: 1.0 | |
lambda_dur: 1.0 | |
lambda_feat_match: 2.0 | |
lambda_kl: 1.0 | |
lambda_mel: 45.0 | |
mel_loss_params: | |
fmax: null | |
fmin: 0 | |
fs: 22050 | |
hop_length: 256 | |
log_base: null | |
n_fft: 1024 | |
n_mels: 80 | |
win_length: null | |
window: hann | |
sampling_rate: 22050 | |
unused_parameters: true | |
use_amp: false | |
use_preprocessor: true | |
use_tensorboard: true | |
use_wandb: false | |
val_scheduler_criterion: | |
- valid | |
- loss | |
valid_batch_bins: null | |
valid_batch_size: null | |
valid_batch_type: null | |
valid_data_path_and_name_and_type: | |
- - dump/22k/raw/dev-clean/text | |
- text | |
- text | |
- - dump/22k/raw/dev-clean/wav.scp | |
- speech | |
- sound | |
- - dump/22k/xvector/dev-clean/xvector.scp | |
- spembs | |
- kaldi_ark | |
valid_max_cache_size: null | |
valid_shape_file: | |
- exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/text_shape.phn | |
- exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/speech_shape | |
version: 0.10.3a2 | |
wandb_entity: null | |
wandb_id: null | |
wandb_model_log_interval: -1 | |
wandb_name: null | |
wandb_project: null | |
write_collected_feats: false | |