# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type char --token_list dump/token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train_nodev/text,text,text --train_data_path_and_name_and_type dump/raw/train_nodev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/train_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/train_dev/wav.scp,speech,sound --train_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.17.scp --valid_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.17.scp --output_dir exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.17 --config conf/train_vits.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null --train_data_path_and_name_and_type dump/raw/train_nodev/utt2sid,sids,text_int --valid_data_path_and_name_and_type dump/raw/train_dev/utt2sid,sids,text_int --use_wandb true --wandb_project GROTTS --wandb_name VITS_lr_3.0e-4 --init_param downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv --batch_size 40 --batch_bins 10000000 # Started at Fri Dec 1 15:58:50 UTC 2023 # /data2/p280965/tts/espnet/tools/venv/bin/python3 /data2/p280965/tts/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type char --token_list dump/token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train_nodev/text,text,text --train_data_path_and_name_and_type dump/raw/train_nodev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/train_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/train_dev/wav.scp,speech,sound --train_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.17.scp --valid_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.17.scp --output_dir exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.17 --config conf/train_vits.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null --train_data_path_and_name_and_type dump/raw/train_nodev/utt2sid,sids,text_int --valid_data_path_and_name_and_type dump/raw/train_dev/utt2sid,sids,text_int --use_wandb true --wandb_project GROTTS --wandb_name VITS_lr_3.0e-4 --init_param downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv --batch_size 40 --batch_bins 10000000 [wieling-3-a100] 2023-12-01 15:58:56,542 (gan_tts:293) INFO: Vocabulary size: 46 [wieling-3-a100] 2023-12-01 15:58:56,656 (encoder:174) INFO: encoder self-attention layer type = relative self-attention /data2/p280965/tts/espnet/tools/venv/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm. warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.") /data2/p280965/tts/espnet/espnet2/gan_tts/vits/monotonic_align/__init__.py:19: UserWarning: Cython version is not available. Fallback to 'EXPERIMETAL' numba version. If you want to use the cython version, please build it as follows: `cd espnet2/gan_tts/vits/monotonic_align; python setup.py build_ext --inplace` warnings.warn( [wieling-3-a100] 2023-12-01 15:58:57,906 (abs_task:1268) INFO: pytorch.version=2.1.0+cu121, cuda.available=True, cudnn.version=8902, cudnn.benchmark=False, cudnn.deterministic=False [wieling-3-a100] 2023-12-01 15:58:57,921 (abs_task:1269) INFO: Model structure: ESPnetGANTTSModel( (feats_extract): LogMelFbank( (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) ) (tts): VITS( (generator): VITSGenerator( (text_encoder): TextEncoder( (emb): Embedding(46, 192) (encoder): Encoder( (embed): Sequential( (0): RelPositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) (encoders): MultiSequential( (0): EncoderLayer( (self_attn): RelPositionMultiHeadedAttention( (linear_q): Linear(in_features=192, out_features=192, bias=True) (linear_k): Linear(in_features=192, out_features=192, bias=True) (linear_v): Linear(in_features=192, out_features=192, bias=True) (linear_out): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear_pos): Linear(in_features=192, out_features=192, bias=False) ) (feed_forward): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (feed_forward_macaron): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): EncoderLayer( (self_attn): RelPositionMultiHeadedAttention( (linear_q): Linear(in_features=192, out_features=192, bias=True) (linear_k): Linear(in_features=192, out_features=192, bias=True) (linear_v): Linear(in_features=192, out_features=192, bias=True) (linear_out): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear_pos): Linear(in_features=192, out_features=192, bias=False) ) (feed_forward): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (feed_forward_macaron): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): EncoderLayer( (self_attn): RelPositionMultiHeadedAttention( (linear_q): Linear(in_features=192, out_features=192, bias=True) (linear_k): Linear(in_features=192, out_features=192, bias=True) (linear_v): Linear(in_features=192, out_features=192, bias=True) (linear_out): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear_pos): Linear(in_features=192, out_features=192, bias=False) ) (feed_forward): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (feed_forward_macaron): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): EncoderLayer( (self_attn): RelPositionMultiHeadedAttention( (linear_q): Linear(in_features=192, out_features=192, bias=True) (linear_k): Linear(in_features=192, out_features=192, bias=True) (linear_v): Linear(in_features=192, out_features=192, bias=True) (linear_out): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear_pos): Linear(in_features=192, out_features=192, bias=False) ) (feed_forward): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (feed_forward_macaron): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): EncoderLayer( (self_attn): RelPositionMultiHeadedAttention( (linear_q): Linear(in_features=192, out_features=192, bias=True) (linear_k): Linear(in_features=192, out_features=192, bias=True) (linear_v): Linear(in_features=192, out_features=192, bias=True) (linear_out): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear_pos): Linear(in_features=192, out_features=192, bias=False) ) (feed_forward): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (feed_forward_macaron): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): EncoderLayer( (self_attn): RelPositionMultiHeadedAttention( (linear_q): Linear(in_features=192, out_features=192, bias=True) (linear_k): Linear(in_features=192, out_features=192, bias=True) (linear_v): Linear(in_features=192, out_features=192, bias=True) (linear_out): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear_pos): Linear(in_features=192, out_features=192, bias=False) ) (feed_forward): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (feed_forward_macaron): MultiLayeredConv1d( (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,)) (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,)) (dropout): Dropout(p=0.1, inplace=False) ) (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((192,), eps=1e-12, elementwise_affine=True) ) (proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,)) ) (decoder): HiFiGANGenerator( (input_conv): Conv1d(192, 512, kernel_size=(7,), stride=(1,), padding=(3,)) (upsamples): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) ) (3): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) ) ) (blocks): ModuleList( (0): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) (1): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) ) ) ) (2): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) ) ) ) (3): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) (4): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) ) ) ) (5): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) ) ) ) (6): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) (7): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) ) ) ) (8): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) ) ) ) (9): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) (10): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) ) ) ) (11): ResidualBlock( (convs1): ModuleList( (0): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) ) (1): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) ) (2): Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) ) ) (convs2): ModuleList( (0-2): 3 x Sequential( (0): LeakyReLU(negative_slope=0.1) (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) ) ) ) ) (output_conv): Sequential( (0): LeakyReLU(negative_slope=0.01) (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) (2): Tanh() ) (global_conv): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) ) (posterior_encoder): PosteriorEncoder( (input_conv): Conv1d(80, 192, kernel_size=(1,), stride=(1,)) (encoder): WaveNet( (conv_layers): ModuleList( (0-15): 16 x ResidualBlock( (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,)) (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False) (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,)) ) ) ) (proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,)) ) (flow): ResidualAffineCouplingBlock( (flows): ModuleList( (0): ResidualAffineCouplingLayer( (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,)) (encoder): WaveNet( (conv_layers): ModuleList( (0-3): 4 x ResidualBlock( (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,)) (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False) (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,)) ) ) ) (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,)) ) (1): FlipFlow() (2): ResidualAffineCouplingLayer( (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,)) (encoder): WaveNet( (conv_layers): ModuleList( (0-3): 4 x ResidualBlock( (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,)) (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False) (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,)) ) ) ) (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,)) ) (3): FlipFlow() (4): ResidualAffineCouplingLayer( (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,)) (encoder): WaveNet( (conv_layers): ModuleList( (0-3): 4 x ResidualBlock( (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,)) (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False) (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,)) ) ) ) (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,)) ) (5): FlipFlow() (6): ResidualAffineCouplingLayer( (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,)) (encoder): WaveNet( (conv_layers): ModuleList( (0-3): 4 x ResidualBlock( (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,)) (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False) (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,)) ) ) ) (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,)) ) (7): FlipFlow() ) ) (duration_predictor): StochasticDurationPredictor( (pre): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (dds): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.5, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.5, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.5, inplace=False) ) ) ) (proj): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (log_flow): LogFlow() (flows): ModuleList( (0): ElementwiseAffineFlow() (1): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (2): FlipFlow() (3): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (4): FlipFlow() (5): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (6): FlipFlow() (7): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (8): FlipFlow() ) (post_pre): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (post_dds): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.5, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.5, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.5, inplace=False) ) ) ) (post_proj): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (post_flows): ModuleList( (0): ElementwiseAffineFlow() (1): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (2): FlipFlow() (3): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (4): FlipFlow() (5): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (6): FlipFlow() (7): ConvFlow( (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,)) (dds_conv): DilatedDepthSeparableConv( (convs): ModuleList( (0): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (1): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) (2): Sequential( (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192) (1): Transpose() (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (3): Transpose() (4): GELU(approximate='none') (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,)) (6): Transpose() (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (8): Transpose() (9): GELU(approximate='none') (10): Dropout(p=0.0, inplace=False) ) ) ) (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,)) ) (8): FlipFlow() ) (global_conv): Conv1d(256, 192, kernel_size=(1,), stride=(1,)) ) (global_emb): Embedding(4, 256) ) (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( (msd): HiFiGANMultiScaleDiscriminator( (discriminators): ModuleList( (0): HiFiGANScaleDiscriminator( (layers): ModuleList( (0): Sequential( (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (1): LeakyReLU(negative_slope=0.1) ) (1): Sequential( (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) (1): LeakyReLU(negative_slope=0.1) ) (2): Sequential( (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) (1): LeakyReLU(negative_slope=0.1) ) (3): Sequential( (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) (1): LeakyReLU(negative_slope=0.1) ) (4): Sequential( (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) (1): LeakyReLU(negative_slope=0.1) ) (5): Sequential( (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) (1): LeakyReLU(negative_slope=0.1) ) (6): Sequential( (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) (1): LeakyReLU(negative_slope=0.1) ) (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) ) (mpd): HiFiGANMultiPeriodDiscriminator( (discriminators): ModuleList( (0-4): 5 x HiFiGANPeriodDiscriminator( (convs): ModuleList( (0): Sequential( (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) (1): LeakyReLU(negative_slope=0.1) ) (1): Sequential( (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) (1): LeakyReLU(negative_slope=0.1) ) (2): Sequential( (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) (1): LeakyReLU(negative_slope=0.1) ) (3): Sequential( (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) (1): LeakyReLU(negative_slope=0.1) ) (4): Sequential( (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) (1): LeakyReLU(negative_slope=0.1) ) ) (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) ) ) ) ) (generator_adv_loss): GeneratorAdversarialLoss() (discriminator_adv_loss): DiscriminatorAdversarialLoss() (feat_match_loss): FeatureMatchLoss() (mel_loss): MelSpectrogramLoss( (wav_to_mel): LogMelFbank( (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=0, fmax=11025.0, htk=False) ) ) (kl_loss): KLDivergenceLoss() ) ) Model summary: Class Name: ESPnetGANTTSModel Total Number of model parameters: 96.24 M Number of trainable parameters: 96.24 M (100.0%) Size: 384.96 MB Type: torch.float32 [wieling-3-a100] 2023-12-01 15:58:57,921 (abs_task:1272) INFO: Optimizer: AdamW ( Parameter Group 0 amsgrad: False betas: [0.8, 0.99] capturable: False differentiable: False eps: 1e-09 foreach: None fused: None initial_lr: 0.0003 lr: 0.0003 maximize: False weight_decay: 0.0 ) [wieling-3-a100] 2023-12-01 15:58:57,921 (abs_task:1273) INFO: Scheduler: [wieling-3-a100] 2023-12-01 15:58:57,922 (abs_task:1272) INFO: Optimizer2: AdamW ( Parameter Group 0 amsgrad: False betas: [0.8, 0.99] capturable: False differentiable: False eps: 1e-09 foreach: None fused: None initial_lr: 0.0003 lr: 0.0003 maximize: False weight_decay: 0.0 ) [wieling-3-a100] 2023-12-01 15:58:57,922 (abs_task:1273) INFO: Scheduler2: [wieling-3-a100] 2023-12-01 15:58:57,922 (abs_task:1282) INFO: Saving the configuration in exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.17/config.yaml [wieling-3-a100] 2023-12-01 15:58:57,940 (abs_task:1293) INFO: Namespace(config='conf/train_vits.yaml', print_config=False, log_level='INFO', drop_last_iter=False, dry_run=False, iterator_type='sequence', valid_iterator_type=None, output_dir='exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.17', ngpu=0, seed=67823, num_workers=4, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=True, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=False, collect_stats=True, write_collected_feats=False, max_epoch=1000, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['train', 'total_count', 'max']], keep_nbest_models=10, nbest_averaging_interval=0, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=50, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=True, wandb_project='GROTTS', wandb_id=None, wandb_entity=None, wandb_name='VITS_lr_3.0e-4', wandb_model_log_interval=-1, detect_anomaly=False, use_lora=False, save_lora_only=True, lora_conf={}, pretrain_path=None, init_param=['downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv'], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=1000, batch_size=40, valid_batch_size=None, batch_bins=10000000, valid_batch_bins=None, train_shape_file=['exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.17.scp'], valid_shape_file=['exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.17.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', shuffle_within_batch=False, sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], chunk_default_fs=None, train_data_path_and_name_and_type=[('dump/raw/train_nodev/text', 'text', 'text'), ('dump/raw/train_nodev/wav.scp', 'speech', 'sound'), ('dump/raw/train_nodev/utt2sid', 'sids', 'text_int')], valid_data_path_and_name_and_type=[('dump/raw/train_dev/text', 'text', 'text'), ('dump/raw/train_dev/wav.scp', 'speech', 'sound'), ('dump/raw/train_dev/utt2sid', 'sids', 'text_int')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, allow_multi_rates=False, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adamw', optim_conf={'lr': 0.0003, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, scheduler='exponentiallr', scheduler_conf={'gamma': 0.999875}, optim2='adamw', optim2_conf={'lr': 0.0003, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, generator_first=False, token_list=['', '', '', 'e', 'n', 'a', 'o', 't', 'i', 'r', 'd', 's', 'k', 'l', 'm', 'u', 'g', 'h', 'w', 'v', '.', 'z', 'b', 'p', ',', 'j', 'c', 'f', '‘', '’', ':', '?', 'ö', "'", '!', '-', ';', 'ò', 'è', 'ì', 'é', 'y', 'ë', 'x', 'q', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='char', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='vits', tts_conf={'generator_type': 'vits_generator', 'generator_params': {'hidden_channels': 192, 'spks': 4, 'global_channels': 256, 'segment_size': 32, 'text_encoder_attention_heads': 2, 'text_encoder_ffn_expand': 4, 'text_encoder_blocks': 6, 'text_encoder_positionwise_layer_type': 'conv1d', 'text_encoder_positionwise_conv_kernel_size': 3, 'text_encoder_positional_encoding_layer_type': 'rel_pos', 'text_encoder_self_attention_layer_type': 'rel_selfattn', 'text_encoder_activation_type': 'swish', 'text_encoder_normalize_before': True, 'text_encoder_dropout_rate': 0.1, 'text_encoder_positional_dropout_rate': 0.0, 'text_encoder_attention_dropout_rate': 0.1, 'use_macaron_style_in_text_encoder': True, 'use_conformer_conv_in_text_encoder': False, 'text_encoder_conformer_kernel_size': -1, 'decoder_kernel_size': 7, 'decoder_channels': 512, 'decoder_upsample_scales': [8, 8, 2, 2], 'decoder_upsample_kernel_sizes': [16, 16, 4, 4], 'decoder_resblock_kernel_sizes': [3, 7, 11], 'decoder_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'use_weight_norm_in_decoder': True, 'posterior_encoder_kernel_size': 5, 'posterior_encoder_layers': 16, 'posterior_encoder_stacks': 1, 'posterior_encoder_base_dilation': 1, 'posterior_encoder_dropout_rate': 0.0, 'use_weight_norm_in_posterior_encoder': True, 'flow_flows': 4, 'flow_kernel_size': 5, 'flow_base_dilation': 1, 'flow_layers': 4, 'flow_dropout_rate': 0.0, 'use_weight_norm_in_flow': True, 'use_only_mean_in_flow': True, 'stochastic_duration_predictor_kernel_size': 3, 'stochastic_duration_predictor_dropout_rate': 0.5, 'stochastic_duration_predictor_flows': 4, 'stochastic_duration_predictor_dds_conv_layers': 3, 'vocabs': 46, 'aux_channels': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': False, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_dur': 1.0, 'lambda_kl': 1.0, 'sampling_rate': 22050, 'cache_generator_outputs': True}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202310', distributed=False) # Accounting: time=18 threads=1 # Ended (code 0) at Fri Dec 1 15:59:08 UTC 2023, elapsed time 18 seconds