# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type char --token_list dump/token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train_nodev/text,text,text --train_data_path_and_name_and_type dump/raw/train_nodev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/train_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/train_dev/wav.scp,speech,sound --train_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.1.scp --valid_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.1.scp --output_dir exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.1 --config conf/train_vits.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null --train_data_path_and_name_and_type dump/raw/train_nodev/utt2sid,sids,text_int --valid_data_path_and_name_and_type dump/raw/train_dev/utt2sid,sids,text_int --use_wandb true --wandb_project GROTTS --wandb_name VITS_lr_3.0e-4 --init_param downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv --batch_size 40 --batch_bins 10000000 
# Started at Fri Dec  1 15:58:34 UTC 2023
#
/data2/p280965/tts/espnet/tools/venv/bin/python3 /data2/p280965/tts/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type char --token_list dump/token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train_nodev/text,text,text --train_data_path_and_name_and_type dump/raw/train_nodev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/train_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/train_dev/wav.scp,speech,sound --train_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.1.scp --valid_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.1.scp --output_dir exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.1 --config conf/train_vits.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null --train_data_path_and_name_and_type dump/raw/train_nodev/utt2sid,sids,text_int --valid_data_path_and_name_and_type dump/raw/train_dev/utt2sid,sids,text_int --use_wandb true --wandb_project GROTTS --wandb_name VITS_lr_3.0e-4 --init_param downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv --batch_size 40 --batch_bins 10000000
[wieling-3-a100] 2023-12-01 15:58:40,398 (gan_tts:293) INFO: Vocabulary size: 46
[wieling-3-a100] 2023-12-01 15:58:40,545 (encoder:174) INFO: encoder self-attention layer type = relative self-attention
/data2/p280965/tts/espnet/tools/venv/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.
  warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.")
/data2/p280965/tts/espnet/espnet2/gan_tts/vits/monotonic_align/__init__.py:19: UserWarning: Cython version is not available. Fallback to 'EXPERIMETAL' numba version. If you want to use the cython version, please build it as follows: `cd espnet2/gan_tts/vits/monotonic_align; python setup.py build_ext --inplace`
  warnings.warn(
[wieling-3-a100] 2023-12-01 15:58:41,774 (abs_task:1268) INFO: pytorch.version=2.1.0+cu121, cuda.available=True, cudnn.version=8902, cudnn.benchmark=False, cudnn.deterministic=False
[wieling-3-a100] 2023-12-01 15:58:41,789 (abs_task:1269) INFO: Model structure:
ESPnetGANTTSModel(
  (feats_extract): LogMelFbank(
    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
  )
  (tts): VITS(
    (generator): VITSGenerator(
      (text_encoder): TextEncoder(
        (emb): Embedding(46, 192)
        (encoder): Encoder(
          (embed): Sequential(
            (0): RelPositionalEncoding(
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (encoders): MultiSequential(
            (0): EncoderLayer(
              (self_attn): RelPositionMultiHeadedAttention(
                (linear_q): Linear(in_features=192, out_features=192, bias=True)
                (linear_k): Linear(in_features=192, out_features=192, bias=True)
                (linear_v): Linear(in_features=192, out_features=192, bias=True)
                (linear_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear_pos): Linear(in_features=192, out_features=192, bias=False)
              )
              (feed_forward): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (feed_forward_macaron): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): EncoderLayer(
              (self_attn): RelPositionMultiHeadedAttention(
                (linear_q): Linear(in_features=192, out_features=192, bias=True)
                (linear_k): Linear(in_features=192, out_features=192, bias=True)
                (linear_v): Linear(in_features=192, out_features=192, bias=True)
                (linear_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear_pos): Linear(in_features=192, out_features=192, bias=False)
              )
              (feed_forward): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (feed_forward_macaron): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (2): EncoderLayer(
              (self_attn): RelPositionMultiHeadedAttention(
                (linear_q): Linear(in_features=192, out_features=192, bias=True)
                (linear_k): Linear(in_features=192, out_features=192, bias=True)
                (linear_v): Linear(in_features=192, out_features=192, bias=True)
                (linear_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear_pos): Linear(in_features=192, out_features=192, bias=False)
              )
              (feed_forward): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (feed_forward_macaron): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (3): EncoderLayer(
              (self_attn): RelPositionMultiHeadedAttention(
                (linear_q): Linear(in_features=192, out_features=192, bias=True)
                (linear_k): Linear(in_features=192, out_features=192, bias=True)
                (linear_v): Linear(in_features=192, out_features=192, bias=True)
                (linear_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear_pos): Linear(in_features=192, out_features=192, bias=False)
              )
              (feed_forward): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (feed_forward_macaron): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (4): EncoderLayer(
              (self_attn): RelPositionMultiHeadedAttention(
                (linear_q): Linear(in_features=192, out_features=192, bias=True)
                (linear_k): Linear(in_features=192, out_features=192, bias=True)
                (linear_v): Linear(in_features=192, out_features=192, bias=True)
                (linear_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear_pos): Linear(in_features=192, out_features=192, bias=False)
              )
              (feed_forward): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (feed_forward_macaron): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (5): EncoderLayer(
              (self_attn): RelPositionMultiHeadedAttention(
                (linear_q): Linear(in_features=192, out_features=192, bias=True)
                (linear_k): Linear(in_features=192, out_features=192, bias=True)
                (linear_v): Linear(in_features=192, out_features=192, bias=True)
                (linear_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear_pos): Linear(in_features=192, out_features=192, bias=False)
              )
              (feed_forward): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (feed_forward_macaron): MultiLayeredConv1d(
                (w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
                (w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (after_norm): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
        )
        (proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
      )
      (decoder): HiFiGANGenerator(
        (input_conv): Conv1d(192, 512, kernel_size=(7,), stride=(1,), padding=(3,))
        (upsamples): ModuleList(
          (0): Sequential(
            (0): LeakyReLU(negative_slope=0.1)
            (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
          )
          (1): Sequential(
            (0): LeakyReLU(negative_slope=0.1)
            (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
          )
          (2): Sequential(
            (0): LeakyReLU(negative_slope=0.1)
            (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
          )
          (3): Sequential(
            (0): LeakyReLU(negative_slope=0.1)
            (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
          )
        )
        (blocks): ModuleList(
          (0): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
              )
            )
          )
          (1): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
              )
            )
          )
          (2): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
              )
            )
          )
          (3): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
              )
            )
          )
          (4): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
              )
            )
          )
          (5): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
              )
            )
          )
          (6): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
              )
            )
          )
          (7): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
              )
            )
          )
          (8): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
              )
            )
          )
          (9): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
              )
            )
          )
          (10): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
              )
            )
          )
          (11): ResidualBlock(
            (convs1): ModuleList(
              (0): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
              )
              (1): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
              )
              (2): Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
              )
            )
            (convs2): ModuleList(
              (0-2): 3 x Sequential(
                (0): LeakyReLU(negative_slope=0.1)
                (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
              )
            )
          )
        )
        (output_conv): Sequential(
          (0): LeakyReLU(negative_slope=0.01)
          (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,))
          (2): Tanh()
        )
        (global_conv): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
      )
      (posterior_encoder): PosteriorEncoder(
        (input_conv): Conv1d(80, 192, kernel_size=(1,), stride=(1,))
        (encoder): WaveNet(
          (conv_layers): ModuleList(
            (0-15): 16 x ResidualBlock(
              (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
              (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
              (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
            )
          )
        )
        (proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
      )
      (flow): ResidualAffineCouplingBlock(
        (flows): ModuleList(
          (0): ResidualAffineCouplingLayer(
            (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
            (encoder): WaveNet(
              (conv_layers): ModuleList(
                (0-3): 4 x ResidualBlock(
                  (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
                  (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
                  (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
                )
              )
            )
            (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
          )
          (1): FlipFlow()
          (2): ResidualAffineCouplingLayer(
            (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
            (encoder): WaveNet(
              (conv_layers): ModuleList(
                (0-3): 4 x ResidualBlock(
                  (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
                  (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
                  (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
                )
              )
            )
            (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
          )
          (3): FlipFlow()
          (4): ResidualAffineCouplingLayer(
            (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
            (encoder): WaveNet(
              (conv_layers): ModuleList(
                (0-3): 4 x ResidualBlock(
                  (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
                  (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
                  (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
                )
              )
            )
            (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
          )
          (5): FlipFlow()
          (6): ResidualAffineCouplingLayer(
            (input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
            (encoder): WaveNet(
              (conv_layers): ModuleList(
                (0-3): 4 x ResidualBlock(
                  (conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
                  (conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
                  (conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
                )
              )
            )
            (proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
          )
          (7): FlipFlow()
        )
      )
      (duration_predictor): StochasticDurationPredictor(
        (pre): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
        (dds): DilatedDepthSeparableConv(
          (convs): ModuleList(
            (0): Sequential(
              (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
              (1): Transpose()
              (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (3): Transpose()
              (4): GELU(approximate='none')
              (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (6): Transpose()
              (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (8): Transpose()
              (9): GELU(approximate='none')
              (10): Dropout(p=0.5, inplace=False)
            )
            (1): Sequential(
              (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
              (1): Transpose()
              (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (3): Transpose()
              (4): GELU(approximate='none')
              (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (6): Transpose()
              (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (8): Transpose()
              (9): GELU(approximate='none')
              (10): Dropout(p=0.5, inplace=False)
            )
            (2): Sequential(
              (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
              (1): Transpose()
              (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (3): Transpose()
              (4): GELU(approximate='none')
              (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (6): Transpose()
              (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (8): Transpose()
              (9): GELU(approximate='none')
              (10): Dropout(p=0.5, inplace=False)
            )
          )
        )
        (proj): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
        (log_flow): LogFlow()
        (flows): ModuleList(
          (0): ElementwiseAffineFlow()
          (1): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (2): FlipFlow()
          (3): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (4): FlipFlow()
          (5): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (6): FlipFlow()
          (7): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (8): FlipFlow()
        )
        (post_pre): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
        (post_dds): DilatedDepthSeparableConv(
          (convs): ModuleList(
            (0): Sequential(
              (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
              (1): Transpose()
              (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (3): Transpose()
              (4): GELU(approximate='none')
              (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (6): Transpose()
              (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (8): Transpose()
              (9): GELU(approximate='none')
              (10): Dropout(p=0.5, inplace=False)
            )
            (1): Sequential(
              (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
              (1): Transpose()
              (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (3): Transpose()
              (4): GELU(approximate='none')
              (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (6): Transpose()
              (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (8): Transpose()
              (9): GELU(approximate='none')
              (10): Dropout(p=0.5, inplace=False)
            )
            (2): Sequential(
              (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
              (1): Transpose()
              (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (3): Transpose()
              (4): GELU(approximate='none')
              (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
              (6): Transpose()
              (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
              (8): Transpose()
              (9): GELU(approximate='none')
              (10): Dropout(p=0.5, inplace=False)
            )
          )
        )
        (post_proj): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
        (post_flows): ModuleList(
          (0): ElementwiseAffineFlow()
          (1): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (2): FlipFlow()
          (3): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (4): FlipFlow()
          (5): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (6): FlipFlow()
          (7): ConvFlow(
            (input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
            (dds_conv): DilatedDepthSeparableConv(
              (convs): ModuleList(
                (0): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (1): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
                (2): Sequential(
                  (0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
                  (1): Transpose()
                  (2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (3): Transpose()
                  (4): GELU(approximate='none')
                  (5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
                  (6): Transpose()
                  (7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                  (8): Transpose()
                  (9): GELU(approximate='none')
                  (10): Dropout(p=0.0, inplace=False)
                )
              )
            )
            (proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
          )
          (8): FlipFlow()
        )
        (global_conv): Conv1d(256, 192, kernel_size=(1,), stride=(1,))
      )
      (global_emb): Embedding(4, 256)
    )
    (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator(
      (msd): HiFiGANMultiScaleDiscriminator(
        (discriminators): ModuleList(
          (0): HiFiGANScaleDiscriminator(
            (layers): ModuleList(
              (0): Sequential(
                (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,))
                (1): LeakyReLU(negative_slope=0.1)
              )
              (1): Sequential(
                (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4)
                (1): LeakyReLU(negative_slope=0.1)
              )
              (2): Sequential(
                (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16)
                (1): LeakyReLU(negative_slope=0.1)
              )
              (3): Sequential(
                (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
                (1): LeakyReLU(negative_slope=0.1)
              )
              (4): Sequential(
                (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
                (1): LeakyReLU(negative_slope=0.1)
              )
              (5): Sequential(
                (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16)
                (1): LeakyReLU(negative_slope=0.1)
              )
              (6): Sequential(
                (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
                (1): LeakyReLU(negative_slope=0.1)
              )
              (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,))
            )
          )
        )
      )
      (mpd): HiFiGANMultiPeriodDiscriminator(
        (discriminators): ModuleList(
          (0-4): 5 x HiFiGANPeriodDiscriminator(
            (convs): ModuleList(
              (0): Sequential(
                (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
                (1): LeakyReLU(negative_slope=0.1)
              )
              (1): Sequential(
                (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
                (1): LeakyReLU(negative_slope=0.1)
              )
              (2): Sequential(
                (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
                (1): LeakyReLU(negative_slope=0.1)
              )
              (3): Sequential(
                (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
                (1): LeakyReLU(negative_slope=0.1)
              )
              (4): Sequential(
                (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
                (1): LeakyReLU(negative_slope=0.1)
              )
            )
            (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
          )
        )
      )
    )
    (generator_adv_loss): GeneratorAdversarialLoss()
    (discriminator_adv_loss): DiscriminatorAdversarialLoss()
    (feat_match_loss): FeatureMatchLoss()
    (mel_loss): MelSpectrogramLoss(
      (wav_to_mel): LogMelFbank(
        (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
        (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=0, fmax=11025.0, htk=False)
      )
    )
    (kl_loss): KLDivergenceLoss()
  )
)

Model summary:
    Class Name: ESPnetGANTTSModel
    Total Number of model parameters: 96.24 M
    Number of trainable parameters: 96.24 M (100.0%)
    Size: 384.96 MB
    Type: torch.float32
[wieling-3-a100] 2023-12-01 15:58:41,789 (abs_task:1272) INFO: Optimizer:
AdamW (
Parameter Group 0
    amsgrad: False
    betas: [0.8, 0.99]
    capturable: False
    differentiable: False
    eps: 1e-09
    foreach: None
    fused: None
    initial_lr: 0.0003
    lr: 0.0003
    maximize: False
    weight_decay: 0.0
)
[wieling-3-a100] 2023-12-01 15:58:41,789 (abs_task:1273) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7ff08e5c38b0>
[wieling-3-a100] 2023-12-01 15:58:41,790 (abs_task:1272) INFO: Optimizer2:
AdamW (
Parameter Group 0
    amsgrad: False
    betas: [0.8, 0.99]
    capturable: False
    differentiable: False
    eps: 1e-09
    foreach: None
    fused: None
    initial_lr: 0.0003
    lr: 0.0003
    maximize: False
    weight_decay: 0.0
)
[wieling-3-a100] 2023-12-01 15:58:41,790 (abs_task:1273) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7ff08e5c3850>
[wieling-3-a100] 2023-12-01 15:58:41,790 (abs_task:1282) INFO: Saving the configuration in exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.1/config.yaml
[wieling-3-a100] 2023-12-01 15:58:41,807 (abs_task:1293) INFO: Namespace(config='conf/train_vits.yaml', print_config=False, log_level='INFO', drop_last_iter=False, dry_run=False, iterator_type='sequence', valid_iterator_type=None, output_dir='exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.1', ngpu=0, seed=67823, num_workers=4, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=True, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=False, collect_stats=True, write_collected_feats=False, max_epoch=1000, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['train', 'total_count', 'max']], keep_nbest_models=10, nbest_averaging_interval=0, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=50, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=True, wandb_project='GROTTS', wandb_id=None, wandb_entity=None, wandb_name='VITS_lr_3.0e-4', wandb_model_log_interval=-1, detect_anomaly=False, use_lora=False, save_lora_only=True, lora_conf={}, pretrain_path=None, init_param=['downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv'], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=1000, batch_size=40, valid_batch_size=None, batch_bins=10000000, valid_batch_bins=None, train_shape_file=['exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.1.scp'], valid_shape_file=['exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.1.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', shuffle_within_batch=False, sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], chunk_default_fs=None, train_data_path_and_name_and_type=[('dump/raw/train_nodev/text', 'text', 'text'), ('dump/raw/train_nodev/wav.scp', 'speech', 'sound'), ('dump/raw/train_nodev/utt2sid', 'sids', 'text_int')], valid_data_path_and_name_and_type=[('dump/raw/train_dev/text', 'text', 'text'), ('dump/raw/train_dev/wav.scp', 'speech', 'sound'), ('dump/raw/train_dev/utt2sid', 'sids', 'text_int')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, allow_multi_rates=False, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adamw', optim_conf={'lr': 0.0003, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, scheduler='exponentiallr', scheduler_conf={'gamma': 0.999875}, optim2='adamw', optim2_conf={'lr': 0.0003, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, generator_first=False, token_list=['<blank>', '<unk>', '<space>', 'e', 'n', 'a', 'o', 't', 'i', 'r', 'd', 's', 'k', 'l', 'm', 'u', 'g', 'h', 'w', 'v', '.', 'z', 'b', 'p', ',', 'j', 'c', 'f', '‘', '’', ':', '?', 'ö', "'", '!', '-', ';', 'ò', 'è', 'ì', 'é', 'y', 'ë', 'x', 'q', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='char', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='vits', tts_conf={'generator_type': 'vits_generator', 'generator_params': {'hidden_channels': 192, 'spks': 4, 'global_channels': 256, 'segment_size': 32, 'text_encoder_attention_heads': 2, 'text_encoder_ffn_expand': 4, 'text_encoder_blocks': 6, 'text_encoder_positionwise_layer_type': 'conv1d', 'text_encoder_positionwise_conv_kernel_size': 3, 'text_encoder_positional_encoding_layer_type': 'rel_pos', 'text_encoder_self_attention_layer_type': 'rel_selfattn', 'text_encoder_activation_type': 'swish', 'text_encoder_normalize_before': True, 'text_encoder_dropout_rate': 0.1, 'text_encoder_positional_dropout_rate': 0.0, 'text_encoder_attention_dropout_rate': 0.1, 'use_macaron_style_in_text_encoder': True, 'use_conformer_conv_in_text_encoder': False, 'text_encoder_conformer_kernel_size': -1, 'decoder_kernel_size': 7, 'decoder_channels': 512, 'decoder_upsample_scales': [8, 8, 2, 2], 'decoder_upsample_kernel_sizes': [16, 16, 4, 4], 'decoder_resblock_kernel_sizes': [3, 7, 11], 'decoder_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'use_weight_norm_in_decoder': True, 'posterior_encoder_kernel_size': 5, 'posterior_encoder_layers': 16, 'posterior_encoder_stacks': 1, 'posterior_encoder_base_dilation': 1, 'posterior_encoder_dropout_rate': 0.0, 'use_weight_norm_in_posterior_encoder': True, 'flow_flows': 4, 'flow_kernel_size': 5, 'flow_base_dilation': 1, 'flow_layers': 4, 'flow_dropout_rate': 0.0, 'use_weight_norm_in_flow': True, 'use_only_mean_in_flow': True, 'stochastic_duration_predictor_kernel_size': 3, 'stochastic_duration_predictor_dropout_rate': 0.5, 'stochastic_duration_predictor_flows': 4, 'stochastic_duration_predictor_dds_conv_layers': 3, 'vocabs': 46, 'aux_channels': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': False, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_dur': 1.0, 'lambda_kl': 1.0, 'sampling_rate': 22050, 'cache_generator_outputs': True}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202310', distributed=False)
# Accounting: time=16 threads=1
# Ended (code 0) at Fri Dec  1 15:58:50 UTC 2023, elapsed time 16 seconds