Martijn Bartelds
Update files
9c82e23
# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type char --token_list dump/token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train_nodev/text,text,text --train_data_path_and_name_and_type dump/raw/train_nodev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/train_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/train_dev/wav.scp,speech,sound --train_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.13.scp --valid_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.13.scp --output_dir exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.13 --config conf/train_vits.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null --train_data_path_and_name_and_type dump/raw/train_nodev/utt2sid,sids,text_int --valid_data_path_and_name_and_type dump/raw/train_dev/utt2sid,sids,text_int --use_wandb true --wandb_project GROTTS --wandb_name VITS_lr_3.0e-4 --init_param downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv --batch_size 40 --batch_bins 10000000
# Started at Fri Dec 1 15:58:34 UTC 2023
#
/data2/p280965/tts/espnet/tools/venv/bin/python3 /data2/p280965/tts/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type char --token_list dump/token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train_nodev/text,text,text --train_data_path_and_name_and_type dump/raw/train_nodev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/train_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/train_dev/wav.scp,speech,sound --train_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.13.scp --valid_shape_file exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.13.scp --output_dir exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.13 --config conf/train_vits.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null --train_data_path_and_name_and_type dump/raw/train_nodev/utt2sid,sids,text_int --valid_data_path_and_name_and_type dump/raw/train_dev/utt2sid,sids,text_int --use_wandb true --wandb_project GROTTS --wandb_name VITS_lr_3.0e-4 --init_param downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv --batch_size 40 --batch_bins 10000000
[wieling-3-a100] 2023-12-01 15:58:40,400 (gan_tts:293) INFO: Vocabulary size: 46
[wieling-3-a100] 2023-12-01 15:58:40,546 (encoder:174) INFO: encoder self-attention layer type = relative self-attention
/data2/p280965/tts/espnet/tools/venv/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:30: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.
warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.")
/data2/p280965/tts/espnet/espnet2/gan_tts/vits/monotonic_align/__init__.py:19: UserWarning: Cython version is not available. Fallback to 'EXPERIMETAL' numba version. If you want to use the cython version, please build it as follows: `cd espnet2/gan_tts/vits/monotonic_align; python setup.py build_ext --inplace`
warnings.warn(
[wieling-3-a100] 2023-12-01 15:58:41,851 (abs_task:1268) INFO: pytorch.version=2.1.0+cu121, cuda.available=True, cudnn.version=8902, cudnn.benchmark=False, cudnn.deterministic=False
[wieling-3-a100] 2023-12-01 15:58:41,866 (abs_task:1269) INFO: Model structure:
ESPnetGANTTSModel(
(feats_extract): LogMelFbank(
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
)
(tts): VITS(
(generator): VITSGenerator(
(text_encoder): TextEncoder(
(emb): Embedding(46, 192)
(encoder): Encoder(
(embed): Sequential(
(0): RelPositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
(encoders): MultiSequential(
(0): EncoderLayer(
(self_attn): RelPositionMultiHeadedAttention(
(linear_q): Linear(in_features=192, out_features=192, bias=True)
(linear_k): Linear(in_features=192, out_features=192, bias=True)
(linear_v): Linear(in_features=192, out_features=192, bias=True)
(linear_out): Linear(in_features=192, out_features=192, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear_pos): Linear(in_features=192, out_features=192, bias=False)
)
(feed_forward): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward_macaron): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(1): EncoderLayer(
(self_attn): RelPositionMultiHeadedAttention(
(linear_q): Linear(in_features=192, out_features=192, bias=True)
(linear_k): Linear(in_features=192, out_features=192, bias=True)
(linear_v): Linear(in_features=192, out_features=192, bias=True)
(linear_out): Linear(in_features=192, out_features=192, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear_pos): Linear(in_features=192, out_features=192, bias=False)
)
(feed_forward): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward_macaron): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(2): EncoderLayer(
(self_attn): RelPositionMultiHeadedAttention(
(linear_q): Linear(in_features=192, out_features=192, bias=True)
(linear_k): Linear(in_features=192, out_features=192, bias=True)
(linear_v): Linear(in_features=192, out_features=192, bias=True)
(linear_out): Linear(in_features=192, out_features=192, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear_pos): Linear(in_features=192, out_features=192, bias=False)
)
(feed_forward): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward_macaron): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(3): EncoderLayer(
(self_attn): RelPositionMultiHeadedAttention(
(linear_q): Linear(in_features=192, out_features=192, bias=True)
(linear_k): Linear(in_features=192, out_features=192, bias=True)
(linear_v): Linear(in_features=192, out_features=192, bias=True)
(linear_out): Linear(in_features=192, out_features=192, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear_pos): Linear(in_features=192, out_features=192, bias=False)
)
(feed_forward): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward_macaron): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(4): EncoderLayer(
(self_attn): RelPositionMultiHeadedAttention(
(linear_q): Linear(in_features=192, out_features=192, bias=True)
(linear_k): Linear(in_features=192, out_features=192, bias=True)
(linear_v): Linear(in_features=192, out_features=192, bias=True)
(linear_out): Linear(in_features=192, out_features=192, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear_pos): Linear(in_features=192, out_features=192, bias=False)
)
(feed_forward): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward_macaron): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(5): EncoderLayer(
(self_attn): RelPositionMultiHeadedAttention(
(linear_q): Linear(in_features=192, out_features=192, bias=True)
(linear_k): Linear(in_features=192, out_features=192, bias=True)
(linear_v): Linear(in_features=192, out_features=192, bias=True)
(linear_out): Linear(in_features=192, out_features=192, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear_pos): Linear(in_features=192, out_features=192, bias=False)
)
(feed_forward): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward_macaron): MultiLayeredConv1d(
(w_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,), padding=(1,))
(w_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,))
(dropout): Dropout(p=0.1, inplace=False)
)
(norm_ff): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(after_norm): LayerNorm((192,), eps=1e-12, elementwise_affine=True)
)
(proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
)
(decoder): HiFiGANGenerator(
(input_conv): Conv1d(192, 512, kernel_size=(7,), stride=(1,), padding=(3,))
(upsamples): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
)
(3): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
)
)
(blocks): ModuleList(
(0): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
)
(1): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
)
(2): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
)
(3): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
)
(4): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
)
(5): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
)
(6): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
)
(7): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
)
(8): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
)
(9): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
)
(10): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
)
)
)
(11): ResidualBlock(
(convs1): ModuleList(
(0): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
)
(1): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
)
(2): Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
)
)
(convs2): ModuleList(
(0-2): 3 x Sequential(
(0): LeakyReLU(negative_slope=0.1)
(1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
)
)
)
)
(output_conv): Sequential(
(0): LeakyReLU(negative_slope=0.01)
(1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,))
(2): Tanh()
)
(global_conv): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
)
(posterior_encoder): PosteriorEncoder(
(input_conv): Conv1d(80, 192, kernel_size=(1,), stride=(1,))
(encoder): WaveNet(
(conv_layers): ModuleList(
(0-15): 16 x ResidualBlock(
(conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
(conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
(conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
)
)
)
(proj): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
)
(flow): ResidualAffineCouplingBlock(
(flows): ModuleList(
(0): ResidualAffineCouplingLayer(
(input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(encoder): WaveNet(
(conv_layers): ModuleList(
(0-3): 4 x ResidualBlock(
(conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
(conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
(conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
)
)
)
(proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(1): FlipFlow()
(2): ResidualAffineCouplingLayer(
(input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(encoder): WaveNet(
(conv_layers): ModuleList(
(0-3): 4 x ResidualBlock(
(conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
(conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
(conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
)
)
)
(proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(3): FlipFlow()
(4): ResidualAffineCouplingLayer(
(input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(encoder): WaveNet(
(conv_layers): ModuleList(
(0-3): 4 x ResidualBlock(
(conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
(conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
(conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
)
)
)
(proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(5): FlipFlow()
(6): ResidualAffineCouplingLayer(
(input_conv): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
(encoder): WaveNet(
(conv_layers): ModuleList(
(0-3): 4 x ResidualBlock(
(conv): Conv1d(192, 384, kernel_size=(5,), stride=(1,), padding=(2,))
(conv1x1_glo): Conv1d1x1(256, 384, kernel_size=(1,), stride=(1,), bias=False)
(conv1x1_out): Conv1d1x1(192, 384, kernel_size=(1,), stride=(1,))
)
)
)
(proj): Conv1d(192, 96, kernel_size=(1,), stride=(1,))
)
(7): FlipFlow()
)
)
(duration_predictor): StochasticDurationPredictor(
(pre): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(dds): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.5, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.5, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.5, inplace=False)
)
)
)
(proj): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(log_flow): LogFlow()
(flows): ModuleList(
(0): ElementwiseAffineFlow()
(1): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(2): FlipFlow()
(3): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(4): FlipFlow()
(5): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(6): FlipFlow()
(7): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(8): FlipFlow()
)
(post_pre): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(post_dds): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.5, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.5, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.5, inplace=False)
)
)
)
(post_proj): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(post_flows): ModuleList(
(0): ElementwiseAffineFlow()
(1): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(2): FlipFlow()
(3): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(4): FlipFlow()
(5): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(6): FlipFlow()
(7): ConvFlow(
(input_conv): Conv1d(1, 192, kernel_size=(1,), stride=(1,))
(dds_conv): DilatedDepthSeparableConv(
(convs): ModuleList(
(0): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(1,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(1): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
(2): Sequential(
(0): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,), groups=192)
(1): Transpose()
(2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(3): Transpose()
(4): GELU(approximate='none')
(5): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
(6): Transpose()
(7): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(8): Transpose()
(9): GELU(approximate='none')
(10): Dropout(p=0.0, inplace=False)
)
)
)
(proj): Conv1d(192, 29, kernel_size=(1,), stride=(1,))
)
(8): FlipFlow()
)
(global_conv): Conv1d(256, 192, kernel_size=(1,), stride=(1,))
)
(global_emb): Embedding(4, 256)
)
(discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator(
(msd): HiFiGANMultiScaleDiscriminator(
(discriminators): ModuleList(
(0): HiFiGANScaleDiscriminator(
(layers): ModuleList(
(0): Sequential(
(0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,))
(1): LeakyReLU(negative_slope=0.1)
)
(1): Sequential(
(0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4)
(1): LeakyReLU(negative_slope=0.1)
)
(2): Sequential(
(0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16)
(1): LeakyReLU(negative_slope=0.1)
)
(3): Sequential(
(0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
(1): LeakyReLU(negative_slope=0.1)
)
(4): Sequential(
(0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
(1): LeakyReLU(negative_slope=0.1)
)
(5): Sequential(
(0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16)
(1): LeakyReLU(negative_slope=0.1)
)
(6): Sequential(
(0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
(1): LeakyReLU(negative_slope=0.1)
)
(7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,))
)
)
)
)
(mpd): HiFiGANMultiPeriodDiscriminator(
(discriminators): ModuleList(
(0-4): 5 x HiFiGANPeriodDiscriminator(
(convs): ModuleList(
(0): Sequential(
(0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
(1): LeakyReLU(negative_slope=0.1)
)
(1): Sequential(
(0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
(1): LeakyReLU(negative_slope=0.1)
)
(2): Sequential(
(0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
(1): LeakyReLU(negative_slope=0.1)
)
(3): Sequential(
(0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
(1): LeakyReLU(negative_slope=0.1)
)
(4): Sequential(
(0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
(1): LeakyReLU(negative_slope=0.1)
)
)
(output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
)
)
)
)
(generator_adv_loss): GeneratorAdversarialLoss()
(discriminator_adv_loss): DiscriminatorAdversarialLoss()
(feat_match_loss): FeatureMatchLoss()
(mel_loss): MelSpectrogramLoss(
(wav_to_mel): LogMelFbank(
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=0, fmax=11025.0, htk=False)
)
)
(kl_loss): KLDivergenceLoss()
)
)
Model summary:
Class Name: ESPnetGANTTSModel
Total Number of model parameters: 96.24 M
Number of trainable parameters: 96.24 M (100.0%)
Size: 384.96 MB
Type: torch.float32
[wieling-3-a100] 2023-12-01 15:58:41,867 (abs_task:1272) INFO: Optimizer:
AdamW (
Parameter Group 0
amsgrad: False
betas: [0.8, 0.99]
capturable: False
differentiable: False
eps: 1e-09
foreach: None
fused: None
initial_lr: 0.0003
lr: 0.0003
maximize: False
weight_decay: 0.0
)
[wieling-3-a100] 2023-12-01 15:58:41,867 (abs_task:1273) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f9c38f5f8e0>
[wieling-3-a100] 2023-12-01 15:58:41,867 (abs_task:1272) INFO: Optimizer2:
AdamW (
Parameter Group 0
amsgrad: False
betas: [0.8, 0.99]
capturable: False
differentiable: False
eps: 1e-09
foreach: None
fused: None
initial_lr: 0.0003
lr: 0.0003
maximize: False
weight_decay: 0.0
)
[wieling-3-a100] 2023-12-01 15:58:41,867 (abs_task:1273) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f9c38f5f880>
[wieling-3-a100] 2023-12-01 15:58:41,867 (abs_task:1282) INFO: Saving the configuration in exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.13/config.yaml
[wieling-3-a100] 2023-12-01 15:58:41,885 (abs_task:1293) INFO: Namespace(config='conf/train_vits.yaml', print_config=False, log_level='INFO', drop_last_iter=False, dry_run=False, iterator_type='sequence', valid_iterator_type=None, output_dir='exp-vits-lr-3e-4/tts_stats_raw_char/logdir/stats.13', ngpu=0, seed=67823, num_workers=4, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=True, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=False, collect_stats=True, write_collected_feats=False, max_epoch=1000, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['train', 'total_count', 'max']], keep_nbest_models=10, nbest_averaging_interval=0, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=50, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=True, wandb_project='GROTTS', wandb_id=None, wandb_entity=None, wandb_name='VITS_lr_3.0e-4', wandb_model_log_interval=-1, detect_anomaly=False, use_lora=False, save_lora_only=True, lora_conf={}, pretrain_path=None, init_param=['downloads/espnet/kan-bayashi_ljspeech_vits/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth:tts:tts:tts.generator.text_encoder,tts.generator.posterior_encoder.input_conv'], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=1000, batch_size=40, valid_batch_size=None, batch_bins=10000000, valid_batch_bins=None, train_shape_file=['exp-vits-lr-3e-4/tts_stats_raw_char/logdir/train.13.scp'], valid_shape_file=['exp-vits-lr-3e-4/tts_stats_raw_char/logdir/valid.13.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', shuffle_within_batch=False, sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], chunk_default_fs=None, train_data_path_and_name_and_type=[('dump/raw/train_nodev/text', 'text', 'text'), ('dump/raw/train_nodev/wav.scp', 'speech', 'sound'), ('dump/raw/train_nodev/utt2sid', 'sids', 'text_int')], valid_data_path_and_name_and_type=[('dump/raw/train_dev/text', 'text', 'text'), ('dump/raw/train_dev/wav.scp', 'speech', 'sound'), ('dump/raw/train_dev/utt2sid', 'sids', 'text_int')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, allow_multi_rates=False, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adamw', optim_conf={'lr': 0.0003, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, scheduler='exponentiallr', scheduler_conf={'gamma': 0.999875}, optim2='adamw', optim2_conf={'lr': 0.0003, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, generator_first=False, token_list=['<blank>', '<unk>', '<space>', 'e', 'n', 'a', 'o', 't', 'i', 'r', 'd', 's', 'k', 'l', 'm', 'u', 'g', 'h', 'w', 'v', '.', 'z', 'b', 'p', ',', 'j', 'c', 'f', '‘', '’', ':', '?', 'ö', "'", '!', '-', ';', 'ò', 'è', 'ì', 'é', 'y', 'ë', 'x', 'q', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='char', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='vits', tts_conf={'generator_type': 'vits_generator', 'generator_params': {'hidden_channels': 192, 'spks': 4, 'global_channels': 256, 'segment_size': 32, 'text_encoder_attention_heads': 2, 'text_encoder_ffn_expand': 4, 'text_encoder_blocks': 6, 'text_encoder_positionwise_layer_type': 'conv1d', 'text_encoder_positionwise_conv_kernel_size': 3, 'text_encoder_positional_encoding_layer_type': 'rel_pos', 'text_encoder_self_attention_layer_type': 'rel_selfattn', 'text_encoder_activation_type': 'swish', 'text_encoder_normalize_before': True, 'text_encoder_dropout_rate': 0.1, 'text_encoder_positional_dropout_rate': 0.0, 'text_encoder_attention_dropout_rate': 0.1, 'use_macaron_style_in_text_encoder': True, 'use_conformer_conv_in_text_encoder': False, 'text_encoder_conformer_kernel_size': -1, 'decoder_kernel_size': 7, 'decoder_channels': 512, 'decoder_upsample_scales': [8, 8, 2, 2], 'decoder_upsample_kernel_sizes': [16, 16, 4, 4], 'decoder_resblock_kernel_sizes': [3, 7, 11], 'decoder_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'use_weight_norm_in_decoder': True, 'posterior_encoder_kernel_size': 5, 'posterior_encoder_layers': 16, 'posterior_encoder_stacks': 1, 'posterior_encoder_base_dilation': 1, 'posterior_encoder_dropout_rate': 0.0, 'use_weight_norm_in_posterior_encoder': True, 'flow_flows': 4, 'flow_kernel_size': 5, 'flow_base_dilation': 1, 'flow_layers': 4, 'flow_dropout_rate': 0.0, 'use_weight_norm_in_flow': True, 'use_only_mean_in_flow': True, 'stochastic_duration_predictor_kernel_size': 3, 'stochastic_duration_predictor_dropout_rate': 0.5, 'stochastic_duration_predictor_flows': 4, 'stochastic_duration_predictor_dds_conv_layers': 3, 'vocabs': 46, 'aux_channels': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': False, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_dur': 1.0, 'lambda_kl': 1.0, 'sampling_rate': 22050, 'cache_generator_outputs': True}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202310', distributed=False)
# Accounting: time=18 threads=1
# Ended (code 0) at Fri Dec 1 15:58:52 UTC 2023, elapsed time 18 seconds