File size: 3,373 Bytes
d8714d0 182ff19 d8714d0 182ff19 d8714d0 182ff19 d8714d0 182ff19 d8714d0 182ff19 d8714d0 182ff19 d8714d0 182ff19 d8714d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# ################################
# Model: Fastspeech2 for TTS
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar
# ################################
# Input parameters
lexicon:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
- spn
n_symbols: 41 #fixed depending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
n_mel_channels: 80
# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: 256
enc_ffn_dim: 1024
enc_k_dim: 256
enc_v_dim: 256
enc_dropout: 0.2
# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: 256
dec_ffn_dim: 1024
dec_k_dim: 256
dec_v_dim: 256
dec_dropout: 0.2
# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.5
# common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]
# variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5
# SPN predictor
spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
padding_idx: !ref <padding_idx>
#model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
dec_num_layers: !ref <dec_num_layers>
dec_num_head: !ref <dec_num_head>
dec_d_model: !ref <dec_d_model>
dec_ffn_dim: !ref <dec_ffn_dim>
dec_k_dim: !ref <dec_k_dim>
dec_v_dim: !ref <dec_v_dim>
dec_dropout: !ref <dec_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
n_mels: !ref <n_mel_channels>
postnet_embedding_dim: !ref <postnet_embedding_dim>
postnet_kernel_size: !ref <postnet_kernel_size>
postnet_n_convolutions: !ref <postnet_n_convolutions>
postnet_dropout: !ref <postnet_dropout>
padding_idx: !ref <padding_idx>
dur_pred_kernel_size: !ref <dur_pred_kernel_size>
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
energy_pred_kernel_size: !ref <energy_pred_kernel_size>
variance_predictor_dropout: !ref <variance_predictor_dropout>
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
modules:
spn_predictor: !ref <spn_predictor>
model: !ref <model>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
spn_predictor: !ref <spn_predictor>
model: !ref <model> |