File size: 1,452 Bytes
d1b91e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
base_config: ./base.yaml
task_cls: tasks.tts.fs.FastSpeechTask

# model
hidden_size: 256
dropout: 0.0
encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
decoder_type: conv # fft|rnn|conv|conformer|wn

# rnn enc/dec
encoder_K: 8
decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2

# fft enc/dec
enc_layers: 4
enc_ffn_kernel_size: 9
enc_prenet: true
enc_pre_ln: true
dec_layers: 4
dec_ffn_kernel_size: 9
num_heads: 2
ffn_act: gelu
ffn_hidden_size: 1024
use_pos_embed: true

# conv enc/dec
enc_dec_norm: ln
conv_use_pos: false
layers_in_block: 2
enc_dilations: [ 1, 1, 1, 1 ]
enc_kernel_size: 5
enc_post_net_kernel: 3
dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
dec_kernel_size: 5
dec_post_net_kernel: 3

# duration
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 2
dur_predictor_kernel: 3
dur_predictor_layers: 2
predictor_dropout: 0.5

# pitch and energy
use_pitch_embed: false
pitch_type: frame # frame|ph|cwt
use_uv: true

# reference encoder and speaker embedding
lambda_commit: 0.25
ref_norm_layer: bn
dec_inp_add_noise: false

# mel
mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5

# loss lambda
lambda_f0: 1.0
lambda_uv: 1.0
lambda_energy: 0.1
lambda_ph_dur: 0.1
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1

# train and eval
warmup_updates: 4000
max_tokens: 40000
max_sentences: 128
max_valid_sentences: 1
max_updates: 160000
use_gt_dur: false
use_gt_f0: false
ds_workers: 2