therealvul commited on
Commit
6f69c14
·
1 Parent(s): fd247fe

Upload config.yml

Browse files
Files changed (1) hide show
  1. Twilight0/config.yml +118 -0
Twilight0/config.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Twilight0"
2
+ first_stage_path: "epoch_1st_00066.pth"
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 200 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 100 # number of peochs for second stage training (joint training)
8
+ batch_size: 2
9
+ segmented_batch_size: [3, 2, 2]
10
+ max_len: 175 # maximum number of frames
11
+ pretrained_model: "Models/Twilight0/epoch_2nd_00006.pth.bak"
12
+ #pretrained_model: "Models/Twilight0/epoch_1st_00067.pth"
13
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
14
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
15
+
16
+ F0_path: "Utils/JDC/bst.t7"
17
+ ASR_config: "Utils/ASR/config.yml"
18
+ ASR_path: "Utils/ASR/epoch_00080.pth"
19
+ PLBERT_dir: 'Utils/PLBERT/'
20
+
21
+ data_params:
22
+ train_data: "Data/train_list_small.txt"
23
+ val_data: "Data/val_list_small.txt"
24
+ root_path: "twilight_data"
25
+ OOD_data: "Data/OOD_texts.txt"
26
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
27
+
28
+ preprocess_params:
29
+ sr: 24000
30
+ spect_params:
31
+ n_fft: 2048
32
+ win_length: 1200
33
+ hop_length: 300
34
+
35
+ model_params:
36
+ multispeaker: false
37
+
38
+ dim_in: 64
39
+ hidden_dim: 512
40
+ max_conv_dim: 512
41
+ n_layer: 3
42
+ n_mels: 80
43
+
44
+ n_token: 178 # number of phoneme tokens
45
+ max_dur: 50 # maximum duration of a single phoneme
46
+ style_dim: 128 # style vector size
47
+
48
+ dropout: 0.2
49
+
50
+ # config for decoder
51
+ decoder:
52
+ type: 'istftnet' # either hifigan or istftnet
53
+ resblock_kernel_sizes: [3,7,11]
54
+ upsample_rates : [10, 6]
55
+ upsample_initial_channel: 512
56
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
57
+ upsample_kernel_sizes: [20, 12]
58
+ gen_istft_n_fft: 20
59
+ gen_istft_hop_size: 5
60
+
61
+ # speech language model config
62
+ slm:
63
+ model: 'microsoft/wavlm-base-plus'
64
+ sr: 16000 # sampling rate of SLM
65
+ hidden: 768 # hidden size of SLM
66
+ nlayers: 13 # number of layers of SLM
67
+ initial_channel: 64 # initial channels of SLM discriminator head
68
+
69
+ # style diffusion model config
70
+ diffusion:
71
+ embedding_mask_proba: 0.1
72
+ # transformer config
73
+ transformer:
74
+ num_layers: 3
75
+ num_heads: 8
76
+ head_features: 64
77
+ multiplier: 2
78
+
79
+ # diffusion distribution config
80
+ dist:
81
+ sigma_data: 0.681720565168225 # placeholder for estimate_sigma_data set to false
82
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
83
+ mean: -3.0
84
+ std: 1.0
85
+
86
+ loss_params:
87
+ lambda_mel: 5. # mel reconstruction loss
88
+ lambda_gen: 1. # generator loss
89
+ lambda_slm: 1. # slm feature matching loss
90
+
91
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
92
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
93
+ TMA_epoch: 50 # TMA starting epoch (1st stage)
94
+
95
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
96
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
97
+ lambda_dur: 1. # duration loss (2nd stage)
98
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
99
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
100
+ lambda_diff: 1. # score matching loss (2nd stage)
101
+
102
+ diff_epoch: 8 # style diffusion starting epoch (2nd stage)
103
+ joint_epoch: 9 # joint training starting epoch (2nd stage)
104
+
105
+ optimizer_params:
106
+ lr: 0.0001 # general learning rate
107
+ bert_lr: 0.00001 # learning rate for PLBERT
108
+ ft_lr: 0.0001 # learning rate for acoustic modules
109
+
110
+ slmadv_params:
111
+ min_len: 100 # minimum length of samples
112
+ max_len: 500 # maximum length of samples
113
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
114
+ iter: 10 # update the discriminator every this iterations of generator update
115
+ thresh: 5 # gradient norm above which the gradient is scaled
116
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
117
+ sig: 1.5 # sigma for differentiable duration modeling
118
+