not-lain commited on
Commit
45908a9
1 Parent(s): 9669bb6

Push model using huggingface_hub.

Browse files
Files changed (3) hide show
  1. README.md +11 -0
  2. config.json +203 -0
  3. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: styletts2
3
+ tags:
4
+ - model_hub_mixin
5
+ - pytorch_model_hub_mixin
6
+ - text-to-speech
7
+ ---
8
+
9
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
10
+ - Library: https://github.com/korakoe/StyleTTS2lib.git
11
+ - Docs: [More Information Needed]
config.json ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ASR_config": {
3
+ "batch_size": 64,
4
+ "dataset_params": {
5
+ "data_augmentation": false
6
+ },
7
+ "device": "cuda",
8
+ "epochs": 180,
9
+ "log_dir": "logs/20201006",
10
+ "model_params": {
11
+ "hidden_dim": 256,
12
+ "input_dim": 80,
13
+ "n_token": 178,
14
+ "token_embedding_dim": 512
15
+ },
16
+ "optimizer_params": {
17
+ "lr": 0.0005
18
+ },
19
+ "preprocess_parasm": {
20
+ "mel_params": {
21
+ "n_mels": 80
22
+ },
23
+ "spect_params": {
24
+ "hop_length": 300,
25
+ "n_fft": 2048,
26
+ "win_length": 1200
27
+ },
28
+ "sr": 24000
29
+ },
30
+ "pretrained_model": "",
31
+ "save_freq": 5,
32
+ "train_data": "ASRDataset/train_list.txt",
33
+ "val_data": "ASRDataset/val_list.txt"
34
+ },
35
+ "BERT_CONFIG": {
36
+ "batch_size": 32,
37
+ "data_folder": "wikipedia_20220301.en.processed",
38
+ "dataset_params": {
39
+ "max_mel_length": 512,
40
+ "phoneme_mask_prob": 0.1,
41
+ "replace_prob": 0.2,
42
+ "token_maps": "token_maps.pkl",
43
+ "token_mask": "M",
44
+ "token_separator": " ",
45
+ "tokenizer": "bert-base-multilingual-cased",
46
+ "word_mask_prob": 0.15,
47
+ "word_separator": 102
48
+ },
49
+ "log_dir": "Checkpoint_all_phonemes",
50
+ "log_interval": 10,
51
+ "mixed_precision": "fp16",
52
+ "model_params": {
53
+ "dropout": 0.1,
54
+ "hidden_size": 768,
55
+ "intermediate_size": 2048,
56
+ "max_position_embeddings": 512,
57
+ "num_attention_heads": 12,
58
+ "num_hidden_layers": 12,
59
+ "vocab_size": 178
60
+ },
61
+ "num_process": 1,
62
+ "num_steps": 2000000,
63
+ "save_interval": 20000
64
+ },
65
+ "LIBRI_TTS_CONFIG": {
66
+ "ASR_config": "Utils/ASR/config.yml",
67
+ "ASR_path": "Utils/ASR/epoch_00080.pth",
68
+ "F0_path": "Utils/JDC/bst.t7",
69
+ "PLBERT_dir": "Utils/PLBERT/",
70
+ "batch_size": 8,
71
+ "data_params": {
72
+ "OOD_data": "Data/OOD_texts.txt",
73
+ "min_length": 50,
74
+ "root_path": "",
75
+ "train_data": "Data/train_list.txt",
76
+ "val_data": "Data/val_list.txt"
77
+ },
78
+ "device": "cuda",
79
+ "epochs_1st": 40,
80
+ "epochs_2nd": 25,
81
+ "first_stage_path": "first_stage.pth",
82
+ "load_only_params": false,
83
+ "log_dir": "Models/LibriTTS",
84
+ "log_interval": 10,
85
+ "loss_params": {
86
+ "TMA_epoch": 4,
87
+ "diff_epoch": 0,
88
+ "joint_epoch": 0,
89
+ "lambda_F0": 1.0,
90
+ "lambda_ce": 20.0,
91
+ "lambda_diff": 1.0,
92
+ "lambda_dur": 1.0,
93
+ "lambda_gen": 1.0,
94
+ "lambda_mel": 5.0,
95
+ "lambda_mono": 1.0,
96
+ "lambda_norm": 1.0,
97
+ "lambda_s2s": 1.0,
98
+ "lambda_slm": 1.0,
99
+ "lambda_sty": 1.0
100
+ },
101
+ "max_len": 300,
102
+ "model_params": {
103
+ "decoder": {
104
+ "resblock_dilation_sizes": [
105
+ [
106
+ 1,
107
+ 3,
108
+ 5
109
+ ],
110
+ [
111
+ 1,
112
+ 3,
113
+ 5
114
+ ],
115
+ [
116
+ 1,
117
+ 3,
118
+ 5
119
+ ]
120
+ ],
121
+ "resblock_kernel_sizes": [
122
+ 3,
123
+ 7,
124
+ 11
125
+ ],
126
+ "type": "hifigan",
127
+ "upsample_initial_channel": 512,
128
+ "upsample_kernel_sizes": [
129
+ 20,
130
+ 10,
131
+ 6,
132
+ 4
133
+ ],
134
+ "upsample_rates": [
135
+ 10,
136
+ 5,
137
+ 3,
138
+ 2
139
+ ]
140
+ },
141
+ "diffusion": {
142
+ "dist": {
143
+ "estimate_sigma_data": true,
144
+ "mean": -3.0,
145
+ "sigma_data": 0.19926648961191362,
146
+ "std": 1.0
147
+ },
148
+ "embedding_mask_proba": 0.1,
149
+ "transformer": {
150
+ "head_features": 64,
151
+ "multiplier": 2,
152
+ "num_heads": 8,
153
+ "num_layers": 3
154
+ }
155
+ },
156
+ "dim_in": 64,
157
+ "dropout": 0.2,
158
+ "hidden_dim": 512,
159
+ "max_conv_dim": 512,
160
+ "max_dur": 50,
161
+ "multispeaker": true,
162
+ "n_layer": 3,
163
+ "n_mels": 80,
164
+ "n_token": 178,
165
+ "slm": {
166
+ "hidden": 768,
167
+ "initial_channel": 64,
168
+ "model": "microsoft/wavlm-base-plus",
169
+ "nlayers": 13,
170
+ "sr": 16000
171
+ },
172
+ "style_dim": 128
173
+ },
174
+ "optimizer_params": {
175
+ "bert_lr": 1e-05,
176
+ "ft_lr": 1e-05,
177
+ "lr": 0.0001
178
+ },
179
+ "preprocess_params": {
180
+ "spect_params": {
181
+ "hop_length": 300,
182
+ "n_fft": 2048,
183
+ "win_length": 1200
184
+ },
185
+ "sr": 24000
186
+ },
187
+ "pretrained_model": "Models/LibriTTS/epoch_2nd_00002.pth",
188
+ "save_freq": 1,
189
+ "second_stage_load_pretrained": true,
190
+ "slmadv_params": {
191
+ "batch_percentage": 0.5,
192
+ "iter": 20,
193
+ "max_len": 500,
194
+ "min_len": 400,
195
+ "scale": 0.01,
196
+ "sig": 1.5,
197
+ "thresh": 5
198
+ }
199
+ },
200
+ "config_path": null,
201
+ "model_checkpoint_path": null,
202
+ "phoneme_converter": "gruut"
203
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59e8062d4ccdae479bb85a38d8c77be4c3bc533329a0b2c7780e6ad9d1689a20
3
+ size 77676036