TheStinger commited on
Commit
82d966b
1 Parent(s): aa7549b

Upload 9 files

Browse files
configs/__pycache__/config.cpython-39.pyc ADDED
Binary file (5.69 kB). View file
 
configs/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"}
configs/config.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import json
5
+ from multiprocessing import cpu_count
6
+
7
+ import torch
8
+
9
+ try:
10
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
11
+
12
+ if torch.xpu.is_available():
13
+ from infer.modules.ipex import ipex_init
14
+
15
+ ipex_init()
16
+ except Exception: # pylint: disable=broad-exception-caught
17
+ pass
18
+ import logging
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ version_config_list = [
24
+ "v1/32k.json",
25
+ "v1/40k.json",
26
+ "v1/48k.json",
27
+ "v2/48k.json",
28
+ "v2/32k.json",
29
+ ]
30
+
31
+
32
+ def singleton_variable(func):
33
+ def wrapper(*args, **kwargs):
34
+ if not wrapper.instance:
35
+ wrapper.instance = func(*args, **kwargs)
36
+ return wrapper.instance
37
+
38
+ wrapper.instance = None
39
+ return wrapper
40
+
41
+
42
+ @singleton_variable
43
+ class Config:
44
+ def __init__(self):
45
+ self.device = "cuda:0"
46
+ self.is_half = True
47
+ self.use_jit = False
48
+ self.n_cpu = 0
49
+ self.gpu_name = None
50
+ self.json_config = self.load_config_json()
51
+ self.gpu_mem = None
52
+ (
53
+ self.python_cmd,
54
+ self.listen_port,
55
+ self.iscolab,
56
+ self.noparallel,
57
+ self.noautoopen,
58
+ self.dml,
59
+ ) = self.arg_parse()
60
+ self.instead = ""
61
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
62
+
63
+ @staticmethod
64
+ def load_config_json() -> dict:
65
+ d = {}
66
+ for config_file in version_config_list:
67
+ with open(f"configs/{config_file}", "r") as f:
68
+ d[config_file] = json.load(f)
69
+ return d
70
+
71
+ @staticmethod
72
+ def arg_parse() -> tuple:
73
+ exe = sys.executable or "python"
74
+ parser = argparse.ArgumentParser()
75
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
76
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
77
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
78
+ parser.add_argument(
79
+ "--noparallel", action="store_true", help="Disable parallel processing"
80
+ )
81
+ parser.add_argument(
82
+ "--noautoopen",
83
+ action="store_true",
84
+ help="Do not open in browser automatically",
85
+ )
86
+ parser.add_argument(
87
+ "--dml",
88
+ action="store_true",
89
+ help="torch_dml",
90
+ )
91
+ cmd_opts = parser.parse_args()
92
+
93
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
94
+
95
+ return (
96
+ cmd_opts.pycmd,
97
+ cmd_opts.port,
98
+ cmd_opts.colab,
99
+ cmd_opts.noparallel,
100
+ cmd_opts.noautoopen,
101
+ cmd_opts.dml,
102
+ )
103
+
104
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
105
+ # check `getattr` and try it for compatibility
106
+ @staticmethod
107
+ def has_mps() -> bool:
108
+ if not torch.backends.mps.is_available():
109
+ return False
110
+ try:
111
+ torch.zeros(1).to(torch.device("mps"))
112
+ return True
113
+ except Exception:
114
+ return False
115
+
116
+ @staticmethod
117
+ def has_xpu() -> bool:
118
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
119
+ return True
120
+ else:
121
+ return False
122
+
123
+ def use_fp32_config(self):
124
+ for config_file in version_config_list:
125
+ self.json_config[config_file]["train"]["fp16_run"] = False
126
+ with open(f"configs/{config_file}", "r") as f:
127
+ strr = f.read().replace("true", "false")
128
+ with open(f"configs/{config_file}", "w") as f:
129
+ f.write(strr)
130
+ with open("infer/modules/train/preprocess.py", "r") as f:
131
+ strr = f.read().replace("3.7", "3.0")
132
+ with open("infer/modules/train/preprocess.py", "w") as f:
133
+ f.write(strr)
134
+ print("overwrite preprocess and configs.json")
135
+
136
+ def device_config(self) -> tuple:
137
+ if torch.cuda.is_available():
138
+ if self.has_xpu():
139
+ self.device = self.instead = "xpu:0"
140
+ self.is_half = True
141
+ i_device = int(self.device.split(":")[-1])
142
+ self.gpu_name = torch.cuda.get_device_name(i_device)
143
+ if (
144
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
145
+ or "P40" in self.gpu_name.upper()
146
+ or "P10" in self.gpu_name.upper()
147
+ or "1060" in self.gpu_name
148
+ or "1070" in self.gpu_name
149
+ or "1080" in self.gpu_name
150
+ ):
151
+ logger.info("Found GPU %s, force to fp32", self.gpu_name)
152
+ self.is_half = False
153
+ self.use_fp32_config()
154
+ else:
155
+ logger.info("Found GPU %s", self.gpu_name)
156
+ self.gpu_mem = int(
157
+ torch.cuda.get_device_properties(i_device).total_memory
158
+ / 1024
159
+ / 1024
160
+ / 1024
161
+ + 0.4
162
+ )
163
+ if self.gpu_mem <= 4:
164
+ with open("infer/modules/train/preprocess.py", "r") as f:
165
+ strr = f.read().replace("3.7", "3.0")
166
+ with open("infer/modules/train/preprocess.py", "w") as f:
167
+ f.write(strr)
168
+ elif self.has_mps():
169
+ logger.info("No supported Nvidia GPU found")
170
+ self.device = self.instead = "mps"
171
+ self.is_half = False
172
+ self.use_fp32_config()
173
+ else:
174
+ logger.info("No supported Nvidia GPU found")
175
+ self.device = self.instead = "cpu"
176
+ self.is_half = False
177
+ self.use_fp32_config()
178
+
179
+ if self.n_cpu == 0:
180
+ self.n_cpu = cpu_count()
181
+
182
+ if self.is_half:
183
+ # 6G显存配置
184
+ x_pad = 3
185
+ x_query = 10
186
+ x_center = 60
187
+ x_max = 65
188
+ else:
189
+ # 5G显存配置
190
+ x_pad = 1
191
+ x_query = 6
192
+ x_center = 38
193
+ x_max = 41
194
+
195
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
196
+ x_pad = 1
197
+ x_query = 5
198
+ x_center = 30
199
+ x_max = 32
200
+ if self.dml:
201
+ logger.info("Use DirectML instead")
202
+ if (
203
+ os.path.exists(
204
+ "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
205
+ )
206
+ == False
207
+ ):
208
+ try:
209
+ os.rename(
210
+ "runtime\Lib\site-packages\onnxruntime",
211
+ "runtime\Lib\site-packages\onnxruntime-cuda",
212
+ )
213
+ except:
214
+ pass
215
+ try:
216
+ os.rename(
217
+ "runtime\Lib\site-packages\onnxruntime-dml",
218
+ "runtime\Lib\site-packages\onnxruntime",
219
+ )
220
+ except:
221
+ pass
222
+ # if self.device != "cpu":
223
+ import torch_directml
224
+
225
+ self.device = torch_directml.device(torch_directml.default_device())
226
+ self.is_half = False
227
+ else:
228
+ if self.instead:
229
+ logger.info(f"Use {self.instead} instead")
230
+ if (
231
+ os.path.exists(
232
+ "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
233
+ )
234
+ == False
235
+ ):
236
+ try:
237
+ os.rename(
238
+ "runtime\Lib\site-packages\onnxruntime",
239
+ "runtime\Lib\site-packages\onnxruntime-dml",
240
+ )
241
+ except:
242
+ pass
243
+ try:
244
+ os.rename(
245
+ "runtime\Lib\site-packages\onnxruntime-cuda",
246
+ "runtime\Lib\site-packages\onnxruntime",
247
+ )
248
+ except:
249
+ pass
250
+ print("is_half:%s, device:%s" % (self.is_half, self.device))
251
+ return x_pad, x_query, x_center, x_max
configs/v1/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v1/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v1/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 50,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.99975,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 100,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 50,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.99975,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 100,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 50,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.99975,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 2,
14
+ "warmup_epochs": 100,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }