feat: major update
Browse files- app.py +84 -68
- config.py +0 -105
- configs/32k.json +0 -46
- configs/40k.json +0 -46
- configs/48k.json +0 -46
- infer_pack/models_onnx_moess.py +0 -849
- {infer_pack → lib/infer_pack}/attentions.py +3 -3
- {infer_pack → lib/infer_pack}/commons.py +0 -0
- {infer_pack → lib/infer_pack}/models.py +6 -6
- {infer_pack → lib/infer_pack}/models_onnx.py +6 -6
- {infer_pack → lib/infer_pack}/modules.py +3 -3
- {infer_pack → lib/infer_pack}/modules/F0Predictor/DioF0Predictor.py +1 -1
- {infer_pack → lib/infer_pack}/modules/F0Predictor/F0Predictor.py +0 -0
- {infer_pack → lib/infer_pack}/modules/F0Predictor/HarvestF0Predictor.py +1 -1
- {infer_pack → lib/infer_pack}/modules/F0Predictor/PMF0Predictor.py +1 -1
- {infer_pack → lib/infer_pack}/modules/F0Predictor/__init__.py +0 -0
- {infer_pack → lib/infer_pack}/onnx_inference.py +4 -3
- {infer_pack → lib/infer_pack}/transforms.py +0 -0
- requirements.txt +12 -17
- vc_infer_pipeline.py +3 -3
app.py
CHANGED
@@ -17,7 +17,7 @@ import io
|
|
17 |
import wave
|
18 |
from datetime import datetime
|
19 |
from fairseq import checkpoint_utils
|
20 |
-
from infer_pack.models import (
|
21 |
SynthesizerTrnMs256NSFsid,
|
22 |
SynthesizerTrnMs256NSFsid_nono,
|
23 |
SynthesizerTrnMs768NSFsid,
|
@@ -29,14 +29,25 @@ config = Config()
|
|
29 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
30 |
limitation = os.getenv("SYSTEM") == "spaces"
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def vc_fn(
|
34 |
vc_audio_mode,
|
35 |
vc_input,
|
36 |
vc_upload,
|
37 |
tts_text,
|
38 |
tts_voice,
|
39 |
-
spk_item,
|
40 |
f0_up_key,
|
41 |
f0_method,
|
42 |
index_rate,
|
@@ -73,13 +84,14 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
|
|
73 |
audio_opt = vc.pipeline(
|
74 |
hubert_model,
|
75 |
net_g,
|
76 |
-
|
77 |
audio,
|
78 |
vc_input,
|
79 |
times,
|
80 |
f0_up_key,
|
81 |
f0_method,
|
82 |
file_index,
|
|
|
83 |
index_rate,
|
84 |
if_f0,
|
85 |
filter_radius,
|
@@ -91,7 +103,7 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
|
|
91 |
f0_file=None,
|
92 |
)
|
93 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
94 |
-
print(info)
|
95 |
return info, (tgt_sr, audio_opt)
|
96 |
except:
|
97 |
info = traceback.format_exc()
|
@@ -99,6 +111,57 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
|
|
99 |
return info, (None, None)
|
100 |
return vc_fn
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
def cut_vocal_and_inst(url, audio_provider, split_model):
|
103 |
if url != "":
|
104 |
if not os.path.exists("dl_audio"):
|
@@ -275,61 +338,15 @@ def change_audio_mode(vc_audio_mode):
|
|
275 |
|
276 |
if __name__ == '__main__':
|
277 |
load_hubert()
|
278 |
-
categories =
|
279 |
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
280 |
voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
|
281 |
-
with open("weights/folder_info.json", "r", encoding="utf-8") as f:
|
282 |
-
folder_info = json.load(f)
|
283 |
-
for category_name, category_info in folder_info.items():
|
284 |
-
if not category_info['enable']:
|
285 |
-
continue
|
286 |
-
category_title = category_info['title']
|
287 |
-
category_folder = category_info['folder_path']
|
288 |
-
description = category_info['description']
|
289 |
-
models = []
|
290 |
-
with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
|
291 |
-
models_info = json.load(f)
|
292 |
-
for model_name, info in models_info.items():
|
293 |
-
if not info['enable']:
|
294 |
-
continue
|
295 |
-
model_title = info['title']
|
296 |
-
model_author = info.get("author", None)
|
297 |
-
model_cover = f"weights/{category_folder}/{model_name}/{info['cover']}"
|
298 |
-
model_index = f"weights/{category_folder}/{model_name}/{info['feature_retrieval_library']}"
|
299 |
-
cpt = torch.load(f"weights/{category_folder}/{model_name}/{model_name}.pth", map_location="cpu")
|
300 |
-
tgt_sr = cpt["config"][-1]
|
301 |
-
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
302 |
-
if_f0 = cpt.get("f0", 1)
|
303 |
-
version = cpt.get("version", "v1")
|
304 |
-
if version == "v1":
|
305 |
-
if if_f0 == 1:
|
306 |
-
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
|
307 |
-
else:
|
308 |
-
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
309 |
-
nodel_version = "V1"
|
310 |
-
elif version == "v2":
|
311 |
-
if if_f0 == 1:
|
312 |
-
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
|
313 |
-
else:
|
314 |
-
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
315 |
-
nodel_version = "V2"
|
316 |
-
del net_g.enc_q
|
317 |
-
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
318 |
-
net_g.eval().to(config.device)
|
319 |
-
if config.is_half:
|
320 |
-
net_g = net_g.half()
|
321 |
-
else:
|
322 |
-
net_g = net_g.float()
|
323 |
-
vc = VC(tgt_sr, config)
|
324 |
-
print(f"Model loaded: {model_name}")
|
325 |
-
models.append((model_name, model_title, model_author, model_cover, nodel_version, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
|
326 |
-
categories.append([category_title, category_folder, description, models])
|
327 |
with gr.Blocks() as app:
|
328 |
gr.Markdown(
|
329 |
"# <center> RVC Genshin Impact Inference\n"
|
330 |
"### <center> [Recommended to use Google Colab to use more character & more feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
|
331 |
"#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
|
332 |
-
"[
|
333 |
)
|
334 |
for (folder_title, folder, description, models) in categories:
|
335 |
with gr.TabItem(folder_title):
|
@@ -353,7 +370,7 @@ if __name__ == '__main__':
|
|
353 |
)
|
354 |
with gr.Row():
|
355 |
with gr.Column():
|
356 |
-
vc_audio_mode = gr.Dropdown(label="Input voice", choices=
|
357 |
# Input and Upload
|
358 |
vc_input = gr.Textbox(label="Input audio path", visible=False)
|
359 |
vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
|
@@ -369,22 +386,13 @@ if __name__ == '__main__':
|
|
369 |
tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
|
370 |
tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
|
371 |
with gr.Column():
|
372 |
-
spk_item = gr.Slider(
|
373 |
-
minimum=0,
|
374 |
-
maximum=2333,
|
375 |
-
step=1,
|
376 |
-
label="Speaker ID",
|
377 |
-
info="(Default: 0)",
|
378 |
-
value=0,
|
379 |
-
interactive=True,
|
380 |
-
)
|
381 |
vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
|
382 |
f0method0 = gr.Radio(
|
383 |
label="Pitch extraction algorithm",
|
384 |
-
info=
|
385 |
-
choices=
|
386 |
value="pm",
|
387 |
-
interactive=True
|
388 |
)
|
389 |
index_rate1 = gr.Slider(
|
390 |
minimum=0,
|
@@ -425,7 +433,16 @@ if __name__ == '__main__':
|
|
425 |
maximum=0.5,
|
426 |
label="Voice Protection",
|
427 |
info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
|
428 |
-
value=0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
step=0.01,
|
430 |
interactive=True,
|
431 |
)
|
@@ -453,7 +470,6 @@ if __name__ == '__main__':
|
|
453 |
vc_upload,
|
454 |
tts_text,
|
455 |
tts_voice,
|
456 |
-
spk_item,
|
457 |
vc_transform0,
|
458 |
f0method0,
|
459 |
index_rate1,
|
|
|
17 |
import wave
|
18 |
from datetime import datetime
|
19 |
from fairseq import checkpoint_utils
|
20 |
+
from lib.infer_pack.models import (
|
21 |
SynthesizerTrnMs256NSFsid,
|
22 |
SynthesizerTrnMs256NSFsid_nono,
|
23 |
SynthesizerTrnMs768NSFsid,
|
|
|
29 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
30 |
limitation = os.getenv("SYSTEM") == "spaces"
|
31 |
|
32 |
+
audio_mode = []
|
33 |
+
f0method_mode = []
|
34 |
+
f0method_info = ""
|
35 |
+
if limitation is True:
|
36 |
+
audio_mode = ["Upload audio", "TTS Audio"]
|
37 |
+
f0method_mode = ["pm", "harvest"]
|
38 |
+
f0method_info = "PM is fast, Harvest is good but extremely slow. (Default: PM)"
|
39 |
+
else:
|
40 |
+
audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
|
41 |
+
f0method_mode = ["pm", "harvest", "crepe"]
|
42 |
+
f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
|
43 |
+
|
44 |
+
def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
|
45 |
def vc_fn(
|
46 |
vc_audio_mode,
|
47 |
vc_input,
|
48 |
vc_upload,
|
49 |
tts_text,
|
50 |
tts_voice,
|
|
|
51 |
f0_up_key,
|
52 |
f0_method,
|
53 |
index_rate,
|
|
|
84 |
audio_opt = vc.pipeline(
|
85 |
hubert_model,
|
86 |
net_g,
|
87 |
+
0,
|
88 |
audio,
|
89 |
vc_input,
|
90 |
times,
|
91 |
f0_up_key,
|
92 |
f0_method,
|
93 |
file_index,
|
94 |
+
# file_big_npy,
|
95 |
index_rate,
|
96 |
if_f0,
|
97 |
filter_radius,
|
|
|
103 |
f0_file=None,
|
104 |
)
|
105 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
106 |
+
print(f"{model_title} | {info}")
|
107 |
return info, (tgt_sr, audio_opt)
|
108 |
except:
|
109 |
info = traceback.format_exc()
|
|
|
111 |
return info, (None, None)
|
112 |
return vc_fn
|
113 |
|
114 |
+
def load_model():
|
115 |
+
categories = []
|
116 |
+
with open("weights/folder_info.json", "r", encoding="utf-8") as f:
|
117 |
+
folder_info = json.load(f)
|
118 |
+
for category_name, category_info in folder_info.items():
|
119 |
+
if not category_info['enable']:
|
120 |
+
continue
|
121 |
+
category_title = category_info['title']
|
122 |
+
category_folder = category_info['folder_path']
|
123 |
+
description = category_info['description']
|
124 |
+
models = []
|
125 |
+
with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
|
126 |
+
models_info = json.load(f)
|
127 |
+
for character_name, info in models_info.items():
|
128 |
+
if not info['enable']:
|
129 |
+
continue
|
130 |
+
model_title = info['title']
|
131 |
+
model_name = info['model_path']
|
132 |
+
model_author = info.get("author", None)
|
133 |
+
model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
|
134 |
+
model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
|
135 |
+
cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
|
136 |
+
tgt_sr = cpt["config"][-1]
|
137 |
+
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
138 |
+
if_f0 = cpt.get("f0", 1)
|
139 |
+
version = cpt.get("version", "v1")
|
140 |
+
if version == "v1":
|
141 |
+
if if_f0 == 1:
|
142 |
+
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
|
143 |
+
else:
|
144 |
+
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
145 |
+
model_version = "V1"
|
146 |
+
elif version == "v2":
|
147 |
+
if if_f0 == 1:
|
148 |
+
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
|
149 |
+
else:
|
150 |
+
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
151 |
+
model_version = "V2"
|
152 |
+
del net_g.enc_q
|
153 |
+
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
154 |
+
net_g.eval().to(config.device)
|
155 |
+
if config.is_half:
|
156 |
+
net_g = net_g.half()
|
157 |
+
else:
|
158 |
+
net_g = net_g.float()
|
159 |
+
vc = VC(tgt_sr, config)
|
160 |
+
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
161 |
+
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
|
162 |
+
categories.append([category_title, category_folder, description, models])
|
163 |
+
return categories
|
164 |
+
|
165 |
def cut_vocal_and_inst(url, audio_provider, split_model):
|
166 |
if url != "":
|
167 |
if not os.path.exists("dl_audio"):
|
|
|
338 |
|
339 |
if __name__ == '__main__':
|
340 |
load_hubert()
|
341 |
+
categories = load_model()
|
342 |
tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
|
343 |
voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
with gr.Blocks() as app:
|
345 |
gr.Markdown(
|
346 |
"# <center> RVC Genshin Impact Inference\n"
|
347 |
"### <center> [Recommended to use Google Colab to use more character & more feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
|
348 |
"#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
|
349 |
+
"### [This spaces use Multi Model RVC Inference](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
|
350 |
)
|
351 |
for (folder_title, folder, description, models) in categories:
|
352 |
with gr.TabItem(folder_title):
|
|
|
370 |
)
|
371 |
with gr.Row():
|
372 |
with gr.Column():
|
373 |
+
vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
|
374 |
# Input and Upload
|
375 |
vc_input = gr.Textbox(label="Input audio path", visible=False)
|
376 |
vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
|
|
|
386 |
tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
|
387 |
tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
|
388 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
|
390 |
f0method0 = gr.Radio(
|
391 |
label="Pitch extraction algorithm",
|
392 |
+
info=f0method_info,
|
393 |
+
choices=f0method_mode,
|
394 |
value="pm",
|
395 |
+
interactive=True
|
396 |
)
|
397 |
index_rate1 = gr.Slider(
|
398 |
minimum=0,
|
|
|
433 |
maximum=0.5,
|
434 |
label="Voice Protection",
|
435 |
info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
|
436 |
+
value=0.4,
|
437 |
+
step=0.01,
|
438 |
+
interactive=True,
|
439 |
+
)
|
440 |
+
protect0 = gr.Slider(
|
441 |
+
minimum=0,
|
442 |
+
maximum=0.5,
|
443 |
+
label="Voice Protection",
|
444 |
+
info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
|
445 |
+
value=0.5,
|
446 |
step=0.01,
|
447 |
interactive=True,
|
448 |
)
|
|
|
470 |
vc_upload,
|
471 |
tts_text,
|
472 |
tts_voice,
|
|
|
473 |
vc_transform0,
|
474 |
f0method0,
|
475 |
index_rate1,
|
config.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import torch
|
3 |
-
from multiprocessing import cpu_count
|
4 |
-
|
5 |
-
class Config:
|
6 |
-
def __init__(self):
|
7 |
-
self.device = "cuda:0"
|
8 |
-
self.is_half = True
|
9 |
-
self.n_cpu = 0
|
10 |
-
self.gpu_name = None
|
11 |
-
self.gpu_mem = None
|
12 |
-
(
|
13 |
-
self.python_cmd,
|
14 |
-
self.listen_port,
|
15 |
-
self.colab,
|
16 |
-
self.noparallel,
|
17 |
-
self.noautoopen,
|
18 |
-
self.api
|
19 |
-
) = self.arg_parse()
|
20 |
-
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
|
21 |
-
|
22 |
-
@staticmethod
|
23 |
-
def arg_parse() -> tuple:
|
24 |
-
parser = argparse.ArgumentParser()
|
25 |
-
parser.add_argument("--port", type=int, default=7865, help="Listen port")
|
26 |
-
parser.add_argument(
|
27 |
-
"--pycmd", type=str, default="python", help="Python command"
|
28 |
-
)
|
29 |
-
parser.add_argument("--colab", action="store_true", help="Launch in colab")
|
30 |
-
parser.add_argument(
|
31 |
-
"--noparallel", action="store_true", help="Disable parallel processing"
|
32 |
-
)
|
33 |
-
parser.add_argument(
|
34 |
-
"--noautoopen",
|
35 |
-
action="store_true",
|
36 |
-
help="Do not open in browser automatically",
|
37 |
-
)
|
38 |
-
parser.add_argument("--api", action="store_true", help="Launch with api")
|
39 |
-
cmd_opts = parser.parse_args()
|
40 |
-
|
41 |
-
cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
|
42 |
-
|
43 |
-
return (
|
44 |
-
cmd_opts.pycmd,
|
45 |
-
cmd_opts.port,
|
46 |
-
cmd_opts.colab,
|
47 |
-
cmd_opts.noparallel,
|
48 |
-
cmd_opts.noautoopen,
|
49 |
-
cmd_opts.api
|
50 |
-
)
|
51 |
-
|
52 |
-
def device_config(self) -> tuple:
|
53 |
-
if torch.cuda.is_available():
|
54 |
-
i_device = int(self.device.split(":")[-1])
|
55 |
-
self.gpu_name = torch.cuda.get_device_name(i_device)
|
56 |
-
if (
|
57 |
-
("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
|
58 |
-
or "P40" in self.gpu_name.upper()
|
59 |
-
or "1060" in self.gpu_name
|
60 |
-
or "1070" in self.gpu_name
|
61 |
-
or "1080" in self.gpu_name
|
62 |
-
):
|
63 |
-
print("16系/10系显卡和P40强制单精度")
|
64 |
-
self.is_half = False
|
65 |
-
else:
|
66 |
-
self.gpu_name = None
|
67 |
-
self.gpu_mem = int(
|
68 |
-
torch.cuda.get_device_properties(i_device).total_memory
|
69 |
-
/ 1024
|
70 |
-
/ 1024
|
71 |
-
/ 1024
|
72 |
-
+ 0.4
|
73 |
-
)
|
74 |
-
elif torch.backends.mps.is_available():
|
75 |
-
print("没有发现支持的N卡, 使用MPS进行推理")
|
76 |
-
self.device = "mps"
|
77 |
-
self.is_half = False
|
78 |
-
else:
|
79 |
-
print("没有发现支持的N卡, 使用CPU进行推理")
|
80 |
-
self.device = "cpu"
|
81 |
-
self.is_half = False
|
82 |
-
|
83 |
-
if self.n_cpu == 0:
|
84 |
-
self.n_cpu = cpu_count()
|
85 |
-
|
86 |
-
if self.is_half:
|
87 |
-
# 6G显存配置
|
88 |
-
x_pad = 3
|
89 |
-
x_query = 10
|
90 |
-
x_center = 60
|
91 |
-
x_max = 65
|
92 |
-
else:
|
93 |
-
# 5G显存配置
|
94 |
-
x_pad = 1
|
95 |
-
x_query = 6
|
96 |
-
x_center = 38
|
97 |
-
x_max = 41
|
98 |
-
|
99 |
-
if self.gpu_mem != None and self.gpu_mem <= 4:
|
100 |
-
x_pad = 1
|
101 |
-
x_query = 5
|
102 |
-
x_center = 30
|
103 |
-
x_max = 32
|
104 |
-
|
105 |
-
return x_pad, x_query, x_center, x_max
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/32k.json
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"train": {
|
3 |
-
"log_interval": 200,
|
4 |
-
"seed": 1234,
|
5 |
-
"epochs": 20000,
|
6 |
-
"learning_rate": 1e-4,
|
7 |
-
"betas": [0.8, 0.99],
|
8 |
-
"eps": 1e-9,
|
9 |
-
"batch_size": 4,
|
10 |
-
"fp16_run": true,
|
11 |
-
"lr_decay": 0.999875,
|
12 |
-
"segment_size": 12800,
|
13 |
-
"init_lr_ratio": 1,
|
14 |
-
"warmup_epochs": 0,
|
15 |
-
"c_mel": 45,
|
16 |
-
"c_kl": 1.0
|
17 |
-
},
|
18 |
-
"data": {
|
19 |
-
"max_wav_value": 32768.0,
|
20 |
-
"sampling_rate": 32000,
|
21 |
-
"filter_length": 1024,
|
22 |
-
"hop_length": 320,
|
23 |
-
"win_length": 1024,
|
24 |
-
"n_mel_channels": 80,
|
25 |
-
"mel_fmin": 0.0,
|
26 |
-
"mel_fmax": null
|
27 |
-
},
|
28 |
-
"model": {
|
29 |
-
"inter_channels": 192,
|
30 |
-
"hidden_channels": 192,
|
31 |
-
"filter_channels": 768,
|
32 |
-
"n_heads": 2,
|
33 |
-
"n_layers": 6,
|
34 |
-
"kernel_size": 3,
|
35 |
-
"p_dropout": 0,
|
36 |
-
"resblock": "1",
|
37 |
-
"resblock_kernel_sizes": [3,7,11],
|
38 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
39 |
-
"upsample_rates": [10,4,2,2,2],
|
40 |
-
"upsample_initial_channel": 512,
|
41 |
-
"upsample_kernel_sizes": [16,16,4,4,4],
|
42 |
-
"use_spectral_norm": false,
|
43 |
-
"gin_channels": 256,
|
44 |
-
"spk_embed_dim": 109
|
45 |
-
}
|
46 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/40k.json
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"train": {
|
3 |
-
"log_interval": 200,
|
4 |
-
"seed": 1234,
|
5 |
-
"epochs": 20000,
|
6 |
-
"learning_rate": 1e-4,
|
7 |
-
"betas": [0.8, 0.99],
|
8 |
-
"eps": 1e-9,
|
9 |
-
"batch_size": 4,
|
10 |
-
"fp16_run": true,
|
11 |
-
"lr_decay": 0.999875,
|
12 |
-
"segment_size": 12800,
|
13 |
-
"init_lr_ratio": 1,
|
14 |
-
"warmup_epochs": 0,
|
15 |
-
"c_mel": 45,
|
16 |
-
"c_kl": 1.0
|
17 |
-
},
|
18 |
-
"data": {
|
19 |
-
"max_wav_value": 32768.0,
|
20 |
-
"sampling_rate": 40000,
|
21 |
-
"filter_length": 2048,
|
22 |
-
"hop_length": 400,
|
23 |
-
"win_length": 2048,
|
24 |
-
"n_mel_channels": 125,
|
25 |
-
"mel_fmin": 0.0,
|
26 |
-
"mel_fmax": null
|
27 |
-
},
|
28 |
-
"model": {
|
29 |
-
"inter_channels": 192,
|
30 |
-
"hidden_channels": 192,
|
31 |
-
"filter_channels": 768,
|
32 |
-
"n_heads": 2,
|
33 |
-
"n_layers": 6,
|
34 |
-
"kernel_size": 3,
|
35 |
-
"p_dropout": 0,
|
36 |
-
"resblock": "1",
|
37 |
-
"resblock_kernel_sizes": [3,7,11],
|
38 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
39 |
-
"upsample_rates": [10,10,2,2],
|
40 |
-
"upsample_initial_channel": 512,
|
41 |
-
"upsample_kernel_sizes": [16,16,4,4],
|
42 |
-
"use_spectral_norm": false,
|
43 |
-
"gin_channels": 256,
|
44 |
-
"spk_embed_dim": 109
|
45 |
-
}
|
46 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/48k.json
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"train": {
|
3 |
-
"log_interval": 200,
|
4 |
-
"seed": 1234,
|
5 |
-
"epochs": 20000,
|
6 |
-
"learning_rate": 1e-4,
|
7 |
-
"betas": [0.8, 0.99],
|
8 |
-
"eps": 1e-9,
|
9 |
-
"batch_size": 4,
|
10 |
-
"fp16_run": true,
|
11 |
-
"lr_decay": 0.999875,
|
12 |
-
"segment_size": 11520,
|
13 |
-
"init_lr_ratio": 1,
|
14 |
-
"warmup_epochs": 0,
|
15 |
-
"c_mel": 45,
|
16 |
-
"c_kl": 1.0
|
17 |
-
},
|
18 |
-
"data": {
|
19 |
-
"max_wav_value": 32768.0,
|
20 |
-
"sampling_rate": 48000,
|
21 |
-
"filter_length": 2048,
|
22 |
-
"hop_length": 480,
|
23 |
-
"win_length": 2048,
|
24 |
-
"n_mel_channels": 128,
|
25 |
-
"mel_fmin": 0.0,
|
26 |
-
"mel_fmax": null
|
27 |
-
},
|
28 |
-
"model": {
|
29 |
-
"inter_channels": 192,
|
30 |
-
"hidden_channels": 192,
|
31 |
-
"filter_channels": 768,
|
32 |
-
"n_heads": 2,
|
33 |
-
"n_layers": 6,
|
34 |
-
"kernel_size": 3,
|
35 |
-
"p_dropout": 0,
|
36 |
-
"resblock": "1",
|
37 |
-
"resblock_kernel_sizes": [3,7,11],
|
38 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
39 |
-
"upsample_rates": [10,6,2,2,2],
|
40 |
-
"upsample_initial_channel": 512,
|
41 |
-
"upsample_kernel_sizes": [16,16,4,4,4],
|
42 |
-
"use_spectral_norm": false,
|
43 |
-
"gin_channels": 256,
|
44 |
-
"spk_embed_dim": 109
|
45 |
-
}
|
46 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
infer_pack/models_onnx_moess.py
DELETED
@@ -1,849 +0,0 @@
|
|
1 |
-
import math, pdb, os
|
2 |
-
from time import time as ttime
|
3 |
-
import torch
|
4 |
-
from torch import nn
|
5 |
-
from torch.nn import functional as F
|
6 |
-
from infer_pack import modules
|
7 |
-
from infer_pack import attentions
|
8 |
-
from infer_pack import commons
|
9 |
-
from infer_pack.commons import init_weights, get_padding
|
10 |
-
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
-
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
-
from infer_pack.commons import init_weights
|
13 |
-
import numpy as np
|
14 |
-
from infer_pack import commons
|
15 |
-
|
16 |
-
|
17 |
-
class TextEncoder256(nn.Module):
|
18 |
-
def __init__(
|
19 |
-
self,
|
20 |
-
out_channels,
|
21 |
-
hidden_channels,
|
22 |
-
filter_channels,
|
23 |
-
n_heads,
|
24 |
-
n_layers,
|
25 |
-
kernel_size,
|
26 |
-
p_dropout,
|
27 |
-
f0=True,
|
28 |
-
):
|
29 |
-
super().__init__()
|
30 |
-
self.out_channels = out_channels
|
31 |
-
self.hidden_channels = hidden_channels
|
32 |
-
self.filter_channels = filter_channels
|
33 |
-
self.n_heads = n_heads
|
34 |
-
self.n_layers = n_layers
|
35 |
-
self.kernel_size = kernel_size
|
36 |
-
self.p_dropout = p_dropout
|
37 |
-
self.emb_phone = nn.Linear(256, hidden_channels)
|
38 |
-
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
39 |
-
if f0 == True:
|
40 |
-
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
41 |
-
self.encoder = attentions.Encoder(
|
42 |
-
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
43 |
-
)
|
44 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
45 |
-
|
46 |
-
def forward(self, phone, pitch, lengths):
|
47 |
-
if pitch == None:
|
48 |
-
x = self.emb_phone(phone)
|
49 |
-
else:
|
50 |
-
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
51 |
-
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
52 |
-
x = self.lrelu(x)
|
53 |
-
x = torch.transpose(x, 1, -1) # [b, h, t]
|
54 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
55 |
-
x.dtype
|
56 |
-
)
|
57 |
-
x = self.encoder(x * x_mask, x_mask)
|
58 |
-
stats = self.proj(x) * x_mask
|
59 |
-
|
60 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
61 |
-
return m, logs, x_mask
|
62 |
-
|
63 |
-
|
64 |
-
class TextEncoder256Sim(nn.Module):
|
65 |
-
def __init__(
|
66 |
-
self,
|
67 |
-
out_channels,
|
68 |
-
hidden_channels,
|
69 |
-
filter_channels,
|
70 |
-
n_heads,
|
71 |
-
n_layers,
|
72 |
-
kernel_size,
|
73 |
-
p_dropout,
|
74 |
-
f0=True,
|
75 |
-
):
|
76 |
-
super().__init__()
|
77 |
-
self.out_channels = out_channels
|
78 |
-
self.hidden_channels = hidden_channels
|
79 |
-
self.filter_channels = filter_channels
|
80 |
-
self.n_heads = n_heads
|
81 |
-
self.n_layers = n_layers
|
82 |
-
self.kernel_size = kernel_size
|
83 |
-
self.p_dropout = p_dropout
|
84 |
-
self.emb_phone = nn.Linear(256, hidden_channels)
|
85 |
-
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
-
if f0 == True:
|
87 |
-
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
-
self.encoder = attentions.Encoder(
|
89 |
-
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
-
)
|
91 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
92 |
-
|
93 |
-
def forward(self, phone, pitch, lengths):
|
94 |
-
if pitch == None:
|
95 |
-
x = self.emb_phone(phone)
|
96 |
-
else:
|
97 |
-
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
98 |
-
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
99 |
-
x = self.lrelu(x)
|
100 |
-
x = torch.transpose(x, 1, -1) # [b, h, t]
|
101 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
102 |
-
x.dtype
|
103 |
-
)
|
104 |
-
x = self.encoder(x * x_mask, x_mask)
|
105 |
-
x = self.proj(x) * x_mask
|
106 |
-
return x, x_mask
|
107 |
-
|
108 |
-
|
109 |
-
class ResidualCouplingBlock(nn.Module):
|
110 |
-
def __init__(
|
111 |
-
self,
|
112 |
-
channels,
|
113 |
-
hidden_channels,
|
114 |
-
kernel_size,
|
115 |
-
dilation_rate,
|
116 |
-
n_layers,
|
117 |
-
n_flows=4,
|
118 |
-
gin_channels=0,
|
119 |
-
):
|
120 |
-
super().__init__()
|
121 |
-
self.channels = channels
|
122 |
-
self.hidden_channels = hidden_channels
|
123 |
-
self.kernel_size = kernel_size
|
124 |
-
self.dilation_rate = dilation_rate
|
125 |
-
self.n_layers = n_layers
|
126 |
-
self.n_flows = n_flows
|
127 |
-
self.gin_channels = gin_channels
|
128 |
-
|
129 |
-
self.flows = nn.ModuleList()
|
130 |
-
for i in range(n_flows):
|
131 |
-
self.flows.append(
|
132 |
-
modules.ResidualCouplingLayer(
|
133 |
-
channels,
|
134 |
-
hidden_channels,
|
135 |
-
kernel_size,
|
136 |
-
dilation_rate,
|
137 |
-
n_layers,
|
138 |
-
gin_channels=gin_channels,
|
139 |
-
mean_only=True,
|
140 |
-
)
|
141 |
-
)
|
142 |
-
self.flows.append(modules.Flip())
|
143 |
-
|
144 |
-
def forward(self, x, x_mask, g=None, reverse=False):
|
145 |
-
if not reverse:
|
146 |
-
for flow in self.flows:
|
147 |
-
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
148 |
-
else:
|
149 |
-
for flow in reversed(self.flows):
|
150 |
-
x = flow(x, x_mask, g=g, reverse=reverse)
|
151 |
-
return x
|
152 |
-
|
153 |
-
def remove_weight_norm(self):
|
154 |
-
for i in range(self.n_flows):
|
155 |
-
self.flows[i * 2].remove_weight_norm()
|
156 |
-
|
157 |
-
|
158 |
-
class PosteriorEncoder(nn.Module):
|
159 |
-
def __init__(
|
160 |
-
self,
|
161 |
-
in_channels,
|
162 |
-
out_channels,
|
163 |
-
hidden_channels,
|
164 |
-
kernel_size,
|
165 |
-
dilation_rate,
|
166 |
-
n_layers,
|
167 |
-
gin_channels=0,
|
168 |
-
):
|
169 |
-
super().__init__()
|
170 |
-
self.in_channels = in_channels
|
171 |
-
self.out_channels = out_channels
|
172 |
-
self.hidden_channels = hidden_channels
|
173 |
-
self.kernel_size = kernel_size
|
174 |
-
self.dilation_rate = dilation_rate
|
175 |
-
self.n_layers = n_layers
|
176 |
-
self.gin_channels = gin_channels
|
177 |
-
|
178 |
-
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
179 |
-
self.enc = modules.WN(
|
180 |
-
hidden_channels,
|
181 |
-
kernel_size,
|
182 |
-
dilation_rate,
|
183 |
-
n_layers,
|
184 |
-
gin_channels=gin_channels,
|
185 |
-
)
|
186 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
187 |
-
|
188 |
-
def forward(self, x, x_lengths, g=None):
|
189 |
-
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
190 |
-
x.dtype
|
191 |
-
)
|
192 |
-
x = self.pre(x) * x_mask
|
193 |
-
x = self.enc(x, x_mask, g=g)
|
194 |
-
stats = self.proj(x) * x_mask
|
195 |
-
m, logs = torch.split(stats, self.out_channels, dim=1)
|
196 |
-
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
197 |
-
return z, m, logs, x_mask
|
198 |
-
|
199 |
-
def remove_weight_norm(self):
|
200 |
-
self.enc.remove_weight_norm()
|
201 |
-
|
202 |
-
|
203 |
-
class Generator(torch.nn.Module):
|
204 |
-
def __init__(
|
205 |
-
self,
|
206 |
-
initial_channel,
|
207 |
-
resblock,
|
208 |
-
resblock_kernel_sizes,
|
209 |
-
resblock_dilation_sizes,
|
210 |
-
upsample_rates,
|
211 |
-
upsample_initial_channel,
|
212 |
-
upsample_kernel_sizes,
|
213 |
-
gin_channels=0,
|
214 |
-
):
|
215 |
-
super(Generator, self).__init__()
|
216 |
-
self.num_kernels = len(resblock_kernel_sizes)
|
217 |
-
self.num_upsamples = len(upsample_rates)
|
218 |
-
self.conv_pre = Conv1d(
|
219 |
-
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
220 |
-
)
|
221 |
-
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
222 |
-
|
223 |
-
self.ups = nn.ModuleList()
|
224 |
-
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
225 |
-
self.ups.append(
|
226 |
-
weight_norm(
|
227 |
-
ConvTranspose1d(
|
228 |
-
upsample_initial_channel // (2**i),
|
229 |
-
upsample_initial_channel // (2 ** (i + 1)),
|
230 |
-
k,
|
231 |
-
u,
|
232 |
-
padding=(k - u) // 2,
|
233 |
-
)
|
234 |
-
)
|
235 |
-
)
|
236 |
-
|
237 |
-
self.resblocks = nn.ModuleList()
|
238 |
-
for i in range(len(self.ups)):
|
239 |
-
ch = upsample_initial_channel // (2 ** (i + 1))
|
240 |
-
for j, (k, d) in enumerate(
|
241 |
-
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
242 |
-
):
|
243 |
-
self.resblocks.append(resblock(ch, k, d))
|
244 |
-
|
245 |
-
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
246 |
-
self.ups.apply(init_weights)
|
247 |
-
|
248 |
-
if gin_channels != 0:
|
249 |
-
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
250 |
-
|
251 |
-
def forward(self, x, g=None):
|
252 |
-
x = self.conv_pre(x)
|
253 |
-
if g is not None:
|
254 |
-
x = x + self.cond(g)
|
255 |
-
|
256 |
-
for i in range(self.num_upsamples):
|
257 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
258 |
-
x = self.ups[i](x)
|
259 |
-
xs = None
|
260 |
-
for j in range(self.num_kernels):
|
261 |
-
if xs is None:
|
262 |
-
xs = self.resblocks[i * self.num_kernels + j](x)
|
263 |
-
else:
|
264 |
-
xs += self.resblocks[i * self.num_kernels + j](x)
|
265 |
-
x = xs / self.num_kernels
|
266 |
-
x = F.leaky_relu(x)
|
267 |
-
x = self.conv_post(x)
|
268 |
-
x = torch.tanh(x)
|
269 |
-
|
270 |
-
return x
|
271 |
-
|
272 |
-
def remove_weight_norm(self):
|
273 |
-
for l in self.ups:
|
274 |
-
remove_weight_norm(l)
|
275 |
-
for l in self.resblocks:
|
276 |
-
l.remove_weight_norm()
|
277 |
-
|
278 |
-
|
279 |
-
class SineGen(torch.nn.Module):
|
280 |
-
"""Definition of sine generator
|
281 |
-
SineGen(samp_rate, harmonic_num = 0,
|
282 |
-
sine_amp = 0.1, noise_std = 0.003,
|
283 |
-
voiced_threshold = 0,
|
284 |
-
flag_for_pulse=False)
|
285 |
-
samp_rate: sampling rate in Hz
|
286 |
-
harmonic_num: number of harmonic overtones (default 0)
|
287 |
-
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
288 |
-
noise_std: std of Gaussian noise (default 0.003)
|
289 |
-
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
290 |
-
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
291 |
-
Note: when flag_for_pulse is True, the first time step of a voiced
|
292 |
-
segment is always sin(np.pi) or cos(0)
|
293 |
-
"""
|
294 |
-
|
295 |
-
def __init__(
|
296 |
-
self,
|
297 |
-
samp_rate,
|
298 |
-
harmonic_num=0,
|
299 |
-
sine_amp=0.1,
|
300 |
-
noise_std=0.003,
|
301 |
-
voiced_threshold=0,
|
302 |
-
flag_for_pulse=False,
|
303 |
-
):
|
304 |
-
super(SineGen, self).__init__()
|
305 |
-
self.sine_amp = sine_amp
|
306 |
-
self.noise_std = noise_std
|
307 |
-
self.harmonic_num = harmonic_num
|
308 |
-
self.dim = self.harmonic_num + 1
|
309 |
-
self.sampling_rate = samp_rate
|
310 |
-
self.voiced_threshold = voiced_threshold
|
311 |
-
|
312 |
-
def _f02uv(self, f0):
|
313 |
-
# generate uv signal
|
314 |
-
uv = torch.ones_like(f0)
|
315 |
-
uv = uv * (f0 > self.voiced_threshold)
|
316 |
-
return uv
|
317 |
-
|
318 |
-
def forward(self, f0, upp):
|
319 |
-
"""sine_tensor, uv = forward(f0)
|
320 |
-
input F0: tensor(batchsize=1, length, dim=1)
|
321 |
-
f0 for unvoiced steps should be 0
|
322 |
-
output sine_tensor: tensor(batchsize=1, length, dim)
|
323 |
-
output uv: tensor(batchsize=1, length, 1)
|
324 |
-
"""
|
325 |
-
with torch.no_grad():
|
326 |
-
f0 = f0[:, None].transpose(1, 2)
|
327 |
-
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
|
328 |
-
# fundamental component
|
329 |
-
f0_buf[:, :, 0] = f0[:, :, 0]
|
330 |
-
for idx in np.arange(self.harmonic_num):
|
331 |
-
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
|
332 |
-
idx + 2
|
333 |
-
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
|
334 |
-
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
|
335 |
-
rand_ini = torch.rand(
|
336 |
-
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
|
337 |
-
)
|
338 |
-
rand_ini[:, 0] = 0
|
339 |
-
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
340 |
-
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
341 |
-
tmp_over_one *= upp
|
342 |
-
tmp_over_one = F.interpolate(
|
343 |
-
tmp_over_one.transpose(2, 1),
|
344 |
-
scale_factor=upp,
|
345 |
-
mode="linear",
|
346 |
-
align_corners=True,
|
347 |
-
).transpose(2, 1)
|
348 |
-
rad_values = F.interpolate(
|
349 |
-
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
|
350 |
-
).transpose(
|
351 |
-
2, 1
|
352 |
-
) #######
|
353 |
-
tmp_over_one %= 1
|
354 |
-
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
355 |
-
cumsum_shift = torch.zeros_like(rad_values)
|
356 |
-
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
357 |
-
sine_waves = torch.sin(
|
358 |
-
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
|
359 |
-
)
|
360 |
-
sine_waves = sine_waves * self.sine_amp
|
361 |
-
uv = self._f02uv(f0)
|
362 |
-
uv = F.interpolate(
|
363 |
-
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
|
364 |
-
).transpose(2, 1)
|
365 |
-
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
366 |
-
noise = noise_amp * torch.randn_like(sine_waves)
|
367 |
-
sine_waves = sine_waves * uv + noise
|
368 |
-
return sine_waves, uv, noise
|
369 |
-
|
370 |
-
|
371 |
-
class SourceModuleHnNSF(torch.nn.Module):
|
372 |
-
"""SourceModule for hn-nsf
|
373 |
-
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
374 |
-
add_noise_std=0.003, voiced_threshod=0)
|
375 |
-
sampling_rate: sampling_rate in Hz
|
376 |
-
harmonic_num: number of harmonic above F0 (default: 0)
|
377 |
-
sine_amp: amplitude of sine source signal (default: 0.1)
|
378 |
-
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
379 |
-
note that amplitude of noise in unvoiced is decided
|
380 |
-
by sine_amp
|
381 |
-
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
382 |
-
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
383 |
-
F0_sampled (batchsize, length, 1)
|
384 |
-
Sine_source (batchsize, length, 1)
|
385 |
-
noise_source (batchsize, length 1)
|
386 |
-
uv (batchsize, length, 1)
|
387 |
-
"""
|
388 |
-
|
389 |
-
def __init__(
|
390 |
-
self,
|
391 |
-
sampling_rate,
|
392 |
-
harmonic_num=0,
|
393 |
-
sine_amp=0.1,
|
394 |
-
add_noise_std=0.003,
|
395 |
-
voiced_threshod=0,
|
396 |
-
is_half=True,
|
397 |
-
):
|
398 |
-
super(SourceModuleHnNSF, self).__init__()
|
399 |
-
|
400 |
-
self.sine_amp = sine_amp
|
401 |
-
self.noise_std = add_noise_std
|
402 |
-
self.is_half = is_half
|
403 |
-
# to produce sine waveforms
|
404 |
-
self.l_sin_gen = SineGen(
|
405 |
-
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
|
406 |
-
)
|
407 |
-
|
408 |
-
# to merge source harmonics into a single excitation
|
409 |
-
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
410 |
-
self.l_tanh = torch.nn.Tanh()
|
411 |
-
|
412 |
-
def forward(self, x, upp=None):
|
413 |
-
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
414 |
-
if self.is_half:
|
415 |
-
sine_wavs = sine_wavs.half()
|
416 |
-
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
417 |
-
return sine_merge, None, None # noise, uv
|
418 |
-
|
419 |
-
|
420 |
-
class GeneratorNSF(torch.nn.Module):
|
421 |
-
def __init__(
|
422 |
-
self,
|
423 |
-
initial_channel,
|
424 |
-
resblock,
|
425 |
-
resblock_kernel_sizes,
|
426 |
-
resblock_dilation_sizes,
|
427 |
-
upsample_rates,
|
428 |
-
upsample_initial_channel,
|
429 |
-
upsample_kernel_sizes,
|
430 |
-
gin_channels,
|
431 |
-
sr,
|
432 |
-
is_half=False,
|
433 |
-
):
|
434 |
-
super(GeneratorNSF, self).__init__()
|
435 |
-
self.num_kernels = len(resblock_kernel_sizes)
|
436 |
-
self.num_upsamples = len(upsample_rates)
|
437 |
-
|
438 |
-
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
|
439 |
-
self.m_source = SourceModuleHnNSF(
|
440 |
-
sampling_rate=sr, harmonic_num=0, is_half=is_half
|
441 |
-
)
|
442 |
-
self.noise_convs = nn.ModuleList()
|
443 |
-
self.conv_pre = Conv1d(
|
444 |
-
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
445 |
-
)
|
446 |
-
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
447 |
-
|
448 |
-
self.ups = nn.ModuleList()
|
449 |
-
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
450 |
-
c_cur = upsample_initial_channel // (2 ** (i + 1))
|
451 |
-
self.ups.append(
|
452 |
-
weight_norm(
|
453 |
-
ConvTranspose1d(
|
454 |
-
upsample_initial_channel // (2**i),
|
455 |
-
upsample_initial_channel // (2 ** (i + 1)),
|
456 |
-
k,
|
457 |
-
u,
|
458 |
-
padding=(k - u) // 2,
|
459 |
-
)
|
460 |
-
)
|
461 |
-
)
|
462 |
-
if i + 1 < len(upsample_rates):
|
463 |
-
stride_f0 = np.prod(upsample_rates[i + 1 :])
|
464 |
-
self.noise_convs.append(
|
465 |
-
Conv1d(
|
466 |
-
1,
|
467 |
-
c_cur,
|
468 |
-
kernel_size=stride_f0 * 2,
|
469 |
-
stride=stride_f0,
|
470 |
-
padding=stride_f0 // 2,
|
471 |
-
)
|
472 |
-
)
|
473 |
-
else:
|
474 |
-
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
475 |
-
|
476 |
-
self.resblocks = nn.ModuleList()
|
477 |
-
for i in range(len(self.ups)):
|
478 |
-
ch = upsample_initial_channel // (2 ** (i + 1))
|
479 |
-
for j, (k, d) in enumerate(
|
480 |
-
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
481 |
-
):
|
482 |
-
self.resblocks.append(resblock(ch, k, d))
|
483 |
-
|
484 |
-
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
485 |
-
self.ups.apply(init_weights)
|
486 |
-
|
487 |
-
if gin_channels != 0:
|
488 |
-
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
489 |
-
|
490 |
-
self.upp = np.prod(upsample_rates)
|
491 |
-
|
492 |
-
def forward(self, x, f0, g=None):
|
493 |
-
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
494 |
-
har_source = har_source.transpose(1, 2)
|
495 |
-
x = self.conv_pre(x)
|
496 |
-
if g is not None:
|
497 |
-
x = x + self.cond(g)
|
498 |
-
|
499 |
-
for i in range(self.num_upsamples):
|
500 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
501 |
-
x = self.ups[i](x)
|
502 |
-
x_source = self.noise_convs[i](har_source)
|
503 |
-
x = x + x_source
|
504 |
-
xs = None
|
505 |
-
for j in range(self.num_kernels):
|
506 |
-
if xs is None:
|
507 |
-
xs = self.resblocks[i * self.num_kernels + j](x)
|
508 |
-
else:
|
509 |
-
xs += self.resblocks[i * self.num_kernels + j](x)
|
510 |
-
x = xs / self.num_kernels
|
511 |
-
x = F.leaky_relu(x)
|
512 |
-
x = self.conv_post(x)
|
513 |
-
x = torch.tanh(x)
|
514 |
-
return x
|
515 |
-
|
516 |
-
def remove_weight_norm(self):
|
517 |
-
for l in self.ups:
|
518 |
-
remove_weight_norm(l)
|
519 |
-
for l in self.resblocks:
|
520 |
-
l.remove_weight_norm()
|
521 |
-
|
522 |
-
|
523 |
-
sr2sr = {
|
524 |
-
"32k": 32000,
|
525 |
-
"40k": 40000,
|
526 |
-
"48k": 48000,
|
527 |
-
}
|
528 |
-
|
529 |
-
|
530 |
-
class SynthesizerTrnMs256NSFsidM(nn.Module):
|
531 |
-
def __init__(
|
532 |
-
self,
|
533 |
-
spec_channels,
|
534 |
-
segment_size,
|
535 |
-
inter_channels,
|
536 |
-
hidden_channels,
|
537 |
-
filter_channels,
|
538 |
-
n_heads,
|
539 |
-
n_layers,
|
540 |
-
kernel_size,
|
541 |
-
p_dropout,
|
542 |
-
resblock,
|
543 |
-
resblock_kernel_sizes,
|
544 |
-
resblock_dilation_sizes,
|
545 |
-
upsample_rates,
|
546 |
-
upsample_initial_channel,
|
547 |
-
upsample_kernel_sizes,
|
548 |
-
spk_embed_dim,
|
549 |
-
gin_channels,
|
550 |
-
sr,
|
551 |
-
**kwargs
|
552 |
-
):
|
553 |
-
super().__init__()
|
554 |
-
if type(sr) == type("strr"):
|
555 |
-
sr = sr2sr[sr]
|
556 |
-
self.spec_channels = spec_channels
|
557 |
-
self.inter_channels = inter_channels
|
558 |
-
self.hidden_channels = hidden_channels
|
559 |
-
self.filter_channels = filter_channels
|
560 |
-
self.n_heads = n_heads
|
561 |
-
self.n_layers = n_layers
|
562 |
-
self.kernel_size = kernel_size
|
563 |
-
self.p_dropout = p_dropout
|
564 |
-
self.resblock = resblock
|
565 |
-
self.resblock_kernel_sizes = resblock_kernel_sizes
|
566 |
-
self.resblock_dilation_sizes = resblock_dilation_sizes
|
567 |
-
self.upsample_rates = upsample_rates
|
568 |
-
self.upsample_initial_channel = upsample_initial_channel
|
569 |
-
self.upsample_kernel_sizes = upsample_kernel_sizes
|
570 |
-
self.segment_size = segment_size
|
571 |
-
self.gin_channels = gin_channels
|
572 |
-
# self.hop_length = hop_length#
|
573 |
-
self.spk_embed_dim = spk_embed_dim
|
574 |
-
self.enc_p = TextEncoder256(
|
575 |
-
inter_channels,
|
576 |
-
hidden_channels,
|
577 |
-
filter_channels,
|
578 |
-
n_heads,
|
579 |
-
n_layers,
|
580 |
-
kernel_size,
|
581 |
-
p_dropout,
|
582 |
-
)
|
583 |
-
self.dec = GeneratorNSF(
|
584 |
-
inter_channels,
|
585 |
-
resblock,
|
586 |
-
resblock_kernel_sizes,
|
587 |
-
resblock_dilation_sizes,
|
588 |
-
upsample_rates,
|
589 |
-
upsample_initial_channel,
|
590 |
-
upsample_kernel_sizes,
|
591 |
-
gin_channels=gin_channels,
|
592 |
-
sr=sr,
|
593 |
-
is_half=kwargs["is_half"],
|
594 |
-
)
|
595 |
-
self.enc_q = PosteriorEncoder(
|
596 |
-
spec_channels,
|
597 |
-
inter_channels,
|
598 |
-
hidden_channels,
|
599 |
-
5,
|
600 |
-
1,
|
601 |
-
16,
|
602 |
-
gin_channels=gin_channels,
|
603 |
-
)
|
604 |
-
self.flow = ResidualCouplingBlock(
|
605 |
-
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
606 |
-
)
|
607 |
-
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
608 |
-
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
609 |
-
|
610 |
-
def remove_weight_norm(self):
|
611 |
-
self.dec.remove_weight_norm()
|
612 |
-
self.flow.remove_weight_norm()
|
613 |
-
self.enc_q.remove_weight_norm()
|
614 |
-
|
615 |
-
def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
|
616 |
-
g = self.emb_g(sid).unsqueeze(-1)
|
617 |
-
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
618 |
-
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
|
619 |
-
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
620 |
-
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
621 |
-
return o
|
622 |
-
|
623 |
-
|
624 |
-
class SynthesizerTrnMs256NSFsid_sim(nn.Module):
|
625 |
-
"""
|
626 |
-
Synthesizer for Training
|
627 |
-
"""
|
628 |
-
|
629 |
-
def __init__(
|
630 |
-
self,
|
631 |
-
spec_channels,
|
632 |
-
segment_size,
|
633 |
-
inter_channels,
|
634 |
-
hidden_channels,
|
635 |
-
filter_channels,
|
636 |
-
n_heads,
|
637 |
-
n_layers,
|
638 |
-
kernel_size,
|
639 |
-
p_dropout,
|
640 |
-
resblock,
|
641 |
-
resblock_kernel_sizes,
|
642 |
-
resblock_dilation_sizes,
|
643 |
-
upsample_rates,
|
644 |
-
upsample_initial_channel,
|
645 |
-
upsample_kernel_sizes,
|
646 |
-
spk_embed_dim,
|
647 |
-
# hop_length,
|
648 |
-
gin_channels=0,
|
649 |
-
use_sdp=True,
|
650 |
-
**kwargs
|
651 |
-
):
|
652 |
-
super().__init__()
|
653 |
-
self.spec_channels = spec_channels
|
654 |
-
self.inter_channels = inter_channels
|
655 |
-
self.hidden_channels = hidden_channels
|
656 |
-
self.filter_channels = filter_channels
|
657 |
-
self.n_heads = n_heads
|
658 |
-
self.n_layers = n_layers
|
659 |
-
self.kernel_size = kernel_size
|
660 |
-
self.p_dropout = p_dropout
|
661 |
-
self.resblock = resblock
|
662 |
-
self.resblock_kernel_sizes = resblock_kernel_sizes
|
663 |
-
self.resblock_dilation_sizes = resblock_dilation_sizes
|
664 |
-
self.upsample_rates = upsample_rates
|
665 |
-
self.upsample_initial_channel = upsample_initial_channel
|
666 |
-
self.upsample_kernel_sizes = upsample_kernel_sizes
|
667 |
-
self.segment_size = segment_size
|
668 |
-
self.gin_channels = gin_channels
|
669 |
-
# self.hop_length = hop_length#
|
670 |
-
self.spk_embed_dim = spk_embed_dim
|
671 |
-
self.enc_p = TextEncoder256Sim(
|
672 |
-
inter_channels,
|
673 |
-
hidden_channels,
|
674 |
-
filter_channels,
|
675 |
-
n_heads,
|
676 |
-
n_layers,
|
677 |
-
kernel_size,
|
678 |
-
p_dropout,
|
679 |
-
)
|
680 |
-
self.dec = GeneratorNSF(
|
681 |
-
inter_channels,
|
682 |
-
resblock,
|
683 |
-
resblock_kernel_sizes,
|
684 |
-
resblock_dilation_sizes,
|
685 |
-
upsample_rates,
|
686 |
-
upsample_initial_channel,
|
687 |
-
upsample_kernel_sizes,
|
688 |
-
gin_channels=gin_channels,
|
689 |
-
is_half=kwargs["is_half"],
|
690 |
-
)
|
691 |
-
|
692 |
-
self.flow = ResidualCouplingBlock(
|
693 |
-
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
694 |
-
)
|
695 |
-
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
696 |
-
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
697 |
-
|
698 |
-
def remove_weight_norm(self):
|
699 |
-
self.dec.remove_weight_norm()
|
700 |
-
self.flow.remove_weight_norm()
|
701 |
-
self.enc_q.remove_weight_norm()
|
702 |
-
|
703 |
-
def forward(
|
704 |
-
self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
|
705 |
-
): # y是spec不需要了现在
|
706 |
-
g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
707 |
-
x, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
708 |
-
x = self.flow(x, x_mask, g=g, reverse=True)
|
709 |
-
o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
|
710 |
-
return o
|
711 |
-
|
712 |
-
|
713 |
-
class MultiPeriodDiscriminator(torch.nn.Module):
|
714 |
-
def __init__(self, use_spectral_norm=False):
|
715 |
-
super(MultiPeriodDiscriminator, self).__init__()
|
716 |
-
periods = [2, 3, 5, 7, 11, 17]
|
717 |
-
# periods = [3, 5, 7, 11, 17, 23, 37]
|
718 |
-
|
719 |
-
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
720 |
-
discs = discs + [
|
721 |
-
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
722 |
-
]
|
723 |
-
self.discriminators = nn.ModuleList(discs)
|
724 |
-
|
725 |
-
def forward(self, y, y_hat):
|
726 |
-
y_d_rs = [] #
|
727 |
-
y_d_gs = []
|
728 |
-
fmap_rs = []
|
729 |
-
fmap_gs = []
|
730 |
-
for i, d in enumerate(self.discriminators):
|
731 |
-
y_d_r, fmap_r = d(y)
|
732 |
-
y_d_g, fmap_g = d(y_hat)
|
733 |
-
# for j in range(len(fmap_r)):
|
734 |
-
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
735 |
-
y_d_rs.append(y_d_r)
|
736 |
-
y_d_gs.append(y_d_g)
|
737 |
-
fmap_rs.append(fmap_r)
|
738 |
-
fmap_gs.append(fmap_g)
|
739 |
-
|
740 |
-
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
741 |
-
|
742 |
-
|
743 |
-
class DiscriminatorS(torch.nn.Module):
|
744 |
-
def __init__(self, use_spectral_norm=False):
|
745 |
-
super(DiscriminatorS, self).__init__()
|
746 |
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
747 |
-
self.convs = nn.ModuleList(
|
748 |
-
[
|
749 |
-
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
750 |
-
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
751 |
-
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
752 |
-
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
753 |
-
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
754 |
-
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
755 |
-
]
|
756 |
-
)
|
757 |
-
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
758 |
-
|
759 |
-
def forward(self, x):
|
760 |
-
fmap = []
|
761 |
-
|
762 |
-
for l in self.convs:
|
763 |
-
x = l(x)
|
764 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
765 |
-
fmap.append(x)
|
766 |
-
x = self.conv_post(x)
|
767 |
-
fmap.append(x)
|
768 |
-
x = torch.flatten(x, 1, -1)
|
769 |
-
|
770 |
-
return x, fmap
|
771 |
-
|
772 |
-
|
773 |
-
class DiscriminatorP(torch.nn.Module):
|
774 |
-
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
775 |
-
super(DiscriminatorP, self).__init__()
|
776 |
-
self.period = period
|
777 |
-
self.use_spectral_norm = use_spectral_norm
|
778 |
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
779 |
-
self.convs = nn.ModuleList(
|
780 |
-
[
|
781 |
-
norm_f(
|
782 |
-
Conv2d(
|
783 |
-
1,
|
784 |
-
32,
|
785 |
-
(kernel_size, 1),
|
786 |
-
(stride, 1),
|
787 |
-
padding=(get_padding(kernel_size, 1), 0),
|
788 |
-
)
|
789 |
-
),
|
790 |
-
norm_f(
|
791 |
-
Conv2d(
|
792 |
-
32,
|
793 |
-
128,
|
794 |
-
(kernel_size, 1),
|
795 |
-
(stride, 1),
|
796 |
-
padding=(get_padding(kernel_size, 1), 0),
|
797 |
-
)
|
798 |
-
),
|
799 |
-
norm_f(
|
800 |
-
Conv2d(
|
801 |
-
128,
|
802 |
-
512,
|
803 |
-
(kernel_size, 1),
|
804 |
-
(stride, 1),
|
805 |
-
padding=(get_padding(kernel_size, 1), 0),
|
806 |
-
)
|
807 |
-
),
|
808 |
-
norm_f(
|
809 |
-
Conv2d(
|
810 |
-
512,
|
811 |
-
1024,
|
812 |
-
(kernel_size, 1),
|
813 |
-
(stride, 1),
|
814 |
-
padding=(get_padding(kernel_size, 1), 0),
|
815 |
-
)
|
816 |
-
),
|
817 |
-
norm_f(
|
818 |
-
Conv2d(
|
819 |
-
1024,
|
820 |
-
1024,
|
821 |
-
(kernel_size, 1),
|
822 |
-
1,
|
823 |
-
padding=(get_padding(kernel_size, 1), 0),
|
824 |
-
)
|
825 |
-
),
|
826 |
-
]
|
827 |
-
)
|
828 |
-
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
829 |
-
|
830 |
-
def forward(self, x):
|
831 |
-
fmap = []
|
832 |
-
|
833 |
-
# 1d to 2d
|
834 |
-
b, c, t = x.shape
|
835 |
-
if t % self.period != 0: # pad first
|
836 |
-
n_pad = self.period - (t % self.period)
|
837 |
-
x = F.pad(x, (0, n_pad), "reflect")
|
838 |
-
t = t + n_pad
|
839 |
-
x = x.view(b, c, t // self.period, self.period)
|
840 |
-
|
841 |
-
for l in self.convs:
|
842 |
-
x = l(x)
|
843 |
-
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
844 |
-
fmap.append(x)
|
845 |
-
x = self.conv_post(x)
|
846 |
-
fmap.append(x)
|
847 |
-
x = torch.flatten(x, 1, -1)
|
848 |
-
|
849 |
-
return x, fmap
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{infer_pack → lib/infer_pack}/attentions.py
RENAMED
@@ -5,9 +5,9 @@ import torch
|
|
5 |
from torch import nn
|
6 |
from torch.nn import functional as F
|
7 |
|
8 |
-
from infer_pack import commons
|
9 |
-
from infer_pack import modules
|
10 |
-
from infer_pack.modules import LayerNorm
|
11 |
|
12 |
|
13 |
class Encoder(nn.Module):
|
|
|
5 |
from torch import nn
|
6 |
from torch.nn import functional as F
|
7 |
|
8 |
+
from lib.infer_pack import commons
|
9 |
+
from lib.infer_pack import modules
|
10 |
+
from lib.infer_pack.modules import LayerNorm
|
11 |
|
12 |
|
13 |
class Encoder(nn.Module):
|
{infer_pack → lib/infer_pack}/commons.py
RENAMED
File without changes
|
{infer_pack → lib/infer_pack}/models.py
RENAMED
@@ -3,15 +3,15 @@ from time import time as ttime
|
|
3 |
import torch
|
4 |
from torch import nn
|
5 |
from torch.nn import functional as F
|
6 |
-
from infer_pack import modules
|
7 |
-
from infer_pack import attentions
|
8 |
-
from infer_pack import commons
|
9 |
-
from infer_pack.commons import init_weights, get_padding
|
10 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
-
from infer_pack.commons import init_weights
|
13 |
import numpy as np
|
14 |
-
from infer_pack import commons
|
15 |
|
16 |
|
17 |
class TextEncoder256(nn.Module):
|
|
|
3 |
import torch
|
4 |
from torch import nn
|
5 |
from torch.nn import functional as F
|
6 |
+
from lib.infer_pack import modules
|
7 |
+
from lib.infer_pack import attentions
|
8 |
+
from lib.infer_pack import commons
|
9 |
+
from lib.infer_pack.commons import init_weights, get_padding
|
10 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
from lib.infer_pack.commons import init_weights
|
13 |
import numpy as np
|
14 |
+
from lib.infer_pack import commons
|
15 |
|
16 |
|
17 |
class TextEncoder256(nn.Module):
|
{infer_pack → lib/infer_pack}/models_onnx.py
RENAMED
@@ -3,15 +3,15 @@ from time import time as ttime
|
|
3 |
import torch
|
4 |
from torch import nn
|
5 |
from torch.nn import functional as F
|
6 |
-
from infer_pack import modules
|
7 |
-
from infer_pack import attentions
|
8 |
-
from infer_pack import commons
|
9 |
-
from infer_pack.commons import init_weights, get_padding
|
10 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
-
from infer_pack.commons import init_weights
|
13 |
import numpy as np
|
14 |
-
from infer_pack import commons
|
15 |
|
16 |
|
17 |
class TextEncoder256(nn.Module):
|
|
|
3 |
import torch
|
4 |
from torch import nn
|
5 |
from torch.nn import functional as F
|
6 |
+
from lib.infer_pack import modules
|
7 |
+
from lib.infer_pack import attentions
|
8 |
+
from lib.infer_pack import commons
|
9 |
+
from lib.infer_pack.commons import init_weights, get_padding
|
10 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
from lib.infer_pack.commons import init_weights
|
13 |
import numpy as np
|
14 |
+
from lib.infer_pack import commons
|
15 |
|
16 |
|
17 |
class TextEncoder256(nn.Module):
|
{infer_pack → lib/infer_pack}/modules.py
RENAMED
@@ -9,9 +9,9 @@ from torch.nn import functional as F
|
|
9 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
10 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
11 |
|
12 |
-
from infer_pack import commons
|
13 |
-
from infer_pack.commons import init_weights, get_padding
|
14 |
-
from infer_pack.transforms import piecewise_rational_quadratic_transform
|
15 |
|
16 |
|
17 |
LRELU_SLOPE = 0.1
|
|
|
9 |
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
10 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
11 |
|
12 |
+
from lib.infer_pack import commons
|
13 |
+
from lib.infer_pack.commons import init_weights, get_padding
|
14 |
+
from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
|
15 |
|
16 |
|
17 |
LRELU_SLOPE = 0.1
|
{infer_pack → lib/infer_pack}/modules/F0Predictor/DioF0Predictor.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
import pyworld
|
3 |
import numpy as np
|
4 |
|
|
|
1 |
+
from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
import pyworld
|
3 |
import numpy as np
|
4 |
|
{infer_pack → lib/infer_pack}/modules/F0Predictor/F0Predictor.py
RENAMED
File without changes
|
{infer_pack → lib/infer_pack}/modules/F0Predictor/HarvestF0Predictor.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
import pyworld
|
3 |
import numpy as np
|
4 |
|
|
|
1 |
+
from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
import pyworld
|
3 |
import numpy as np
|
4 |
|
{infer_pack → lib/infer_pack}/modules/F0Predictor/PMF0Predictor.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
import parselmouth
|
3 |
import numpy as np
|
4 |
|
|
|
1 |
+
from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
import parselmouth
|
3 |
import numpy as np
|
4 |
|
{infer_pack → lib/infer_pack}/modules/F0Predictor/__init__.py
RENAMED
File without changes
|
{infer_pack → lib/infer_pack}/onnx_inference.py
RENAMED
@@ -3,6 +3,7 @@ import librosa
|
|
3 |
import numpy as np
|
4 |
import soundfile
|
5 |
|
|
|
6 |
class ContentVec:
|
7 |
def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
|
8 |
print("load model(s) from {}".format(vec_path))
|
@@ -32,19 +33,19 @@ class ContentVec:
|
|
32 |
|
33 |
def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
|
34 |
if f0_predictor == "pm":
|
35 |
-
from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
|
36 |
|
37 |
f0_predictor_object = PMF0Predictor(
|
38 |
hop_length=hop_length, sampling_rate=sampling_rate
|
39 |
)
|
40 |
elif f0_predictor == "harvest":
|
41 |
-
from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
|
42 |
|
43 |
f0_predictor_object = HarvestF0Predictor(
|
44 |
hop_length=hop_length, sampling_rate=sampling_rate
|
45 |
)
|
46 |
elif f0_predictor == "dio":
|
47 |
-
from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
48 |
|
49 |
f0_predictor_object = DioF0Predictor(
|
50 |
hop_length=hop_length, sampling_rate=sampling_rate
|
|
|
3 |
import numpy as np
|
4 |
import soundfile
|
5 |
|
6 |
+
|
7 |
class ContentVec:
|
8 |
def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
|
9 |
print("load model(s) from {}".format(vec_path))
|
|
|
33 |
|
34 |
def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
|
35 |
if f0_predictor == "pm":
|
36 |
+
from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
|
37 |
|
38 |
f0_predictor_object = PMF0Predictor(
|
39 |
hop_length=hop_length, sampling_rate=sampling_rate
|
40 |
)
|
41 |
elif f0_predictor == "harvest":
|
42 |
+
from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
|
43 |
|
44 |
f0_predictor_object = HarvestF0Predictor(
|
45 |
hop_length=hop_length, sampling_rate=sampling_rate
|
46 |
)
|
47 |
elif f0_predictor == "dio":
|
48 |
+
from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
49 |
|
50 |
f0_predictor_object = DioF0Predictor(
|
51 |
hop_length=hop_length, sampling_rate=sampling_rate
|
{infer_pack → lib/infer_pack}/transforms.py
RENAMED
File without changes
|
requirements.txt
CHANGED
@@ -1,26 +1,21 @@
|
|
1 |
-
setuptools
|
2 |
wheel
|
3 |
-
|
4 |
-
fairseq==0.12.2
|
5 |
-
gradio
|
6 |
ffmpeg
|
7 |
-
praat-parselmouth
|
8 |
-
pyworld
|
9 |
-
numpy==1.23.5
|
10 |
numba==0.56.4
|
11 |
-
|
12 |
-
faiss-cpu==1.7.3
|
13 |
-
faiss-gpu
|
14 |
scipy==1.9.3
|
|
|
|
|
|
|
|
|
15 |
pyworld>=0.3.2
|
|
|
|
|
|
|
16 |
tensorboard
|
17 |
tensorboardX
|
18 |
-
onnxruntime
|
19 |
-
pyngrok==4.1.12
|
20 |
-
soundfile>=0.12.1
|
21 |
-
tqdm>=4.63.1
|
22 |
torchcrepe
|
23 |
-
|
24 |
-
edge-tts
|
25 |
demucs
|
26 |
-
|
|
|
|
|
|
1 |
wheel
|
2 |
+
setuptools
|
|
|
|
|
3 |
ffmpeg
|
|
|
|
|
|
|
4 |
numba==0.56.4
|
5 |
+
numpy==1.23.5
|
|
|
|
|
6 |
scipy==1.9.3
|
7 |
+
librosa==0.9.1
|
8 |
+
fairseq==0.12.2
|
9 |
+
faiss-cpu==1.7.3
|
10 |
+
gradio==3.34.0
|
11 |
pyworld>=0.3.2
|
12 |
+
soundfile>=0.12.1
|
13 |
+
praat-parselmouth>=0.4.2
|
14 |
+
httpx==0.23.0
|
15 |
tensorboard
|
16 |
tensorboardX
|
|
|
|
|
|
|
|
|
17 |
torchcrepe
|
18 |
+
onnxruntime
|
|
|
19 |
demucs
|
20 |
+
edge-tts
|
21 |
+
yt_dlp
|
vc_infer_pipeline.py
CHANGED
@@ -184,7 +184,7 @@ class VC(object):
|
|
184 |
with torch.no_grad():
|
185 |
logits = model.extract_features(**inputs)
|
186 |
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
187 |
-
if protect < 0.5 and pitch!=None and pitchf!=None:
|
188 |
feats0 = feats.clone()
|
189 |
if (
|
190 |
isinstance(index, type(None)) == False
|
@@ -211,7 +211,7 @@ class VC(object):
|
|
211 |
)
|
212 |
|
213 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
214 |
-
if protect < 0.5 and pitch!=None and pitchf!=None:
|
215 |
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
216 |
0, 2, 1
|
217 |
)
|
@@ -223,7 +223,7 @@ class VC(object):
|
|
223 |
pitch = pitch[:, :p_len]
|
224 |
pitchf = pitchf[:, :p_len]
|
225 |
|
226 |
-
if protect < 0.5 and pitch!=None and pitchf!=None:
|
227 |
pitchff = pitchf.clone()
|
228 |
pitchff[pitchf > 0] = 1
|
229 |
pitchff[pitchf < 1] = protect
|
|
|
184 |
with torch.no_grad():
|
185 |
logits = model.extract_features(**inputs)
|
186 |
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
187 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
188 |
feats0 = feats.clone()
|
189 |
if (
|
190 |
isinstance(index, type(None)) == False
|
|
|
211 |
)
|
212 |
|
213 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
214 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
215 |
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
216 |
0, 2, 1
|
217 |
)
|
|
|
223 |
pitch = pitch[:, :p_len]
|
224 |
pitchf = pitchf[:, :p_len]
|
225 |
|
226 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
227 |
pitchff = pitchf.clone()
|
228 |
pitchff[pitchf > 0] = 1
|
229 |
pitchff[pitchf < 1] = protect
|