Spaces:
Running
Running
darksakura
commited on
Commit
·
6f5bbf2
1
Parent(s):
01ec3d2
Upload 22 files
Browse files- auto_slicer.py +107 -0
- flask_api.py +1 -1
- inference/__pycache__/__init__.cpython-38.pyc +0 -0
- inference/__pycache__/infer_tool.cpython-38.pyc +0 -0
- inference/__pycache__/infer_tool_webui.cpython-38.pyc +0 -0
- inference/__pycache__/slicer.cpython-38.pyc +0 -0
- inference/infer_tool_webui.py +537 -0
- preprocess_flist_config.py +36 -7
- preprocess_hubert_f0.py +97 -35
- resample.py +76 -26
- spkmix.py +11 -0
- train_diff.py +76 -0
- train_index.py +30 -0
- utils.py +95 -28
auto_slicer.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import final
|
3 |
+
import numpy as np
|
4 |
+
import librosa
|
5 |
+
import soundfile as sf
|
6 |
+
from modules.slicer2 import Slicer
|
7 |
+
|
8 |
+
class AutoSlicer:
|
9 |
+
def __init__(self):
|
10 |
+
self.slicer_params = {
|
11 |
+
"threshold": -40,
|
12 |
+
"min_length": 5000,
|
13 |
+
"min_interval": 300,
|
14 |
+
"hop_size": 10,
|
15 |
+
"max_sil_kept": 500,
|
16 |
+
}
|
17 |
+
self.original_min_interval = self.slicer_params["min_interval"]
|
18 |
+
|
19 |
+
def auto_slice(self, filename, input_dir, output_dir, max_sec):
|
20 |
+
audio, sr = librosa.load(os.path.join(input_dir, filename), sr=None, mono=False)
|
21 |
+
slicer = Slicer(sr=sr, **self.slicer_params)
|
22 |
+
chunks = slicer.slice(audio)
|
23 |
+
files_to_delete = []
|
24 |
+
for i, chunk in enumerate(chunks):
|
25 |
+
if len(chunk.shape) > 1:
|
26 |
+
chunk = chunk.T
|
27 |
+
output_filename = f"{os.path.splitext(filename)[0]}_{i}"
|
28 |
+
output_filename = "".join(c for c in output_filename if c.isascii() or c == "_") + ".wav"
|
29 |
+
output_filepath = os.path.join(output_dir, output_filename)
|
30 |
+
sf.write(output_filepath, chunk, sr)
|
31 |
+
#Check and re-slice audio that more than max_sec.
|
32 |
+
while True:
|
33 |
+
new_audio, sr = librosa.load(output_filepath, sr=None, mono=False)
|
34 |
+
if librosa.get_duration(y=new_audio, sr=sr) <= max_sec:
|
35 |
+
break
|
36 |
+
self.slicer_params["min_interval"] = self.slicer_params["min_interval"] // 2
|
37 |
+
if self.slicer_params["min_interval"] >= self.slicer_params["hop_size"]:
|
38 |
+
new_chunks = Slicer(sr=sr, **self.slicer_params).slice(new_audio)
|
39 |
+
for j, new_chunk in enumerate(new_chunks):
|
40 |
+
if len(new_chunk.shape) > 1:
|
41 |
+
new_chunk = new_chunk.T
|
42 |
+
new_output_filename = f"{os.path.splitext(output_filename)[0]}_{j}.wav"
|
43 |
+
sf.write(os.path.join(output_dir, new_output_filename), new_chunk, sr)
|
44 |
+
files_to_delete.append(output_filepath)
|
45 |
+
else:
|
46 |
+
break
|
47 |
+
self.slicer_params["min_interval"] = self.original_min_interval
|
48 |
+
for file_path in files_to_delete:
|
49 |
+
if os.path.exists(file_path):
|
50 |
+
os.remove(file_path)
|
51 |
+
|
52 |
+
def merge_short(self, output_dir, max_sec, min_sec):
|
53 |
+
short_files = []
|
54 |
+
for filename in os.listdir(output_dir):
|
55 |
+
filepath = os.path.join(output_dir, filename)
|
56 |
+
if filename.endswith(".wav"):
|
57 |
+
audio, sr = librosa.load(filepath, sr=None, mono=False)
|
58 |
+
duration = librosa.get_duration(y=audio, sr=sr)
|
59 |
+
if duration < min_sec:
|
60 |
+
short_files.append((filepath, audio, duration))
|
61 |
+
short_files.sort(key=lambda x: x[2], reverse=True)
|
62 |
+
merged_audio = []
|
63 |
+
current_duration = 0
|
64 |
+
for filepath, audio, duration in short_files:
|
65 |
+
if current_duration + duration <= max_sec:
|
66 |
+
merged_audio.append(audio)
|
67 |
+
current_duration += duration
|
68 |
+
os.remove(filepath)
|
69 |
+
else:
|
70 |
+
if merged_audio:
|
71 |
+
output_audio = np.concatenate(merged_audio, axis=-1)
|
72 |
+
if len(output_audio.shape) > 1:
|
73 |
+
output_audio = output_audio.T
|
74 |
+
output_filename = f"merged_{len(os.listdir(output_dir))}.wav"
|
75 |
+
sf.write(os.path.join(output_dir, output_filename), output_audio, sr)
|
76 |
+
merged_audio = [audio]
|
77 |
+
current_duration = duration
|
78 |
+
os.remove(filepath)
|
79 |
+
if merged_audio and current_duration >= min_sec:
|
80 |
+
output_audio = np.concatenate(merged_audio, axis=-1)
|
81 |
+
if len(output_audio.shape) > 1:
|
82 |
+
output_audio = output_audio.T
|
83 |
+
output_filename = f"merged_{len(os.listdir(output_dir))}.wav"
|
84 |
+
sf.write(os.path.join(output_dir, output_filename), output_audio, sr)
|
85 |
+
|
86 |
+
def slice_count(self, input_dir, output_dir):
|
87 |
+
orig_duration = final_duration = 0
|
88 |
+
for file in os.listdir(input_dir):
|
89 |
+
if file.endswith(".wav"):
|
90 |
+
_audio, _sr = librosa.load(os.path.join(input_dir, file), sr=None, mono=False)
|
91 |
+
orig_duration += librosa.get_duration(y=_audio, sr=_sr)
|
92 |
+
wav_files = [file for file in os.listdir(output_dir) if file.endswith(".wav")]
|
93 |
+
num_files = len(wav_files)
|
94 |
+
max_duration = -1
|
95 |
+
min_duration = float("inf")
|
96 |
+
for file in wav_files:
|
97 |
+
file_path = os.path.join(output_dir, file)
|
98 |
+
audio, sr = librosa.load(file_path, sr=None, mono=False)
|
99 |
+
duration = librosa.get_duration(y=audio, sr=sr)
|
100 |
+
final_duration += float(duration)
|
101 |
+
if duration > max_duration:
|
102 |
+
max_duration = float(duration)
|
103 |
+
if duration < min_duration:
|
104 |
+
min_duration = float(duration)
|
105 |
+
return num_files, max_duration, min_duration, orig_duration, final_duration
|
106 |
+
|
107 |
+
|
flask_api.py
CHANGED
@@ -7,7 +7,7 @@ import torchaudio
|
|
7 |
from flask import Flask, request, send_file
|
8 |
from flask_cors import CORS
|
9 |
|
10 |
-
from inference.infer_tool import
|
11 |
|
12 |
app = Flask(__name__)
|
13 |
|
|
|
7 |
from flask import Flask, request, send_file
|
8 |
from flask_cors import CORS
|
9 |
|
10 |
+
from inference.infer_tool import RealTimeVC, Svc
|
11 |
|
12 |
app = Flask(__name__)
|
13 |
|
inference/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (131 Bytes). View file
|
|
inference/__pycache__/infer_tool.cpython-38.pyc
ADDED
Binary file (14.8 kB). View file
|
|
inference/__pycache__/infer_tool_webui.cpython-38.pyc
ADDED
Binary file (14.9 kB). View file
|
|
inference/__pycache__/slicer.cpython-38.pyc
ADDED
Binary file (3.83 kB). View file
|
|
inference/infer_tool_webui.py
ADDED
@@ -0,0 +1,537 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import hashlib
|
3 |
+
import io
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import os
|
7 |
+
import pickle
|
8 |
+
import time
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
import librosa
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
# import onnxruntime
|
15 |
+
import soundfile
|
16 |
+
import torch
|
17 |
+
import torchaudio
|
18 |
+
from tqdm import tqdm
|
19 |
+
|
20 |
+
import cluster
|
21 |
+
import utils
|
22 |
+
from diffusion.unit2mel import load_model_vocoder
|
23 |
+
from inference import slicer
|
24 |
+
from models import SynthesizerTrn
|
25 |
+
|
26 |
+
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
27 |
+
|
28 |
+
|
29 |
+
def read_temp(file_name):
|
30 |
+
if not os.path.exists(file_name):
|
31 |
+
with open(file_name, "w") as f:
|
32 |
+
f.write(json.dumps({"info": "temp_dict"}))
|
33 |
+
return {}
|
34 |
+
else:
|
35 |
+
try:
|
36 |
+
with open(file_name, "r") as f:
|
37 |
+
data = f.read()
|
38 |
+
data_dict = json.loads(data)
|
39 |
+
if os.path.getsize(file_name) > 50 * 1024 * 1024:
|
40 |
+
f_name = file_name.replace("\\", "/").split("/")[-1]
|
41 |
+
print(f"clean {f_name}")
|
42 |
+
for wav_hash in list(data_dict.keys()):
|
43 |
+
if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
|
44 |
+
del data_dict[wav_hash]
|
45 |
+
except Exception as e:
|
46 |
+
print(e)
|
47 |
+
print(f"{file_name} error,auto rebuild file")
|
48 |
+
data_dict = {"info": "temp_dict"}
|
49 |
+
return data_dict
|
50 |
+
|
51 |
+
|
52 |
+
def write_temp(file_name, data):
|
53 |
+
with open(file_name, "w") as f:
|
54 |
+
f.write(json.dumps(data))
|
55 |
+
|
56 |
+
|
57 |
+
def timeit(func):
|
58 |
+
def run(*args, **kwargs):
|
59 |
+
t = time.time()
|
60 |
+
res = func(*args, **kwargs)
|
61 |
+
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
|
62 |
+
return res
|
63 |
+
|
64 |
+
return run
|
65 |
+
|
66 |
+
|
67 |
+
def format_wav(audio_path):
|
68 |
+
if Path(audio_path).suffix == '.wav':
|
69 |
+
return
|
70 |
+
raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
|
71 |
+
soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
|
72 |
+
|
73 |
+
|
74 |
+
def get_end_file(dir_path, end):
|
75 |
+
file_lists = []
|
76 |
+
for root, dirs, files in os.walk(dir_path):
|
77 |
+
files = [f for f in files if f[0] != '.']
|
78 |
+
dirs[:] = [d for d in dirs if d[0] != '.']
|
79 |
+
for f_file in files:
|
80 |
+
if f_file.endswith(end):
|
81 |
+
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
|
82 |
+
return file_lists
|
83 |
+
|
84 |
+
|
85 |
+
def get_md5(content):
|
86 |
+
return hashlib.new("md5", content).hexdigest()
|
87 |
+
|
88 |
+
def fill_a_to_b(a, b):
|
89 |
+
if len(a) < len(b):
|
90 |
+
for _ in range(0, len(b) - len(a)):
|
91 |
+
a.append(a[0])
|
92 |
+
|
93 |
+
def mkdir(paths: list):
|
94 |
+
for path in paths:
|
95 |
+
if not os.path.exists(path):
|
96 |
+
os.mkdir(path)
|
97 |
+
|
98 |
+
def pad_array(arr, target_length):
|
99 |
+
current_length = arr.shape[0]
|
100 |
+
if current_length >= target_length:
|
101 |
+
return arr
|
102 |
+
else:
|
103 |
+
pad_width = target_length - current_length
|
104 |
+
pad_left = pad_width // 2
|
105 |
+
pad_right = pad_width - pad_left
|
106 |
+
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
|
107 |
+
return padded_arr
|
108 |
+
|
109 |
+
def split_list_by_n(list_collection, n, pre=0):
|
110 |
+
for i in range(0, len(list_collection), n):
|
111 |
+
yield list_collection[i-pre if i-pre>=0 else i: i + n]
|
112 |
+
|
113 |
+
|
114 |
+
class F0FilterException(Exception):
|
115 |
+
pass
|
116 |
+
|
117 |
+
class Svc(object):
|
118 |
+
def __init__(self, net_g_path, config_path,
|
119 |
+
device=None,
|
120 |
+
cluster_model_path="logs/44k/kmeans_10000.pt",
|
121 |
+
nsf_hifigan_enhance = False,
|
122 |
+
diffusion_model_path="logs/44k/diffusion/model_0.pt",
|
123 |
+
diffusion_config_path="configs/diffusion.yaml",
|
124 |
+
shallow_diffusion = False,
|
125 |
+
only_diffusion = False,
|
126 |
+
spk_mix_enable = False,
|
127 |
+
feature_retrieval = False
|
128 |
+
):
|
129 |
+
self.net_g_path = net_g_path
|
130 |
+
self.only_diffusion = only_diffusion
|
131 |
+
self.shallow_diffusion = shallow_diffusion
|
132 |
+
self.feature_retrieval = feature_retrieval
|
133 |
+
if device is None:
|
134 |
+
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
135 |
+
else:
|
136 |
+
self.dev = torch.device(device)
|
137 |
+
self.net_g_ms = None
|
138 |
+
if not self.only_diffusion:
|
139 |
+
self.hps_ms = utils.get_hparams_from_file(config_path,True)
|
140 |
+
self.target_sample = self.hps_ms.data.sampling_rate
|
141 |
+
self.hop_size = self.hps_ms.data.hop_length
|
142 |
+
self.spk2id = self.hps_ms.spk
|
143 |
+
self.unit_interpolate_mode = self.hps_ms.data.unit_interpolate_mode if self.hps_ms.data.unit_interpolate_mode is not None else 'left'
|
144 |
+
self.vol_embedding = self.hps_ms.model.vol_embedding if self.hps_ms.model.vol_embedding is not None else False
|
145 |
+
self.speech_encoder = self.hps_ms.model.speech_encoder if self.hps_ms.model.speech_encoder is not None else 'vec768l12'
|
146 |
+
|
147 |
+
self.nsf_hifigan_enhance = nsf_hifigan_enhance
|
148 |
+
if self.shallow_diffusion or self.only_diffusion:
|
149 |
+
if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
|
150 |
+
self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
|
151 |
+
if self.only_diffusion:
|
152 |
+
self.target_sample = self.diffusion_args.data.sampling_rate
|
153 |
+
self.hop_size = self.diffusion_args.data.block_size
|
154 |
+
self.spk2id = self.diffusion_args.spk
|
155 |
+
self.speech_encoder = self.diffusion_args.data.encoder
|
156 |
+
self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
|
157 |
+
if spk_mix_enable:
|
158 |
+
self.diffusion_model.init_spkmix(len(self.spk2id))
|
159 |
+
else:
|
160 |
+
print("No diffusion model or config found. Shallow diffusion mode will False")
|
161 |
+
self.shallow_diffusion = self.only_diffusion = False
|
162 |
+
|
163 |
+
# load hubert and model
|
164 |
+
if not self.only_diffusion:
|
165 |
+
self.load_model(spk_mix_enable)
|
166 |
+
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
|
167 |
+
self.volume_extractor = utils.Volume_Extractor(self.hop_size)
|
168 |
+
else:
|
169 |
+
self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
|
170 |
+
self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
|
171 |
+
|
172 |
+
if os.path.exists(cluster_model_path):
|
173 |
+
if self.feature_retrieval:
|
174 |
+
with open(cluster_model_path,"rb") as f:
|
175 |
+
self.cluster_model = pickle.load(f)
|
176 |
+
self.big_npy = None
|
177 |
+
self.now_spk_id = -1
|
178 |
+
else:
|
179 |
+
self.cluster_model = cluster.get_cluster_model(cluster_model_path)
|
180 |
+
else:
|
181 |
+
self.feature_retrieval=False
|
182 |
+
|
183 |
+
if self.shallow_diffusion :
|
184 |
+
self.nsf_hifigan_enhance = False
|
185 |
+
if self.nsf_hifigan_enhance:
|
186 |
+
from modules.enhancer import Enhancer
|
187 |
+
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
|
188 |
+
|
189 |
+
def load_model(self, spk_mix_enable=False):
|
190 |
+
# get model configuration
|
191 |
+
self.net_g_ms = SynthesizerTrn(
|
192 |
+
self.hps_ms.data.filter_length // 2 + 1,
|
193 |
+
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
194 |
+
**self.hps_ms.model)
|
195 |
+
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
196 |
+
self.dtype = list(self.net_g_ms.parameters())[0].dtype
|
197 |
+
if "half" in self.net_g_path and torch.cuda.is_available():
|
198 |
+
_ = self.net_g_ms.half().eval().to(self.dev)
|
199 |
+
else:
|
200 |
+
_ = self.net_g_ms.eval().to(self.dev)
|
201 |
+
if spk_mix_enable:
|
202 |
+
self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
|
203 |
+
|
204 |
+
def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
|
205 |
+
|
206 |
+
f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
|
207 |
+
|
208 |
+
f0, uv = f0_predictor_object.compute_f0_uv(wav)
|
209 |
+
if f0_filter and sum(f0) == 0:
|
210 |
+
raise F0FilterException("No voice detected")
|
211 |
+
f0 = torch.FloatTensor(f0).to(self.dev)
|
212 |
+
uv = torch.FloatTensor(uv).to(self.dev)
|
213 |
+
|
214 |
+
f0 = f0 * 2 ** (tran / 12)
|
215 |
+
f0 = f0.unsqueeze(0)
|
216 |
+
uv = uv.unsqueeze(0)
|
217 |
+
|
218 |
+
wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
|
219 |
+
wav16k = torch.from_numpy(wav16k).to(self.dev)
|
220 |
+
c = self.hubert_model.encoder(wav16k)
|
221 |
+
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
|
222 |
+
|
223 |
+
if cluster_infer_ratio !=0:
|
224 |
+
if self.feature_retrieval:
|
225 |
+
speaker_id = self.spk2id.get(speaker)
|
226 |
+
if speaker_id is None:
|
227 |
+
raise RuntimeError("The name you entered is not in the speaker list!")
|
228 |
+
if not speaker_id and type(speaker) is int:
|
229 |
+
if len(self.spk2id.__dict__) >= speaker:
|
230 |
+
speaker_id = speaker
|
231 |
+
feature_index = self.cluster_model[speaker_id]
|
232 |
+
feat_np = c.transpose(0,1).cpu().numpy()
|
233 |
+
if self.big_npy is None or self.now_spk_id != speaker_id:
|
234 |
+
self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
|
235 |
+
self.now_spk_id = speaker_id
|
236 |
+
print("starting feature retrieval...")
|
237 |
+
score, ix = feature_index.search(feat_np, k=8)
|
238 |
+
weight = np.square(1 / score)
|
239 |
+
weight /= weight.sum(axis=1, keepdims=True)
|
240 |
+
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
241 |
+
c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
|
242 |
+
c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
|
243 |
+
print("end feature retrieval...")
|
244 |
+
else:
|
245 |
+
cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
|
246 |
+
cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
|
247 |
+
c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
|
248 |
+
|
249 |
+
c = c.unsqueeze(0)
|
250 |
+
return c, f0, uv
|
251 |
+
|
252 |
+
def infer(self, speaker, tran, raw_path,
|
253 |
+
cluster_infer_ratio=0,
|
254 |
+
auto_predict_f0=False,
|
255 |
+
noice_scale=0.4,
|
256 |
+
f0_filter=False,
|
257 |
+
f0_predictor='pm',
|
258 |
+
enhancer_adaptive_key = 0,
|
259 |
+
cr_threshold = 0.05,
|
260 |
+
k_step = 100,
|
261 |
+
frame = 0,
|
262 |
+
spk_mix = False,
|
263 |
+
second_encoding = False,
|
264 |
+
loudness_envelope_adjustment = 1
|
265 |
+
):
|
266 |
+
wav, sr = librosa.load(raw_path, sr=self.target_sample)
|
267 |
+
if spk_mix:
|
268 |
+
c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
269 |
+
n_frames = f0.size(1)
|
270 |
+
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
|
271 |
+
else:
|
272 |
+
speaker_id = self.spk2id.get(speaker)
|
273 |
+
if not speaker_id and type(speaker) is int:
|
274 |
+
if len(self.spk2id.__dict__) >= speaker:
|
275 |
+
speaker_id = speaker
|
276 |
+
if speaker_id is None:
|
277 |
+
raise RuntimeError("The name you entered is not in the speaker list!")
|
278 |
+
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
279 |
+
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
280 |
+
n_frames = f0.size(1)
|
281 |
+
c = c.to(self.dtype)
|
282 |
+
f0 = f0.to(self.dtype)
|
283 |
+
uv = uv.to(self.dtype)
|
284 |
+
with torch.no_grad():
|
285 |
+
start = time.time()
|
286 |
+
vol = None
|
287 |
+
if not self.only_diffusion:
|
288 |
+
vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
|
289 |
+
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
|
290 |
+
audio = audio[0,0].data.float()
|
291 |
+
audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
|
292 |
+
else:
|
293 |
+
audio = torch.FloatTensor(wav).to(self.dev)
|
294 |
+
audio_mel = None
|
295 |
+
if self.dtype != torch.float32:
|
296 |
+
c = c.to(torch.float32)
|
297 |
+
f0 = f0.to(torch.float32)
|
298 |
+
uv = uv.to(torch.float32)
|
299 |
+
if self.only_diffusion or self.shallow_diffusion:
|
300 |
+
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
|
301 |
+
if self.shallow_diffusion and second_encoding:
|
302 |
+
audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
|
303 |
+
audio16k = torch.from_numpy(audio16k).to(self.dev)
|
304 |
+
c = self.hubert_model.encoder(audio16k)
|
305 |
+
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
|
306 |
+
f0 = f0[:,:,None]
|
307 |
+
c = c.transpose(-1,-2)
|
308 |
+
audio_mel = self.diffusion_model(
|
309 |
+
c,
|
310 |
+
f0,
|
311 |
+
vol,
|
312 |
+
spk_id = sid,
|
313 |
+
spk_mix_dict = None,
|
314 |
+
gt_spec=audio_mel,
|
315 |
+
infer=True,
|
316 |
+
infer_speedup=self.diffusion_args.infer.speedup,
|
317 |
+
method=self.diffusion_args.infer.method,
|
318 |
+
k_step=k_step)
|
319 |
+
audio = self.vocoder.infer(audio_mel, f0).squeeze()
|
320 |
+
if self.nsf_hifigan_enhance:
|
321 |
+
audio, _ = self.enhancer.enhance(
|
322 |
+
audio[None,:],
|
323 |
+
self.target_sample,
|
324 |
+
f0[:,:,None],
|
325 |
+
self.hps_ms.data.hop_length,
|
326 |
+
adaptive_key = enhancer_adaptive_key)
|
327 |
+
if loudness_envelope_adjustment != 1:
|
328 |
+
audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
|
329 |
+
use_time = time.time() - start
|
330 |
+
print("vits use time:{}".format(use_time))
|
331 |
+
return audio, audio.shape[-1], n_frames
|
332 |
+
|
333 |
+
def clear_empty(self):
|
334 |
+
# clean up vram
|
335 |
+
torch.cuda.empty_cache()
|
336 |
+
|
337 |
+
def unload_model(self):
|
338 |
+
# unload model
|
339 |
+
self.net_g_ms = self.net_g_ms.to("cpu")
|
340 |
+
del self.net_g_ms
|
341 |
+
if hasattr(self,"enhancer"):
|
342 |
+
self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
|
343 |
+
del self.enhancer.enhancer
|
344 |
+
del self.enhancer
|
345 |
+
gc.collect()
|
346 |
+
|
347 |
+
def slice_inference(self,
|
348 |
+
raw_audio_path,
|
349 |
+
spk,
|
350 |
+
tran,
|
351 |
+
slice_db,
|
352 |
+
cluster_infer_ratio,
|
353 |
+
auto_predict_f0,
|
354 |
+
noice_scale,
|
355 |
+
pad_seconds=0.5,
|
356 |
+
clip_seconds=0,
|
357 |
+
lg_num=0,
|
358 |
+
lgr_num =0.75,
|
359 |
+
f0_predictor='pm',
|
360 |
+
enhancer_adaptive_key = 0,
|
361 |
+
cr_threshold = 0.05,
|
362 |
+
k_step = 100,
|
363 |
+
use_spk_mix = False,
|
364 |
+
second_encoding = False,
|
365 |
+
loudness_envelope_adjustment = 1
|
366 |
+
):
|
367 |
+
if use_spk_mix:
|
368 |
+
if len(self.spk2id) == 1:
|
369 |
+
spk = self.spk2id.keys()[0]
|
370 |
+
use_spk_mix = False
|
371 |
+
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
372 |
+
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
373 |
+
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
374 |
+
per_size = int(clip_seconds*audio_sr)
|
375 |
+
lg_size = int(lg_num*audio_sr)
|
376 |
+
lg_size_r = int(lg_size*lgr_num)
|
377 |
+
lg_size_c_l = (lg_size-lg_size_r)//2
|
378 |
+
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
379 |
+
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
380 |
+
|
381 |
+
if use_spk_mix:
|
382 |
+
assert len(self.spk2id) == len(spk)
|
383 |
+
audio_length = 0
|
384 |
+
for (slice_tag, data) in audio_data:
|
385 |
+
aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
|
386 |
+
if slice_tag:
|
387 |
+
audio_length += aud_length // self.hop_size
|
388 |
+
continue
|
389 |
+
if per_size != 0:
|
390 |
+
datas = split_list_by_n(data, per_size,lg_size)
|
391 |
+
else:
|
392 |
+
datas = [data]
|
393 |
+
for k,dat in enumerate(datas):
|
394 |
+
pad_len = int(audio_sr * pad_seconds)
|
395 |
+
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
|
396 |
+
a_length = per_length + 2 * pad_len
|
397 |
+
audio_length += a_length // self.hop_size
|
398 |
+
audio_length += len(audio_data)
|
399 |
+
spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
|
400 |
+
for i in range(len(spk)):
|
401 |
+
last_end = None
|
402 |
+
for mix in spk[i]:
|
403 |
+
if mix[3]<0. or mix[2]<0.:
|
404 |
+
raise RuntimeError("mix value must higer Than zero!")
|
405 |
+
begin = int(audio_length * mix[0])
|
406 |
+
end = int(audio_length * mix[1])
|
407 |
+
length = end - begin
|
408 |
+
if length<=0:
|
409 |
+
raise RuntimeError("begin Must lower Than end!")
|
410 |
+
step = (mix[3] - mix[2])/length
|
411 |
+
if last_end is not None:
|
412 |
+
if last_end != begin:
|
413 |
+
raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
|
414 |
+
last_end = end
|
415 |
+
if step == 0.:
|
416 |
+
spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
|
417 |
+
else:
|
418 |
+
spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
|
419 |
+
if(len(spk_mix_data)<length):
|
420 |
+
num_pad = length - len(spk_mix_data)
|
421 |
+
spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
|
422 |
+
spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
|
423 |
+
|
424 |
+
spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
|
425 |
+
# spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
|
426 |
+
for i, x in enumerate(spk_mix_ten[0]):
|
427 |
+
if x == 0.0:
|
428 |
+
spk_mix_ten[0][i] = 1.0
|
429 |
+
spk_mix_tensor[:,i] = 1.0 / len(spk)
|
430 |
+
spk_mix_tensor = spk_mix_tensor / spk_mix_ten
|
431 |
+
if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
|
432 |
+
raise RuntimeError("sum(spk_mix_tensor) not equal 1")
|
433 |
+
spk = spk_mix_tensor
|
434 |
+
|
435 |
+
global_frame = 0
|
436 |
+
audio = []
|
437 |
+
for (slice_tag, data) in tqdm(audio_data):
|
438 |
+
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
439 |
+
# padd
|
440 |
+
length = int(np.ceil(len(data) / audio_sr * self.target_sample))
|
441 |
+
if slice_tag:
|
442 |
+
print('jump empty segment')
|
443 |
+
_audio = np.zeros(length)
|
444 |
+
audio.extend(list(pad_array(_audio, length)))
|
445 |
+
global_frame += length // self.hop_size
|
446 |
+
continue
|
447 |
+
if per_size != 0:
|
448 |
+
datas = split_list_by_n(data, per_size,lg_size)
|
449 |
+
else:
|
450 |
+
datas = [data]
|
451 |
+
for k,dat in enumerate(datas):
|
452 |
+
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
|
453 |
+
if clip_seconds!=0:
|
454 |
+
print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
455 |
+
# padd
|
456 |
+
pad_len = int(audio_sr * pad_seconds)
|
457 |
+
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
458 |
+
raw_path = io.BytesIO()
|
459 |
+
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
460 |
+
raw_path.seek(0)
|
461 |
+
out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
|
462 |
+
cluster_infer_ratio=cluster_infer_ratio,
|
463 |
+
auto_predict_f0=auto_predict_f0,
|
464 |
+
noice_scale=noice_scale,
|
465 |
+
f0_predictor = f0_predictor,
|
466 |
+
enhancer_adaptive_key = enhancer_adaptive_key,
|
467 |
+
cr_threshold = cr_threshold,
|
468 |
+
k_step = k_step,
|
469 |
+
frame = global_frame,
|
470 |
+
spk_mix = use_spk_mix,
|
471 |
+
second_encoding = second_encoding,
|
472 |
+
loudness_envelope_adjustment = loudness_envelope_adjustment
|
473 |
+
)
|
474 |
+
global_frame += out_frame
|
475 |
+
_audio = out_audio.cpu().numpy()
|
476 |
+
pad_len = int(self.target_sample * pad_seconds)
|
477 |
+
_audio = _audio[pad_len:-pad_len]
|
478 |
+
_audio = pad_array(_audio, per_length)
|
479 |
+
if lg_size!=0 and k!=0:
|
480 |
+
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
|
481 |
+
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
|
482 |
+
lg_pre = lg1*(1-lg)+lg2*lg
|
483 |
+
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
|
484 |
+
audio.extend(lg_pre)
|
485 |
+
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
|
486 |
+
audio.extend(list(_audio))
|
487 |
+
return np.array(audio)
|
488 |
+
|
489 |
+
class RealTimeVC:
|
490 |
+
def __init__(self):
|
491 |
+
self.last_chunk = None
|
492 |
+
self.last_o = None
|
493 |
+
self.chunk_len = 16000 # chunk length
|
494 |
+
self.pre_len = 3840 # cross fade length, multiples of 640
|
495 |
+
|
496 |
+
# Input and output are 1-dimensional numpy waveform arrays
|
497 |
+
|
498 |
+
def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
|
499 |
+
cluster_infer_ratio=0,
|
500 |
+
auto_predict_f0=False,
|
501 |
+
noice_scale=0.4,
|
502 |
+
f0_filter=False):
|
503 |
+
|
504 |
+
import maad
|
505 |
+
audio, sr = torchaudio.load(input_wav_path)
|
506 |
+
audio = audio.cpu().numpy()[0]
|
507 |
+
temp_wav = io.BytesIO()
|
508 |
+
if self.last_chunk is None:
|
509 |
+
input_wav_path.seek(0)
|
510 |
+
|
511 |
+
audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
|
512 |
+
cluster_infer_ratio=cluster_infer_ratio,
|
513 |
+
auto_predict_f0=auto_predict_f0,
|
514 |
+
noice_scale=noice_scale,
|
515 |
+
f0_filter=f0_filter)
|
516 |
+
|
517 |
+
audio = audio.cpu().numpy()
|
518 |
+
self.last_chunk = audio[-self.pre_len:]
|
519 |
+
self.last_o = audio
|
520 |
+
return audio[-self.chunk_len:]
|
521 |
+
else:
|
522 |
+
audio = np.concatenate([self.last_chunk, audio])
|
523 |
+
soundfile.write(temp_wav, audio, sr, format="wav")
|
524 |
+
temp_wav.seek(0)
|
525 |
+
|
526 |
+
audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
|
527 |
+
cluster_infer_ratio=cluster_infer_ratio,
|
528 |
+
auto_predict_f0=auto_predict_f0,
|
529 |
+
noice_scale=noice_scale,
|
530 |
+
f0_filter=f0_filter)
|
531 |
+
|
532 |
+
audio = audio.cpu().numpy()
|
533 |
+
ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
|
534 |
+
self.last_chunk = audio[-self.pre_len:]
|
535 |
+
self.last_o = audio
|
536 |
+
return ret[self.chunk_len:2 * self.chunk_len]
|
537 |
+
|
preprocess_flist_config.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
-
import os
|
2 |
import argparse
|
|
|
|
|
3 |
import re
|
|
|
|
|
4 |
|
5 |
from tqdm import tqdm
|
6 |
-
|
7 |
-
import
|
8 |
-
import wave
|
9 |
|
10 |
config_template = json.load(open("configs_template/config_template.json"))
|
11 |
|
@@ -26,6 +28,8 @@ if __name__ == "__main__":
|
|
26 |
parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
|
27 |
parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
|
28 |
parser.add_argument("--source_dir", type=str, default="./dataset/44k", help="path to source dir")
|
|
|
|
|
29 |
args = parser.parse_args()
|
30 |
|
31 |
train = []
|
@@ -41,8 +45,8 @@ if __name__ == "__main__":
|
|
41 |
for file in wavs:
|
42 |
if not file.endswith("wav"):
|
43 |
continue
|
44 |
-
|
45 |
-
|
46 |
if get_wav_duration(file) < 0.3:
|
47 |
print("skip too short audio:", file)
|
48 |
continue
|
@@ -67,9 +71,34 @@ if __name__ == "__main__":
|
|
67 |
wavpath = fname
|
68 |
f.write(wavpath + "\n")
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
config_template["spk"] = spk_dict
|
71 |
config_template["model"]["n_speakers"] = spk_id
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
print("Writing configs/config.json")
|
74 |
with open("configs/config.json", "w") as f:
|
75 |
json.dump(config_template, f, indent=2)
|
|
|
|
|
|
|
|
1 |
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
import re
|
5 |
+
import wave
|
6 |
+
from random import shuffle
|
7 |
|
8 |
from tqdm import tqdm
|
9 |
+
|
10 |
+
import diffusion.logger.utils as du
|
|
|
11 |
|
12 |
config_template = json.load(open("configs_template/config_template.json"))
|
13 |
|
|
|
28 |
parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
|
29 |
parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
|
30 |
parser.add_argument("--source_dir", type=str, default="./dataset/44k", help="path to source dir")
|
31 |
+
parser.add_argument("--speech_encoder", type=str, default="vec768l12", help="choice a speech encoder|'vec768l12','vec256l9','hubertsoft','whisper-ppg','cnhubertlarge','dphubert','whisper-ppg-large','wavlmbase+'")
|
32 |
+
parser.add_argument("--vol_aug", action="store_true", help="Whether to use volume embedding and volume augmentation")
|
33 |
args = parser.parse_args()
|
34 |
|
35 |
train = []
|
|
|
45 |
for file in wavs:
|
46 |
if not file.endswith("wav"):
|
47 |
continue
|
48 |
+
if not pattern.match(file):
|
49 |
+
print(f"warning:文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
|
50 |
if get_wav_duration(file) < 0.3:
|
51 |
print("skip too short audio:", file)
|
52 |
continue
|
|
|
71 |
wavpath = fname
|
72 |
f.write(wavpath + "\n")
|
73 |
|
74 |
+
|
75 |
+
d_config_template = du.load_config("configs_template/diffusion_template.yaml")
|
76 |
+
d_config_template["model"]["n_spk"] = spk_id
|
77 |
+
d_config_template["data"]["encoder"] = args.speech_encoder
|
78 |
+
d_config_template["spk"] = spk_dict
|
79 |
+
|
80 |
config_template["spk"] = spk_dict
|
81 |
config_template["model"]["n_speakers"] = spk_id
|
82 |
+
config_template["model"]["speech_encoder"] = args.speech_encoder
|
83 |
+
|
84 |
+
if args.speech_encoder == "vec768l12" or args.speech_encoder == "dphubert" or args.speech_encoder == "wavlmbase+":
|
85 |
+
config_template["model"]["ssl_dim"] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = 768
|
86 |
+
d_config_template["data"]["encoder_out_channels"] = 768
|
87 |
+
elif args.speech_encoder == "vec256l9" or args.speech_encoder == 'hubertsoft':
|
88 |
+
config_template["model"]["ssl_dim"] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = 256
|
89 |
+
d_config_template["data"]["encoder_out_channels"] = 256
|
90 |
+
elif args.speech_encoder == "whisper-ppg" or args.speech_encoder == 'cnhubertlarge':
|
91 |
+
config_template["model"]["ssl_dim"] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = 1024
|
92 |
+
d_config_template["data"]["encoder_out_channels"] = 1024
|
93 |
+
elif args.speech_encoder == "whisper-ppg-large":
|
94 |
+
config_template["model"]["ssl_dim"] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = 1280
|
95 |
+
d_config_template["data"]["encoder_out_channels"] = 1280
|
96 |
+
|
97 |
+
if args.vol_aug:
|
98 |
+
config_template["train"]["vol_aug"] = config_template["model"]["vol_embedding"] = True
|
99 |
+
|
100 |
print("Writing configs/config.json")
|
101 |
with open("configs/config.json", "w") as f:
|
102 |
json.dump(config_template, f, indent=2)
|
103 |
+
print("Writing configs/diffusion.yaml")
|
104 |
+
du.save_config("configs/diffusion.yaml",d_config_template)
|
preprocess_hubert_f0.py
CHANGED
@@ -1,43 +1,54 @@
|
|
1 |
-
import
|
|
|
2 |
import multiprocessing
|
3 |
import os
|
4 |
-
import
|
|
|
|
|
5 |
from random import shuffle
|
6 |
|
|
|
|
|
7 |
import torch
|
8 |
-
from glob import glob
|
9 |
from tqdm import tqdm
|
10 |
-
from modules.mel_processing import spectrogram_torch
|
11 |
|
|
|
12 |
import utils
|
13 |
-
import
|
|
|
14 |
|
15 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
16 |
-
|
17 |
-
import numpy as np
|
18 |
|
19 |
hps = utils.get_hparams_from_file("configs/config.json")
|
|
|
20 |
sampling_rate = hps.data.sampling_rate
|
21 |
hop_length = hps.data.hop_length
|
|
|
22 |
|
23 |
|
24 |
-
def process_one(filename, hmodel):
|
25 |
# print(filename)
|
26 |
wav, sr = librosa.load(filename, sr=sampling_rate)
|
|
|
|
|
|
|
|
|
27 |
soft_path = filename + ".soft.pt"
|
28 |
if not os.path.exists(soft_path):
|
29 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
|
31 |
wav16k = torch.from_numpy(wav16k).to(device)
|
32 |
-
c =
|
33 |
torch.save(c.cpu(), soft_path)
|
34 |
|
35 |
f0_path = filename + ".f0.npy"
|
36 |
if not os.path.exists(f0_path):
|
37 |
-
|
38 |
-
|
|
|
39 |
)
|
40 |
-
np.save(f0_path, f0)
|
|
|
41 |
|
42 |
spec_path = filename.replace(".wav", ".spec.pt")
|
43 |
if not os.path.exists(spec_path):
|
@@ -45,7 +56,6 @@ def process_one(filename, hmodel):
|
|
45 |
# The following code can't be replaced by torch.FloatTensor(wav)
|
46 |
# because load_wav_to_torch return a tensor that need to be normalized
|
47 |
|
48 |
-
audio, sr = utils.load_wav_to_torch(filename)
|
49 |
if sr != hps.data.sampling_rate:
|
50 |
raise ValueError(
|
51 |
"{} SR doesn't match target {} SR".format(
|
@@ -53,8 +63,7 @@ def process_one(filename, hmodel):
|
|
53 |
)
|
54 |
)
|
55 |
|
56 |
-
audio_norm = audio / hps.data.max_wav_value
|
57 |
-
audio_norm = audio_norm.unsqueeze(0)
|
58 |
|
59 |
spec = spectrogram_torch(
|
60 |
audio_norm,
|
@@ -67,35 +76,88 @@ def process_one(filename, hmodel):
|
|
67 |
spec = torch.squeeze(spec, 0)
|
68 |
torch.save(spec, spec_path)
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
74 |
-
hmodel = utils.
|
75 |
-
print("Loaded
|
76 |
-
for filename in tqdm(filenames):
|
77 |
-
process_one(filename, hmodel)
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
if __name__ == "__main__":
|
81 |
parser = argparse.ArgumentParser()
|
82 |
parser.add_argument(
|
83 |
"--in_dir", type=str, default="dataset/44k", help="path to input dir"
|
84 |
)
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10]
|
88 |
shuffle(filenames)
|
89 |
multiprocessing.set_start_method("spawn", force=True)
|
90 |
|
91 |
-
num_processes =
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
print([len(c) for c in chunks])
|
97 |
-
processes = [
|
98 |
-
multiprocessing.Process(target=process_batch, args=(chunk,)) for chunk in chunks
|
99 |
-
]
|
100 |
-
for p in processes:
|
101 |
-
p.start()
|
|
|
1 |
+
import argparse
|
2 |
+
import logging
|
3 |
import multiprocessing
|
4 |
import os
|
5 |
+
import random
|
6 |
+
from concurrent.futures import ProcessPoolExecutor
|
7 |
+
from glob import glob
|
8 |
from random import shuffle
|
9 |
|
10 |
+
import librosa
|
11 |
+
import numpy as np
|
12 |
import torch
|
|
|
13 |
from tqdm import tqdm
|
|
|
14 |
|
15 |
+
import diffusion.logger.utils as du
|
16 |
import utils
|
17 |
+
from diffusion.vocoder import Vocoder
|
18 |
+
from modules.mel_processing import spectrogram_torch
|
19 |
|
20 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
21 |
+
logging.getLogger("matplotlib").setLevel(logging.WARNING)
|
|
|
22 |
|
23 |
hps = utils.get_hparams_from_file("configs/config.json")
|
24 |
+
dconfig = du.load_config("configs/diffusion.yaml")
|
25 |
sampling_rate = hps.data.sampling_rate
|
26 |
hop_length = hps.data.hop_length
|
27 |
+
speech_encoder = hps["model"]["speech_encoder"]
|
28 |
|
29 |
|
30 |
+
def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
|
31 |
# print(filename)
|
32 |
wav, sr = librosa.load(filename, sr=sampling_rate)
|
33 |
+
audio_norm = torch.FloatTensor(wav)
|
34 |
+
audio_norm = audio_norm.unsqueeze(0)
|
35 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
+
|
37 |
soft_path = filename + ".soft.pt"
|
38 |
if not os.path.exists(soft_path):
|
|
|
39 |
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
|
40 |
wav16k = torch.from_numpy(wav16k).to(device)
|
41 |
+
c = hmodel.encoder(wav16k)
|
42 |
torch.save(c.cpu(), soft_path)
|
43 |
|
44 |
f0_path = filename + ".f0.npy"
|
45 |
if not os.path.exists(f0_path):
|
46 |
+
f0_predictor = utils.get_f0_predictor(f0p,sampling_rate=sampling_rate, hop_length=hop_length,device=None,threshold=0.05)
|
47 |
+
f0,uv = f0_predictor.compute_f0_uv(
|
48 |
+
wav
|
49 |
)
|
50 |
+
np.save(f0_path, np.asanyarray((f0,uv),dtype=object))
|
51 |
+
|
52 |
|
53 |
spec_path = filename.replace(".wav", ".spec.pt")
|
54 |
if not os.path.exists(spec_path):
|
|
|
56 |
# The following code can't be replaced by torch.FloatTensor(wav)
|
57 |
# because load_wav_to_torch return a tensor that need to be normalized
|
58 |
|
|
|
59 |
if sr != hps.data.sampling_rate:
|
60 |
raise ValueError(
|
61 |
"{} SR doesn't match target {} SR".format(
|
|
|
63 |
)
|
64 |
)
|
65 |
|
66 |
+
#audio_norm = audio / hps.data.max_wav_value
|
|
|
67 |
|
68 |
spec = spectrogram_torch(
|
69 |
audio_norm,
|
|
|
76 |
spec = torch.squeeze(spec, 0)
|
77 |
torch.save(spec, spec_path)
|
78 |
|
79 |
+
if diff or hps.model.vol_embedding:
|
80 |
+
volume_path = filename + ".vol.npy"
|
81 |
+
volume_extractor = utils.Volume_Extractor(hop_length)
|
82 |
+
if not os.path.exists(volume_path):
|
83 |
+
volume = volume_extractor.extract(audio_norm)
|
84 |
+
np.save(volume_path, volume.to('cpu').numpy())
|
85 |
+
|
86 |
+
if diff:
|
87 |
+
mel_path = filename + ".mel.npy"
|
88 |
+
if not os.path.exists(mel_path) and mel_extractor is not None:
|
89 |
+
mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate)
|
90 |
+
mel = mel_t.squeeze().to('cpu').numpy()
|
91 |
+
np.save(mel_path, mel)
|
92 |
+
aug_mel_path = filename + ".aug_mel.npy"
|
93 |
+
aug_vol_path = filename + ".aug_vol.npy"
|
94 |
+
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
95 |
+
max_shift = min(1, np.log10(1/max_amp))
|
96 |
+
log10_vol_shift = random.uniform(-1, max_shift)
|
97 |
+
keyshift = random.uniform(-5, 5)
|
98 |
+
if mel_extractor is not None:
|
99 |
+
aug_mel_t = mel_extractor.extract(audio_norm * (10 ** log10_vol_shift), sampling_rate, keyshift = keyshift)
|
100 |
+
aug_mel = aug_mel_t.squeeze().to('cpu').numpy()
|
101 |
+
aug_vol = volume_extractor.extract(audio_norm * (10 ** log10_vol_shift))
|
102 |
+
if not os.path.exists(aug_mel_path):
|
103 |
+
np.save(aug_mel_path,np.asanyarray((aug_mel,keyshift),dtype=object))
|
104 |
+
if not os.path.exists(aug_vol_path):
|
105 |
+
np.save(aug_vol_path,aug_vol.to('cpu').numpy())
|
106 |
+
|
107 |
+
def process_batch(file_chunk, f0p, diff=False, mel_extractor=None):
|
108 |
+
print("Loading speech encoder for content...")
|
109 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
110 |
+
hmodel = utils.get_speech_encoder(speech_encoder, device=device)
|
111 |
+
print("Loaded speech encoder.")
|
|
|
|
|
112 |
|
113 |
+
for filename in tqdm(file_chunk):
|
114 |
+
process_one(filename, hmodel, f0p, diff, mel_extractor)
|
115 |
+
|
116 |
+
def parallel_process(filenames, num_processes, f0p, diff, mel_extractor):
|
117 |
+
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
118 |
+
tasks = []
|
119 |
+
for i in range(num_processes):
|
120 |
+
start = int(i * len(filenames) / num_processes)
|
121 |
+
end = int((i + 1) * len(filenames) / num_processes)
|
122 |
+
file_chunk = filenames[start:end]
|
123 |
+
tasks.append(executor.submit(process_batch, file_chunk, f0p, diff, mel_extractor))
|
124 |
+
|
125 |
+
for task in tqdm(tasks):
|
126 |
+
task.result()
|
127 |
|
128 |
if __name__ == "__main__":
|
129 |
parser = argparse.ArgumentParser()
|
130 |
parser.add_argument(
|
131 |
"--in_dir", type=str, default="dataset/44k", help="path to input dir"
|
132 |
)
|
133 |
+
parser.add_argument(
|
134 |
+
'--use_diff',action='store_true', help='Whether to use the diffusion model'
|
135 |
+
)
|
136 |
+
parser.add_argument(
|
137 |
+
'--f0_predictor', type=str, default="dio", help='Select F0 predictor, can select crepe,pm,dio,harvest,rmvpe, default pm(note: crepe is original F0 using mean filter)'
|
138 |
+
)
|
139 |
+
parser.add_argument(
|
140 |
+
'--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores'
|
141 |
+
)
|
142 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
143 |
args = parser.parse_args()
|
144 |
+
f0p = args.f0_predictor
|
145 |
+
print(speech_encoder)
|
146 |
+
print(f0p)
|
147 |
+
print(args.use_diff)
|
148 |
+
if args.use_diff:
|
149 |
+
print("use_diff")
|
150 |
+
print("Loading Mel Extractor...")
|
151 |
+
mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device = device)
|
152 |
+
print("Loaded Mel Extractor.")
|
153 |
+
else:
|
154 |
+
mel_extractor = None
|
155 |
filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10]
|
156 |
shuffle(filenames)
|
157 |
multiprocessing.set_start_method("spawn", force=True)
|
158 |
|
159 |
+
num_processes = args.num_processes
|
160 |
+
if num_processes == 0:
|
161 |
+
num_processes = os.cpu_count()
|
162 |
+
|
163 |
+
parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor)
|
|
|
|
|
|
|
|
|
|
|
|
resample.py
CHANGED
@@ -1,48 +1,98 @@
|
|
1 |
-
import os
|
2 |
import argparse
|
|
|
|
|
|
|
|
|
|
|
3 |
import librosa
|
4 |
import numpy as np
|
5 |
-
from multiprocessing import Pool, cpu_count
|
6 |
from scipy.io import wavfile
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def process(item):
|
11 |
spkdir, wav_name, args = item
|
12 |
-
# speaker 's5', 'p280', 'p315' are excluded,
|
13 |
speaker = spkdir.replace("\\", "/").split("/")[-1]
|
|
|
14 |
wav_path = os.path.join(args.in_dir, speaker, wav_name)
|
15 |
if os.path.exists(wav_path) and '.wav' in wav_path:
|
16 |
os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True)
|
17 |
-
|
18 |
-
wav,
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
args.sr2,
|
29 |
-
(wav2 * np.iinfo(np.int16).max).astype(np.int16)
|
30 |
-
)
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
if __name__ == "__main__":
|
35 |
parser = argparse.ArgumentParser()
|
36 |
parser.add_argument("--sr2", type=int, default=44100, help="sampling rate")
|
37 |
parser.add_argument("--in_dir", type=str, default="./dataset_raw", help="path to source dir")
|
38 |
parser.add_argument("--out_dir2", type=str, default="./dataset/44k", help="path to target dir")
|
|
|
39 |
args = parser.parse_args()
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
spk_dir = os.path.join(args.in_dir, speaker)
|
45 |
-
if os.path.isdir(spk_dir):
|
46 |
-
print(spk_dir)
|
47 |
-
for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])):
|
48 |
-
pass
|
|
|
|
|
1 |
import argparse
|
2 |
+
import concurrent.futures
|
3 |
+
import os
|
4 |
+
from concurrent.futures import ProcessPoolExecutor
|
5 |
+
from multiprocessing import cpu_count
|
6 |
+
|
7 |
import librosa
|
8 |
import numpy as np
|
|
|
9 |
from scipy.io import wavfile
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
|
13 |
+
def load_wav(wav_path):
|
14 |
+
return librosa.load(wav_path, sr=None)
|
15 |
+
|
16 |
+
|
17 |
+
def trim_wav(wav, top_db=40):
|
18 |
+
return librosa.effects.trim(wav, top_db=top_db)
|
19 |
+
|
20 |
+
|
21 |
+
def normalize_peak(wav, threshold=1.0):
|
22 |
+
peak = np.abs(wav).max()
|
23 |
+
if peak > threshold:
|
24 |
+
wav = 0.98 * wav / peak
|
25 |
+
return wav
|
26 |
+
|
27 |
+
|
28 |
+
def resample_wav(wav, sr, target_sr):
|
29 |
+
return librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
|
30 |
+
|
31 |
+
|
32 |
+
def save_wav_to_path(wav, save_path, sr):
|
33 |
+
wavfile.write(
|
34 |
+
save_path,
|
35 |
+
sr,
|
36 |
+
(wav * np.iinfo(np.int16).max).astype(np.int16)
|
37 |
+
)
|
38 |
+
|
39 |
+
|
40 |
def process(item):
|
41 |
spkdir, wav_name, args = item
|
|
|
42 |
speaker = spkdir.replace("\\", "/").split("/")[-1]
|
43 |
+
|
44 |
wav_path = os.path.join(args.in_dir, speaker, wav_name)
|
45 |
if os.path.exists(wav_path) and '.wav' in wav_path:
|
46 |
os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True)
|
47 |
+
|
48 |
+
wav, sr = load_wav(wav_path)
|
49 |
+
wav, _ = trim_wav(wav)
|
50 |
+
wav = normalize_peak(wav)
|
51 |
+
resampled_wav = resample_wav(wav, sr, args.sr2)
|
52 |
+
|
53 |
+
if not args.skip_loudnorm:
|
54 |
+
resampled_wav /= np.max(np.abs(resampled_wav))
|
55 |
+
|
56 |
+
save_path2 = os.path.join(args.out_dir2, speaker, wav_name)
|
57 |
+
save_wav_to_path(resampled_wav, save_path2, args.sr2)
|
|
|
|
|
|
|
58 |
|
59 |
|
60 |
+
"""
|
61 |
+
def process_all_speakers():
|
62 |
+
process_count = 30 if os.cpu_count() > 60 else (os.cpu_count() - 2 if os.cpu_count() > 4 else 1)
|
63 |
+
|
64 |
+
with ThreadPoolExecutor(max_workers=process_count) as executor:
|
65 |
+
for speaker in speakers:
|
66 |
+
spk_dir = os.path.join(args.in_dir, speaker)
|
67 |
+
if os.path.isdir(spk_dir):
|
68 |
+
print(spk_dir)
|
69 |
+
futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
|
70 |
+
for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
71 |
+
pass
|
72 |
+
"""
|
73 |
+
# multi process
|
74 |
+
|
75 |
+
|
76 |
+
def process_all_speakers():
|
77 |
+
process_count = 30 if os.cpu_count() > 60 else (os.cpu_count() - 2 if os.cpu_count() > 4 else 1)
|
78 |
+
with ProcessPoolExecutor(max_workers=process_count) as executor:
|
79 |
+
for speaker in speakers:
|
80 |
+
spk_dir = os.path.join(args.in_dir, speaker)
|
81 |
+
if os.path.isdir(spk_dir):
|
82 |
+
print(spk_dir)
|
83 |
+
futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
|
84 |
+
for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
85 |
+
pass
|
86 |
+
|
87 |
|
88 |
if __name__ == "__main__":
|
89 |
parser = argparse.ArgumentParser()
|
90 |
parser.add_argument("--sr2", type=int, default=44100, help="sampling rate")
|
91 |
parser.add_argument("--in_dir", type=str, default="./dataset_raw", help="path to source dir")
|
92 |
parser.add_argument("--out_dir2", type=str, default="./dataset/44k", help="path to target dir")
|
93 |
+
parser.add_argument("--skip_loudnorm", action="store_true", help="Skip loudness matching if you have done it")
|
94 |
args = parser.parse_args()
|
95 |
+
|
96 |
+
print(f"CPU count: {cpu_count()}")
|
97 |
+
speakers = os.listdir(args.in_dir)
|
98 |
+
process_all_speakers()
|
|
|
|
|
|
|
|
|
|
spkmix.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 角色混合轨道 编写规则:
|
2 |
+
# 角色ID : [[起始时间1, 终止时间1, 起始数值1, 起始数值1], [起始时间2, 终止时间2, 起始数值2, 起始数值2]]
|
3 |
+
# 起始时间和前一个的终止时间必须相同,第一个起始时间必须为0,最后一个终止时间必须为1 (时间的范围为0-1)
|
4 |
+
# 全部角色必须填写,不使用的角色填[[0., 1., 0., 0.]]即可
|
5 |
+
# 融合数值可以随便填,在指定的时间段内从起始数值线性变化为终止数值,内部会自动确保线性组合为1,可以放心使用
|
6 |
+
|
7 |
+
spk_mix_map = {
|
8 |
+
0 : [[0., 0.5, 1, 0.5], [0.5, 1, 0.5, 1]],
|
9 |
+
1 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]],
|
10 |
+
2 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]]
|
11 |
+
}
|
train_diff.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch.optim import lr_scheduler
|
5 |
+
|
6 |
+
from diffusion.data_loaders import get_data_loaders
|
7 |
+
from diffusion.logger import utils
|
8 |
+
from diffusion.solver import train
|
9 |
+
from diffusion.unit2mel import Unit2Mel
|
10 |
+
from diffusion.vocoder import Vocoder
|
11 |
+
|
12 |
+
|
13 |
+
def parse_args(args=None, namespace=None):
|
14 |
+
"""Parse command-line arguments."""
|
15 |
+
parser = argparse.ArgumentParser()
|
16 |
+
parser.add_argument(
|
17 |
+
"-c",
|
18 |
+
"--config",
|
19 |
+
type=str,
|
20 |
+
required=True,
|
21 |
+
help="path to the config file")
|
22 |
+
return parser.parse_args(args=args, namespace=namespace)
|
23 |
+
|
24 |
+
|
25 |
+
if __name__ == '__main__':
|
26 |
+
# parse commands
|
27 |
+
cmd = parse_args()
|
28 |
+
|
29 |
+
# load config
|
30 |
+
args = utils.load_config(cmd.config)
|
31 |
+
print(' > config:', cmd.config)
|
32 |
+
print(' > exp:', args.env.expdir)
|
33 |
+
|
34 |
+
# load vocoder
|
35 |
+
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
|
36 |
+
|
37 |
+
# load model
|
38 |
+
model = Unit2Mel(
|
39 |
+
args.data.encoder_out_channels,
|
40 |
+
args.model.n_spk,
|
41 |
+
args.model.use_pitch_aug,
|
42 |
+
vocoder.dimension,
|
43 |
+
args.model.n_layers,
|
44 |
+
args.model.n_chans,
|
45 |
+
args.model.n_hidden,
|
46 |
+
args.model.timesteps,
|
47 |
+
args.model.k_step_max
|
48 |
+
)
|
49 |
+
|
50 |
+
print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
|
51 |
+
|
52 |
+
# load parameters
|
53 |
+
optimizer = torch.optim.AdamW(model.parameters())
|
54 |
+
initial_global_step, model, optimizer = utils.load_model(args.env.expdir, model, optimizer, device=args.device)
|
55 |
+
for param_group in optimizer.param_groups:
|
56 |
+
param_group['initial_lr'] = args.train.lr
|
57 |
+
param_group['lr'] = args.train.lr * (args.train.gamma ** max(((initial_global_step-2)//args.train.decay_step),0) )
|
58 |
+
param_group['weight_decay'] = args.train.weight_decay
|
59 |
+
scheduler = lr_scheduler.StepLR(optimizer, step_size=args.train.decay_step, gamma=args.train.gamma,last_epoch=initial_global_step-2)
|
60 |
+
|
61 |
+
# device
|
62 |
+
if args.device == 'cuda':
|
63 |
+
torch.cuda.set_device(args.env.gpu_id)
|
64 |
+
model.to(args.device)
|
65 |
+
|
66 |
+
for state in optimizer.state.values():
|
67 |
+
for k, v in state.items():
|
68 |
+
if torch.is_tensor(v):
|
69 |
+
state[k] = v.to(args.device)
|
70 |
+
|
71 |
+
# datas
|
72 |
+
loader_train, loader_valid = get_data_loaders(args, whole_audio=False)
|
73 |
+
|
74 |
+
# run
|
75 |
+
train(args, initial_global_step, model, optimizer, scheduler, vocoder, loader_train, loader_valid)
|
76 |
+
|
train_index.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
import utils
|
6 |
+
|
7 |
+
if __name__ == "__main__":
|
8 |
+
parser = argparse.ArgumentParser()
|
9 |
+
parser.add_argument(
|
10 |
+
"--root_dir", type=str, default="dataset/44k", help="path to root dir"
|
11 |
+
)
|
12 |
+
parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
|
13 |
+
help='JSON file for configuration')
|
14 |
+
parser.add_argument(
|
15 |
+
"--output_dir", type=str, default="logs/44k", help="path to output dir"
|
16 |
+
)
|
17 |
+
|
18 |
+
args = parser.parse_args()
|
19 |
+
|
20 |
+
hps = utils.get_hparams_from_file(args.config)
|
21 |
+
spk_dic = hps.spk
|
22 |
+
result = {}
|
23 |
+
|
24 |
+
for k,v in spk_dic.items():
|
25 |
+
print(f"now, index {k} feature...")
|
26 |
+
index = utils.train_index(k,args.root_dir)
|
27 |
+
result[v] = index
|
28 |
+
|
29 |
+
with open(os.path.join(args.output_dir,"feature_and_index.pkl"),"wb") as f:
|
30 |
+
pickle.dump(result,f)
|
utils.py
CHANGED
@@ -1,22 +1,21 @@
|
|
1 |
-
import os
|
2 |
-
import glob
|
3 |
-
import re
|
4 |
-
import sys
|
5 |
import argparse
|
6 |
-
import
|
7 |
import json
|
|
|
|
|
|
|
8 |
import subprocess
|
9 |
-
import
|
10 |
-
import
|
11 |
-
import
|
|
|
|
|
12 |
import librosa
|
13 |
import numpy as np
|
14 |
-
from scipy.io.wavfile import read
|
15 |
import torch
|
|
|
|
|
16 |
from torch.nn import functional as F
|
17 |
-
from modules.commons import sequence_mask
|
18 |
-
import faiss
|
19 |
-
import tqdm
|
20 |
|
21 |
MATPLOTLIB_FLAG = False
|
22 |
|
@@ -97,7 +96,10 @@ def get_f0_predictor(f0_predictor,hop_length,sampling_rate,**kargs):
|
|
97 |
f0_predictor_object = HarvestF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
98 |
elif f0_predictor == "dio":
|
99 |
from modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
100 |
-
f0_predictor_object = DioF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
|
|
|
|
|
|
101 |
else:
|
102 |
raise Exception("Unknown f0 predictor")
|
103 |
return f0_predictor_object
|
@@ -130,6 +132,18 @@ def get_speech_encoder(speech_encoder,device=None,**kargs):
|
|
130 |
elif speech_encoder == "whisper-ppg":
|
131 |
from vencoder.WhisperPPG import WhisperPPG
|
132 |
speech_encoder_object = WhisperPPG(device = device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
else:
|
134 |
raise Exception("Unknown speech encoder")
|
135 |
return speech_encoder_object
|
@@ -142,6 +156,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
|
|
142 |
if optimizer is not None and not skip_optimizer and checkpoint_dict['optimizer'] is not None:
|
143 |
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
144 |
saved_state_dict = checkpoint_dict['model']
|
|
|
145 |
if hasattr(model, 'module'):
|
146 |
state_dict = model.module.state_dict()
|
147 |
else:
|
@@ -153,10 +168,11 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
|
|
153 |
# print("load", k)
|
154 |
new_state_dict[k] = saved_state_dict[k]
|
155 |
assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
|
156 |
-
except:
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
160 |
if hasattr(model, 'module'):
|
161 |
model.module.load_state_dict(new_state_dict)
|
162 |
else:
|
@@ -189,15 +205,20 @@ def clean_checkpoints(path_to_models='logs/44k/', n_ckpts_to_keep=2, sort_by_tim
|
|
189 |
False -> lexicographically delete ckpts
|
190 |
"""
|
191 |
ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
|
192 |
-
name_key
|
193 |
-
|
|
|
|
|
194 |
sort_key = time_key if sort_by_time else name_key
|
195 |
-
x_sorted
|
|
|
196 |
to_del = [os.path.join(path_to_models, fn) for fn in
|
197 |
(x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])]
|
198 |
-
del_info
|
199 |
-
|
200 |
-
|
|
|
|
|
201 |
|
202 |
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
203 |
for k, v in scalars.items():
|
@@ -325,11 +346,11 @@ def get_hparams_from_dir(model_dir):
|
|
325 |
return hparams
|
326 |
|
327 |
|
328 |
-
def get_hparams_from_file(config_path):
|
329 |
with open(config_path, "r") as f:
|
330 |
data = f.read()
|
331 |
config = json.loads(data)
|
332 |
-
hparams =HParams(**config)
|
333 |
return hparams
|
334 |
|
335 |
|
@@ -368,7 +389,13 @@ def get_logger(model_dir, filename="train.log"):
|
|
368 |
return logger
|
369 |
|
370 |
|
371 |
-
def repeat_expand_2d(content, target_len):
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
# content : [h, t]
|
373 |
|
374 |
src_len = content.shape[-1]
|
@@ -385,6 +412,14 @@ def repeat_expand_2d(content, target_len):
|
|
385 |
return target
|
386 |
|
387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
def mix_model(model_paths,mix_rate,mode):
|
389 |
mix_rate = torch.FloatTensor(mix_rate)/100
|
390 |
model_tem = torch.load(model_paths[0])
|
@@ -420,6 +455,7 @@ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出
|
|
420 |
return data2
|
421 |
|
422 |
def train_index(spk_name,root_dir = "dataset/44k/"): #from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
|
|
|
423 |
print("The feature index is constructing.")
|
424 |
exp_dir = os.path.join(root_dir,spk_name)
|
425 |
listdir_res = []
|
@@ -436,6 +472,25 @@ def train_index(spk_name,root_dir = "dataset/44k/"): #from: RVC https://github.
|
|
436 |
big_npy_idx = np.arange(big_npy.shape[0])
|
437 |
np.random.shuffle(big_npy_idx)
|
438 |
big_npy = big_npy[big_npy_idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
440 |
index = faiss.index_factory(big_npy.shape[1] , "IVF%s,Flat" % n_ivf)
|
441 |
index_ivf = faiss.extract_index_ivf(index) #
|
@@ -486,6 +541,18 @@ class HParams():
|
|
486 |
def get(self,index):
|
487 |
return self.__dict__.get(index)
|
488 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
class Volume_Extractor:
|
490 |
def __init__(self, hop_size = 512):
|
491 |
self.hop_size = hop_size
|
@@ -496,6 +563,6 @@ class Volume_Extractor:
|
|
496 |
n_frames = int(audio.size(-1) // self.hop_size)
|
497 |
audio2 = audio ** 2
|
498 |
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
|
499 |
-
volume = torch.
|
500 |
volume = torch.sqrt(volume)
|
501 |
-
return volume
|
|
|
|
|
|
|
|
|
|
|
1 |
import argparse
|
2 |
+
import glob
|
3 |
import json
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import re
|
7 |
import subprocess
|
8 |
+
import sys
|
9 |
+
import traceback
|
10 |
+
from multiprocessing import cpu_count
|
11 |
+
|
12 |
+
import faiss
|
13 |
import librosa
|
14 |
import numpy as np
|
|
|
15 |
import torch
|
16 |
+
from scipy.io.wavfile import read
|
17 |
+
from sklearn.cluster import MiniBatchKMeans
|
18 |
from torch.nn import functional as F
|
|
|
|
|
|
|
19 |
|
20 |
MATPLOTLIB_FLAG = False
|
21 |
|
|
|
96 |
f0_predictor_object = HarvestF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
97 |
elif f0_predictor == "dio":
|
98 |
from modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
99 |
+
f0_predictor_object = DioF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
100 |
+
elif f0_predictor == "rmvpe":
|
101 |
+
from modules.F0Predictor.RMVPEF0Predictor import RMVPEF0Predictor
|
102 |
+
f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
|
103 |
else:
|
104 |
raise Exception("Unknown f0 predictor")
|
105 |
return f0_predictor_object
|
|
|
132 |
elif speech_encoder == "whisper-ppg":
|
133 |
from vencoder.WhisperPPG import WhisperPPG
|
134 |
speech_encoder_object = WhisperPPG(device = device)
|
135 |
+
elif speech_encoder == "cnhubertlarge":
|
136 |
+
from vencoder.CNHubertLarge import CNHubertLarge
|
137 |
+
speech_encoder_object = CNHubertLarge(device = device)
|
138 |
+
elif speech_encoder == "dphubert":
|
139 |
+
from vencoder.DPHubert import DPHubert
|
140 |
+
speech_encoder_object = DPHubert(device = device)
|
141 |
+
elif speech_encoder == "whisper-ppg-large":
|
142 |
+
from vencoder.WhisperPPGLarge import WhisperPPGLarge
|
143 |
+
speech_encoder_object = WhisperPPGLarge(device = device)
|
144 |
+
elif speech_encoder == "wavlmbase+":
|
145 |
+
from vencoder.WavLMBasePlus import WavLMBasePlus
|
146 |
+
speech_encoder_object = WavLMBasePlus(device = device)
|
147 |
else:
|
148 |
raise Exception("Unknown speech encoder")
|
149 |
return speech_encoder_object
|
|
|
156 |
if optimizer is not None and not skip_optimizer and checkpoint_dict['optimizer'] is not None:
|
157 |
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
158 |
saved_state_dict = checkpoint_dict['model']
|
159 |
+
model = model.to(list(saved_state_dict.values())[0].dtype)
|
160 |
if hasattr(model, 'module'):
|
161 |
state_dict = model.module.state_dict()
|
162 |
else:
|
|
|
168 |
# print("load", k)
|
169 |
new_state_dict[k] = saved_state_dict[k]
|
170 |
assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
|
171 |
+
except Exception:
|
172 |
+
if "enc_q" not in k or "emb_g" not in k:
|
173 |
+
print("error, %s is not in the checkpoint" % k)
|
174 |
+
logger.info("%s is not in the checkpoint" % k)
|
175 |
+
new_state_dict[k] = v
|
176 |
if hasattr(model, 'module'):
|
177 |
model.module.load_state_dict(new_state_dict)
|
178 |
else:
|
|
|
205 |
False -> lexicographically delete ckpts
|
206 |
"""
|
207 |
ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
|
208 |
+
def name_key(_f):
|
209 |
+
return int(re.compile("._(\\d+)\\.pth").match(_f).group(1))
|
210 |
+
def time_key(_f):
|
211 |
+
return os.path.getmtime(os.path.join(path_to_models, _f))
|
212 |
sort_key = time_key if sort_by_time else name_key
|
213 |
+
def x_sorted(_x):
|
214 |
+
return sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], key=sort_key)
|
215 |
to_del = [os.path.join(path_to_models, fn) for fn in
|
216 |
(x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])]
|
217 |
+
def del_info(fn):
|
218 |
+
return logger.info(f".. Free up space by deleting ckpt {fn}")
|
219 |
+
def del_routine(x):
|
220 |
+
return [os.remove(x), del_info(x)]
|
221 |
+
[del_routine(fn) for fn in to_del]
|
222 |
|
223 |
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
224 |
for k, v in scalars.items():
|
|
|
346 |
return hparams
|
347 |
|
348 |
|
349 |
+
def get_hparams_from_file(config_path, infer_mode = False):
|
350 |
with open(config_path, "r") as f:
|
351 |
data = f.read()
|
352 |
config = json.loads(data)
|
353 |
+
hparams =HParams(**config) if not infer_mode else InferHParams(**config)
|
354 |
return hparams
|
355 |
|
356 |
|
|
|
389 |
return logger
|
390 |
|
391 |
|
392 |
+
def repeat_expand_2d(content, target_len, mode = 'left'):
|
393 |
+
# content : [h, t]
|
394 |
+
return repeat_expand_2d_left(content, target_len) if mode == 'left' else repeat_expand_2d_other(content, target_len, mode)
|
395 |
+
|
396 |
+
|
397 |
+
|
398 |
+
def repeat_expand_2d_left(content, target_len):
|
399 |
# content : [h, t]
|
400 |
|
401 |
src_len = content.shape[-1]
|
|
|
412 |
return target
|
413 |
|
414 |
|
415 |
+
# mode : 'nearest'| 'linear'| 'bilinear'| 'bicubic'| 'trilinear'| 'area'
|
416 |
+
def repeat_expand_2d_other(content, target_len, mode = 'nearest'):
|
417 |
+
# content : [h, t]
|
418 |
+
content = content[None,:,:]
|
419 |
+
target = F.interpolate(content,size=target_len,mode=mode)[0]
|
420 |
+
return target
|
421 |
+
|
422 |
+
|
423 |
def mix_model(model_paths,mix_rate,mode):
|
424 |
mix_rate = torch.FloatTensor(mix_rate)/100
|
425 |
model_tem = torch.load(model_paths[0])
|
|
|
455 |
return data2
|
456 |
|
457 |
def train_index(spk_name,root_dir = "dataset/44k/"): #from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
|
458 |
+
n_cpu = cpu_count()
|
459 |
print("The feature index is constructing.")
|
460 |
exp_dir = os.path.join(root_dir,spk_name)
|
461 |
listdir_res = []
|
|
|
472 |
big_npy_idx = np.arange(big_npy.shape[0])
|
473 |
np.random.shuffle(big_npy_idx)
|
474 |
big_npy = big_npy[big_npy_idx]
|
475 |
+
if big_npy.shape[0] > 2e5:
|
476 |
+
# if(1):
|
477 |
+
info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
|
478 |
+
print(info)
|
479 |
+
try:
|
480 |
+
big_npy = (
|
481 |
+
MiniBatchKMeans(
|
482 |
+
n_clusters=10000,
|
483 |
+
verbose=True,
|
484 |
+
batch_size=256 * n_cpu,
|
485 |
+
compute_labels=False,
|
486 |
+
init="random",
|
487 |
+
)
|
488 |
+
.fit(big_npy)
|
489 |
+
.cluster_centers_
|
490 |
+
)
|
491 |
+
except Exception:
|
492 |
+
info = traceback.format_exc()
|
493 |
+
print(info)
|
494 |
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
495 |
index = faiss.index_factory(big_npy.shape[1] , "IVF%s,Flat" % n_ivf)
|
496 |
index_ivf = faiss.extract_index_ivf(index) #
|
|
|
541 |
def get(self,index):
|
542 |
return self.__dict__.get(index)
|
543 |
|
544 |
+
|
545 |
+
class InferHParams(HParams):
|
546 |
+
def __init__(self, **kwargs):
|
547 |
+
for k, v in kwargs.items():
|
548 |
+
if type(v) == dict:
|
549 |
+
v = InferHParams(**v)
|
550 |
+
self[k] = v
|
551 |
+
|
552 |
+
def __getattr__(self,index):
|
553 |
+
return self.get(index)
|
554 |
+
|
555 |
+
|
556 |
class Volume_Extractor:
|
557 |
def __init__(self, hop_size = 512):
|
558 |
self.hop_size = hop_size
|
|
|
563 |
n_frames = int(audio.size(-1) // self.hop_size)
|
564 |
audio2 = audio ** 2
|
565 |
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
|
566 |
+
volume = torch.nn.functional.unfold(audio2[:,None,None,:],(1,self.hop_size),stride=self.hop_size)[:,:,:n_frames].mean(dim=1)[0]
|
567 |
volume = torch.sqrt(volume)
|
568 |
+
return volume
|