SayaSS commited on
Commit
e302df6
1 Parent(s): 762c569
app.py CHANGED
@@ -90,7 +90,7 @@ if __name__ == '__main__':
90
  voices.append(f"{r['ShortName']}-{r['Gender']}")
91
  for f in os.listdir("models"):
92
  name = f
93
- model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device, hubert_model=hubert_model)
94
  cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
95
  models.append((name, cover, create_vc_fn(model, name)))
96
  with gr.Blocks() as app:
 
90
  voices.append(f"{r['ShortName']}-{r['Gender']}")
91
  for f in os.listdir("models"):
92
  name = f
93
+ model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device)
94
  cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
95
  models.append((name, cover, create_vc_fn(model, name)))
96
  with gr.Blocks() as app:
cluster/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/cluster/__pycache__/__init__.cpython-38.pyc and b/cluster/__pycache__/__init__.cpython-38.pyc differ
 
cvec/checkpoint_best_legacy_500.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:294a2e8c98136070a999e040ec98dfa5a99b88a7938181c56cc2ab0e2f6ce0e8
3
- size 48501067
 
 
 
 
data_utils.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ import random
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+
8
+ import modules.commons as commons
9
+ import utils
10
+ from modules.mel_processing import spectrogram_torch, spec_to_mel_torch
11
+ from utils import load_wav_to_torch, load_filepaths_and_text
12
+
13
+ # import h5py
14
+
15
+
16
+ """Multi speaker version"""
17
+
18
+
19
+ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
20
+ """
21
+ 1) loads audio, speaker_id, text pairs
22
+ 2) normalizes text and converts them to sequences of integers
23
+ 3) computes spectrograms from audio files.
24
+ """
25
+
26
+ def __init__(self, audiopaths, hparams, all_in_mem: bool = False):
27
+ self.audiopaths = load_filepaths_and_text(audiopaths)
28
+ self.max_wav_value = hparams.data.max_wav_value
29
+ self.sampling_rate = hparams.data.sampling_rate
30
+ self.filter_length = hparams.data.filter_length
31
+ self.hop_length = hparams.data.hop_length
32
+ self.win_length = hparams.data.win_length
33
+ self.sampling_rate = hparams.data.sampling_rate
34
+ self.use_sr = hparams.train.use_sr
35
+ self.spec_len = hparams.train.max_speclen
36
+ self.spk_map = hparams.spk
37
+
38
+ random.seed(1234)
39
+ random.shuffle(self.audiopaths)
40
+
41
+ self.all_in_mem = all_in_mem
42
+ if self.all_in_mem:
43
+ self.cache = [self.get_audio(p[0]) for p in self.audiopaths]
44
+
45
+ def get_audio(self, filename):
46
+ filename = filename.replace("\\", "/")
47
+ audio, sampling_rate = load_wav_to_torch(filename)
48
+ if sampling_rate != self.sampling_rate:
49
+ raise ValueError("{} SR doesn't match target {} SR".format(
50
+ sampling_rate, self.sampling_rate))
51
+ audio_norm = audio / self.max_wav_value
52
+ audio_norm = audio_norm.unsqueeze(0)
53
+ spec_filename = filename.replace(".wav", ".spec.pt")
54
+
55
+ # Ideally, all data generated after Mar 25 should have .spec.pt
56
+ if os.path.exists(spec_filename):
57
+ spec = torch.load(spec_filename)
58
+ else:
59
+ spec = spectrogram_torch(audio_norm, self.filter_length,
60
+ self.sampling_rate, self.hop_length, self.win_length,
61
+ center=False)
62
+ spec = torch.squeeze(spec, 0)
63
+ torch.save(spec, spec_filename)
64
+
65
+ spk = filename.split("/")[-2]
66
+ spk = torch.LongTensor([self.spk_map[spk]])
67
+
68
+ f0 = np.load(filename + ".f0.npy")
69
+ f0, uv = utils.interpolate_f0(f0)
70
+ f0 = torch.FloatTensor(f0)
71
+ uv = torch.FloatTensor(uv)
72
+
73
+ c = torch.load(filename+ ".soft.pt")
74
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
75
+
76
+
77
+ lmin = min(c.size(-1), spec.size(-1))
78
+ assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
79
+ assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
80
+ spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
81
+ audio_norm = audio_norm[:, :lmin * self.hop_length]
82
+
83
+ return c, f0, spec, audio_norm, spk, uv
84
+
85
+ def random_slice(self, c, f0, spec, audio_norm, spk, uv):
86
+ # if spec.shape[1] < 30:
87
+ # print("skip too short audio:", filename)
88
+ # return None
89
+ if spec.shape[1] > 800:
90
+ start = random.randint(0, spec.shape[1]-800)
91
+ end = start + 790
92
+ spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
93
+ audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
94
+
95
+ return c, f0, spec, audio_norm, spk, uv
96
+
97
+ def __getitem__(self, index):
98
+ if self.all_in_mem:
99
+ return self.random_slice(*self.cache[index])
100
+ else:
101
+ return self.random_slice(*self.get_audio(self.audiopaths[index][0]))
102
+
103
+ def __len__(self):
104
+ return len(self.audiopaths)
105
+
106
+
107
+ class TextAudioCollate:
108
+
109
+ def __call__(self, batch):
110
+ batch = [b for b in batch if b is not None]
111
+
112
+ input_lengths, ids_sorted_decreasing = torch.sort(
113
+ torch.LongTensor([x[0].shape[1] for x in batch]),
114
+ dim=0, descending=True)
115
+
116
+ max_c_len = max([x[0].size(1) for x in batch])
117
+ max_wav_len = max([x[3].size(1) for x in batch])
118
+
119
+ lengths = torch.LongTensor(len(batch))
120
+
121
+ c_padded = torch.FloatTensor(len(batch), batch[0][0].shape[0], max_c_len)
122
+ f0_padded = torch.FloatTensor(len(batch), max_c_len)
123
+ spec_padded = torch.FloatTensor(len(batch), batch[0][2].shape[0], max_c_len)
124
+ wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
125
+ spkids = torch.LongTensor(len(batch), 1)
126
+ uv_padded = torch.FloatTensor(len(batch), max_c_len)
127
+
128
+ c_padded.zero_()
129
+ spec_padded.zero_()
130
+ f0_padded.zero_()
131
+ wav_padded.zero_()
132
+ uv_padded.zero_()
133
+
134
+ for i in range(len(ids_sorted_decreasing)):
135
+ row = batch[ids_sorted_decreasing[i]]
136
+
137
+ c = row[0]
138
+ c_padded[i, :, :c.size(1)] = c
139
+ lengths[i] = c.size(1)
140
+
141
+ f0 = row[1]
142
+ f0_padded[i, :f0.size(0)] = f0
143
+
144
+ spec = row[2]
145
+ spec_padded[i, :, :spec.size(1)] = spec
146
+
147
+ wav = row[3]
148
+ wav_padded[i, :, :wav.size(1)] = wav
149
+
150
+ spkids[i, 0] = row[4]
151
+
152
+ uv = row[5]
153
+ uv_padded[i, :uv.size(0)] = uv
154
+
155
+ return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded
hubert/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/hubert/__pycache__/__init__.cpython-38.pyc and b/hubert/__pycache__/__init__.cpython-38.pyc differ
 
hubert/__pycache__/hubert_model.cpython-38.pyc CHANGED
Binary files a/hubert/__pycache__/hubert_model.cpython-38.pyc and b/hubert/__pycache__/hubert_model.cpython-38.pyc differ
 
inference/__pycache__/infer_tool.cpython-38.pyc CHANGED
Binary files a/inference/__pycache__/infer_tool.cpython-38.pyc and b/inference/__pycache__/infer_tool.cpython-38.pyc differ
 
inference/infer_tool.py CHANGED
@@ -108,8 +108,11 @@ def split_list_by_n(list_collection, n, pre=0):
108
  yield list_collection[i-pre if i-pre>=0 else i: i + n]
109
 
110
 
 
 
 
111
  class Svc(object):
112
- def __init__(self, net_g_path, config_path, hubert_model,
113
  device=None,
114
  cluster_model_path="logs/44k/kmeans_10000.pt"):
115
  self.net_g_path = net_g_path
@@ -123,7 +126,7 @@ class Svc(object):
123
  self.hop_size = self.hps_ms.data.hop_length
124
  self.spk2id = self.hps_ms.spk
125
  # 加载hubert
126
- self.hubert_model = hubert_model
127
  self.load_model()
128
  if os.path.exists(cluster_model_path):
129
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
@@ -142,12 +145,24 @@ class Svc(object):
142
 
143
 
144
 
145
- def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
 
146
  wav, sr = librosa.load(in_path, sr=self.target_sample)
147
- f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
148
- f0, uv = utils.interpolate_f0(f0)
149
- f0 = torch.FloatTensor(f0)
150
- uv = torch.FloatTensor(uv)
 
 
 
 
 
 
 
 
 
 
 
151
  f0 = f0 * 2 ** (tran / 12)
152
  f0 = f0.unsqueeze(0).to(self.dev)
153
  uv = uv.unsqueeze(0).to(self.dev)
@@ -157,7 +172,7 @@ class Svc(object):
157
  c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
158
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
159
 
160
- if cluster_infer_ratio != 0:
161
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
162
  cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
163
  c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
@@ -168,13 +183,17 @@ class Svc(object):
168
  def infer(self, speaker, tran, raw_path,
169
  cluster_infer_ratio=0,
170
  auto_predict_f0=False,
171
- noice_scale=0.4):
 
 
 
 
172
  speaker_id = self.spk2id.__dict__.get(speaker)
173
  if not speaker_id and type(speaker) is int:
174
  if len(self.spk2id.__dict__) >= speaker:
175
  speaker_id = speaker
176
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
177
- c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker)
178
  if "half" in self.net_g_path and torch.cuda.is_available():
179
  c = c.half()
180
  with torch.no_grad():
@@ -183,23 +202,35 @@ class Svc(object):
183
  use_time = time.time() - start
184
  print("vits use time:{}".format(use_time))
185
  return audio, audio.shape[-1]
186
-
187
  def clear_empty(self):
188
  # 清理显存
189
  torch.cuda.empty_cache()
190
 
191
- def slice_inference(self, raw_audio_path, spk, tran, slice_db, cluster_infer_ratio, auto_predict_f0, noice_scale,
192
- pad_seconds=0.5, clip_seconds=0, lg_num=0, lgr_num=0.75):
 
 
 
 
 
 
 
 
 
 
 
 
193
  wav_path = raw_audio_path
194
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
195
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
196
- per_size = int(clip_seconds * audio_sr)
197
- lg_size = int(lg_num * audio_sr)
198
- lg_size_r = int(lg_size * lgr_num)
199
- lg_size_c_l = (lg_size - lg_size_r) // 2
200
- lg_size_c_r = lg_size - lg_size_r - lg_size_c_l
201
- lg = np.linspace(0, 1, lg_size_r) if lg_size != 0 else 0
202
-
203
  audio = []
204
  for (slice_tag, data) in audio_data:
205
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
@@ -211,12 +242,12 @@ class Svc(object):
211
  audio.extend(list(pad_array(_audio, length)))
212
  continue
213
  if per_size != 0:
214
- datas = split_list_by_n(data, per_size, lg_size)
215
  else:
216
  datas = [data]
217
- for k, dat in enumerate(datas):
218
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds != 0 else length
219
- if clip_seconds != 0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
220
  # padd
221
  pad_len = int(audio_sr * pad_seconds)
222
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
@@ -224,25 +255,25 @@ class Svc(object):
224
  soundfile.write(raw_path, dat, audio_sr, format="wav")
225
  raw_path.seek(0)
226
  out_audio, out_sr = self.infer(spk, tran, raw_path,
227
- cluster_infer_ratio=cluster_infer_ratio,
228
- auto_predict_f0=auto_predict_f0,
229
- noice_scale=noice_scale
230
- )
 
231
  _audio = out_audio.cpu().numpy()
232
  pad_len = int(self.target_sample * pad_seconds)
233
  _audio = _audio[pad_len:-pad_len]
234
  _audio = pad_array(_audio, per_length)
235
- if lg_size != 0 and k != 0:
236
- lg1 = audio[-(lg_size_r + lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
237
- lg2 = _audio[lg_size_c_l:lg_size_c_l + lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
238
- lg_pre = lg1 * (1 - lg) + lg2 * lg
239
- audio = audio[0:-(lg_size_r + lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
240
  audio.extend(lg_pre)
241
- _audio = _audio[lg_size_c_l + lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
242
  audio.extend(list(_audio))
243
  return np.array(audio)
244
 
245
-
246
  class RealTimeVC:
247
  def __init__(self):
248
  self.last_chunk = None
@@ -252,14 +283,25 @@ class RealTimeVC:
252
 
253
  """输入输出都是1维numpy 音频波形数组"""
254
 
255
- def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
 
 
 
 
 
256
  import maad
257
  audio, sr = torchaudio.load(input_wav_path)
258
  audio = audio.cpu().numpy()[0]
259
  temp_wav = io.BytesIO()
260
  if self.last_chunk is None:
261
  input_wav_path.seek(0)
262
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
 
 
 
 
 
 
263
  audio = audio.cpu().numpy()
264
  self.last_chunk = audio[-self.pre_len:]
265
  self.last_o = audio
@@ -268,7 +310,13 @@ class RealTimeVC:
268
  audio = np.concatenate([self.last_chunk, audio])
269
  soundfile.write(temp_wav, audio, sr, format="wav")
270
  temp_wav.seek(0)
271
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
 
 
 
 
 
 
272
  audio = audio.cpu().numpy()
273
  ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
274
  self.last_chunk = audio[-self.pre_len:]
 
108
  yield list_collection[i-pre if i-pre>=0 else i: i + n]
109
 
110
 
111
+ class F0FilterException(Exception):
112
+ pass
113
+
114
  class Svc(object):
115
+ def __init__(self, net_g_path, config_path,
116
  device=None,
117
  cluster_model_path="logs/44k/kmeans_10000.pt"):
118
  self.net_g_path = net_g_path
 
126
  self.hop_size = self.hps_ms.data.hop_length
127
  self.spk2id = self.hps_ms.spk
128
  # 加载hubert
129
+ self.hubert_model = utils.get_hubert_model().to(self.dev)
130
  self.load_model()
131
  if os.path.exists(cluster_model_path):
132
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
 
145
 
146
 
147
 
148
+ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling):
149
+
150
  wav, sr = librosa.load(in_path, sr=self.target_sample)
151
+
152
+ if F0_mean_pooling == True:
153
+ f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev)
154
+ if f0_filter and sum(f0) == 0:
155
+ raise F0FilterException("未检测到人声")
156
+ f0 = torch.FloatTensor(list(f0))
157
+ uv = torch.FloatTensor(list(uv))
158
+ if F0_mean_pooling == False:
159
+ f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
160
+ if f0_filter and sum(f0) == 0:
161
+ raise F0FilterException("未检测到人声")
162
+ f0, uv = utils.interpolate_f0(f0)
163
+ f0 = torch.FloatTensor(f0)
164
+ uv = torch.FloatTensor(uv)
165
+
166
  f0 = f0 * 2 ** (tran / 12)
167
  f0 = f0.unsqueeze(0).to(self.dev)
168
  uv = uv.unsqueeze(0).to(self.dev)
 
172
  c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
173
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
174
 
175
+ if cluster_infer_ratio !=0:
176
  cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
177
  cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
178
  c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
 
183
  def infer(self, speaker, tran, raw_path,
184
  cluster_infer_ratio=0,
185
  auto_predict_f0=False,
186
+ noice_scale=0.4,
187
+ f0_filter=False,
188
+ F0_mean_pooling=False
189
+ ):
190
+
191
  speaker_id = self.spk2id.__dict__.get(speaker)
192
  if not speaker_id and type(speaker) is int:
193
  if len(self.spk2id.__dict__) >= speaker:
194
  speaker_id = speaker
195
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
196
+ c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling)
197
  if "half" in self.net_g_path and torch.cuda.is_available():
198
  c = c.half()
199
  with torch.no_grad():
 
202
  use_time = time.time() - start
203
  print("vits use time:{}".format(use_time))
204
  return audio, audio.shape[-1]
205
+
206
  def clear_empty(self):
207
  # 清理显存
208
  torch.cuda.empty_cache()
209
 
210
+ def slice_inference(self,
211
+ raw_audio_path,
212
+ spk,
213
+ tran,
214
+ slice_db,
215
+ cluster_infer_ratio,
216
+ auto_predict_f0,
217
+ noice_scale,
218
+ pad_seconds=0.5,
219
+ clip_seconds=0,
220
+ lg_num=0,
221
+ lgr_num =0.75,
222
+ F0_mean_pooling = False
223
+ ):
224
  wav_path = raw_audio_path
225
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
226
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
227
+ per_size = int(clip_seconds*audio_sr)
228
+ lg_size = int(lg_num*audio_sr)
229
+ lg_size_r = int(lg_size*lgr_num)
230
+ lg_size_c_l = (lg_size-lg_size_r)//2
231
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
232
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
233
+
234
  audio = []
235
  for (slice_tag, data) in audio_data:
236
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
 
242
  audio.extend(list(pad_array(_audio, length)))
243
  continue
244
  if per_size != 0:
245
+ datas = split_list_by_n(data, per_size,lg_size)
246
  else:
247
  datas = [data]
248
+ for k,dat in enumerate(datas):
249
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
250
+ if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
251
  # padd
252
  pad_len = int(audio_sr * pad_seconds)
253
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
 
255
  soundfile.write(raw_path, dat, audio_sr, format="wav")
256
  raw_path.seek(0)
257
  out_audio, out_sr = self.infer(spk, tran, raw_path,
258
+ cluster_infer_ratio=cluster_infer_ratio,
259
+ auto_predict_f0=auto_predict_f0,
260
+ noice_scale=noice_scale,
261
+ F0_mean_pooling = F0_mean_pooling
262
+ )
263
  _audio = out_audio.cpu().numpy()
264
  pad_len = int(self.target_sample * pad_seconds)
265
  _audio = _audio[pad_len:-pad_len]
266
  _audio = pad_array(_audio, per_length)
267
+ if lg_size!=0 and k!=0:
268
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
269
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
270
+ lg_pre = lg1*(1-lg)+lg2*lg
271
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
272
  audio.extend(lg_pre)
273
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
274
  audio.extend(list(_audio))
275
  return np.array(audio)
276
 
 
277
  class RealTimeVC:
278
  def __init__(self):
279
  self.last_chunk = None
 
283
 
284
  """输入输出都是1维numpy 音频波形数组"""
285
 
286
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
287
+ cluster_infer_ratio=0,
288
+ auto_predict_f0=False,
289
+ noice_scale=0.4,
290
+ f0_filter=False):
291
+
292
  import maad
293
  audio, sr = torchaudio.load(input_wav_path)
294
  audio = audio.cpu().numpy()[0]
295
  temp_wav = io.BytesIO()
296
  if self.last_chunk is None:
297
  input_wav_path.seek(0)
298
+
299
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
300
+ cluster_infer_ratio=cluster_infer_ratio,
301
+ auto_predict_f0=auto_predict_f0,
302
+ noice_scale=noice_scale,
303
+ f0_filter=f0_filter)
304
+
305
  audio = audio.cpu().numpy()
306
  self.last_chunk = audio[-self.pre_len:]
307
  self.last_o = audio
 
310
  audio = np.concatenate([self.last_chunk, audio])
311
  soundfile.write(temp_wav, audio, sr, format="wav")
312
  temp_wav.seek(0)
313
+
314
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
315
+ cluster_infer_ratio=cluster_infer_ratio,
316
+ auto_predict_f0=auto_predict_f0,
317
+ noice_scale=noice_scale,
318
+ f0_filter=f0_filter)
319
+
320
  audio = audio.cpu().numpy()
321
  ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
322
  self.last_chunk = audio[-self.pre_len:]
inference_main.py CHANGED
@@ -23,17 +23,19 @@ def main():
23
  parser = argparse.ArgumentParser(description='sovits4 inference')
24
 
25
  # 一定要设置的部分
26
- parser.add_argument('-m', '--model_path', type=str, default="/Volumes/Extend/下载/G_20800.pth", help='模型路径')
27
  parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
28
- parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src"], help='wav文件名列表,放在raw文件夹下')
 
29
  parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
30
- parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nyaru'], help='合成目标说话人名称')
31
 
32
  # 可选项部分
33
- parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,
34
- help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
35
- parser.add_argument('-cm', '--cluster_model_path', type=str, default="/Volumes/Extend/下载/so-vits-svc-4.0/logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
36
- parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=1, help='聚类方案占比,范围0-1,若没有训练聚类模型则填0即可')
 
37
 
38
  # 不用动的部分
39
  parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
@@ -41,6 +43,7 @@ def main():
41
  parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
42
  parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
43
  parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
 
44
 
45
  args = parser.parse_args()
46
 
@@ -55,6 +58,10 @@ def main():
55
  cluster_infer_ratio = args.cluster_infer_ratio
56
  noice_scale = args.noice_scale
57
  pad_seconds = args.pad_seconds
 
 
 
 
58
 
59
  infer_tool.fill_a_to_b(trans, clean_names)
60
  for clean_name, tran in zip(clean_names, trans):
@@ -65,35 +72,58 @@ def main():
65
  wav_path = Path(raw_audio_path).with_suffix('.wav')
66
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
67
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
 
 
 
 
 
 
68
 
69
  for spk in spk_list:
70
  audio = []
71
  for (slice_tag, data) in audio_data:
72
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
73
- # padd
74
- pad_len = int(audio_sr * pad_seconds)
75
- data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
76
  length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
77
- raw_path = io.BytesIO()
78
- soundfile.write(raw_path, data, audio_sr, format="wav")
79
- raw_path.seek(0)
80
  if slice_tag:
81
  print('jump empty segment')
82
  _audio = np.zeros(length)
 
 
 
 
83
  else:
 
 
 
 
 
 
 
 
 
 
84
  out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
85
  cluster_infer_ratio=cluster_infer_ratio,
86
  auto_predict_f0=auto_predict_f0,
87
- noice_scale=noice_scale
 
88
  )
89
  _audio = out_audio.cpu().numpy()
90
-
91
- pad_len = int(svc_model.target_sample * pad_seconds)
92
- _audio = _audio[pad_len:-pad_len]
93
- audio.extend(list(_audio))
 
 
 
 
 
 
 
94
  key = "auto" if auto_predict_f0 else f"{tran}key"
95
  cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
96
- res_path = f'./results/old——{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
97
  soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
98
 
99
  if __name__ == '__main__':
 
23
  parser = argparse.ArgumentParser(description='sovits4 inference')
24
 
25
  # 一定要设置的部分
26
+ parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
27
  parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
28
+ parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
29
+ parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
30
  parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
31
+ parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
32
 
33
  # 可选项部分
34
+ parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
35
+ parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
36
+ parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可')
37
+ parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
38
+ parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭')
39
 
40
  # 不用动的部分
41
  parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
 
43
  parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
44
  parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
45
  parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
46
+ parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
47
 
48
  args = parser.parse_args()
49
 
 
58
  cluster_infer_ratio = args.cluster_infer_ratio
59
  noice_scale = args.noice_scale
60
  pad_seconds = args.pad_seconds
61
+ clip = args.clip
62
+ lg = args.linear_gradient
63
+ lgr = args.linear_gradient_retain
64
+ F0_mean_pooling = args.f0_mean_pooling
65
 
66
  infer_tool.fill_a_to_b(trans, clean_names)
67
  for clean_name, tran in zip(clean_names, trans):
 
72
  wav_path = Path(raw_audio_path).with_suffix('.wav')
73
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
74
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
75
+ per_size = int(clip*audio_sr)
76
+ lg_size = int(lg*audio_sr)
77
+ lg_size_r = int(lg_size*lgr)
78
+ lg_size_c_l = (lg_size-lg_size_r)//2
79
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
80
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
81
 
82
  for spk in spk_list:
83
  audio = []
84
  for (slice_tag, data) in audio_data:
85
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
86
+
 
 
87
  length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
 
 
 
88
  if slice_tag:
89
  print('jump empty segment')
90
  _audio = np.zeros(length)
91
+ audio.extend(list(infer_tool.pad_array(_audio, length)))
92
+ continue
93
+ if per_size != 0:
94
+ datas = infer_tool.split_list_by_n(data, per_size,lg_size)
95
  else:
96
+ datas = [data]
97
+ for k,dat in enumerate(datas):
98
+ per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
99
+ if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
100
+ # padd
101
+ pad_len = int(audio_sr * pad_seconds)
102
+ dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
103
+ raw_path = io.BytesIO()
104
+ soundfile.write(raw_path, dat, audio_sr, format="wav")
105
+ raw_path.seek(0)
106
  out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
107
  cluster_infer_ratio=cluster_infer_ratio,
108
  auto_predict_f0=auto_predict_f0,
109
+ noice_scale=noice_scale,
110
+ F0_mean_pooling = F0_mean_pooling
111
  )
112
  _audio = out_audio.cpu().numpy()
113
+ pad_len = int(svc_model.target_sample * pad_seconds)
114
+ _audio = _audio[pad_len:-pad_len]
115
+ _audio = infer_tool.pad_array(_audio, per_length)
116
+ if lg_size!=0 and k!=0:
117
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
118
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
119
+ lg_pre = lg1*(1-lg)+lg2*lg
120
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
121
+ audio.extend(lg_pre)
122
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
123
+ audio.extend(list(_audio))
124
  key = "auto" if auto_predict_f0 else f"{tran}key"
125
  cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
126
+ res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
127
  soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
128
 
129
  if __name__ == '__main__':
modules/crepe.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional,Union
2
+ try:
3
+ from typing import Literal
4
+ except Exception as e:
5
+ from typing_extensions import Literal
6
+ import numpy as np
7
+ import torch
8
+ import torchcrepe
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+ import scipy
12
+
13
+ #from:https://github.com/fishaudio/fish-diffusion
14
+
15
+ def repeat_expand(
16
+ content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
17
+ ):
18
+ """Repeat content to target length.
19
+ This is a wrapper of torch.nn.functional.interpolate.
20
+
21
+ Args:
22
+ content (torch.Tensor): tensor
23
+ target_len (int): target length
24
+ mode (str, optional): interpolation mode. Defaults to "nearest".
25
+
26
+ Returns:
27
+ torch.Tensor: tensor
28
+ """
29
+
30
+ ndim = content.ndim
31
+
32
+ if content.ndim == 1:
33
+ content = content[None, None]
34
+ elif content.ndim == 2:
35
+ content = content[None]
36
+
37
+ assert content.ndim == 3
38
+
39
+ is_np = isinstance(content, np.ndarray)
40
+ if is_np:
41
+ content = torch.from_numpy(content)
42
+
43
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
44
+
45
+ if is_np:
46
+ results = results.numpy()
47
+
48
+ if ndim == 1:
49
+ return results[0, 0]
50
+ elif ndim == 2:
51
+ return results[0]
52
+
53
+
54
+ class BasePitchExtractor:
55
+ def __init__(
56
+ self,
57
+ hop_length: int = 512,
58
+ f0_min: float = 50.0,
59
+ f0_max: float = 1100.0,
60
+ keep_zeros: bool = True,
61
+ ):
62
+ """Base pitch extractor.
63
+
64
+ Args:
65
+ hop_length (int, optional): Hop length. Defaults to 512.
66
+ f0_min (float, optional): Minimum f0. Defaults to 50.0.
67
+ f0_max (float, optional): Maximum f0. Defaults to 1100.0.
68
+ keep_zeros (bool, optional): Whether keep zeros in pitch. Defaults to True.
69
+ """
70
+
71
+ self.hop_length = hop_length
72
+ self.f0_min = f0_min
73
+ self.f0_max = f0_max
74
+ self.keep_zeros = keep_zeros
75
+
76
+ def __call__(self, x, sampling_rate=44100, pad_to=None):
77
+ raise NotImplementedError("BasePitchExtractor is not callable.")
78
+
79
+ def post_process(self, x, sampling_rate, f0, pad_to):
80
+ if isinstance(f0, np.ndarray):
81
+ f0 = torch.from_numpy(f0).float().to(x.device)
82
+
83
+ if pad_to is None:
84
+ return f0
85
+
86
+ f0 = repeat_expand(f0, pad_to)
87
+
88
+ if self.keep_zeros:
89
+ return f0
90
+
91
+ vuv_vector = torch.zeros_like(f0)
92
+ vuv_vector[f0 > 0.0] = 1.0
93
+ vuv_vector[f0 <= 0.0] = 0.0
94
+
95
+ # 去掉0频率, 并线性插值
96
+ nzindex = torch.nonzero(f0).squeeze()
97
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
98
+ time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
99
+ time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
100
+
101
+ if f0.shape[0] <= 0:
102
+ return torch.zeros(pad_to, dtype=torch.float, device=x.device),torch.zeros(pad_to, dtype=torch.float, device=x.device)
103
+
104
+ if f0.shape[0] == 1:
105
+ return torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0],torch.ones(pad_to, dtype=torch.float, device=x.device)
106
+
107
+ # 大概可以用 torch 重写?
108
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
109
+ vuv_vector = vuv_vector.cpu().numpy()
110
+ vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
111
+
112
+ return f0,vuv_vector
113
+
114
+
115
+ class MaskedAvgPool1d(nn.Module):
116
+ def __init__(
117
+ self, kernel_size: int, stride: Optional[int] = None, padding: Optional[int] = 0
118
+ ):
119
+ """An implementation of mean pooling that supports masked values.
120
+
121
+ Args:
122
+ kernel_size (int): The size of the median pooling window.
123
+ stride (int, optional): The stride of the median pooling window. Defaults to None.
124
+ padding (int, optional): The padding of the median pooling window. Defaults to 0.
125
+ """
126
+
127
+ super(MaskedAvgPool1d, self).__init__()
128
+ self.kernel_size = kernel_size
129
+ self.stride = stride or kernel_size
130
+ self.padding = padding
131
+
132
+ def forward(self, x, mask=None):
133
+ ndim = x.dim()
134
+ if ndim == 2:
135
+ x = x.unsqueeze(1)
136
+
137
+ assert (
138
+ x.dim() == 3
139
+ ), "Input tensor must have 2 or 3 dimensions (batch_size, channels, width)"
140
+
141
+ # Apply the mask by setting masked elements to zero, or make NaNs zero
142
+ if mask is None:
143
+ mask = ~torch.isnan(x)
144
+
145
+ # Ensure mask has the same shape as the input tensor
146
+ assert x.shape == mask.shape, "Input tensor and mask must have the same shape"
147
+
148
+ masked_x = torch.where(mask, x, torch.zeros_like(x))
149
+ # Create a ones kernel with the same number of channels as the input tensor
150
+ ones_kernel = torch.ones(x.size(1), 1, self.kernel_size, device=x.device)
151
+
152
+ # Perform sum pooling
153
+ sum_pooled = nn.functional.conv1d(
154
+ masked_x,
155
+ ones_kernel,
156
+ stride=self.stride,
157
+ padding=self.padding,
158
+ groups=x.size(1),
159
+ )
160
+
161
+ # Count the non-masked (valid) elements in each pooling window
162
+ valid_count = nn.functional.conv1d(
163
+ mask.float(),
164
+ ones_kernel,
165
+ stride=self.stride,
166
+ padding=self.padding,
167
+ groups=x.size(1),
168
+ )
169
+ valid_count = valid_count.clamp(min=1) # Avoid division by zero
170
+
171
+ # Perform masked average pooling
172
+ avg_pooled = sum_pooled / valid_count
173
+
174
+ # Fill zero values with NaNs
175
+ avg_pooled[avg_pooled == 0] = float("nan")
176
+
177
+ if ndim == 2:
178
+ return avg_pooled.squeeze(1)
179
+
180
+ return avg_pooled
181
+
182
+
183
+ class MaskedMedianPool1d(nn.Module):
184
+ def __init__(
185
+ self, kernel_size: int, stride: Optional[int] = None, padding: Optional[int] = 0
186
+ ):
187
+ """An implementation of median pooling that supports masked values.
188
+
189
+ This implementation is inspired by the median pooling implementation in
190
+ https://gist.github.com/rwightman/f2d3849281624be7c0f11c85c87c1598
191
+
192
+ Args:
193
+ kernel_size (int): The size of the median pooling window.
194
+ stride (int, optional): The stride of the median pooling window. Defaults to None.
195
+ padding (int, optional): The padding of the median pooling window. Defaults to 0.
196
+ """
197
+
198
+ super(MaskedMedianPool1d, self).__init__()
199
+ self.kernel_size = kernel_size
200
+ self.stride = stride or kernel_size
201
+ self.padding = padding
202
+
203
+ def forward(self, x, mask=None):
204
+ ndim = x.dim()
205
+ if ndim == 2:
206
+ x = x.unsqueeze(1)
207
+
208
+ assert (
209
+ x.dim() == 3
210
+ ), "Input tensor must have 2 or 3 dimensions (batch_size, channels, width)"
211
+
212
+ if mask is None:
213
+ mask = ~torch.isnan(x)
214
+
215
+ assert x.shape == mask.shape, "Input tensor and mask must have the same shape"
216
+
217
+ masked_x = torch.where(mask, x, torch.zeros_like(x))
218
+
219
+ x = F.pad(masked_x, (self.padding, self.padding), mode="reflect")
220
+ mask = F.pad(
221
+ mask.float(), (self.padding, self.padding), mode="constant", value=0
222
+ )
223
+
224
+ x = x.unfold(2, self.kernel_size, self.stride)
225
+ mask = mask.unfold(2, self.kernel_size, self.stride)
226
+
227
+ x = x.contiguous().view(x.size()[:3] + (-1,))
228
+ mask = mask.contiguous().view(mask.size()[:3] + (-1,)).to(x.device)
229
+
230
+ # Combine the mask with the input tensor
231
+ #x_masked = torch.where(mask.bool(), x, torch.fill_(torch.zeros_like(x),float("inf")))
232
+ x_masked = torch.where(mask.bool(), x, torch.FloatTensor([float("inf")]).to(x.device))
233
+
234
+ # Sort the masked tensor along the last dimension
235
+ x_sorted, _ = torch.sort(x_masked, dim=-1)
236
+
237
+ # Compute the count of non-masked (valid) values
238
+ valid_count = mask.sum(dim=-1)
239
+
240
+ # Calculate the index of the median value for each pooling window
241
+ median_idx = (torch.div((valid_count - 1), 2, rounding_mode='trunc')).clamp(min=0)
242
+
243
+ # Gather the median values using the calculated indices
244
+ median_pooled = x_sorted.gather(-1, median_idx.unsqueeze(-1).long()).squeeze(-1)
245
+
246
+ # Fill infinite values with NaNs
247
+ median_pooled[torch.isinf(median_pooled)] = float("nan")
248
+
249
+ if ndim == 2:
250
+ return median_pooled.squeeze(1)
251
+
252
+ return median_pooled
253
+
254
+
255
+ class CrepePitchExtractor(BasePitchExtractor):
256
+ def __init__(
257
+ self,
258
+ hop_length: int = 512,
259
+ f0_min: float = 50.0,
260
+ f0_max: float = 1100.0,
261
+ threshold: float = 0.05,
262
+ keep_zeros: bool = False,
263
+ device = None,
264
+ model: Literal["full", "tiny"] = "full",
265
+ use_fast_filters: bool = True,
266
+ ):
267
+ super().__init__(hop_length, f0_min, f0_max, keep_zeros)
268
+
269
+ self.threshold = threshold
270
+ self.model = model
271
+ self.use_fast_filters = use_fast_filters
272
+ self.hop_length = hop_length
273
+ if device is None:
274
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
275
+ else:
276
+ self.dev = torch.device(device)
277
+ if self.use_fast_filters:
278
+ self.median_filter = MaskedMedianPool1d(3, 1, 1).to(device)
279
+ self.mean_filter = MaskedAvgPool1d(3, 1, 1).to(device)
280
+
281
+ def __call__(self, x, sampling_rate=44100, pad_to=None):
282
+ """Extract pitch using crepe.
283
+
284
+
285
+ Args:
286
+ x (torch.Tensor): Audio signal, shape (1, T).
287
+ sampling_rate (int, optional): Sampling rate. Defaults to 44100.
288
+ pad_to (int, optional): Pad to length. Defaults to None.
289
+
290
+ Returns:
291
+ torch.Tensor: Pitch, shape (T // hop_length,).
292
+ """
293
+
294
+ assert x.ndim == 2, f"Expected 2D tensor, got {x.ndim}D tensor."
295
+ assert x.shape[0] == 1, f"Expected 1 channel, got {x.shape[0]} channels."
296
+
297
+ x = x.to(self.dev)
298
+ f0, pd = torchcrepe.predict(
299
+ x,
300
+ sampling_rate,
301
+ self.hop_length,
302
+ self.f0_min,
303
+ self.f0_max,
304
+ pad=True,
305
+ model=self.model,
306
+ batch_size=1024,
307
+ device=x.device,
308
+ return_periodicity=True,
309
+ )
310
+
311
+ # Filter, remove silence, set uv threshold, refer to the original warehouse readme
312
+ if self.use_fast_filters:
313
+ pd = self.median_filter(pd)
314
+ else:
315
+ pd = torchcrepe.filter.median(pd, 3)
316
+
317
+ pd = torchcrepe.threshold.Silence(-60.0)(pd, x, sampling_rate, 512)
318
+ f0 = torchcrepe.threshold.At(self.threshold)(f0, pd)
319
+
320
+ if self.use_fast_filters:
321
+ f0 = self.mean_filter(f0)
322
+ else:
323
+ f0 = torchcrepe.filter.mean(f0, 3)
324
+
325
+ f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
326
+
327
+ return self.post_process(x, sampling_rate, f0, pad_to)
onnx/model_onnx.py DELETED
@@ -1,328 +0,0 @@
1
- import copy
2
- import math
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
- import modules.attentions as attentions
8
- import modules.commons as commons
9
- import modules.modules as modules
10
-
11
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
- from modules.commons import init_weights, get_padding
14
- from vdecoder.hifigan.models import Generator
15
- from utils import f0_to_coarse
16
-
17
- class ResidualCouplingBlock(nn.Module):
18
- def __init__(self,
19
- channels,
20
- hidden_channels,
21
- kernel_size,
22
- dilation_rate,
23
- n_layers,
24
- n_flows=4,
25
- gin_channels=0):
26
- super().__init__()
27
- self.channels = channels
28
- self.hidden_channels = hidden_channels
29
- self.kernel_size = kernel_size
30
- self.dilation_rate = dilation_rate
31
- self.n_layers = n_layers
32
- self.n_flows = n_flows
33
- self.gin_channels = gin_channels
34
-
35
- self.flows = nn.ModuleList()
36
- for i in range(n_flows):
37
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
38
- self.flows.append(modules.Flip())
39
-
40
- def forward(self, x, x_mask, g=None, reverse=False):
41
- if not reverse:
42
- for flow in self.flows:
43
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
44
- else:
45
- for flow in reversed(self.flows):
46
- x = flow(x, x_mask, g=g, reverse=reverse)
47
- return x
48
-
49
-
50
- class Encoder(nn.Module):
51
- def __init__(self,
52
- in_channels,
53
- out_channels,
54
- hidden_channels,
55
- kernel_size,
56
- dilation_rate,
57
- n_layers,
58
- gin_channels=0):
59
- super().__init__()
60
- self.in_channels = in_channels
61
- self.out_channels = out_channels
62
- self.hidden_channels = hidden_channels
63
- self.kernel_size = kernel_size
64
- self.dilation_rate = dilation_rate
65
- self.n_layers = n_layers
66
- self.gin_channels = gin_channels
67
-
68
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
69
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
70
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
71
-
72
- def forward(self, x, x_lengths, g=None):
73
- # print(x.shape,x_lengths.shape)
74
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
75
- x = self.pre(x) * x_mask
76
- x = self.enc(x, x_mask, g=g)
77
- stats = self.proj(x) * x_mask
78
- m, logs = torch.split(stats, self.out_channels, dim=1)
79
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
80
- return z, m, logs, x_mask
81
-
82
-
83
- class TextEncoder(nn.Module):
84
- def __init__(self,
85
- in_channels,
86
- out_channels,
87
- hidden_channels,
88
- kernel_size,
89
- dilation_rate,
90
- n_layers,
91
- gin_channels=0,
92
- filter_channels=None,
93
- n_heads=None,
94
- p_dropout=None):
95
- super().__init__()
96
- self.in_channels = in_channels
97
- self.out_channels = out_channels
98
- self.hidden_channels = hidden_channels
99
- self.kernel_size = kernel_size
100
- self.dilation_rate = dilation_rate
101
- self.n_layers = n_layers
102
- self.gin_channels = gin_channels
103
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
104
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
105
- self.f0_emb = nn.Embedding(256, hidden_channels)
106
-
107
- self.enc_ = attentions.Encoder(
108
- hidden_channels,
109
- filter_channels,
110
- n_heads,
111
- n_layers,
112
- kernel_size,
113
- p_dropout)
114
-
115
- def forward(self, x, x_lengths, f0=None):
116
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
117
- x = self.pre(x) * x_mask
118
- x = x + self.f0_emb(f0.long()).transpose(1,2)
119
- x = self.enc_(x * x_mask, x_mask)
120
- stats = self.proj(x) * x_mask
121
- m, logs = torch.split(stats, self.out_channels, dim=1)
122
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
123
-
124
- return z, m, logs, x_mask
125
-
126
-
127
-
128
- class DiscriminatorP(torch.nn.Module):
129
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130
- super(DiscriminatorP, self).__init__()
131
- self.period = period
132
- self.use_spectral_norm = use_spectral_norm
133
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
134
- self.convs = nn.ModuleList([
135
- norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
136
- norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
137
- norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
138
- norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
139
- norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
140
- ])
141
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
142
-
143
- def forward(self, x):
144
- fmap = []
145
-
146
- # 1d to 2d
147
- b, c, t = x.shape
148
- if t % self.period != 0: # pad first
149
- n_pad = self.period - (t % self.period)
150
- x = F.pad(x, (0, n_pad), "reflect")
151
- t = t + n_pad
152
- x = x.view(b, c, t // self.period, self.period)
153
-
154
- for l in self.convs:
155
- x = l(x)
156
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
157
- fmap.append(x)
158
- x = self.conv_post(x)
159
- fmap.append(x)
160
- x = torch.flatten(x, 1, -1)
161
-
162
- return x, fmap
163
-
164
-
165
- class DiscriminatorS(torch.nn.Module):
166
- def __init__(self, use_spectral_norm=False):
167
- super(DiscriminatorS, self).__init__()
168
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
169
- self.convs = nn.ModuleList([
170
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
171
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
172
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
173
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
174
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
175
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
176
- ])
177
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
178
-
179
- def forward(self, x):
180
- fmap = []
181
-
182
- for l in self.convs:
183
- x = l(x)
184
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
185
- fmap.append(x)
186
- x = self.conv_post(x)
187
- fmap.append(x)
188
- x = torch.flatten(x, 1, -1)
189
-
190
- return x, fmap
191
-
192
-
193
- class MultiPeriodDiscriminator(torch.nn.Module):
194
- def __init__(self, use_spectral_norm=False):
195
- super(MultiPeriodDiscriminator, self).__init__()
196
- periods = [2,3,5,7,11]
197
-
198
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
199
- discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
200
- self.discriminators = nn.ModuleList(discs)
201
-
202
- def forward(self, y, y_hat):
203
- y_d_rs = []
204
- y_d_gs = []
205
- fmap_rs = []
206
- fmap_gs = []
207
- for i, d in enumerate(self.discriminators):
208
- y_d_r, fmap_r = d(y)
209
- y_d_g, fmap_g = d(y_hat)
210
- y_d_rs.append(y_d_r)
211
- y_d_gs.append(y_d_g)
212
- fmap_rs.append(fmap_r)
213
- fmap_gs.append(fmap_g)
214
-
215
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
216
-
217
-
218
- class SpeakerEncoder(torch.nn.Module):
219
- def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
220
- super(SpeakerEncoder, self).__init__()
221
- self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
222
- self.linear = nn.Linear(model_hidden_size, model_embedding_size)
223
- self.relu = nn.ReLU()
224
-
225
- def forward(self, mels):
226
- self.lstm.flatten_parameters()
227
- _, (hidden, _) = self.lstm(mels)
228
- embeds_raw = self.relu(self.linear(hidden[-1]))
229
- return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
230
-
231
- def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
232
- mel_slices = []
233
- for i in range(0, total_frames-partial_frames, partial_hop):
234
- mel_range = torch.arange(i, i+partial_frames)
235
- mel_slices.append(mel_range)
236
-
237
- return mel_slices
238
-
239
- def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
240
- mel_len = mel.size(1)
241
- last_mel = mel[:,-partial_frames:]
242
-
243
- if mel_len > partial_frames:
244
- mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
245
- mels = list(mel[:,s] for s in mel_slices)
246
- mels.append(last_mel)
247
- mels = torch.stack(tuple(mels), 0).squeeze(1)
248
-
249
- with torch.no_grad():
250
- partial_embeds = self(mels)
251
- embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
252
- #embed = embed / torch.linalg.norm(embed, 2)
253
- else:
254
- with torch.no_grad():
255
- embed = self(last_mel)
256
-
257
- return embed
258
-
259
-
260
- class SynthesizerTrn(nn.Module):
261
- """
262
- Synthesizer for Training
263
- """
264
-
265
- def __init__(self,
266
- spec_channels,
267
- segment_size,
268
- inter_channels,
269
- hidden_channels,
270
- filter_channels,
271
- n_heads,
272
- n_layers,
273
- kernel_size,
274
- p_dropout,
275
- resblock,
276
- resblock_kernel_sizes,
277
- resblock_dilation_sizes,
278
- upsample_rates,
279
- upsample_initial_channel,
280
- upsample_kernel_sizes,
281
- gin_channels,
282
- ssl_dim,
283
- n_speakers,
284
- **kwargs):
285
-
286
- super().__init__()
287
- self.spec_channels = spec_channels
288
- self.inter_channels = inter_channels
289
- self.hidden_channels = hidden_channels
290
- self.filter_channels = filter_channels
291
- self.n_heads = n_heads
292
- self.n_layers = n_layers
293
- self.kernel_size = kernel_size
294
- self.p_dropout = p_dropout
295
- self.resblock = resblock
296
- self.resblock_kernel_sizes = resblock_kernel_sizes
297
- self.resblock_dilation_sizes = resblock_dilation_sizes
298
- self.upsample_rates = upsample_rates
299
- self.upsample_initial_channel = upsample_initial_channel
300
- self.upsample_kernel_sizes = upsample_kernel_sizes
301
- self.segment_size = segment_size
302
- self.gin_channels = gin_channels
303
- self.ssl_dim = ssl_dim
304
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
305
-
306
- self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout)
307
- hps = {
308
- "sampling_rate": 32000,
309
- "inter_channels": 192,
310
- "resblock": "1",
311
- "resblock_kernel_sizes": [3, 7, 11],
312
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
313
- "upsample_rates": [10, 8, 2, 2],
314
- "upsample_initial_channel": 512,
315
- "upsample_kernel_sizes": [16, 16, 4, 4],
316
- "gin_channels": 256,
317
- }
318
- self.dec = Generator(h=hps)
319
- self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
320
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
321
-
322
- def forward(self, c, c_lengths, f0, g=None):
323
- g = self.emb_g(g.unsqueeze(0)).transpose(1,2)
324
- z_p, m_p, logs_p, c_mask = self.enc_p_(c.transpose(1,2), c_lengths, f0=f0_to_coarse(f0))
325
- z = self.flow(z_p, c_mask, g=g, reverse=True)
326
- o = self.dec(z * c_mask, g=g, f0=f0.float())
327
- return o
328
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx/model_onnx_48k.py DELETED
@@ -1,328 +0,0 @@
1
- import copy
2
- import math
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
- import modules.attentions as attentions
8
- import modules.commons as commons
9
- import modules.modules as modules
10
-
11
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
- from modules.commons import init_weights, get_padding
14
- from vdecoder.hifigan.models import Generator
15
- from utils import f0_to_coarse
16
-
17
- class ResidualCouplingBlock(nn.Module):
18
- def __init__(self,
19
- channels,
20
- hidden_channels,
21
- kernel_size,
22
- dilation_rate,
23
- n_layers,
24
- n_flows=4,
25
- gin_channels=0):
26
- super().__init__()
27
- self.channels = channels
28
- self.hidden_channels = hidden_channels
29
- self.kernel_size = kernel_size
30
- self.dilation_rate = dilation_rate
31
- self.n_layers = n_layers
32
- self.n_flows = n_flows
33
- self.gin_channels = gin_channels
34
-
35
- self.flows = nn.ModuleList()
36
- for i in range(n_flows):
37
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
38
- self.flows.append(modules.Flip())
39
-
40
- def forward(self, x, x_mask, g=None, reverse=False):
41
- if not reverse:
42
- for flow in self.flows:
43
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
44
- else:
45
- for flow in reversed(self.flows):
46
- x = flow(x, x_mask, g=g, reverse=reverse)
47
- return x
48
-
49
-
50
- class Encoder(nn.Module):
51
- def __init__(self,
52
- in_channels,
53
- out_channels,
54
- hidden_channels,
55
- kernel_size,
56
- dilation_rate,
57
- n_layers,
58
- gin_channels=0):
59
- super().__init__()
60
- self.in_channels = in_channels
61
- self.out_channels = out_channels
62
- self.hidden_channels = hidden_channels
63
- self.kernel_size = kernel_size
64
- self.dilation_rate = dilation_rate
65
- self.n_layers = n_layers
66
- self.gin_channels = gin_channels
67
-
68
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
69
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
70
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
71
-
72
- def forward(self, x, x_lengths, g=None):
73
- # print(x.shape,x_lengths.shape)
74
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
75
- x = self.pre(x) * x_mask
76
- x = self.enc(x, x_mask, g=g)
77
- stats = self.proj(x) * x_mask
78
- m, logs = torch.split(stats, self.out_channels, dim=1)
79
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
80
- return z, m, logs, x_mask
81
-
82
-
83
- class TextEncoder(nn.Module):
84
- def __init__(self,
85
- in_channels,
86
- out_channels,
87
- hidden_channels,
88
- kernel_size,
89
- dilation_rate,
90
- n_layers,
91
- gin_channels=0,
92
- filter_channels=None,
93
- n_heads=None,
94
- p_dropout=None):
95
- super().__init__()
96
- self.in_channels = in_channels
97
- self.out_channels = out_channels
98
- self.hidden_channels = hidden_channels
99
- self.kernel_size = kernel_size
100
- self.dilation_rate = dilation_rate
101
- self.n_layers = n_layers
102
- self.gin_channels = gin_channels
103
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
104
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
105
- self.f0_emb = nn.Embedding(256, hidden_channels)
106
-
107
- self.enc_ = attentions.Encoder(
108
- hidden_channels,
109
- filter_channels,
110
- n_heads,
111
- n_layers,
112
- kernel_size,
113
- p_dropout)
114
-
115
- def forward(self, x, x_lengths, f0=None):
116
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
117
- x = self.pre(x) * x_mask
118
- x = x + self.f0_emb(f0.long()).transpose(1,2)
119
- x = self.enc_(x * x_mask, x_mask)
120
- stats = self.proj(x) * x_mask
121
- m, logs = torch.split(stats, self.out_channels, dim=1)
122
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
123
-
124
- return z, m, logs, x_mask
125
-
126
-
127
-
128
- class DiscriminatorP(torch.nn.Module):
129
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130
- super(DiscriminatorP, self).__init__()
131
- self.period = period
132
- self.use_spectral_norm = use_spectral_norm
133
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
134
- self.convs = nn.ModuleList([
135
- norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
136
- norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
137
- norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
138
- norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
139
- norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
140
- ])
141
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
142
-
143
- def forward(self, x):
144
- fmap = []
145
-
146
- # 1d to 2d
147
- b, c, t = x.shape
148
- if t % self.period != 0: # pad first
149
- n_pad = self.period - (t % self.period)
150
- x = F.pad(x, (0, n_pad), "reflect")
151
- t = t + n_pad
152
- x = x.view(b, c, t // self.period, self.period)
153
-
154
- for l in self.convs:
155
- x = l(x)
156
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
157
- fmap.append(x)
158
- x = self.conv_post(x)
159
- fmap.append(x)
160
- x = torch.flatten(x, 1, -1)
161
-
162
- return x, fmap
163
-
164
-
165
- class DiscriminatorS(torch.nn.Module):
166
- def __init__(self, use_spectral_norm=False):
167
- super(DiscriminatorS, self).__init__()
168
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
169
- self.convs = nn.ModuleList([
170
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
171
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
172
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
173
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
174
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
175
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
176
- ])
177
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
178
-
179
- def forward(self, x):
180
- fmap = []
181
-
182
- for l in self.convs:
183
- x = l(x)
184
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
185
- fmap.append(x)
186
- x = self.conv_post(x)
187
- fmap.append(x)
188
- x = torch.flatten(x, 1, -1)
189
-
190
- return x, fmap
191
-
192
-
193
- class MultiPeriodDiscriminator(torch.nn.Module):
194
- def __init__(self, use_spectral_norm=False):
195
- super(MultiPeriodDiscriminator, self).__init__()
196
- periods = [2,3,5,7,11]
197
-
198
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
199
- discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
200
- self.discriminators = nn.ModuleList(discs)
201
-
202
- def forward(self, y, y_hat):
203
- y_d_rs = []
204
- y_d_gs = []
205
- fmap_rs = []
206
- fmap_gs = []
207
- for i, d in enumerate(self.discriminators):
208
- y_d_r, fmap_r = d(y)
209
- y_d_g, fmap_g = d(y_hat)
210
- y_d_rs.append(y_d_r)
211
- y_d_gs.append(y_d_g)
212
- fmap_rs.append(fmap_r)
213
- fmap_gs.append(fmap_g)
214
-
215
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
216
-
217
-
218
- class SpeakerEncoder(torch.nn.Module):
219
- def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
220
- super(SpeakerEncoder, self).__init__()
221
- self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
222
- self.linear = nn.Linear(model_hidden_size, model_embedding_size)
223
- self.relu = nn.ReLU()
224
-
225
- def forward(self, mels):
226
- self.lstm.flatten_parameters()
227
- _, (hidden, _) = self.lstm(mels)
228
- embeds_raw = self.relu(self.linear(hidden[-1]))
229
- return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
230
-
231
- def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
232
- mel_slices = []
233
- for i in range(0, total_frames-partial_frames, partial_hop):
234
- mel_range = torch.arange(i, i+partial_frames)
235
- mel_slices.append(mel_range)
236
-
237
- return mel_slices
238
-
239
- def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
240
- mel_len = mel.size(1)
241
- last_mel = mel[:,-partial_frames:]
242
-
243
- if mel_len > partial_frames:
244
- mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
245
- mels = list(mel[:,s] for s in mel_slices)
246
- mels.append(last_mel)
247
- mels = torch.stack(tuple(mels), 0).squeeze(1)
248
-
249
- with torch.no_grad():
250
- partial_embeds = self(mels)
251
- embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
252
- #embed = embed / torch.linalg.norm(embed, 2)
253
- else:
254
- with torch.no_grad():
255
- embed = self(last_mel)
256
-
257
- return embed
258
-
259
-
260
- class SynthesizerTrn(nn.Module):
261
- """
262
- Synthesizer for Training
263
- """
264
-
265
- def __init__(self,
266
- spec_channels,
267
- segment_size,
268
- inter_channels,
269
- hidden_channels,
270
- filter_channels,
271
- n_heads,
272
- n_layers,
273
- kernel_size,
274
- p_dropout,
275
- resblock,
276
- resblock_kernel_sizes,
277
- resblock_dilation_sizes,
278
- upsample_rates,
279
- upsample_initial_channel,
280
- upsample_kernel_sizes,
281
- gin_channels,
282
- ssl_dim,
283
- n_speakers,
284
- **kwargs):
285
-
286
- super().__init__()
287
- self.spec_channels = spec_channels
288
- self.inter_channels = inter_channels
289
- self.hidden_channels = hidden_channels
290
- self.filter_channels = filter_channels
291
- self.n_heads = n_heads
292
- self.n_layers = n_layers
293
- self.kernel_size = kernel_size
294
- self.p_dropout = p_dropout
295
- self.resblock = resblock
296
- self.resblock_kernel_sizes = resblock_kernel_sizes
297
- self.resblock_dilation_sizes = resblock_dilation_sizes
298
- self.upsample_rates = upsample_rates
299
- self.upsample_initial_channel = upsample_initial_channel
300
- self.upsample_kernel_sizes = upsample_kernel_sizes
301
- self.segment_size = segment_size
302
- self.gin_channels = gin_channels
303
- self.ssl_dim = ssl_dim
304
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
305
-
306
- self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout)
307
- hps = {
308
- "sampling_rate": 48000,
309
- "inter_channels": 192,
310
- "resblock": "1",
311
- "resblock_kernel_sizes": [3, 7, 11],
312
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
313
- "upsample_rates": [10, 8, 2, 2],
314
- "upsample_initial_channel": 512,
315
- "upsample_kernel_sizes": [16, 16, 4, 4],
316
- "gin_channels": 256,
317
- }
318
- self.dec = Generator(h=hps)
319
- self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
320
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
321
-
322
- def forward(self, c, c_lengths, f0, g=None):
323
- g = self.emb_g(g.unsqueeze(0)).transpose(1,2)
324
- z_p, m_p, logs_p, c_mask = self.enc_p_(c.transpose(1,2), c_lengths, f0=f0_to_coarse(f0))
325
- z = self.flow(z_p, c_mask, g=g, reverse=True)
326
- o = self.dec(z * c_mask, g=g, f0=f0.float())
327
- return o
328
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx/onnx_export.py DELETED
@@ -1,73 +0,0 @@
1
- import argparse
2
- import time
3
- import numpy as np
4
- import onnx
5
- from onnxsim import simplify
6
- import onnxruntime as ort
7
- import onnxoptimizer
8
- import torch
9
- from model_onnx import SynthesizerTrn
10
- import utils
11
- from hubert import hubert_model_onnx
12
-
13
- def main(HubertExport,NetExport):
14
-
15
- path = "NyaruTaffy"
16
-
17
- if(HubertExport):
18
- device = torch.device("cuda")
19
- hubert_soft = utils.get_hubert_model()
20
- test_input = torch.rand(1, 1, 16000)
21
- input_names = ["source"]
22
- output_names = ["embed"]
23
- torch.onnx.export(hubert_soft.to(device),
24
- test_input.to(device),
25
- "hubert3.0.onnx",
26
- dynamic_axes={
27
- "source": {
28
- 2: "sample_length"
29
- }
30
- },
31
- verbose=False,
32
- opset_version=13,
33
- input_names=input_names,
34
- output_names=output_names)
35
- if(NetExport):
36
- device = torch.device("cuda")
37
- hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
38
- SVCVITS = SynthesizerTrn(
39
- hps.data.filter_length // 2 + 1,
40
- hps.train.segment_size // hps.data.hop_length,
41
- **hps.model)
42
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
43
- _ = SVCVITS.eval().to(device)
44
- for i in SVCVITS.parameters():
45
- i.requires_grad = False
46
- test_hidden_unit = torch.rand(1, 50, 256)
47
- test_lengths = torch.LongTensor([50])
48
- test_pitch = torch.rand(1, 50)
49
- test_sid = torch.LongTensor([0])
50
- input_names = ["hidden_unit", "lengths", "pitch", "sid"]
51
- output_names = ["audio", ]
52
- SVCVITS.eval()
53
- torch.onnx.export(SVCVITS,
54
- (
55
- test_hidden_unit.to(device),
56
- test_lengths.to(device),
57
- test_pitch.to(device),
58
- test_sid.to(device)
59
- ),
60
- f"checkpoints/{path}/model.onnx",
61
- dynamic_axes={
62
- "hidden_unit": [0, 1],
63
- "pitch": [1]
64
- },
65
- do_constant_folding=False,
66
- opset_version=16,
67
- verbose=False,
68
- input_names=input_names,
69
- output_names=output_names)
70
-
71
-
72
- if __name__ == '__main__':
73
- main(False,True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx/onnx_export_48k.py DELETED
@@ -1,73 +0,0 @@
1
- import argparse
2
- import time
3
- import numpy as np
4
- import onnx
5
- from onnxsim import simplify
6
- import onnxruntime as ort
7
- import onnxoptimizer
8
- import torch
9
- from model_onnx_48k import SynthesizerTrn
10
- import utils
11
- from hubert import hubert_model_onnx
12
-
13
- def main(HubertExport,NetExport):
14
-
15
- path = "NyaruTaffy"
16
-
17
- if(HubertExport):
18
- device = torch.device("cuda")
19
- hubert_soft = hubert_model_onnx.hubert_soft("hubert/model.pt")
20
- test_input = torch.rand(1, 1, 16000)
21
- input_names = ["source"]
22
- output_names = ["embed"]
23
- torch.onnx.export(hubert_soft.to(device),
24
- test_input.to(device),
25
- "hubert3.0.onnx",
26
- dynamic_axes={
27
- "source": {
28
- 2: "sample_length"
29
- }
30
- },
31
- verbose=False,
32
- opset_version=13,
33
- input_names=input_names,
34
- output_names=output_names)
35
- if(NetExport):
36
- device = torch.device("cuda")
37
- hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
38
- SVCVITS = SynthesizerTrn(
39
- hps.data.filter_length // 2 + 1,
40
- hps.train.segment_size // hps.data.hop_length,
41
- **hps.model)
42
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
43
- _ = SVCVITS.eval().to(device)
44
- for i in SVCVITS.parameters():
45
- i.requires_grad = False
46
- test_hidden_unit = torch.rand(1, 50, 256)
47
- test_lengths = torch.LongTensor([50])
48
- test_pitch = torch.rand(1, 50)
49
- test_sid = torch.LongTensor([0])
50
- input_names = ["hidden_unit", "lengths", "pitch", "sid"]
51
- output_names = ["audio", ]
52
- SVCVITS.eval()
53
- torch.onnx.export(SVCVITS,
54
- (
55
- test_hidden_unit.to(device),
56
- test_lengths.to(device),
57
- test_pitch.to(device),
58
- test_sid.to(device)
59
- ),
60
- f"checkpoints/{path}/model.onnx",
61
- dynamic_axes={
62
- "hidden_unit": [0, 1],
63
- "pitch": [1]
64
- },
65
- do_constant_folding=False,
66
- opset_version=16,
67
- verbose=False,
68
- input_names=input_names,
69
- output_names=output_names)
70
-
71
-
72
- if __name__ == '__main__':
73
- main(False,True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnxexport/model_onnx.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torch.nn import functional as F
4
+
5
+ import modules.attentions as attentions
6
+ import modules.commons as commons
7
+ import modules.modules as modules
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
11
+
12
+ import utils
13
+ from modules.commons import init_weights, get_padding
14
+ from vdecoder.hifigan.models import Generator
15
+ from utils import f0_to_coarse
16
+
17
+
18
+ class ResidualCouplingBlock(nn.Module):
19
+ def __init__(self,
20
+ channels,
21
+ hidden_channels,
22
+ kernel_size,
23
+ dilation_rate,
24
+ n_layers,
25
+ n_flows=4,
26
+ gin_channels=0):
27
+ super().__init__()
28
+ self.channels = channels
29
+ self.hidden_channels = hidden_channels
30
+ self.kernel_size = kernel_size
31
+ self.dilation_rate = dilation_rate
32
+ self.n_layers = n_layers
33
+ self.n_flows = n_flows
34
+ self.gin_channels = gin_channels
35
+
36
+ self.flows = nn.ModuleList()
37
+ for i in range(n_flows):
38
+ self.flows.append(
39
+ modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
40
+ gin_channels=gin_channels, mean_only=True))
41
+ self.flows.append(modules.Flip())
42
+
43
+ def forward(self, x, x_mask, g=None, reverse=False):
44
+ if not reverse:
45
+ for flow in self.flows:
46
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
47
+ else:
48
+ for flow in reversed(self.flows):
49
+ x = flow(x, x_mask, g=g, reverse=reverse)
50
+ return x
51
+
52
+
53
+ class Encoder(nn.Module):
54
+ def __init__(self,
55
+ in_channels,
56
+ out_channels,
57
+ hidden_channels,
58
+ kernel_size,
59
+ dilation_rate,
60
+ n_layers,
61
+ gin_channels=0):
62
+ super().__init__()
63
+ self.in_channels = in_channels
64
+ self.out_channels = out_channels
65
+ self.hidden_channels = hidden_channels
66
+ self.kernel_size = kernel_size
67
+ self.dilation_rate = dilation_rate
68
+ self.n_layers = n_layers
69
+ self.gin_channels = gin_channels
70
+
71
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
72
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
73
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
74
+
75
+ def forward(self, x, x_lengths, g=None):
76
+ # print(x.shape,x_lengths.shape)
77
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
78
+ x = self.pre(x) * x_mask
79
+ x = self.enc(x, x_mask, g=g)
80
+ stats = self.proj(x) * x_mask
81
+ m, logs = torch.split(stats, self.out_channels, dim=1)
82
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
83
+ return z, m, logs, x_mask
84
+
85
+
86
+ class TextEncoder(nn.Module):
87
+ def __init__(self,
88
+ out_channels,
89
+ hidden_channels,
90
+ kernel_size,
91
+ n_layers,
92
+ gin_channels=0,
93
+ filter_channels=None,
94
+ n_heads=None,
95
+ p_dropout=None):
96
+ super().__init__()
97
+ self.out_channels = out_channels
98
+ self.hidden_channels = hidden_channels
99
+ self.kernel_size = kernel_size
100
+ self.n_layers = n_layers
101
+ self.gin_channels = gin_channels
102
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
103
+ self.f0_emb = nn.Embedding(256, hidden_channels)
104
+
105
+ self.enc_ = attentions.Encoder(
106
+ hidden_channels,
107
+ filter_channels,
108
+ n_heads,
109
+ n_layers,
110
+ kernel_size,
111
+ p_dropout)
112
+
113
+ def forward(self, x, x_mask, f0=None, z=None):
114
+ x = x + self.f0_emb(f0).transpose(1, 2)
115
+ x = self.enc_(x * x_mask, x_mask)
116
+ stats = self.proj(x) * x_mask
117
+ m, logs = torch.split(stats, self.out_channels, dim=1)
118
+ z = (m + z * torch.exp(logs)) * x_mask
119
+ return z, m, logs, x_mask
120
+
121
+
122
+ class DiscriminatorP(torch.nn.Module):
123
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
124
+ super(DiscriminatorP, self).__init__()
125
+ self.period = period
126
+ self.use_spectral_norm = use_spectral_norm
127
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
128
+ self.convs = nn.ModuleList([
129
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
130
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
131
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
132
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
133
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
134
+ ])
135
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
136
+
137
+ def forward(self, x):
138
+ fmap = []
139
+
140
+ # 1d to 2d
141
+ b, c, t = x.shape
142
+ if t % self.period != 0: # pad first
143
+ n_pad = self.period - (t % self.period)
144
+ x = F.pad(x, (0, n_pad), "reflect")
145
+ t = t + n_pad
146
+ x = x.view(b, c, t // self.period, self.period)
147
+
148
+ for l in self.convs:
149
+ x = l(x)
150
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
151
+ fmap.append(x)
152
+ x = self.conv_post(x)
153
+ fmap.append(x)
154
+ x = torch.flatten(x, 1, -1)
155
+
156
+ return x, fmap
157
+
158
+
159
+ class DiscriminatorS(torch.nn.Module):
160
+ def __init__(self, use_spectral_norm=False):
161
+ super(DiscriminatorS, self).__init__()
162
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
163
+ self.convs = nn.ModuleList([
164
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
165
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
166
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
167
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
168
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
169
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
170
+ ])
171
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
172
+
173
+ def forward(self, x):
174
+ fmap = []
175
+
176
+ for l in self.convs:
177
+ x = l(x)
178
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
179
+ fmap.append(x)
180
+ x = self.conv_post(x)
181
+ fmap.append(x)
182
+ x = torch.flatten(x, 1, -1)
183
+
184
+ return x, fmap
185
+
186
+
187
+ class F0Decoder(nn.Module):
188
+ def __init__(self,
189
+ out_channels,
190
+ hidden_channels,
191
+ filter_channels,
192
+ n_heads,
193
+ n_layers,
194
+ kernel_size,
195
+ p_dropout,
196
+ spk_channels=0):
197
+ super().__init__()
198
+ self.out_channels = out_channels
199
+ self.hidden_channels = hidden_channels
200
+ self.filter_channels = filter_channels
201
+ self.n_heads = n_heads
202
+ self.n_layers = n_layers
203
+ self.kernel_size = kernel_size
204
+ self.p_dropout = p_dropout
205
+ self.spk_channels = spk_channels
206
+
207
+ self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
208
+ self.decoder = attentions.FFT(
209
+ hidden_channels,
210
+ filter_channels,
211
+ n_heads,
212
+ n_layers,
213
+ kernel_size,
214
+ p_dropout)
215
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
216
+ self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
217
+ self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
218
+
219
+ def forward(self, x, norm_f0, x_mask, spk_emb=None):
220
+ x = torch.detach(x)
221
+ if spk_emb is not None:
222
+ x = x + self.cond(spk_emb)
223
+ x += self.f0_prenet(norm_f0)
224
+ x = self.prenet(x) * x_mask
225
+ x = self.decoder(x * x_mask, x_mask)
226
+ x = self.proj(x) * x_mask
227
+ return x
228
+
229
+
230
+ class SynthesizerTrn(nn.Module):
231
+ """
232
+ Synthesizer for Training
233
+ """
234
+
235
+ def __init__(self,
236
+ spec_channels,
237
+ segment_size,
238
+ inter_channels,
239
+ hidden_channels,
240
+ filter_channels,
241
+ n_heads,
242
+ n_layers,
243
+ kernel_size,
244
+ p_dropout,
245
+ resblock,
246
+ resblock_kernel_sizes,
247
+ resblock_dilation_sizes,
248
+ upsample_rates,
249
+ upsample_initial_channel,
250
+ upsample_kernel_sizes,
251
+ gin_channels,
252
+ ssl_dim,
253
+ n_speakers,
254
+ sampling_rate=44100,
255
+ **kwargs):
256
+ super().__init__()
257
+ self.spec_channels = spec_channels
258
+ self.inter_channels = inter_channels
259
+ self.hidden_channels = hidden_channels
260
+ self.filter_channels = filter_channels
261
+ self.n_heads = n_heads
262
+ self.n_layers = n_layers
263
+ self.kernel_size = kernel_size
264
+ self.p_dropout = p_dropout
265
+ self.resblock = resblock
266
+ self.resblock_kernel_sizes = resblock_kernel_sizes
267
+ self.resblock_dilation_sizes = resblock_dilation_sizes
268
+ self.upsample_rates = upsample_rates
269
+ self.upsample_initial_channel = upsample_initial_channel
270
+ self.upsample_kernel_sizes = upsample_kernel_sizes
271
+ self.segment_size = segment_size
272
+ self.gin_channels = gin_channels
273
+ self.ssl_dim = ssl_dim
274
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
275
+
276
+ self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
277
+
278
+ self.enc_p = TextEncoder(
279
+ inter_channels,
280
+ hidden_channels,
281
+ filter_channels=filter_channels,
282
+ n_heads=n_heads,
283
+ n_layers=n_layers,
284
+ kernel_size=kernel_size,
285
+ p_dropout=p_dropout
286
+ )
287
+ hps = {
288
+ "sampling_rate": sampling_rate,
289
+ "inter_channels": inter_channels,
290
+ "resblock": resblock,
291
+ "resblock_kernel_sizes": resblock_kernel_sizes,
292
+ "resblock_dilation_sizes": resblock_dilation_sizes,
293
+ "upsample_rates": upsample_rates,
294
+ "upsample_initial_channel": upsample_initial_channel,
295
+ "upsample_kernel_sizes": upsample_kernel_sizes,
296
+ "gin_channels": gin_channels,
297
+ }
298
+ self.dec = Generator(h=hps)
299
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
300
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
301
+ self.f0_decoder = F0Decoder(
302
+ 1,
303
+ hidden_channels,
304
+ filter_channels,
305
+ n_heads,
306
+ n_layers,
307
+ kernel_size,
308
+ p_dropout,
309
+ spk_channels=gin_channels
310
+ )
311
+ self.emb_uv = nn.Embedding(2, hidden_channels)
312
+ self.predict_f0 = False
313
+
314
+ def forward(self, c, f0, mel2ph, uv, noise=None, g=None):
315
+
316
+ decoder_inp = F.pad(c, [0, 0, 1, 0])
317
+ mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
318
+ c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2) # [B, T, H]
319
+
320
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
321
+ g = g.unsqueeze(0)
322
+ g = self.emb_g(g).transpose(1, 2)
323
+ x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
324
+ x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
325
+
326
+ if self.predict_f0:
327
+ lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
328
+ norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
329
+ pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
330
+ f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
331
+
332
+ z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
333
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
334
+ o = self.dec(z * c_mask, g=g, f0=f0)
335
+ return o
vdecoder/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/vdecoder/__pycache__/__init__.cpython-38.pyc and b/vdecoder/__pycache__/__init__.cpython-38.pyc differ
 
vdecoder/hifigan/__pycache__/env.cpython-38.pyc CHANGED
Binary files a/vdecoder/hifigan/__pycache__/env.cpython-38.pyc and b/vdecoder/hifigan/__pycache__/env.cpython-38.pyc differ
 
vdecoder/hifigan/__pycache__/models.cpython-38.pyc CHANGED
Binary files a/vdecoder/hifigan/__pycache__/models.cpython-38.pyc and b/vdecoder/hifigan/__pycache__/models.cpython-38.pyc differ
 
vdecoder/hifigan/__pycache__/utils.cpython-38.pyc CHANGED
Binary files a/vdecoder/hifigan/__pycache__/utils.cpython-38.pyc and b/vdecoder/hifigan/__pycache__/utils.cpython-38.pyc differ