jonluca commited on
Commit
61a2d96
·
unverified ·
1 Parent(s): 1373f78

use remove url to load pth

Browse files
Files changed (3) hide show
  1. _run.py +0 -368
  2. compute.py +0 -132
  3. styletts2importable.py +1 -5
_run.py DELETED
@@ -1,368 +0,0 @@
1
- from cached_path import cached_path
2
-
3
- from dp.phonemizer import Phonemizer
4
- print("NLTK")
5
- import nltk
6
- nltk.download('punkt')
7
- print("SCIPY")
8
- from scipy.io.wavfile import write
9
- print("TORCH STUFF")
10
- import torch
11
- print("START")
12
- torch.manual_seed(0)
13
- torch.backends.cudnn.benchmark = False
14
- torch.backends.cudnn.deterministic = True
15
-
16
- import random
17
- random.seed(0)
18
-
19
- import numpy as np
20
- np.random.seed(0)
21
-
22
- # load packages
23
- import time
24
- import random
25
- import yaml
26
- import numpy as np
27
- import torch
28
- import torchaudio
29
- import librosa
30
- from nltk.tokenize import word_tokenize
31
-
32
- from models import *
33
- from utils import *
34
- from text_utils import TextCleaner
35
- textclenaer = TextCleaner()
36
-
37
-
38
- to_mel = torchaudio.transforms.MelSpectrogram(
39
- n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
40
- mean, std = -4, 4
41
-
42
- def length_to_mask(lengths):
43
- mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
44
- mask = torch.gt(mask+1, lengths.unsqueeze(1))
45
- return mask
46
-
47
- def preprocess(wave):
48
- wave_tensor = torch.from_numpy(wave).float()
49
- mel_tensor = to_mel(wave_tensor)
50
- mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
51
- return mel_tensor
52
-
53
- def compute_style(path):
54
- wave, sr = librosa.load(path, sr=24000)
55
- audio, index = librosa.effects.trim(wave, top_db=30)
56
- if sr != 24000:
57
- audio = librosa.resample(audio, sr, 24000)
58
- mel_tensor = preprocess(audio).to(device)
59
-
60
- with torch.no_grad():
61
- ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
62
- ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))
63
-
64
- return torch.cat([ref_s, ref_p], dim=1)
65
-
66
- device = 'cpu'
67
- if torch.cuda.is_available():
68
- device = 'cuda'
69
- elif torch.backends.mps.is_available():
70
- print("MPS would be available but cannot be used rn")
71
- # device = 'mps'
72
-
73
-
74
- # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
75
- phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
76
-
77
-
78
- config = yaml.safe_load(open("Models/LibriTTS/config.yml"))
79
-
80
- # load pretrained ASR model
81
- ASR_config = config.get('ASR_config', False)
82
- ASR_path = config.get('ASR_path', False)
83
- text_aligner = load_ASR_models(ASR_path, ASR_config)
84
-
85
- # load pretrained F0 model
86
- F0_path = config.get('F0_path', False)
87
- pitch_extractor = load_F0_models(F0_path)
88
-
89
- # load BERT model
90
- from Utils.PLBERT.util import load_plbert
91
- BERT_path = config.get('PLBERT_dir', False)
92
- plbert = load_plbert(BERT_path)
93
-
94
- model_params = recursive_munch(config['model_params'])
95
- model = build_model(model_params, text_aligner, pitch_extractor, plbert)
96
- _ = [model[key].eval() for key in model]
97
- _ = [model[key].to(device) for key in model]
98
-
99
- params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
100
- params = params_whole['net']
101
-
102
- for key in model:
103
- if key in params:
104
- print('%s loaded' % key)
105
- try:
106
- model[key].load_state_dict(params[key])
107
- except:
108
- from collections import OrderedDict
109
- state_dict = params[key]
110
- new_state_dict = OrderedDict()
111
- for k, v in state_dict.items():
112
- name = k[7:] # remove `module.`
113
- new_state_dict[name] = v
114
- # load params
115
- model[key].load_state_dict(new_state_dict, strict=False)
116
- # except:
117
- # _load(params[key], model[key])
118
- _ = [model[key].eval() for key in model]
119
-
120
- from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
121
-
122
- sampler = DiffusionSampler(
123
- model.diffusion.diffusion,
124
- sampler=ADPM2Sampler(),
125
- sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
126
- clamp=False
127
- )
128
-
129
- def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
130
- text = text.strip()
131
- ps = phonemizer([text], lang='en_us')
132
- ps = word_tokenize(ps[0])
133
- ps = ' '.join(ps)
134
- tokens = textclenaer(ps)
135
- tokens.insert(0, 0)
136
- tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
137
-
138
- with torch.no_grad():
139
- input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
140
- text_mask = length_to_mask(input_lengths).to(device)
141
-
142
- t_en = model.text_encoder(tokens, input_lengths, text_mask)
143
- bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
144
- d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
145
-
146
- s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
147
- embedding=bert_dur,
148
- embedding_scale=embedding_scale,
149
- features=ref_s, # reference from the same speaker as the embedding
150
- num_steps=diffusion_steps).squeeze(1)
151
-
152
-
153
- s = s_pred[:, 128:]
154
- ref = s_pred[:, :128]
155
-
156
- ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
157
- s = beta * s + (1 - beta) * ref_s[:, 128:]
158
-
159
- d = model.predictor.text_encoder(d_en,
160
- s, input_lengths, text_mask)
161
-
162
- x, _ = model.predictor.lstm(d)
163
- duration = model.predictor.duration_proj(x)
164
-
165
- duration = torch.sigmoid(duration).sum(axis=-1)
166
- pred_dur = torch.round(duration.squeeze()).clamp(min=1)
167
-
168
-
169
- pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
170
- c_frame = 0
171
- for i in range(pred_aln_trg.size(0)):
172
- pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
173
- c_frame += int(pred_dur[i].data)
174
-
175
- # encode prosody
176
- en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
177
- if model_params.decoder.type == "hifigan":
178
- asr_new = torch.zeros_like(en)
179
- asr_new[:, :, 0] = en[:, :, 0]
180
- asr_new[:, :, 1:] = en[:, :, 0:-1]
181
- en = asr_new
182
-
183
- F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
184
-
185
- asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
186
- if model_params.decoder.type == "hifigan":
187
- asr_new = torch.zeros_like(asr)
188
- asr_new[:, :, 0] = asr[:, :, 0]
189
- asr_new[:, :, 1:] = asr[:, :, 0:-1]
190
- asr = asr_new
191
-
192
- out = model.decoder(asr,
193
- F0_pred, N_pred, ref.squeeze().unsqueeze(0))
194
-
195
-
196
- return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
197
-
198
- def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):
199
- text = text.strip()
200
- ps = phonemizer([text], lang='en_us')
201
- ps = word_tokenize(ps[0])
202
- ps = ' '.join(ps)
203
- ps = ps.replace('``', '"')
204
- ps = ps.replace("''", '"')
205
-
206
- tokens = textclenaer(ps)
207
- tokens.insert(0, 0)
208
- tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
209
-
210
- with torch.no_grad():
211
- input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
212
- text_mask = length_to_mask(input_lengths).to(device)
213
-
214
- t_en = model.text_encoder(tokens, input_lengths, text_mask)
215
- bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
216
- d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
217
-
218
- s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
219
- embedding=bert_dur,
220
- embedding_scale=embedding_scale,
221
- features=ref_s, # reference from the same speaker as the embedding
222
- num_steps=diffusion_steps).squeeze(1)
223
-
224
- if s_prev is not None:
225
- # convex combination of previous and current style
226
- s_pred = t * s_prev + (1 - t) * s_pred
227
-
228
- s = s_pred[:, 128:]
229
- ref = s_pred[:, :128]
230
-
231
- ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
232
- s = beta * s + (1 - beta) * ref_s[:, 128:]
233
-
234
- s_pred = torch.cat([ref, s], dim=-1)
235
-
236
- d = model.predictor.text_encoder(d_en,
237
- s, input_lengths, text_mask)
238
-
239
- x, _ = model.predictor.lstm(d)
240
- duration = model.predictor.duration_proj(x)
241
-
242
- duration = torch.sigmoid(duration).sum(axis=-1)
243
- pred_dur = torch.round(duration.squeeze()).clamp(min=1)
244
-
245
-
246
- pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
247
- c_frame = 0
248
- for i in range(pred_aln_trg.size(0)):
249
- pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
250
- c_frame += int(pred_dur[i].data)
251
-
252
- # encode prosody
253
- en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
254
- if model_params.decoder.type == "hifigan":
255
- asr_new = torch.zeros_like(en)
256
- asr_new[:, :, 0] = en[:, :, 0]
257
- asr_new[:, :, 1:] = en[:, :, 0:-1]
258
- en = asr_new
259
-
260
- F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
261
-
262
- asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
263
- if model_params.decoder.type == "hifigan":
264
- asr_new = torch.zeros_like(asr)
265
- asr_new[:, :, 0] = asr[:, :, 0]
266
- asr_new[:, :, 1:] = asr[:, :, 0:-1]
267
- asr = asr_new
268
-
269
- out = model.decoder(asr,
270
- F0_pred, N_pred, ref.squeeze().unsqueeze(0))
271
-
272
-
273
- return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later
274
-
275
- def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
276
- text = text.strip()
277
- ps = phonemizer([text], lang='en_us')
278
- ps = word_tokenize(ps[0])
279
- ps = ' '.join(ps)
280
-
281
- tokens = textclenaer(ps)
282
- tokens.insert(0, 0)
283
- tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
284
-
285
- ref_text = ref_text.strip()
286
- ps = phonemizer([ref_text], lang='en_us')
287
- ps = word_tokenize(ps[0])
288
- ps = ' '.join(ps)
289
-
290
- ref_tokens = textclenaer(ps)
291
- ref_tokens.insert(0, 0)
292
- ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)
293
-
294
-
295
- with torch.no_grad():
296
- input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
297
- text_mask = length_to_mask(input_lengths).to(device)
298
-
299
- t_en = model.text_encoder(tokens, input_lengths, text_mask)
300
- bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
301
- d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
302
-
303
- ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)
304
- ref_text_mask = length_to_mask(ref_input_lengths).to(device)
305
- model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())
306
- s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
307
- embedding=bert_dur,
308
- embedding_scale=embedding_scale,
309
- features=ref_s, # reference from the same speaker as the embedding
310
- num_steps=diffusion_steps).squeeze(1)
311
-
312
-
313
- s = s_pred[:, 128:]
314
- ref = s_pred[:, :128]
315
-
316
- ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
317
- s = beta * s + (1 - beta) * ref_s[:, 128:]
318
-
319
- d = model.predictor.text_encoder(d_en,
320
- s, input_lengths, text_mask)
321
-
322
- x, _ = model.predictor.lstm(d)
323
- duration = model.predictor.duration_proj(x)
324
-
325
- duration = torch.sigmoid(duration).sum(axis=-1)
326
- pred_dur = torch.round(duration.squeeze()).clamp(min=1)
327
-
328
-
329
- pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
330
- c_frame = 0
331
- for i in range(pred_aln_trg.size(0)):
332
- pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
333
- c_frame += int(pred_dur[i].data)
334
-
335
- # encode prosody
336
- en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
337
- if model_params.decoder.type == "hifigan":
338
- asr_new = torch.zeros_like(en)
339
- asr_new[:, :, 0] = en[:, :, 0]
340
- asr_new[:, :, 1:] = en[:, :, 0:-1]
341
- en = asr_new
342
-
343
- F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
344
-
345
- asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
346
- if model_params.decoder.type == "hifigan":
347
- asr_new = torch.zeros_like(asr)
348
- asr_new[:, :, 0] = asr[:, :, 0]
349
- asr_new[:, :, 1:] = asr[:, :, 0:-1]
350
- asr = asr_new
351
-
352
- out = model.decoder(asr,
353
- F0_pred, N_pred, ref.squeeze().unsqueeze(0))
354
-
355
-
356
- return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
357
- print("Time to synthesize!")
358
- ref_s = compute_style('./voice/voice.wav')
359
- while True:
360
- text = input("What to say? > ")
361
- start = time.time()
362
- wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=15, embedding_scale=1)
363
- rtf = (time.time() - start) / (len(wav) / 24000)
364
- print(f"RTF = {rtf:5f}")
365
- print(k + ' Synthesized:')
366
- # display(ipd.Audio(wav, rate=24000, normalize=False))
367
- write('result.wav', 24000, wav)
368
- print("Saved to result.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
compute.py DELETED
@@ -1,132 +0,0 @@
1
- from cached_path import cached_path
2
-
3
- # from dp.phonemizer import Phonemizer
4
- print("NLTK")
5
- import nltk
6
- nltk.download('punkt')
7
- print("SCIPY")
8
- print("TORCH STUFF")
9
- import torch
10
- print("START")
11
- torch.manual_seed(0)
12
- torch.backends.cudnn.benchmark = False
13
- torch.backends.cudnn.deterministic = True
14
-
15
- import random
16
- random.seed(0)
17
-
18
- import numpy as np
19
- np.random.seed(0)
20
-
21
- # load packages
22
- import random
23
- import yaml
24
- import numpy as np
25
- import torch
26
- import torchaudio
27
- import librosa
28
-
29
- from models import *
30
- from utils import *
31
- from text_utils import TextCleaner
32
- textclenaer = TextCleaner()
33
-
34
-
35
- to_mel = torchaudio.transforms.MelSpectrogram(
36
- n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
37
- mean, std = -4, 4
38
-
39
- def length_to_mask(lengths):
40
- mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
41
- mask = torch.gt(mask+1, lengths.unsqueeze(1))
42
- return mask
43
-
44
- def preprocess(wave):
45
- wave_tensor = torch.from_numpy(wave).float()
46
- mel_tensor = to_mel(wave_tensor)
47
- mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
48
- return mel_tensor
49
-
50
- def compute_style(path):
51
- wave, sr = librosa.load(path, sr=24000)
52
- audio, index = librosa.effects.trim(wave, top_db=30)
53
- if sr != 24000:
54
- audio = librosa.resample(audio, sr, 24000)
55
- mel_tensor = preprocess(audio).to(device)
56
-
57
- with torch.no_grad():
58
- ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
59
- ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))
60
-
61
- return torch.cat([ref_s, ref_p], dim=1)
62
-
63
- device = 'cpu'
64
- if torch.cuda.is_available():
65
- device = 'cuda'
66
- elif torch.backends.mps.is_available():
67
- print("MPS would be available but cannot be used rn")
68
- # device = 'mps'
69
-
70
-
71
-
72
- # config = yaml.safe_load(open("Models/LibriTTS/config.yml"))
73
- config = yaml.safe_load(open(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml"))))
74
-
75
- # load pretrained ASR model
76
- ASR_config = config.get('ASR_config', False)
77
- ASR_path = config.get('ASR_path', False)
78
- text_aligner = load_ASR_models(ASR_path, ASR_config)
79
-
80
- # load pretrained F0 model
81
- F0_path = config.get('F0_path', False)
82
- pitch_extractor = load_F0_models(F0_path)
83
-
84
- # load BERT model
85
- from Utils.PLBERT.util import load_plbert
86
- BERT_path = config.get('PLBERT_dir', False)
87
- plbert = load_plbert(BERT_path)
88
-
89
- model_params = recursive_munch(config['model_params'])
90
- model = build_model(model_params, text_aligner, pitch_extractor, plbert)
91
- _ = [model[key].eval() for key in model]
92
- _ = [model[key].to(device) for key in model]
93
-
94
- # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
95
- params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
96
- params = params_whole['net']
97
-
98
- for key in model:
99
- if key in params:
100
- print('%s loaded' % key)
101
- try:
102
- model[key].load_state_dict(params[key])
103
- except:
104
- from collections import OrderedDict
105
- state_dict = params[key]
106
- new_state_dict = OrderedDict()
107
- for k, v in state_dict.items():
108
- name = k[7:] # remove `module.`
109
- new_state_dict[name] = v
110
- # load params
111
- model[key].load_state_dict(new_state_dict, strict=False)
112
- # except:
113
- # _load(params[key], model[key])
114
- _ = [model[key].eval() for key in model]
115
-
116
- from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
117
-
118
- sampler = DiffusionSampler(
119
- model.diffusion.diffusion,
120
- sampler=ADPM2Sampler(),
121
- sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
122
- clamp=False
123
- )
124
- voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3', 'm-us-4']
125
- voices = {}
126
- # todo: cache computed style, load using pickle
127
- for v in voicelist:
128
- print(f"Loading voice {v}")
129
- voices[v] = compute_style(f'voices/{v}.wav')
130
- import pickle
131
- with open('voices.pkl', 'wb') as f:
132
- pickle.dump(voices, f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
styletts2importable.py CHANGED
@@ -189,11 +189,7 @@ _ = [model[key].to(device) for key in model]
189
 
190
 
191
  params_whole = torch.load(
192
- str(
193
- cached_path(
194
- "hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth"
195
- )
196
- ),
197
  map_location="cpu",
198
  )
199
  params = params_whole["net"]
 
189
 
190
 
191
  params_whole = torch.load(
192
+ str(cached_path("https://base-weights.weights.gg/epochs_2nd_00020.pth")),
 
 
 
 
193
  map_location="cpu",
194
  )
195
  params = params_whole["net"]