update to v2 models (clvp pending)
Browse files
api.py
CHANGED
@@ -23,9 +23,11 @@ from utils.tokenizer import VoiceBpeTokenizer, lev_distance
|
|
23 |
pbar = None
|
24 |
def download_models():
|
25 |
MODELS = {
|
26 |
-
'
|
27 |
-
'
|
28 |
-
'
|
|
|
|
|
29 |
}
|
30 |
os.makedirs('.models', exist_ok=True)
|
31 |
def show_progress(block_num, block_size, total_size):
|
@@ -162,25 +164,12 @@ class TextToSpeech:
|
|
162 |
train_solo_embeddings=False,
|
163 |
average_conditioning_embeddings=True).cpu().eval()
|
164 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
165 |
-
'''
|
166 |
-
self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
|
167 |
-
model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
|
168 |
-
average_conditioning_embeddings=True, types=2).cpu().eval()
|
169 |
-
self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
|
170 |
-
'''
|
171 |
-
|
172 |
-
self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
173 |
-
model_dim=1024,
|
174 |
-
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
175 |
-
train_solo_embeddings=False,
|
176 |
-
average_conditioning_embeddings=True).cpu().eval()
|
177 |
-
self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
|
178 |
|
179 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
180 |
text_seq_len=350, text_heads=8,
|
181 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
182 |
use_xformers=True).cpu().eval()
|
183 |
-
self.clvp.load_state_dict(torch.load('.models/
|
184 |
|
185 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
186 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
@@ -281,11 +270,11 @@ class TextToSpeech:
|
|
281 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
282 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
283 |
# results, but will increase memory usage.
|
284 |
-
self.
|
285 |
-
best_latents = self.
|
286 |
-
torch.tensor([best_results.shape[-1]*self.
|
287 |
return_latent=True, clip_inputs=False)
|
288 |
-
self.
|
289 |
|
290 |
print("Performing vocoding..")
|
291 |
wav_candidates = []
|
|
|
23 |
pbar = None
|
24 |
def download_models():
|
25 |
MODELS = {
|
26 |
+
'autoregressive.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth',
|
27 |
+
'clvp.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clip.pth',
|
28 |
+
'clip.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth',
|
29 |
+
'diffusion_decoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth',
|
30 |
+
'vocoder.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth',
|
31 |
}
|
32 |
os.makedirs('.models', exist_ok=True)
|
33 |
def show_progress(block_num, block_size, total_size):
|
|
|
164 |
train_solo_embeddings=False,
|
165 |
average_conditioning_embeddings=True).cpu().eval()
|
166 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
self.clvp = CLVP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
169 |
text_seq_len=350, text_heads=8,
|
170 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
171 |
use_xformers=True).cpu().eval()
|
172 |
+
self.clvp.load_state_dict(torch.load('.models/clvp.pth'))
|
173 |
|
174 |
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
175 |
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
|
|
270 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
271 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
272 |
# results, but will increase memory usage.
|
273 |
+
self.autoregressive = self.autoregressive.cuda()
|
274 |
+
best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
275 |
+
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
276 |
return_latent=True, clip_inputs=False)
|
277 |
+
self.autoregressive = self.autoregressive.cpu()
|
278 |
|
279 |
print("Performing vocoding..")
|
280 |
wav_candidates = []
|