Spaces:
Running
on
T4
Running
on
T4
fix memory leak
Browse files- tortoise/api.py +32 -51
tortoise/api.py
CHANGED
@@ -243,28 +243,22 @@ class TextToSpeech:
|
|
243 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
244 |
model_dim=1024,
|
245 |
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
246 |
-
train_solo_embeddings=False).
|
247 |
self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
|
248 |
self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
|
249 |
|
250 |
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
251 |
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
252 |
-
layer_drop=0, unconditioned_percentage=0).
|
253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
254 |
|
255 |
-
self.vocoder = UnivNetGenerator().
|
256 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
257 |
self.vocoder.eval(inference=True)
|
258 |
|
259 |
# Random latent generators (RLGs) are loaded lazily.
|
260 |
self.rlg_auto = None
|
261 |
self.rlg_diffusion = None
|
262 |
-
@contextmanager
|
263 |
-
def temporary_cuda(self, model):
|
264 |
-
m = model.to(self.device)
|
265 |
-
yield m
|
266 |
-
m = model.cpu()
|
267 |
-
|
268 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
269 |
"""
|
270 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
@@ -328,7 +322,6 @@ class TextToSpeech:
|
|
328 |
# Presets are defined here.
|
329 |
presets = {
|
330 |
'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
|
331 |
-
# 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
|
332 |
'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
|
333 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
334 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
@@ -409,57 +402,45 @@ class TextToSpeech:
|
|
409 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
410 |
|
411 |
with torch.no_grad():
|
412 |
-
|
413 |
-
stop_mel_token = self.autoregressive.stop_mel_token
|
414 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
415 |
if verbose:
|
416 |
print("Generating autoregressive samples..")
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
max_generate_length=max_mel_tokens,
|
427 |
-
**hf_generate_kwargs)
|
428 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
429 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
430 |
# results, but will increase memory usage.
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
437 |
-
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
|
438 |
-
torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
439 |
-
return_latent=True, clip_inputs=False)
|
440 |
-
del auto_conditioning
|
441 |
|
442 |
if verbose:
|
443 |
print("Transforming autoregressive outputs into audio..")
|
444 |
wav_candidates = []
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
verbose=verbose)
|
461 |
-
wav = vocoder.inference(mel)
|
462 |
-
wav_candidates.append(wav.cpu())
|
463 |
|
464 |
def potentially_redact(clip, text):
|
465 |
if self.enable_redaction:
|
|
|
243 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
244 |
model_dim=1024,
|
245 |
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
246 |
+
train_solo_embeddings=False).cuda().eval()
|
247 |
self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
|
248 |
self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
|
249 |
|
250 |
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
251 |
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
252 |
+
layer_drop=0, unconditioned_percentage=0).cuda().eval()
|
253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
254 |
|
255 |
+
self.vocoder = UnivNetGenerator().cuda()
|
256 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
257 |
self.vocoder.eval(inference=True)
|
258 |
|
259 |
# Random latent generators (RLGs) are loaded lazily.
|
260 |
self.rlg_auto = None
|
261 |
self.rlg_diffusion = None
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
263 |
"""
|
264 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
|
|
322 |
# Presets are defined here.
|
323 |
presets = {
|
324 |
'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
|
|
|
325 |
'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
|
326 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
327 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
|
|
402 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
403 |
|
404 |
with torch.no_grad():
|
|
|
|
|
405 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
406 |
if verbose:
|
407 |
print("Generating autoregressive samples..")
|
408 |
+
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
409 |
+
do_sample=True,
|
410 |
+
top_p=top_p,
|
411 |
+
temperature=temperature,
|
412 |
+
num_return_sequences=num_autoregressive_samples,
|
413 |
+
length_penalty=length_penalty,
|
414 |
+
repetition_penalty=repetition_penalty,
|
415 |
+
max_generate_length=max_mel_tokens,
|
416 |
+
**hf_generate_kwargs)
|
|
|
|
|
417 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
418 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
419 |
# results, but will increase memory usage.
|
420 |
+
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
421 |
+
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
|
422 |
+
torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
423 |
+
return_latent=True, clip_inputs=False)
|
424 |
+
del auto_conditioning
|
|
|
|
|
|
|
|
|
|
|
425 |
|
426 |
if verbose:
|
427 |
print("Transforming autoregressive outputs into audio..")
|
428 |
wav_candidates = []
|
429 |
+
latents = best_latents
|
430 |
+
# Find the first occurrence of the "calm" token and trim the codes to that.
|
431 |
+
ctokens = 0
|
432 |
+
for k in range(codes.shape[-1]):
|
433 |
+
if codes[0, k] == calm_token:
|
434 |
+
ctokens += 1
|
435 |
+
else:
|
436 |
+
ctokens = 0
|
437 |
+
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
438 |
+
latents = latents[:, :k]
|
439 |
+
break
|
440 |
+
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
441 |
+
verbose=verbose)
|
442 |
+
wav = self.vocoder.inference(mel)
|
443 |
+
wav_candidates.append(wav.cpu())
|
|
|
|
|
|
|
444 |
|
445 |
def potentially_redact(clip, text):
|
446 |
if self.enable_redaction:
|