Spaces:
Running
on
T4
Running
on
T4
remove clvp for lower gpu usage and increased speed.
Browse files- app.py +3 -15
- tortoise/api.py +42 -180
app.py
CHANGED
@@ -40,7 +40,6 @@ VOICE_OPTIONS = [
|
|
40 |
"william",
|
41 |
"jane_eyre",
|
42 |
"random", # special option for random voice
|
43 |
-
"disabled", # special option for disabled voice
|
44 |
]
|
45 |
|
46 |
|
@@ -49,7 +48,6 @@ def inference(
|
|
49 |
script,
|
50 |
voice,
|
51 |
voice_b,
|
52 |
-
preset,
|
53 |
seed,
|
54 |
split_by_newline,
|
55 |
):
|
@@ -81,7 +79,7 @@ def inference(
|
|
81 |
text,
|
82 |
voice_samples=voice_samples,
|
83 |
conditioning_latents=conditioning_latents,
|
84 |
-
preset=
|
85 |
k=1,
|
86 |
use_deterministic_seed=seed,
|
87 |
)
|
@@ -91,12 +89,9 @@ def inference(
|
|
91 |
|
92 |
full_audio = torch.cat(all_parts, dim=-1)
|
93 |
|
94 |
-
# os.makedirs("outputs", exist_ok=True)
|
95 |
-
# torchaudio.save(os.path.join("outputs", f"{name}.wav"), full_audio, 24000)
|
96 |
-
|
97 |
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
|
98 |
f.write(
|
99 |
-
f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} |
|
100 |
)
|
101 |
|
102 |
output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
|
@@ -120,14 +115,8 @@ def main():
|
|
120 |
)
|
121 |
script = gr.File(label="Upload a text file")
|
122 |
|
123 |
-
preset = gr.Radio(
|
124 |
-
["ultra_fast", "fast", "standard", "high_quality"],
|
125 |
-
value="fast",
|
126 |
-
label="Preset mode (determines quality with tradeoff over speed):",
|
127 |
-
type="value",
|
128 |
-
)
|
129 |
voice = gr.Dropdown(
|
130 |
-
VOICE_OPTIONS, value="
|
131 |
)
|
132 |
voice_b = gr.Dropdown(
|
133 |
VOICE_OPTIONS,
|
@@ -154,7 +143,6 @@ def main():
|
|
154 |
script,
|
155 |
voice,
|
156 |
voice_b,
|
157 |
-
preset,
|
158 |
seed,
|
159 |
split_by_newline,
|
160 |
],
|
|
|
40 |
"william",
|
41 |
"jane_eyre",
|
42 |
"random", # special option for random voice
|
|
|
43 |
]
|
44 |
|
45 |
|
|
|
48 |
script,
|
49 |
voice,
|
50 |
voice_b,
|
|
|
51 |
seed,
|
52 |
split_by_newline,
|
53 |
):
|
|
|
79 |
text,
|
80 |
voice_samples=voice_samples,
|
81 |
conditioning_latents=conditioning_latents,
|
82 |
+
preset="ultra_fast",
|
83 |
k=1,
|
84 |
use_deterministic_seed=seed,
|
85 |
)
|
|
|
89 |
|
90 |
full_audio = torch.cat(all_parts, dim=-1)
|
91 |
|
|
|
|
|
|
|
92 |
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
|
93 |
f.write(
|
94 |
+
f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
|
95 |
)
|
96 |
|
97 |
output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
|
|
|
115 |
)
|
116 |
script = gr.File(label="Upload a text file")
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
voice = gr.Dropdown(
|
119 |
+
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
|
120 |
)
|
121 |
voice_b = gr.Dropdown(
|
122 |
VOICE_OPTIONS,
|
|
|
143 |
script,
|
144 |
voice,
|
145 |
voice_b,
|
|
|
146 |
seed,
|
147 |
split_by_newline,
|
148 |
],
|
tortoise/api.py
CHANGED
@@ -252,13 +252,6 @@ class TextToSpeech:
|
|
252 |
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
254 |
|
255 |
-
self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
|
256 |
-
text_seq_len=350, text_heads=12,
|
257 |
-
num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
|
258 |
-
use_xformers=True).cpu().eval()
|
259 |
-
self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
|
260 |
-
self.cvvp = None # CVVP model is only loaded if used.
|
261 |
-
|
262 |
self.vocoder = UnivNetGenerator().cpu()
|
263 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
264 |
self.vocoder.eval(inference=True)
|
@@ -272,13 +265,6 @@ class TextToSpeech:
|
|
272 |
yield m
|
273 |
m = model.cpu()
|
274 |
|
275 |
-
|
276 |
-
def load_cvvp(self):
|
277 |
-
"""Load CVVP model."""
|
278 |
-
self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
|
279 |
-
speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
|
280 |
-
self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
|
281 |
-
|
282 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
283 |
"""
|
284 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
@@ -341,8 +327,9 @@ class TextToSpeech:
|
|
341 |
'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
|
342 |
# Presets are defined here.
|
343 |
presets = {
|
344 |
-
'ultra_fast': {'num_autoregressive_samples':
|
345 |
-
'
|
|
|
346 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
347 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
348 |
}
|
@@ -422,182 +409,57 @@ class TextToSpeech:
|
|
422 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
423 |
|
424 |
with torch.no_grad():
|
425 |
-
|
426 |
-
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
|
427 |
stop_mel_token = self.autoregressive.stop_mel_token
|
428 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
429 |
if verbose:
|
430 |
print("Generating autoregressive samples..")
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
max_generate_length=max_mel_tokens,
|
443 |
-
**hf_generate_kwargs)
|
444 |
-
padding_needed = max_mel_tokens - codes.shape[1]
|
445 |
-
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
446 |
-
samples.append(codes)
|
447 |
-
else:
|
448 |
-
with self.temporary_cuda(self.autoregressive) as autoregressive:
|
449 |
-
for b in tqdm(range(num_batches), disable=not verbose):
|
450 |
-
codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
|
451 |
-
do_sample=True,
|
452 |
-
top_p=top_p,
|
453 |
-
temperature=temperature,
|
454 |
-
num_return_sequences=self.autoregressive_batch_size,
|
455 |
-
length_penalty=length_penalty,
|
456 |
-
repetition_penalty=repetition_penalty,
|
457 |
-
max_generate_length=max_mel_tokens,
|
458 |
-
**hf_generate_kwargs)
|
459 |
-
padding_needed = max_mel_tokens - codes.shape[1]
|
460 |
-
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
|
461 |
-
samples.append(codes)
|
462 |
-
|
463 |
-
clip_results = []
|
464 |
-
|
465 |
-
if not torch.backends.mps.is_available():
|
466 |
-
with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
|
467 |
-
device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
|
468 |
-
):
|
469 |
-
if cvvp_amount > 0:
|
470 |
-
if self.cvvp is None:
|
471 |
-
self.load_cvvp()
|
472 |
-
self.cvvp = self.cvvp.to(self.device)
|
473 |
-
if verbose:
|
474 |
-
if self.cvvp is None:
|
475 |
-
print("Computing best candidates using CLVP")
|
476 |
-
else:
|
477 |
-
print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
|
478 |
-
for batch in tqdm(samples, disable=not verbose):
|
479 |
-
for i in range(batch.shape[0]):
|
480 |
-
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
481 |
-
if cvvp_amount != 1:
|
482 |
-
clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
483 |
-
if auto_conds is not None and cvvp_amount > 0:
|
484 |
-
cvvp_accumulator = 0
|
485 |
-
for cl in range(auto_conds.shape[1]):
|
486 |
-
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
487 |
-
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
488 |
-
if cvvp_amount == 1:
|
489 |
-
clip_results.append(cvvp)
|
490 |
-
else:
|
491 |
-
clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
|
492 |
-
else:
|
493 |
-
clip_results.append(clvp_out)
|
494 |
-
clip_results = torch.cat(clip_results, dim=0)
|
495 |
-
samples = torch.cat(samples, dim=0)
|
496 |
-
best_results = samples[torch.topk(clip_results, k=k).indices]
|
497 |
-
else:
|
498 |
-
with self.temporary_cuda(self.clvp) as clvp:
|
499 |
-
if cvvp_amount > 0:
|
500 |
-
if self.cvvp is None:
|
501 |
-
self.load_cvvp()
|
502 |
-
self.cvvp = self.cvvp.to(self.device)
|
503 |
-
if verbose:
|
504 |
-
if self.cvvp is None:
|
505 |
-
print("Computing best candidates using CLVP")
|
506 |
-
else:
|
507 |
-
print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
|
508 |
-
for batch in tqdm(samples, disable=not verbose):
|
509 |
-
for i in range(batch.shape[0]):
|
510 |
-
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
|
511 |
-
if cvvp_amount != 1:
|
512 |
-
clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
|
513 |
-
if auto_conds is not None and cvvp_amount > 0:
|
514 |
-
cvvp_accumulator = 0
|
515 |
-
for cl in range(auto_conds.shape[1]):
|
516 |
-
cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
|
517 |
-
cvvp = cvvp_accumulator / auto_conds.shape[1]
|
518 |
-
if cvvp_amount == 1:
|
519 |
-
clip_results.append(cvvp)
|
520 |
-
else:
|
521 |
-
clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
|
522 |
-
else:
|
523 |
-
clip_results.append(clvp_out)
|
524 |
-
clip_results = torch.cat(clip_results, dim=0)
|
525 |
-
samples = torch.cat(samples, dim=0)
|
526 |
-
best_results = samples[torch.topk(clip_results, k=k).indices]
|
527 |
-
if self.cvvp is not None:
|
528 |
-
self.cvvp = self.cvvp.cpu()
|
529 |
-
del samples
|
530 |
-
|
531 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
532 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
533 |
# results, but will increase memory usage.
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
)
|
538 |
-
|
539 |
-
)
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
del auto_conditioning
|
545 |
-
else:
|
546 |
-
with self.temporary_cuda(
|
547 |
-
self.autoregressive
|
548 |
-
) as autoregressive:
|
549 |
-
best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
550 |
-
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
|
551 |
-
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
552 |
-
return_latent=True, clip_inputs=False)
|
553 |
-
del auto_conditioning
|
554 |
|
555 |
if verbose:
|
556 |
print("Transforming autoregressive outputs into audio..")
|
557 |
wav_candidates = []
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
|
|
567 |
ctokens = 0
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
break
|
576 |
-
mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
577 |
-
verbose=verbose)
|
578 |
-
wav = vocoder.inference(mel)
|
579 |
-
wav_candidates.append(wav.cpu())
|
580 |
-
else:
|
581 |
-
diffusion, vocoder = self.diffusion, self.vocoder
|
582 |
-
diffusion_conditioning = diffusion_conditioning.cpu()
|
583 |
-
for b in range(best_results.shape[0]):
|
584 |
-
codes = best_results[b].unsqueeze(0).cpu()
|
585 |
-
latents = best_latents[b].unsqueeze(0).cpu()
|
586 |
-
|
587 |
-
# Find the first occurrence of the "calm" token and trim the codes to that.
|
588 |
-
ctokens = 0
|
589 |
-
for k in range(codes.shape[-1]):
|
590 |
-
if codes[0, k] == calm_token:
|
591 |
-
ctokens += 1
|
592 |
-
else:
|
593 |
-
ctokens = 0
|
594 |
-
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
595 |
-
latents = latents[:, :k]
|
596 |
-
break
|
597 |
-
mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
598 |
-
verbose=verbose)
|
599 |
-
wav = vocoder.inference(mel)
|
600 |
-
wav_candidates.append(wav.cpu())
|
601 |
|
602 |
def potentially_redact(clip, text):
|
603 |
if self.enable_redaction:
|
|
|
252 |
layer_drop=0, unconditioned_percentage=0).cpu().eval()
|
253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
self.vocoder = UnivNetGenerator().cpu()
|
256 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
257 |
self.vocoder.eval(inference=True)
|
|
|
265 |
yield m
|
266 |
m = model.cpu()
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
269 |
"""
|
270 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
|
|
327 |
'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
|
328 |
# Presets are defined here.
|
329 |
presets = {
|
330 |
+
'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
|
331 |
+
# 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
|
332 |
+
'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
|
333 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
334 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
335 |
}
|
|
|
409 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
410 |
|
411 |
with torch.no_grad():
|
412 |
+
|
|
|
413 |
stop_mel_token = self.autoregressive.stop_mel_token
|
414 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
415 |
if verbose:
|
416 |
print("Generating autoregressive samples..")
|
417 |
+
with self.temporary_cuda(self.autoregressive
|
418 |
+
) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
|
419 |
+
codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
|
420 |
+
do_sample=True,
|
421 |
+
top_p=top_p,
|
422 |
+
temperature=temperature,
|
423 |
+
num_return_sequences=num_autoregressive_samples,
|
424 |
+
length_penalty=length_penalty,
|
425 |
+
repetition_penalty=repetition_penalty,
|
426 |
+
max_generate_length=max_mel_tokens,
|
427 |
+
**hf_generate_kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
429 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
430 |
# results, but will increase memory usage.
|
431 |
+
with self.temporary_cuda(
|
432 |
+
self.autoregressive
|
433 |
+
) as autoregressive, torch.autocast(
|
434 |
+
device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
|
435 |
+
):
|
436 |
+
best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
437 |
+
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
|
438 |
+
torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
439 |
+
return_latent=True, clip_inputs=False)
|
440 |
+
del auto_conditioning
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
|
442 |
if verbose:
|
443 |
print("Transforming autoregressive outputs into audio..")
|
444 |
wav_candidates = []
|
445 |
+
with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
|
446 |
+
self.vocoder
|
447 |
+
) as vocoder:
|
448 |
+
latents = best_latents
|
449 |
+
# Find the first occurrence of the "calm" token and trim the codes to that.
|
450 |
+
ctokens = 0
|
451 |
+
for k in range(codes.shape[-1]):
|
452 |
+
if codes[0, k] == calm_token:
|
453 |
+
ctokens += 1
|
454 |
+
else:
|
455 |
ctokens = 0
|
456 |
+
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
457 |
+
latents = latents[:, :k]
|
458 |
+
break
|
459 |
+
mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
460 |
+
verbose=verbose)
|
461 |
+
wav = vocoder.inference(mel)
|
462 |
+
wav_candidates.append(wav.cpu())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
|
464 |
def potentially_redact(clip, text):
|
465 |
if self.enable_redaction:
|