Spaces:

Manmay
/

tortoise-tts

Running on T4

App Files Files Community

Manmay commited on Sep 30, 2023

Commit

1370e35

1 Parent(s): 30f76f3

remove clvp for lower gpu usage and increased speed.

Browse files

Files changed (2) hide show

app.py +3 -15
tortoise/api.py +42 -180

app.py CHANGED Viewed

@@ -40,7 +40,6 @@ VOICE_OPTIONS = [
     "william",
     "jane_eyre",
     "random",  # special option for random voice
-    "disabled",  # special option for disabled voice
 ]
@@ -49,7 +48,6 @@ def inference(
     script,
     voice,
     voice_b,
-    preset,
     seed,
     split_by_newline,
 ):
@@ -81,7 +79,7 @@ def inference(
             text,
             voice_samples=voice_samples,
             conditioning_latents=conditioning_latents,
-            preset=preset,
             k=1,
             use_deterministic_seed=seed,
         )
@@ -91,12 +89,9 @@ def inference(
     full_audio = torch.cat(all_parts, dim=-1)
-    # os.makedirs("outputs", exist_ok=True)
-    # torchaudio.save(os.path.join("outputs", f"{name}.wav"), full_audio, 24000)
     with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
         f.write(
-            f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
         )
     output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
@@ -120,14 +115,8 @@ def main():
     )
     script = gr.File(label="Upload a text file")
-    preset = gr.Radio(
-        ["ultra_fast", "fast", "standard", "high_quality"],
-        value="fast",
-        label="Preset mode (determines quality with tradeoff over speed):",
-        type="value",
-    )
     voice = gr.Dropdown(
-        VOICE_OPTIONS, value="angie", label="Select voice:", type="value"
     )
     voice_b = gr.Dropdown(
         VOICE_OPTIONS,
@@ -154,7 +143,6 @@ def main():
             script,
             voice,
             voice_b,
-            preset,
             seed,
             split_by_newline,
         ],

     "william",
     "jane_eyre",
     "random",  # special option for random voice
 ]
     script,
     voice,
     voice_b,
     seed,
     split_by_newline,
 ):
             text,
             voice_samples=voice_samples,
             conditioning_latents=conditioning_latents,
+            preset="ultra_fast",
             k=1,
             use_deterministic_seed=seed,
         )
     full_audio = torch.cat(all_parts, dim=-1)
     with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
         f.write(
+            f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
         )
     output_texts = [f"({j+1}) {texts[j]}" for j in range(len(texts))]
     )
     script = gr.File(label="Upload a text file")
     voice = gr.Dropdown(
+        VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
     )
     voice_b = gr.Dropdown(
         VOICE_OPTIONS,
             script,
             voice,
             voice_b,
             seed,
             split_by_newline,
         ],

tortoise/api.py CHANGED Viewed

@@ -252,13 +252,6 @@ class TextToSpeech:
                                           layer_drop=0, unconditioned_percentage=0).cpu().eval()
             self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
-        self.clvp = CLVP(dim_text=768, dim_speech=768, dim_latent=768, num_text_tokens=256, text_enc_depth=20,
-                         text_seq_len=350, text_heads=12,
-                         num_speech_tokens=8192, speech_enc_depth=20, speech_heads=12, speech_seq_len=430,
-                         use_xformers=True).cpu().eval()
-        self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
-        self.cvvp = None # CVVP model is only loaded if used.
         self.vocoder = UnivNetGenerator().cpu()
         self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
         self.vocoder.eval(inference=True)
@@ -272,13 +265,6 @@ class TextToSpeech:
         yield m
         m = model.cpu()
-    def load_cvvp(self):
-        """Load CVVP model."""
-        self.cvvp = CVVP(model_dim=512, transformer_heads=8, dropout=0, mel_codes=8192, conditioning_enc_depth=8, cond_mask_percentage=0,
-                         speech_enc_depth=8, speech_mask_percentage=0, latent_multiplier=1).cpu().eval()
-        self.cvvp.load_state_dict(torch.load(get_model_path('cvvp.pth', self.models_dir)))
     def get_conditioning_latents(self, voice_samples, return_mels=False):
         """
         Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
@@ -341,8 +327,9 @@ class TextToSpeech:
                     'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
         # Presets are defined here.
         presets = {
-            'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30, 'cond_free': False},
-            'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 80},
             'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
             'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
         }
@@ -422,182 +409,57 @@ class TextToSpeech:
         diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
         with torch.no_grad():
-            samples = []
-            num_batches = num_autoregressive_samples // self.autoregressive_batch_size
             stop_mel_token = self.autoregressive.stop_mel_token
             calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
             if verbose:
                 print("Generating autoregressive samples..")
-            if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.autoregressive
-                ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
-                    for b in tqdm(range(num_batches), disable=not verbose):
-                        codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
-                                                                    do_sample=True,
-                                                                    top_p=top_p,
-                                                                    temperature=temperature,
-                                                                    num_return_sequences=self.autoregressive_batch_size,
-                                                                    length_penalty=length_penalty,
-                                                                    repetition_penalty=repetition_penalty,
-                                                                    max_generate_length=max_mel_tokens,
-                                                                    **hf_generate_kwargs)
-                        padding_needed = max_mel_tokens - codes.shape[1]
-                        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
-                        samples.append(codes)
-            else:
-                with self.temporary_cuda(self.autoregressive) as autoregressive:
-                    for b in tqdm(range(num_batches), disable=not verbose):
-                        codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
-                                                                    do_sample=True,
-                                                                    top_p=top_p,
-                                                                    temperature=temperature,
-                                                                    num_return_sequences=self.autoregressive_batch_size,
-                                                                    length_penalty=length_penalty,
-                                                                    repetition_penalty=repetition_penalty,
-                                                                    max_generate_length=max_mel_tokens,
-                                                                    **hf_generate_kwargs)
-                        padding_needed = max_mel_tokens - codes.shape[1]
-                        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
-                        samples.append(codes)
-            clip_results = []
-            if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
-                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
-                ):
-                    if cvvp_amount > 0:
-                        if self.cvvp is None:
-                            self.load_cvvp()
-                        self.cvvp = self.cvvp.to(self.device)
-                    if verbose:
-                        if self.cvvp is None:
-                            print("Computing best candidates using CLVP")
-                        else:
-                            print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
-                    for batch in tqdm(samples, disable=not verbose):
-                        for i in range(batch.shape[0]):
-                            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
-                        if cvvp_amount != 1:
-                            clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
-                        if auto_conds is not None and cvvp_amount > 0:
-                            cvvp_accumulator = 0
-                            for cl in range(auto_conds.shape[1]):
-                                cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
-                            cvvp = cvvp_accumulator / auto_conds.shape[1]
-                            if cvvp_amount == 1:
-                                clip_results.append(cvvp)
-                            else:
-                                clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
-                        else:
-                            clip_results.append(clvp_out)
-                    clip_results = torch.cat(clip_results, dim=0)
-                    samples = torch.cat(samples, dim=0)
-                    best_results = samples[torch.topk(clip_results, k=k).indices]
-            else:
-                with self.temporary_cuda(self.clvp) as clvp:
-                    if cvvp_amount > 0:
-                        if self.cvvp is None:
-                            self.load_cvvp()
-                        self.cvvp = self.cvvp.to(self.device)
-                    if verbose:
-                        if self.cvvp is None:
-                            print("Computing best candidates using CLVP")
-                        else:
-                            print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
-                    for batch in tqdm(samples, disable=not verbose):
-                        for i in range(batch.shape[0]):
-                            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
-                        if cvvp_amount != 1:
-                            clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
-                        if auto_conds is not None and cvvp_amount > 0:
-                            cvvp_accumulator = 0
-                            for cl in range(auto_conds.shape[1]):
-                                cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
-                            cvvp = cvvp_accumulator / auto_conds.shape[1]
-                            if cvvp_amount == 1:
-                                clip_results.append(cvvp)
-                            else:
-                                clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
-                        else:
-                            clip_results.append(clvp_out)
-                    clip_results = torch.cat(clip_results, dim=0)
-                    samples = torch.cat(samples, dim=0)
-                    best_results = samples[torch.topk(clip_results, k=k).indices]
-            if self.cvvp is not None:
-                self.cvvp = self.cvvp.cpu()
-            del samples
             # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
             # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
             # results, but will increase memory usage.
-            if not torch.backends.mps.is_available():
-                with self.temporary_cuda(
-                    self.autoregressive
-                ) as autoregressive, torch.autocast(
-                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
-                ):
-                    best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
-                                                    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
-                                                    torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
-                                                    return_latent=True, clip_inputs=False)
-                    del auto_conditioning
-            else:
-                with self.temporary_cuda(
-                    self.autoregressive
-                ) as autoregressive:
-                    best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
-                                                    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
-                                                    torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
-                                                    return_latent=True, clip_inputs=False)
-                    del auto_conditioning
             if verbose:
                 print("Transforming autoregressive outputs into audio..")
             wav_candidates = []
-            if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
-                    self.vocoder
-                ) as vocoder:
-                    for b in range(best_results.shape[0]):
-                        codes = best_results[b].unsqueeze(0)
-                        latents = best_latents[b].unsqueeze(0)
-                        # Find the first occurrence of the "calm" token and trim the codes to that.
                         ctokens = 0
-                        for k in range(codes.shape[-1]):
-                            if codes[0, k] == calm_token:
-                                ctokens += 1
-                            else:
-                                ctokens = 0
-                            if ctokens > 8:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
-                                latents = latents[:, :k]
-                                break
-                        mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
-                                                    verbose=verbose)
-                        wav = vocoder.inference(mel)
-                        wav_candidates.append(wav.cpu())
-            else:
-                diffusion, vocoder = self.diffusion, self.vocoder
-                diffusion_conditioning = diffusion_conditioning.cpu()
-                for b in range(best_results.shape[0]):
-                    codes = best_results[b].unsqueeze(0).cpu()
-                    latents = best_latents[b].unsqueeze(0).cpu()
-                    # Find the first occurrence of the "calm" token and trim the codes to that.
-                    ctokens = 0
-                    for k in range(codes.shape[-1]):
-                        if codes[0, k] == calm_token:
-                            ctokens += 1
-                        else:
-                            ctokens = 0
-                        if ctokens > 8:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
-                            latents = latents[:, :k]
-                            break
-                    mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
-                                                verbose=verbose)
-                    wav = vocoder.inference(mel)
-                    wav_candidates.append(wav.cpu())
             def potentially_redact(clip, text):
                 if self.enable_redaction:

                                           layer_drop=0, unconditioned_percentage=0).cpu().eval()
             self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
         self.vocoder = UnivNetGenerator().cpu()
         self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
         self.vocoder.eval(inference=True)
         yield m
         m = model.cpu()
     def get_conditioning_latents(self, voice_samples, return_mels=False):
         """
         Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
                     'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
         # Presets are defined here.
         presets = {
+            'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
+            # 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
+            'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
             'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
             'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
         }
         diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
         with torch.no_grad():
             stop_mel_token = self.autoregressive.stop_mel_token
             calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
             if verbose:
                 print("Generating autoregressive samples..")
+            with self.temporary_cuda(self.autoregressive
+            ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
+                codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
+                                                            do_sample=True,
+                                                            top_p=top_p,
+                                                            temperature=temperature,
+                                                            num_return_sequences=num_autoregressive_samples,
+                                                            length_penalty=length_penalty,
+                                                            repetition_penalty=repetition_penalty,
+                                                            max_generate_length=max_mel_tokens,
+                                                            **hf_generate_kwargs)
             # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
             # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
             # results, but will increase memory usage.
+            with self.temporary_cuda(
+                self.autoregressive
+            ) as autoregressive, torch.autocast(
+                device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
+            ):
+                best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
+                                                torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
+                                                torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
+                                                return_latent=True, clip_inputs=False)
+                del auto_conditioning
             if verbose:
                 print("Transforming autoregressive outputs into audio..")
             wav_candidates = []
+            with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
+                self.vocoder
+            ) as vocoder:
+                latents = best_latents
+                # Find the first occurrence of the "calm" token and trim the codes to that.
+                ctokens = 0
+                for k in range(codes.shape[-1]):
+                    if codes[0, k] == calm_token:
+                        ctokens += 1
+                    else:
                         ctokens = 0
+                    if ctokens > 8:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
+                        latents = latents[:, :k]
+                        break
+                mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
+                                            verbose=verbose)
+                wav = vocoder.inference(mel)
+                wav_candidates.append(wav.cpu())
             def potentially_redact(clip, text):
                 if self.enable_redaction: