Spaces:

Flux9665
/

EnglishToucan

Running on Zero

File size: 3,053 Bytes

6faeba1
 
 
 
 
6a79837
6faeba1
 
 
 
 
 
 
 
 
 
 
 
6a79837
6faeba1
 
 
6a79837
 
6faeba1
 
 
6a79837
6faeba1
 
 
 
 
 
 
 
 
 
 
 
 
 
6a79837
23208c6
 
 
 
 
 
 
 
 
 
6a79837
6faeba1
 
 
c255993
6faeba1
6a79837
6faeba1

import os

import torch

from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Modules.ControllabilityGAN.GAN import GanWrapper
from Utility.storage_config import MODELS_DIR


class ControllableInterface:

    def __init__(self, gpu_id="cpu", available_artificial_voices=1000):
        if gpu_id == "cpu":
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
        else:
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
            os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
        self.device = "cuda" if gpu_id != "cpu" else "cpu"
        self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta")
        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
        self.generated_speaker_embeds = list()
        self.available_artificial_voices = available_artificial_voices
        self.current_language = ""
        self.current_accent = ""

    def read(self,
             prompt,
             reference_audio,
             voice_seed,
             prosody_creativity,
             duration_scaling_factor,
             pause_duration_scaling_factor,
             pitch_variance_scale,
             energy_variance_scale,
             emb_slider_1,
             emb_slider_2,
             emb_slider_3,
             emb_slider_4,
             emb_slider_5,
             emb_slider_6,
             loudness_in_db
             ):
        if reference_audio is None:
            self.wgan.set_latent(voice_seed)
            controllability_vector = torch.tensor([emb_slider_1,
                                                   emb_slider_2,
                                                   emb_slider_3,
                                                   emb_slider_4,
                                                   emb_slider_5,
                                                   emb_slider_6], dtype=torch.float32)
            embedding = self.wgan.modify_embed(controllability_vector)
            self.model.set_utterance_embedding(embedding=embedding)
        else:
            self.model.set_utterance_embedding(reference_audio)

        phones = self.model.text2phone.get_phone_string(prompt)
        if len(phones) > 1800:
            prompt = "Your input was too long. Please try either a shorter text or split it into several parts."

        print(prompt + "\n\n")
        wav, sr, fig = self.model(prompt,
                                  input_is_phones=False,
                                  duration_scaling_factor=duration_scaling_factor,
                                  pitch_variance_scale=pitch_variance_scale,
                                  energy_variance_scale=energy_variance_scale,
                                  pause_duration_scaling_factor=pause_duration_scaling_factor,
                                  return_plot_as_filepath=True,
                                  prosody_creativity=prosody_creativity,
                                  loudness_in_db=loudness_in_db)
        return sr, wav, fig