import torch import gradio as gr import soundfile as sf import numpy as np import random, os import spaces from consistencytta import ConsistencyTTA def seed_all(seed): """ Seed all random number generators. """ seed = int(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.cuda.random.manual_seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True device = torch.device( "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) sr = 16000 # Build ConsistencyTTA model consistencytta = ConsistencyTTA().to(device) consistencytta.eval() consistencytta.requires_grad_(False) @spaces.GPU() def generate(prompt: str, seed: str = '', cfg_weight: float = 4.): """ Generate audio from a given prompt. Args: prompt (str): Text prompt to generate audio from. seed (str, optional): Random seed. Defaults to '', which means no seed. """ if seed != '': try: seed_all(int(seed)) except: pass with torch.no_grad(): with torch.autocast(device_type="cuda", dtype=torch.bfloat16): wav = consistencytta( [prompt], num_steps=1, cfg_scale_input=cfg_weight, cfg_scale_post=1., sr=sr ) sf.write("output.wav", wav.T, samplerate=sr, subtype='PCM_16') return "output.wav" # Generate test audio print("Generating test audio...") generate("A dog barks as a train passes by.", seed=1) print("Test audio generated successfully! Starting Gradio interface...") # Launch Gradio interface iface = gr.Interface( fn=generate, inputs=[ gr.Textbox( label="Text", value="Several people cheer and scream and speak as water flows hard." ), gr.Textbox(label="Random Seed (Optional)", value=''), gr.Slider( minimum=0., maximum=8., value=3.5, label="Classifier-Free Guidance Strength" )], outputs="audio", title="ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio " \ "Generation with Consistency Distillation", description="This is the official demo page for ConsistencyTTA, a model that accelerates " \ "diffusion-based text-to-audio generation hundreds of times with consistency " \ "models.
Here, the audio is generated within a single non-autoregressive " \ "forward pass from the CLAP-finetuned ConsistencyTTA checkpoint.
Since " \ "the training dataset does not include speech, the model is not expected to " \ "generate coherent speech.
Have fun!" ) iface.launch()