Music-TTA

Runtime error

File size: 6,280 Bytes

ffead1e
 
 
559b00c
ffead1e
 
 
 
 
 
559b00c
 
 
 
 
 
 
 
ffead1e
 
559b00c
ffead1e
559b00c
ffead1e
 
 
 
 
 
 
 
 
559b00c
 
 
ffead1e
 
 
 
 
 
 
 
 
 
 
559b00c
ffead1e
 
 
 
 
 
559b00c
 
ffead1e
559b00c
ffead1e
 
 
 
559b00c
 
ffead1e
 
559b00c
ffead1e
559b00c
ffead1e
 
 
 
 
559b00c
ffead1e
 
a258601
559b00c
 
 
 
fa7808c
ffead1e
 
fa7808c
ffead1e
05118bf
 
 
559b00c
 
 
 
 
 
05118bf
559b00c
 
05118bf
559b00c
 
 
 
 
 
 
05118bf
 
 
 
ffead1e
05118bf
fa7808c
05118bf
ffead1e
05118bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b7c424
05118bf

import gradio as gr
import json
import torch

from tqdm import tqdm
from huggingface_hub import snapshot_download
from models import AudioDiffusion, DDPMScheduler
from audioldm.audio.stft import TacotronSTFT
from audioldm.variational_autoencoder import AutoencoderKL
from gradio import Markdown

# Automatic device detection
if torch.cuda.is_available():
    device_type = "cuda"
    device_selection = "cuda:0"
else:
    device_type = "cpu"
    device_selection = "cpu"

class Tango:
    def __init__(self, name = "declare-lab/tango2", device = device_selection):
        
        path = snapshot_download(repo_id = name)
        
        vae_config = json.load(open("{}/vae_config.json".format(path)))
        stft_config = json.load(open("{}/stft_config.json".format(path)))
        main_config = json.load(open("{}/main_config.json".format(path)))
        
        self.vae = AutoencoderKL(**vae_config).to(device)
        self.stft = TacotronSTFT(**stft_config).to(device)
        self.model = AudioDiffusion(**main_config).to(device)
        
        vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location = device)
        stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location = device)
        main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location = device)
        
        self.vae.load_state_dict(vae_weights)
        self.stft.load_state_dict(stft_weights)
        self.model.load_state_dict(main_weights)

        print ("Successfully loaded checkpoint from:", name)
        
        self.vae.eval()
        self.stft.eval()
        self.model.eval()
        
        self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder = "scheduler")
        
    def chunks(self, lst, n):
        """ Yield successive n-sized chunks from a list. """
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
        
    def generate(self, prompt, steps = 100, guidance = 3, samples = 1, disable_progress = True):
        """ Generate audio for a single prompt string. """
        with torch.no_grad():
            latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
            mel = self.vae.decode_first_stage(latents)
            wave = self.vae.decode_to_waveform(mel)
        return wave[0]
    
    def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True):
        """ Generate audio for a list of prompt strings. """
        outputs = []
        for k in tqdm(range(0, len(prompts), batch_size)):
            batch = prompts[k: k + batch_size]
            with torch.no_grad():
                latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
                mel = self.vae.decode_first_stage(latents)
                wave = self.vae.decode_to_waveform(mel)
                outputs += [item for item in wave]
        if samples == 1:
            return outputs
        return list(self.chunks(outputs, samples))

# Initialize TANGO

tango = Tango(device = "cpu")
tango.vae.to(device_type)
tango.stft.to(device_type)
tango.model.to(device_type)

def gradio_generate(prompt, steps, guidance):
    output_wave = tango.generate(prompt, steps, guidance)
    return gr.make_waveform((16000, output_wave))

# Gradio interface
with gr.Blocks() as interface:
    gr.Markdown("""
        <p style="text-align: center;">
        <b><big><big><big>Text-to-Audio</big></big></big></b>
        <br/>Generates an audio file, freely, without account, without watermark, that you can download.
        </p>
        <br/>
        <br/>
        ✨ Powered by <i>Tango 2</i> AI.
        <br/>
        <ul>
        <li>If you need to generate <b>music</b>, I recommend to use <i>MusicGen</i>,</li>
        </ul>
        <br/>
        🐌 Slow process... Your computer must <b><u>not</u></b> enter into standby mode.<br/>You can duplicate this space on a free account, it works on CPU.<br/>
        <a href='https://huggingface.co/spaces/Fabrice-TIERCELIN/Text-to-Audio?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14'></a>
        <br/>
        ⚖️ You can use, modify and share the generated sounds but not for commercial uses.
        """
    )
    input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True)
    denoising_steps = gr.Slider(label = "Steps", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True)
    guidance_scale = gr.Slider(label = "Guidance Scale", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)

    submit = gr.Button("Generate 🚀", variant = "primary")

    output_audio = gr.Audio(label = "Generated Audio")

    submit.click(fn = gradio_generate, inputs = [
        input_text,
        denoising_steps,
        guidance_scale
    ], outputs = [
        output_audio
    ], scroll_to_output = True)

    gr.Examples(
        fn = gradio_generate,
	    inputs = [
            input_text,
            denoising_steps,
            guidance_scale
        ],
	    outputs = [
            output_audio
        ],
        examples = [
                ["A hammer is hitting a wooden surface", 100, 3],
                ["Peaceful and calming ambient music with singing bowl and other instruments.", 100, 3],
                ["A man is speaking in a small room.", 100, 3],
                ["A female is speaking followed by footstep sound", 100, 3],
                ["Wooden table tapping sound followed by water pouring sound.", 100, 3],
            ],
        cache_examples = "lazy",
    )
        
    interface.queue(10).launch()