Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

PicoAudio / app.py

ZeyuXie

Update app.py

b409041 verified 3 months ago

raw

history blame contribute delete

8.63 kB


	import spaces
	import os
	import json
	import numpy as np
	import torch
	import soundfile as sf
	import gradio as gr
	from diffusers import DDPMScheduler
	from pico_model import PicoDiffusion
	from audioldm.variational_autoencoder.autoencoder import AutoencoderKL
	from llm_preprocess import get_event, preprocess_gemini, preprocess_gpt



	class dotdict(dict):
	"""dot.notation access to dictionary attributes"""
	__getattr__ = dict.get
	__setattr__ = dict.__setitem__
	__delattr__ = dict.__delitem__


	class InferRunner:
	def __init__(self, device):
	vae_config = json.load(open("ckpts/ldm/vae_config.json"))
	self.vae = AutoencoderKL(**vae_config)
	vae_weights = torch.load("ckpts/ldm/pytorch_model_vae.bin", map_location="cpu")
	self.vae.load_state_dict(vae_weights)
	self.vae = self.vae.to(device)

	train_args = dotdict(json.loads(open("ckpts/pico_model/summary.jsonl").readlines()[0]))
	self.pico_model = PicoDiffusion(
	scheduler_name=train_args.scheduler_name,
	unet_model_config_path=train_args.unet_model_config,
	snr_gamma=train_args.snr_gamma,
	freeze_text_encoder_ckpt="ckpts/laion_clap/630k-audioset-best.pt",
	diffusion_pt="ckpts/pico_model/diffusion.pt",
	).eval().to(device)
	self.scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler")

	#device = "cuda" if torch.cuda.is_available() else "cpu"
	device = "cuda"
	runner = InferRunner(device)
	event_list = get_event()

	@spaces.GPU(duration=240)
	def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
	with torch.no_grad():
	latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
	mel = runner.vae.decode_first_stage(latents)
	wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
	outpath = f"output.wav"
	sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
	return outpath

	def preprocess(caption):
	output = preprocess_gemini(caption)
	return output, output

	def update_textbox(event_name, current_text):
	event = event_name + ' two times.'
	if current_text:
	return current_text.strip('.') + ' then ' + event
	else:
	return event

	with gr.Blocks() as demo:
	with gr.Row():
	gr.Markdown("## PicoAudio")
	with gr.Row():
	gr.Markdown("""
	[![arXiv](https://img.shields.io/badge/arXiv-2407.02869v2-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2407.02869v2)

	[![githubio](https://img.shields.io/badge/GitHub.io-Audio_Samples-blue?logo=Github&style=flat-square)](https://zeyuxie29.github.io/PicoAudio.github.io/)

	[![github](https://img.shields.io/badge/GitHub-Code-blue?logo=Github&style=flat-square)](https://github.com/zeyuxie29/PicoAudio)

	[![Hugging Face data](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-blue)](https://huggingface.co/datasets/ZeyuXie/PicoAudio/tree/main)
	""")
	with gr.Row():
	description_text = f"18 events supported:"
	gr.Markdown(description_text)


	btn_event = []
	with gr.Row():
	for i in range(6):
	event_name = f"{event_list[i]}"
	btn_event.append(gr.Button(event_name))
	with gr.Row():
	for i in range(6, 12):
	event_name = f"{event_list[i]}"
	btn_event.append(gr.Button(event_name))
	with gr.Row():
	for i in range(12, 18):
	event_name = f"{event_list[i]}"
	btn_event.append(gr.Button(event_name))


	with gr.Row():
	gr.Markdown("## Step1-Preprocess")
	with gr.Row():
	preprocess_description_text = f"Transfer free-text into timestamp caption via LLM. "+\
	"This demo uses Gemini as the preprocessor. If any errors occur, please try a few more times. "+\
	"We also provide the GPT version consistent with the paper in the file 'Files/llm_reprocessing.py'. You can use your own api_key to modify and run 'Files/inference.py' for local inference."
	gr.Markdown(preprocess_description_text)
	with gr.Row():
	with gr.Column():
	freetext_prompt = gr.Textbox(label="Free-text Prompt: Input your free-text caption here. (e.g. a dog barks three times.)",
	value="a dog barks three times.",)
	with gr.Row():
	preprocess_run_button = gr.Button()
	preprocess_run_clear = gr.ClearButton([freetext_prompt])
	prompt = None
	with gr.Column():
	freetext_prompt_out = gr.Textbox(label="Timestamp Caption: Preprocess output")
	with gr.Row():
	with gr.Column():
	gr.Examples(
	examples = [["spraying two times then gunshot three times."],
	["a dog barks three times."],
	["cow mooing two times."],],
	inputs = [freetext_prompt],
	outputs = [prompt]
	)
	with gr.Column():
	pass


	with gr.Row():
	gr.Markdown("## Step2-Generate")
	with gr.Row():
	generate_description_text = f"Generate audio based on timestamp caption."
	gr.Markdown(generate_description_text)
	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(label="Timestamp Caption: Specify your timestamp caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
	value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
	with gr.Row():
	generate_run_button = gr.Button()
	generate_run_clear = gr.ClearButton([prompt])
	with gr.Accordion("Advanced options", open=False):
	num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
	guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
	with gr.Column():
	outaudio = gr.Audio()

	for i in range(18):
	event_name = f"{event_list[i]}"
	btn_event[i].click(fn=update_textbox, inputs=[gr.State(event_name), freetext_prompt], outputs=freetext_prompt)
	preprocess_run_button.click(fn=preprocess, inputs=[freetext_prompt], outputs=[prompt, freetext_prompt_out])
	generate_run_button.click(fn=infer, inputs=[prompt, num_steps, guidance_scale], outputs=[outaudio])


	with gr.Row():
	with gr.Column():
	gr.Examples(
	examples = [["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
	["dog_barking at 0.562-2.562_4.25-6.25."],
	["cow_mooing at 0.958-3.582_5.272-7.896."],
	["tapping_clicking_clanking at 0.579-4.019_5.882-9.322"],
	["duck_quacking at 1.51-2.51_4.904-5.904"],
	],
	inputs = [prompt, num_steps, guidance_scale],
	outputs = [outaudio]
	)
	with gr.Column():
	pass


	demo.launch()


	# description_text = f"18 events: {', '.join(event_list)}"
	# prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
	# value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
	# outaudio = gr.Audio()
	# num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
	# guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
	# gr_interface = gr.Interface(
	# fn=infer,
	# inputs=[prompt, num_steps, guidance_scale],
	# outputs=[outaudio],
	# title="PicoAudio",
	# description=description_text,
	# allow_flagging=False,
	# examples=[
	# ["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
	# ["dog_barking at 0.562-2.562_4.25-6.25."],
	# ["cow_mooing at 0.958-3.582_5.272-7.896."],
	# ],
	# cache_examples="lazy", # Turn on to cache.
	# )
	# gr_interface.queue(10).launch()