Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

ZeyuXie commited on Jul 18, 2024

Commit

629d1bf

verified ·

1 Parent(s): d1abaff

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -98

app.py CHANGED Viewed

@@ -1,112 +1,169 @@
-"""
-At the command line, only need to run once to install the package via pip:
-$ pip install google-generativeai
-"""
-from pathlib import Path
 import os
 import json
-import re
-os.environ['HTTP_PROXY'] = 'http://127.0.0.1:58591'
-os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:58591'
-def get_event():
-    event_list = [
-            "burping_belching",             # 0
-            "car_horn_honking",             #
-            "cat_meowing",                  #
-            "cow_mooing",                   #
-            "dog_barking",                  #
-            "door_knocking",                #
-            "door_slamming",                #
-            "explosion",                    #
-            "gunshot",                      # 8
-            "sheep_goat_bleating",          #
-            "sneeze",                       #
-            "spraying",                     #
-            "thump_thud",                   #
-            "train_horn",                   #
-            "tapping_clicking_clanking",    #
-            "woman_laughing",               #
-            "duck_quacking",                # 16
-            "whistling",                    #
-        ]
-    return event_list
-def get_prompt():
-    train_json_list = ["data/train_multi-event_v3.json",
-        f"data/train_single-event_multi_v3.json",
-        f"data/train_single-event_single_v3.json"]
-    learn_pair = ""
-    for train_json in train_json_list:
-        with open(train_json, 'r') as train_file:
-            for idx, line in enumerate(train_file):
-                if idx >= 100: break
-                data = json.loads(line.strip())
-                learn_pair += f"{str(idx)}:{data['captions']}~{data['onset']}. "
-    preffix_prompt =  "I'm doing an audio event generation, which is a harmless job that will contain some sound events. For example, a gunshot is a sound that is harmless."  +\
-                            "You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', " +\
-                            "where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. " +\
-                            "The 'onset-offset' inside needs to be determined based on common sense and the examples I provide, with a duration not less than 1 and not greater than 4.  All format 'onsetk-offsetk' should replaced by number. " +\
-                            "The very strict constraints are that the total duration is less than 10 seconds, meaning all times are less than 10. It is preferred that events do not overlap as much as possible. " +\
-                            "Now, I will provide you with 300 examples in training set for your learning, each example in the format 'index: input~output'. " +\
-                            learn_pair
-    print(len(preffix_prompt))
-    return preffix_prompt
-def postprocess(caption):
-    caption = caption.strip('\n').strip(' ').strip('.')
-    caption = caption.replace('__', ' at ').replace('--', ' and ')
-    return caption
-def preprocess_gemini(free_text_caption):
-    preffix_prompt = get_prompt()
-    import google.generativeai as genai
-    genai.configure(api_key="AIzaSyDfGKPQtS9qExCfl3bnfxC1rLPzvORz3E4")
-    print(free_text_caption)
-    # Set up the model
-    generation_config = {
-      "temperature": 1,
-      "top_p": 0.95,
-      "top_k": 64,
-      "max_output_tokens": 8192,
-    }
-    model = genai.GenerativeModel(model_name="gemini-1.5-flash",
-                                  generation_config=generation_config,)
-    prompt_parts = [
-        preffix_prompt +\
-        f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols.",
-    ]
-    timestampCaption = model.generate_content(prompt_parts).text
-    print(timestampCaption)
-    return postprocess(timestampCaption)
-def preprocess_gpt(free_text_caption):
-    preffix_prompt = get_prompt()
-    from openai import OpenAI
-    client = OpenAI(api_key="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")
-    completion_start = client.chat.completions.create(
-                    model="gpt-4-1106-preview",
-                    messages=[{
-                      "role": "user",
-                       "content":
-                            preffix_prompt +\
-                            f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols."
-                    }]
-                )
-    timestampCaption = completion_start.choices[0].message.content
-    return postprocess(timestampCaption)
-if __name__=="__main__":
-    caption = preprocess_gemini("spraying two times then gunshot three times.")
-    print(caption)

 import os
 import json
+import numpy as np
+import torch
+import soundfile as sf
+import gradio as gr
+from diffusers import DDPMScheduler
+from pico_model import PicoDiffusion
+from audioldm.variational_autoencoder.autoencoder import AutoencoderKL
+from llm_preprocess import get_event, preprocess_gemini, preprocess_gpt
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class InferRunner:
+    def __init__(self, device):
+        vae_config = json.load(open("ckpts/ldm/vae_config.json"))
+        self.vae = AutoencoderKL(**vae_config).to(device)
+        vae_weights = torch.load("ckpts/ldm/pytorch_model_vae.bin", map_location=device)
+        self.vae.load_state_dict(vae_weights)
+        train_args = dotdict(json.loads(open("ckpts/pico_model/summary.jsonl").readlines()[0]))
+        self.pico_model = PicoDiffusion(
+            scheduler_name=train_args.scheduler_name,
+            unet_model_config_path=train_args.unet_model_config,
+            snr_gamma=train_args.snr_gamma,
+            freeze_text_encoder_ckpt="ckpts/laion_clap/630k-audioset-best.pt",
+            diffusion_pt="ckpts/pico_model/diffusion.pt",
+        ).eval().to(device)
+        self.scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# runner = InferRunner(device)
+event_list = get_event()
+def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
+    with torch.no_grad():
+        latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
+        mel = runner.vae.decode_first_stage(latents)
+        wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
+    outpath = f"output.wav"
+    sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
+    return outpath
+def preprocess(caption):
+    output = preprocess_gemini(caption)
+    return output, output
+def update_textbox(event_name, current_text):
+    print(event_name, current_text)
+    event = event_name + ' two times.'
+    if current_text:
+        return current_text.strip('.') + ' then ' + event
+    else:
+        return event
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("## PicoAudio")
+    with gr.Row():
+        description_text = f"18 events supported :"
+        gr.Markdown(description_text)
+    btn_event = []
+    with gr.Row():
+        for i in range(6):
+            event_name = f"{event_list[i]}"
+            btn_event.append(gr.Button(event_name))
+    with gr.Row():
+        for i in range(6, 12):
+            event_name = f"{event_list[i]}"
+            btn_event.append(gr.Button(event_name))
+    with gr.Row():
+        for i in range(12, 18):
+            event_name = f"{event_list[i]}"
+            btn_event.append(gr.Button(event_name))
+    with gr.Row():
+        gr.Markdown("## Step1")
+    with gr.Row():
+        preprocess_description_text = f"Preprocess: transfer free-text into timestamp caption via LLM. "+\
+            "This demo uses Gemini as the preprocessor. If any errors occur, please try a few more times. "+\
+                "We also provide the GPT version consistent with the paper in the file 'Files/llm_reprocessing.py'. You can use your own api_key to modify and run 'Files/inference.py' for local inference."
+        gr.Markdown(preprocess_description_text)
+    with gr.Row():
+        with gr.Column():
+            freetext_prompt = gr.Textbox(label="Prompt: Input your free-text caption here. (e.g. a dog barks three times.)",
+                value="a dog barks three times.",)
+            preprocess_run_button = gr.Button()
+            prompt = None
+        with gr.Column():
+            freetext_prompt_out = gr.Textbox(label="Preprocess output")
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                        examples = [["spraying two times then gunshot three times."],
+                                    ["a dog barks three times."],
+                                    ["cow mooing two times."],],
+                        inputs = [freetext_prompt],
+                        outputs = [prompt]
+                        )
+        with gr.Column():
+            pass
+    with gr.Row():
+        gr.Markdown("## Step2")
+    with gr.Row():
+        generate_description_text = f"Generate audio based on timestamp caption."
+        gr.Markdown(generate_description_text)
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
+                value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
+            generate_run_button = gr.Button()
+            with gr.Accordion("Advanced options", open=False):
+                num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
+                guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
+        with gr.Column():
+            outaudio = gr.Audio()
+    for i in range(18):
+        event_name = f"{event_list[i]}"
+        btn_event[i].click(fn=update_textbox, inputs=[gr.State(event_name), freetext_prompt], outputs=freetext_prompt)
+    preprocess_run_button.click(fn=preprocess, inputs=[freetext_prompt], outputs=[prompt, freetext_prompt_out])
+    generate_run_button.click(fn=infer, inputs=[prompt, num_steps, guidance_scale], outputs=[outaudio])
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                        examples = [["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
+                                    ["dog_barking at 0.562-2.562_4.25-6.25."],
+                                    ["cow_mooing at 0.958-3.582_5.272-7.896."],],
+                        inputs = [prompt, num_steps, guidance_scale],
+                        outputs = [outaudio]
+                        )
+        with gr.Column():
+            pass
+demo.launch()
+# description_text = f"18 events: {', '.join(event_list)}"
+# prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
+#     value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
+# outaudio = gr.Audio()
+# num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
+# guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
+# gr_interface = gr.Interface(
+        #     fn=infer,
+        #     inputs=[prompt, num_steps, guidance_scale],
+        #     outputs=[outaudio],
+        #     title="PicoAudio",
+        #     description=description_text,
+        #     allow_flagging=False,
+        #     examples=[
+        #         ["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
+        #         ["dog_barking at 0.562-2.562_4.25-6.25."],
+        #         ["cow_mooing at 0.958-3.582_5.272-7.896."],
+        #     ],
+        #     cache_examples="lazy", # Turn on to cache.
+        # )
+        # gr_interface.queue(10).launch()