Spaces:

TomTumo
/

stable-diffusion-video-gen

Paused

App Files Files Community

Tom Auger commited on Jun 26, 2023

Commit

5ba30fd

unverified ·

1 Parent(s): ec91093

Change to just plain video generation

Browse files

Files changed (1) hide show

app.py +4 -346

app.py CHANGED Viewed

@@ -1,24 +1,10 @@
-# Experimental app to help with the process of generating music videos
-# Requires youtube-dl to be installed
-# pip install youtube-dl
-import gradio as gr
-import librosa
-from pathlib import Path
-import numpy as np
-import random
-from io import BytesIO
-import soundfile as sf
-from matplotlib import pyplot as plt
-from stable_diffusion_videos import StableDiffusionWalkPipeline, generate_images, get_timesteps_arr
 from diffusers.models import AutoencoderKL
 from diffusers.schedulers import LMSDiscreteScheduler
 from diffusers.utils.import_utils import is_xformers_available
 import torch
-import youtube_dl
-import os
 pipe = StableDiffusionWalkPipeline.from_pretrained(
     'runwayml/stable-diffusion-v1-5',
@@ -33,335 +19,7 @@ pipe = StableDiffusionWalkPipeline.from_pretrained(
 if is_xformers_available():
     pipe.enable_xformers_memory_efficient_attention()
-def download_example_clip(url, output_dir='./', output_filename='%(title)s.%(ext)s'):
-    if (Path(output_dir) / output_filename).exists():
-        return str(Path(output_dir) / output_filename)
-    files_before = os.listdir(output_dir) if os.path.exists(output_dir) else []
-    ydl_opts = {
-        'outtmpl': str(Path(output_dir) / output_filename),
-        'format': 'bestaudio',
-        'extract-audio': True,
-        'audio-format': 'mp3',
-        'audio-quality': 0,
-    }
-    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([url])
-    files_after = os.listdir(output_dir)
-    return str(Path(output_dir) / list(set(files_after) - set(files_before))[0])
-def audio_data_to_buffer(y, sr):
-    audio_filepath = BytesIO()
-    audio_filepath.name = 'audio.wav'
-    sf.write(audio_filepath, y, samplerate=sr, format='WAV')
-    audio_filepath.seek(0)
-    return audio_filepath
-def plot_array(y):
-    fig = plt.figure()
-    x = np.arange(y.shape[0])
-    plt.title("Line graph")
-    plt.xlabel("X axis")
-    plt.ylabel("Y axis")
-    plt.plot(x, y, color ="red")
-    plt.savefig('timesteps_chart.png')
-    return fig
-def on_slice_btn_click(audio, audio_start_sec, duration, fps, smooth, margin):
-    if audio is None:
-        return [
-            gr.update(visible=False),
-            gr.update(visible=False),
-        ]
-    y, sr = librosa.load(audio, offset=audio_start_sec, duration=duration)
-    T = get_timesteps_arr(
-        audio_data_to_buffer(y, sr),
-        0,
-        duration,
-        fps=fps,
-        margin=margin,
-        smooth=smooth,
-    )
-    return [gr.update(value=(sr, y), visible=True), gr.update(value=plot_array(T), visible=True)]
-def on_audio_change_or_clear(audio):
-    if audio is None:
-        return [
-            gr.update(visible=False),
-            gr.update(visible=False)
-        ]
-    duration = librosa.get_duration(filename=audio)
-    return [
-        gr.update(maximum=int(duration), visible=True),
-        gr.update(maximum=int(min(10, duration)), visible=True)
-    ]
-def on_update_weight_settings_btn_click(sliced_audio, duration, fps, smooth, margin):
-    if sliced_audio is None:
-        return gr.update(visible=False)
-    T = get_timesteps_arr(
-        sliced_audio,
-        0,
-        duration,
-        fps=fps,
-        margin=margin,
-        smooth=smooth,
-    )
-    return gr.update(value=plot_array(T), visible=True)
-def on_generate_images_btn_click(
-    prompt_a,
-    prompt_b,
-    seed_a,
-    seed_b,
-    output_dir,
-    num_inference_steps,
-    guidance_scale,
-    height,
-    width,
-    upsample,
-):
-    output_dir = Path(output_dir) / 'images'
-    if seed_a == -1:
-        seed_a = random.randint(0, 9999999)
-    if seed_b == -1:
-        seed_b = random.randint(0, 9999999)
-    image_a_fpath = generate_images(
-        pipe,
-        prompt_a,
-        seeds=[seed_a],
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        height=height,
-        width=width,
-        upsample=upsample,
-        output_dir=output_dir
-    )[0]
-    image_b_fpath = generate_images(
-        pipe,
-        prompt_b,
-        seeds=[seed_b],
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        height=height,
-        width=width,
-        upsample=upsample,
-        output_dir=output_dir
-    )[0]
-    return [
-        gr.update(value=image_a_fpath, visible=True),
-        gr.update(value=image_b_fpath, visible=True),
-        gr.update(value=seed_a),
-        gr.update(value=seed_b),
-    ]
-def on_generate_music_video_btn_click(
-    audio_filepath,
-    audio_start_sec,
-    duration,
-    fps,
-    smooth,
-    margin,
-    prompt_a,
-    prompt_b,
-    seed_a,
-    seed_b,
-    batch_size,
-    output_dir,
-    num_inference_steps,
-    guidance_scale,
-    height,
-    width,
-    upsample,
-):
-    if audio_filepath is None:
-        return gr.update(visible=False)
-    video_filepath = pipe.walk(
-        prompts=[prompt_a, prompt_b],
-        seeds=[seed_a, seed_b],
-        num_interpolation_steps=int(duration * fps),
-        output_dir=output_dir,
-        fps=fps,
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        height=height,
-        width=width,
-        upsample=upsample,
-        batch_size=batch_size,
-        audio_filepath=audio_filepath,
-        audio_start_sec=audio_start_sec,
-        margin=margin,
-        smooth=smooth,
-    )
-    return gr.update(value=video_filepath, visible=True)
-audio_start_sec = gr.Slider(0, 10, 0, step=1, label="Start (sec)", interactive=True)
-duration = gr.Slider(0, 10, 1, step=1, label="Duration (sec)", interactive=True)
-slice_btn = gr.Button("Slice Audio")
-sliced_audio = gr.Audio(type='filepath')
-wav_plot = gr.Plot(label="Interpolation Weights Per Frame")
-fps = gr.Slider(1, 60, 12, step=1, label="FPS", interactive=True)
-smooth = gr.Slider(0, 1, 0.0, label="Smoothing", interactive=True)
-margin = gr.Slider(1.0, 20.0, 1.0, step=0.5, label="Margin Max", interactive=True)
-update_weight_settings_btn = gr.Button("Update Interpolation Weights")
-prompt_a = gr.Textbox(value='blueberry spaghetti', label="Prompt A")
-prompt_b = gr.Textbox(value='strawberry spaghetti', label="Prompt B")
-seed_a = gr.Number(-1, label="Seed A", precision=0, interactive=True)
-seed_b = gr.Number(-1, label="Seed B", precision=0, interactive=True)
-generate_images_btn = gr.Button("Generate Images")
-image_a = gr.Image(visible=False, label="Image A")
-image_b = gr.Image(visible=False, label="Image B")
-batch_size = gr.Slider(1, 32, 1, step=1, label="Batch Size", interactive=True)
-generate_music_video_btn = gr.Button("Generate Music Video")
-video = gr.Video(visible=False, label="Video")
-STEP_1_MARKDOWN = """
-## 1. Upload Some Audio
-Upload an audio file to use as the source for the music video.
-"""
-STEP_2_MARKDOWN = """
-## 2. Slice Portion of Audio for Generated Clip
-Here you can slice a portion of the audio to use for the generated music video. The longer the audio, the more frames will be generated (which will take longer).
-I suggest you use this app to make music videos in segments of 5-10 seconds at a time. Then, you can stitch the videos together using a video editor or ffmpeg later.
-**Warning**: If your audio file is short, I do no check that the duration you chose is not longer than the audio. It may cause some issues, so just be mindful of that.
-"""
-STEP_3_MARKDOWN = """
-## 3. Set Interpolation Weight Settings
-This section lets you play with the settings used to configure how we move through the latent space given the audio you sliced.
-If you look at the graph on the right, you'll see in the X-axis how many frames. The Y-axis is the weight of Image A as we move through the latent space.
-If you listen to the audio slice and look at the graph, you should see bumps at points where the audio energy is high (in our case, percussive energy).
-"""
-STEP_4_MARKDOWN = """
-## 4. Select Prompts, Seeds, Settings, and Generate Images
-Here you can select the settings for image generation.
-Then, you can select prompts and seeds for generating images.
-  - Image A will be first frame of the generated video.
-  - Image B will be last frame of the generated video.
-  - The video will be generated by interpolating between the two images using the audio you provided.
-If you set the seeds to -1, a random seed will be used and saved for you, so you can explore different images given the same prompt.
-"""
-with gr.Blocks() as demo:
-    gr.Markdown(STEP_1_MARKDOWN)
-    audio = gr.Audio(type='filepath', interactive=True)
-    gr.Examples(
-        [
-            download_example_clip(
-                url='https://soundcloud.com/nateraw/thoughts',
-                output_dir='./music',
-                output_filename='thoughts.mp3'
-            )
-        ],
-        inputs=audio,
-        outputs=[audio_start_sec, duration],
-        fn=on_audio_change_or_clear,
-        cache_examples=False
-    )
-    audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
-    audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])
-    gr.Markdown(STEP_2_MARKDOWN)
-    audio_start_sec.render()
-    duration.render()
-    slice_btn.render()
-    slice_btn.click(on_slice_btn_click, [audio, audio_start_sec, duration, fps, smooth, margin], [sliced_audio, wav_plot])
-    sliced_audio.render()
-    gr.Markdown(STEP_3_MARKDOWN)
-    with gr.Row():
-        with gr.Column(scale=4):
-            fps.render()
-            smooth.render()
-            margin.render()
-            update_weight_settings_btn.render()
-            update_weight_settings_btn.click(
-                on_update_weight_settings_btn_click,
-                [sliced_audio, duration, fps, smooth, margin],
-                wav_plot
-            )
-        with gr.Column(scale=3):
-            wav_plot.render()
-    gr.Markdown(STEP_4_MARKDOWN)
-    with gr.Accordion("Additional Settings", open=False):
-        output_dir = gr.Textbox(value='./dreams', label="Output Directory")
-        num_inference_steps = gr.Slider(1, 200, 50, step=10, label="Diffusion Inference Steps", interactive=True)
-        guidance_scale = gr.Slider(1.0, 25.0, 7.5, step=0.5, label="Guidance Scale", interactive=True)
-        height = gr.Slider(512, 1024, 512, step=64, label="Height", interactive=True)
-        width = gr.Slider(512, 1024, 512, step=64, label="Width", interactive=True)
-        upsample = gr.Checkbox(value=False, label="Upsample with Real-ESRGAN")
-    with gr.Row():
-        with gr.Column(scale=4):
-            prompt_a.render()
-        with gr.Column(scale=1):
-            seed_a.render()
-    with gr.Row():
-        with gr.Column(scale=4):
-            prompt_b.render()
-        with gr.Column(scale=1):
-            seed_b.render()
-    generate_images_btn.render()
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_a.render()
-        with gr.Column(scale=1):
-            image_b.render()
-    generate_images_btn.click(
-        on_generate_images_btn_click,
-        [prompt_a, prompt_b, seed_a, seed_b, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
-        [image_a, image_b, seed_a, seed_b]
-    )
-    gr.Markdown("## 5. Generate Music Video")
-    # TODO - add equivalent code snippet to generate music video
-    batch_size.render()
-    generate_music_video_btn.render()
-    generate_music_video_btn.click(
-        on_generate_music_video_btn_click,
-        [audio, audio_start_sec, duration, fps, smooth, margin, prompt_a, prompt_b, seed_a, seed_b, batch_size, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
-        video
-    )
-    video.render()
 if __name__ == '__main__':
-    demo.launch(debug=True)

+from stable_diffusion_videos import StableDiffusionWalkPipeline, Interface
 from diffusers.models import AutoencoderKL
 from diffusers.schedulers import LMSDiscreteScheduler
 from diffusers.utils.import_utils import is_xformers_available
 import torch
 pipe = StableDiffusionWalkPipeline.from_pretrained(
     'runwayml/stable-diffusion-v1-5',
 if is_xformers_available():
     pipe.enable_xformers_memory_efficient_attention()
+interface = Interface(pipe)
 if __name__ == '__main__':
+    interface.launch(debug=True)