Tom Auger commited on
Commit
5ba30fd
·
unverified ·
1 Parent(s): ec91093

Change to just plain video generation

Browse files
Files changed (1) hide show
  1. app.py +4 -346
app.py CHANGED
@@ -1,24 +1,10 @@
1
- # Experimental app to help with the process of generating music videos
2
- # Requires youtube-dl to be installed
3
- # pip install youtube-dl
4
-
5
- import gradio as gr
6
- import librosa
7
- from pathlib import Path
8
- import numpy as np
9
- import random
10
- from io import BytesIO
11
- import soundfile as sf
12
- from matplotlib import pyplot as plt
13
-
14
- from stable_diffusion_videos import StableDiffusionWalkPipeline, generate_images, get_timesteps_arr
15
 
16
  from diffusers.models import AutoencoderKL
17
  from diffusers.schedulers import LMSDiscreteScheduler
18
  from diffusers.utils.import_utils import is_xformers_available
19
  import torch
20
- import youtube_dl
21
- import os
22
 
23
  pipe = StableDiffusionWalkPipeline.from_pretrained(
24
  'runwayml/stable-diffusion-v1-5',
@@ -33,335 +19,7 @@ pipe = StableDiffusionWalkPipeline.from_pretrained(
33
  if is_xformers_available():
34
  pipe.enable_xformers_memory_efficient_attention()
35
 
36
- def download_example_clip(url, output_dir='./', output_filename='%(title)s.%(ext)s'):
37
- if (Path(output_dir) / output_filename).exists():
38
- return str(Path(output_dir) / output_filename)
39
-
40
- files_before = os.listdir(output_dir) if os.path.exists(output_dir) else []
41
- ydl_opts = {
42
- 'outtmpl': str(Path(output_dir) / output_filename),
43
- 'format': 'bestaudio',
44
- 'extract-audio': True,
45
- 'audio-format': 'mp3',
46
- 'audio-quality': 0,
47
- }
48
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
49
- ydl.download([url])
50
-
51
- files_after = os.listdir(output_dir)
52
- return str(Path(output_dir) / list(set(files_after) - set(files_before))[0])
53
-
54
- def audio_data_to_buffer(y, sr):
55
- audio_filepath = BytesIO()
56
- audio_filepath.name = 'audio.wav'
57
- sf.write(audio_filepath, y, samplerate=sr, format='WAV')
58
- audio_filepath.seek(0)
59
- return audio_filepath
60
-
61
-
62
- def plot_array(y):
63
- fig = plt.figure()
64
- x = np.arange(y.shape[0])
65
- plt.title("Line graph")
66
- plt.xlabel("X axis")
67
- plt.ylabel("Y axis")
68
- plt.plot(x, y, color ="red")
69
- plt.savefig('timesteps_chart.png')
70
- return fig
71
-
72
- def on_slice_btn_click(audio, audio_start_sec, duration, fps, smooth, margin):
73
- if audio is None:
74
- return [
75
- gr.update(visible=False),
76
- gr.update(visible=False),
77
- ]
78
-
79
- y, sr = librosa.load(audio, offset=audio_start_sec, duration=duration)
80
- T = get_timesteps_arr(
81
- audio_data_to_buffer(y, sr),
82
- 0,
83
- duration,
84
- fps=fps,
85
- margin=margin,
86
- smooth=smooth,
87
- )
88
- return [gr.update(value=(sr, y), visible=True), gr.update(value=plot_array(T), visible=True)]
89
-
90
- def on_audio_change_or_clear(audio):
91
- if audio is None:
92
- return [
93
- gr.update(visible=False),
94
- gr.update(visible=False)
95
- ]
96
-
97
- duration = librosa.get_duration(filename=audio)
98
- return [
99
- gr.update(maximum=int(duration), visible=True),
100
- gr.update(maximum=int(min(10, duration)), visible=True)
101
- ]
102
-
103
- def on_update_weight_settings_btn_click(sliced_audio, duration, fps, smooth, margin):
104
- if sliced_audio is None:
105
- return gr.update(visible=False)
106
-
107
- T = get_timesteps_arr(
108
- sliced_audio,
109
- 0,
110
- duration,
111
- fps=fps,
112
- margin=margin,
113
- smooth=smooth,
114
- )
115
- return gr.update(value=plot_array(T), visible=True)
116
-
117
-
118
- def on_generate_images_btn_click(
119
- prompt_a,
120
- prompt_b,
121
- seed_a,
122
- seed_b,
123
- output_dir,
124
- num_inference_steps,
125
- guidance_scale,
126
- height,
127
- width,
128
- upsample,
129
- ):
130
- output_dir = Path(output_dir) / 'images'
131
-
132
- if seed_a == -1:
133
- seed_a = random.randint(0, 9999999)
134
- if seed_b == -1:
135
- seed_b = random.randint(0, 9999999)
136
-
137
- image_a_fpath = generate_images(
138
- pipe,
139
- prompt_a,
140
- seeds=[seed_a],
141
- num_inference_steps=num_inference_steps,
142
- guidance_scale=guidance_scale,
143
- height=height,
144
- width=width,
145
- upsample=upsample,
146
- output_dir=output_dir
147
- )[0]
148
- image_b_fpath = generate_images(
149
- pipe,
150
- prompt_b,
151
- seeds=[seed_b],
152
- num_inference_steps=num_inference_steps,
153
- guidance_scale=guidance_scale,
154
- height=height,
155
- width=width,
156
- upsample=upsample,
157
- output_dir=output_dir
158
- )[0]
159
-
160
- return [
161
- gr.update(value=image_a_fpath, visible=True),
162
- gr.update(value=image_b_fpath, visible=True),
163
- gr.update(value=seed_a),
164
- gr.update(value=seed_b),
165
- ]
166
-
167
- def on_generate_music_video_btn_click(
168
- audio_filepath,
169
- audio_start_sec,
170
- duration,
171
- fps,
172
- smooth,
173
- margin,
174
- prompt_a,
175
- prompt_b,
176
- seed_a,
177
- seed_b,
178
- batch_size,
179
- output_dir,
180
- num_inference_steps,
181
- guidance_scale,
182
- height,
183
- width,
184
- upsample,
185
- ):
186
-
187
- if audio_filepath is None:
188
- return gr.update(visible=False)
189
-
190
- video_filepath = pipe.walk(
191
- prompts=[prompt_a, prompt_b],
192
- seeds=[seed_a, seed_b],
193
- num_interpolation_steps=int(duration * fps),
194
- output_dir=output_dir,
195
- fps=fps,
196
- num_inference_steps=num_inference_steps,
197
- guidance_scale=guidance_scale,
198
- height=height,
199
- width=width,
200
- upsample=upsample,
201
- batch_size=batch_size,
202
- audio_filepath=audio_filepath,
203
- audio_start_sec=audio_start_sec,
204
- margin=margin,
205
- smooth=smooth,
206
- )
207
- return gr.update(value=video_filepath, visible=True)
208
-
209
-
210
- audio_start_sec = gr.Slider(0, 10, 0, step=1, label="Start (sec)", interactive=True)
211
- duration = gr.Slider(0, 10, 1, step=1, label="Duration (sec)", interactive=True)
212
- slice_btn = gr.Button("Slice Audio")
213
-
214
- sliced_audio = gr.Audio(type='filepath')
215
- wav_plot = gr.Plot(label="Interpolation Weights Per Frame")
216
-
217
- fps = gr.Slider(1, 60, 12, step=1, label="FPS", interactive=True)
218
- smooth = gr.Slider(0, 1, 0.0, label="Smoothing", interactive=True)
219
- margin = gr.Slider(1.0, 20.0, 1.0, step=0.5, label="Margin Max", interactive=True)
220
- update_weight_settings_btn = gr.Button("Update Interpolation Weights")
221
-
222
- prompt_a = gr.Textbox(value='blueberry spaghetti', label="Prompt A")
223
- prompt_b = gr.Textbox(value='strawberry spaghetti', label="Prompt B")
224
- seed_a = gr.Number(-1, label="Seed A", precision=0, interactive=True)
225
- seed_b = gr.Number(-1, label="Seed B", precision=0, interactive=True)
226
- generate_images_btn = gr.Button("Generate Images")
227
- image_a = gr.Image(visible=False, label="Image A")
228
- image_b = gr.Image(visible=False, label="Image B")
229
-
230
- batch_size = gr.Slider(1, 32, 1, step=1, label="Batch Size", interactive=True)
231
- generate_music_video_btn = gr.Button("Generate Music Video")
232
- video = gr.Video(visible=False, label="Video")
233
-
234
- STEP_1_MARKDOWN = """
235
- ## 1. Upload Some Audio
236
-
237
- Upload an audio file to use as the source for the music video.
238
- """
239
-
240
- STEP_2_MARKDOWN = """
241
- ## 2. Slice Portion of Audio for Generated Clip
242
-
243
- Here you can slice a portion of the audio to use for the generated music video. The longer the audio, the more frames will be generated (which will take longer).
244
-
245
- I suggest you use this app to make music videos in segments of 5-10 seconds at a time. Then, you can stitch the videos together using a video editor or ffmpeg later.
246
-
247
- **Warning**: If your audio file is short, I do no check that the duration you chose is not longer than the audio. It may cause some issues, so just be mindful of that.
248
- """
249
-
250
- STEP_3_MARKDOWN = """
251
- ## 3. Set Interpolation Weight Settings
252
-
253
- This section lets you play with the settings used to configure how we move through the latent space given the audio you sliced.
254
-
255
- If you look at the graph on the right, you'll see in the X-axis how many frames. The Y-axis is the weight of Image A as we move through the latent space.
256
-
257
- If you listen to the audio slice and look at the graph, you should see bumps at points where the audio energy is high (in our case, percussive energy).
258
- """
259
-
260
- STEP_4_MARKDOWN = """
261
- ## 4. Select Prompts, Seeds, Settings, and Generate Images
262
-
263
- Here you can select the settings for image generation.
264
-
265
- Then, you can select prompts and seeds for generating images.
266
-
267
- - Image A will be first frame of the generated video.
268
- - Image B will be last frame of the generated video.
269
- - The video will be generated by interpolating between the two images using the audio you provided.
270
-
271
- If you set the seeds to -1, a random seed will be used and saved for you, so you can explore different images given the same prompt.
272
- """
273
-
274
-
275
- with gr.Blocks() as demo:
276
- gr.Markdown(STEP_1_MARKDOWN)
277
- audio = gr.Audio(type='filepath', interactive=True)
278
- gr.Examples(
279
- [
280
- download_example_clip(
281
- url='https://soundcloud.com/nateraw/thoughts',
282
- output_dir='./music',
283
- output_filename='thoughts.mp3'
284
- )
285
- ],
286
- inputs=audio,
287
- outputs=[audio_start_sec, duration],
288
- fn=on_audio_change_or_clear,
289
- cache_examples=False
290
- )
291
- audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
292
- audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])
293
-
294
- gr.Markdown(STEP_2_MARKDOWN)
295
- audio_start_sec.render()
296
- duration.render()
297
- slice_btn.render()
298
-
299
- slice_btn.click(on_slice_btn_click, [audio, audio_start_sec, duration, fps, smooth, margin], [sliced_audio, wav_plot])
300
- sliced_audio.render()
301
-
302
- gr.Markdown(STEP_3_MARKDOWN)
303
-
304
- with gr.Row():
305
- with gr.Column(scale=4):
306
- fps.render()
307
- smooth.render()
308
- margin.render()
309
- update_weight_settings_btn.render()
310
- update_weight_settings_btn.click(
311
- on_update_weight_settings_btn_click,
312
- [sliced_audio, duration, fps, smooth, margin],
313
- wav_plot
314
- )
315
- with gr.Column(scale=3):
316
- wav_plot.render()
317
-
318
- gr.Markdown(STEP_4_MARKDOWN)
319
-
320
- with gr.Accordion("Additional Settings", open=False):
321
- output_dir = gr.Textbox(value='./dreams', label="Output Directory")
322
- num_inference_steps = gr.Slider(1, 200, 50, step=10, label="Diffusion Inference Steps", interactive=True)
323
- guidance_scale = gr.Slider(1.0, 25.0, 7.5, step=0.5, label="Guidance Scale", interactive=True)
324
- height = gr.Slider(512, 1024, 512, step=64, label="Height", interactive=True)
325
- width = gr.Slider(512, 1024, 512, step=64, label="Width", interactive=True)
326
- upsample = gr.Checkbox(value=False, label="Upsample with Real-ESRGAN")
327
-
328
- with gr.Row():
329
- with gr.Column(scale=4):
330
- prompt_a.render()
331
- with gr.Column(scale=1):
332
- seed_a.render()
333
-
334
- with gr.Row():
335
- with gr.Column(scale=4):
336
- prompt_b.render()
337
- with gr.Column(scale=1):
338
- seed_b.render()
339
-
340
- generate_images_btn.render()
341
-
342
- with gr.Row():
343
- with gr.Column(scale=1):
344
- image_a.render()
345
- with gr.Column(scale=1):
346
- image_b.render()
347
-
348
- generate_images_btn.click(
349
- on_generate_images_btn_click,
350
- [prompt_a, prompt_b, seed_a, seed_b, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
351
- [image_a, image_b, seed_a, seed_b]
352
- )
353
-
354
- gr.Markdown("## 5. Generate Music Video")
355
- # TODO - add equivalent code snippet to generate music video
356
- batch_size.render()
357
- generate_music_video_btn.render()
358
- generate_music_video_btn.click(
359
- on_generate_music_video_btn_click,
360
- [audio, audio_start_sec, duration, fps, smooth, margin, prompt_a, prompt_b, seed_a, seed_b, batch_size, output_dir, num_inference_steps, guidance_scale, height, width, upsample],
361
- video
362
- )
363
- video.render()
364
-
365
 
366
  if __name__ == '__main__':
367
- demo.launch(debug=True)
 
1
+ from stable_diffusion_videos import StableDiffusionWalkPipeline, Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from diffusers.models import AutoencoderKL
4
  from diffusers.schedulers import LMSDiscreteScheduler
5
  from diffusers.utils.import_utils import is_xformers_available
6
  import torch
7
+
 
8
 
9
  pipe = StableDiffusionWalkPipeline.from_pretrained(
10
  'runwayml/stable-diffusion-v1-5',
 
19
  if is_xformers_available():
20
  pipe.enable_xformers_memory_efficient_attention()
21
 
22
+ interface = Interface(pipe)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  if __name__ == '__main__':
25
+ interface.launch(debug=True)