Yw22 commited on
Commit
443c891
·
1 Parent(s): 80bffe7
Files changed (2) hide show
  1. ImageConductor_app.py +0 -586
  2. app.py +9 -0
ImageConductor_app.py DELETED
@@ -1,586 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import numpy as np
4
- import cv2
5
- import uuid
6
- import torch
7
- import torchvision
8
- import json
9
-
10
- from PIL import Image
11
- from omegaconf import OmegaConf
12
- from einops import rearrange, repeat
13
- from torchvision import transforms
14
- from transformers import CLIPTextModel, CLIPTokenizer
15
- from diffusers import AutoencoderKL, DDIMScheduler
16
-
17
- from pipelines.pipeline_imagecoductor import ImageConductorPipeline
18
- from modules.unet import UNet3DConditionFlowModel
19
- from utils.gradio_utils import ensure_dirname, split_filename, visualize_drag, image2pil, image2arr
20
- from utils.utils import create_image_controlnet, create_flow_controlnet, interpolate_trajectory, load_weights, load_model, bivariate_Gaussian
21
- from utils.lora_utils import add_LoRA_to_controlnet
22
- from utils.visualizer import Visualizer, vis_flow_to_video
23
- #### Description ####
24
- title = r"""<h1 align="center">CustomNet: Object Customization with Variable-Viewpoints in Text-to-Image Diffusion Models</h1>"""
25
-
26
- head = r"""
27
- <div style="text-align: center;">
28
- <h1>Image Conductor: Precision Control for Interactive Video Synthesis</h1>
29
- <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
30
- <a href=""></a>
31
- <a href='https://liyaowei-stu.github.io/project/ImageConductor/'><img src='https://img.shields.io/badge/Project_Page-ImgaeConductor-green' alt='Project Page'></a>
32
- <a href='https://arxiv.org/pdf/2406.15339'><img src='https://img.shields.io/badge/Paper-Arxiv-blue'></a>
33
- <a href='https://github.com/liyaowei-stu/ImageConductor'><img src='https://img.shields.io/badge/Code-Github-orange'></a>
34
-
35
-
36
- </div>
37
- </br>
38
- </div>
39
- """
40
-
41
-
42
-
43
- descriptions = r"""
44
- Official Gradio Demo for <a href='https://github.com/liyaowei-stu/ImageConductor'><b>Image Conductor: Precision Control for Interactive Video Synthesis</b></a>.<br>
45
- 🧙Image Conductor enables precise, fine-grained control for generating motion-controllable videos from images, advancing the practical application of interactive video synthesis.<br>
46
- """
47
-
48
-
49
- instructions = r"""
50
- - ⭐️ <b>step1: </b>Upload or select one image from Example.
51
- - ⭐️ <b>step2: </b>Click 'Add Drag' to draw some drags.
52
- - ⭐️ <b>step3: </b>Input text prompt that complements the image (Necessary).
53
- - ⭐️ <b>step4: </b>Select 'Drag Mode' to specify the control of camera transition or object movement.
54
- - ⭐️ <b>step5: </b>Click 'Run' button to generate video assets.
55
- - ⭐️ <b>others: </b>Click 'Delete last drag' to delete the whole lastest path. Click 'Delete last step' to delete the lastest clicked control point.
56
- """
57
-
58
- citation = r"""
59
- If Image Conductor is helpful, please help to ⭐ the <a href='https://github.com/liyaowei-stu/ImageConductor' target='_blank'>Github Repo</a>. Thanks!
60
- [![GitHub Stars](https://img.shields.io/github/stars/liyaowei-stu%2FImageConductor)](https://github.com/liyaowei-stu/ImageConductor)
61
- ---
62
-
63
- 📝 **Citation**
64
- <br>
65
- If our work is useful for your research, please consider citing:
66
- ```bibtex
67
- @misc{li2024imageconductor,
68
- title={Image Conductor: Precision Control for Interactive Video Synthesis},
69
- author={Li, Yaowei and Wang, Xintao and Zhang, Zhaoyang and Wang, Zhouxia and Yuan, Ziyang and Xie, Liangbin and Zou, Yuexian and Shan, Ying},
70
- year={2024},
71
- eprint={2406.15339},
72
- archivePrefix={arXiv},
73
- primaryClass={cs.CV}
74
- }
75
- ```
76
-
77
- 📧 **Contact**
78
- <br>
79
- If you have any questions, please feel free to reach me out at <b>ywl@stu.pku.edu.cn</b>.
80
-
81
- # """
82
-
83
- os.makedirs("models/personalized")
84
- os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/flow_controlnet.ckpt -P models/')
85
- os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/image_controlnet.ckpt -P models/')
86
- os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/unet.ckpt -P models/')
87
- os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/helloobjects_V12c.safetensors -P models/personalized')
88
- os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/TUSUN.safetensors -P models/personalized')
89
-
90
-
91
-
92
-
93
- # - - - - - examples - - - - - #
94
- image_examples = [
95
- ["__asset__/images/object/turtle-1.jpg",
96
- "a sea turtle gracefully swimming over a coral reef in the clear blue ocean.",
97
- "object",
98
- 11318446767408804497,
99
- "",
100
- json.load(open("__asset__/trajs/object/turtle-1.json")),
101
- "__asset__/images/object/turtle-1.jpg",
102
- ],
103
-
104
- ["__asset__/images/object/rose-1.jpg",
105
- "a red rose engulfed in flames.",
106
- "object",
107
- 6854275249656120509,
108
- "",
109
- json.load(open("__asset__/trajs/object/rose-1.json")),
110
- "__asset__/images/object/rose-1.jpg",
111
- ],
112
-
113
- ["__asset__/images/object/jellyfish-1.jpg",
114
- "intricate detailing,photorealism,hyperrealistic, glowing jellyfish mushroom, flying, starry sky, bokeh, golden ratio composition.",
115
- "object",
116
- 17966188172968903484,
117
- "HelloObject",
118
- json.load(open("__asset__/trajs/object/jellyfish-1.json")),
119
- "__asset__/images/object/jellyfish-1.jpg",
120
- ],
121
-
122
-
123
- ["__asset__/images/camera/lush-1.jpg",
124
- "detailed craftsmanship, photorealism, hyperrealistic, roaring waterfall, misty spray, lush greenery, vibrant rainbow, golden ratio composition.",
125
- "camera",
126
- 7970487946960948963,
127
- "HelloObject",
128
- json.load(open("__asset__/trajs/camera/lush-1.json")),
129
- "__asset__/images/camera/lush-1.jpg",
130
- ],
131
-
132
- ["__asset__/images/camera/tusun-1.jpg",
133
- "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing.",
134
- "camera",
135
- 996953226890228361,
136
- "TUSUN",
137
- json.load(open("__asset__/trajs/camera/tusun-1.json")),
138
- "__asset__/images/camera/tusun-1.jpg",
139
- ],
140
-
141
- ["__asset__/images/camera/painting-1.jpg",
142
- "A oil painting.",
143
- "camera",
144
- 16867854766769816385,
145
- "",
146
- json.load(open("__asset__/trajs/camera/painting-1.json")),
147
- "__asset__/images/camera/painting-1.jpg",
148
- ],
149
-
150
- ]
151
-
152
-
153
- DREAM_BOOTH = {
154
- 'HelloObject': 'models/personalized/helloobjects_V12c.safetensors',
155
- }
156
-
157
- LORA = {
158
- 'TUSUN': 'models/personalized/TUSUN.safetensors',
159
- }
160
-
161
- LORA_ALPHA = {
162
- 'TUSUN': 0.6,
163
- }
164
-
165
- NPROMPT = {
166
- "HelloObject": 'FastNegativeV2,(bad-artist:1),(worst quality, low quality:1.4),(bad_prompt_version2:0.8),bad-hands-5,lowres,bad anatomy,bad hands,((text)),(watermark),error,missing fingers,extra digit,fewer digits,cropped,worst quality,low quality,normal quality,((username)),blurry,(extra limbs),bad-artist-anime,badhandv4,EasyNegative,ng_deepnegative_v1_75t,verybadimagenegative_v1.3,BadDream,(three hands:1.6),(three legs:1.2),(more than two hands:1.4),(more than two legs,:1.2)'
167
- }
168
-
169
- output_dir = "outputs"
170
- ensure_dirname(output_dir)
171
-
172
- def points_to_flows(track_points, model_length, height, width):
173
- input_drag = np.zeros((model_length - 1, height, width, 2))
174
- for splited_track in track_points:
175
- if len(splited_track) == 1: # stationary point
176
- displacement_point = tuple([splited_track[0][0] + 1, splited_track[0][1] + 1])
177
- splited_track = tuple([splited_track[0], displacement_point])
178
- # interpolate the track
179
- splited_track = interpolate_trajectory(splited_track, model_length)
180
- splited_track = splited_track[:model_length]
181
- if len(splited_track) < model_length:
182
- splited_track = splited_track + [splited_track[-1]] * (model_length -len(splited_track))
183
- for i in range(model_length - 1):
184
- start_point = splited_track[i]
185
- end_point = splited_track[i+1]
186
- input_drag[i][int(start_point[1])][int(start_point[0])][0] = end_point[0] - start_point[0]
187
- input_drag[i][int(start_point[1])][int(start_point[0])][1] = end_point[1] - start_point[1]
188
- return input_drag
189
-
190
- class ImageConductor:
191
- def __init__(self, device, unet_path, image_controlnet_path, flow_controlnet_path, height, width, model_length, lora_rank=64):
192
- self.device = device
193
- tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="tokenizer")
194
- text_encoder = CLIPTextModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="text_encoder").cuda()
195
- vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae").cuda()
196
- inference_config = OmegaConf.load("configs/inference/inference.yaml")
197
- unet = UNet3DConditionFlowModel.from_pretrained_2d("runwayml/stable-diffusion-v1-5", subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs))
198
-
199
- self.vae = vae
200
-
201
- ### >>> Initialize UNet module >>> ###
202
- load_model(unet, unet_path)
203
-
204
- ### >>> Initialize image controlnet module >>> ###
205
- image_controlnet = create_image_controlnet("configs/inference/image_condition.yaml", unet)
206
- load_model(image_controlnet, image_controlnet_path)
207
- ### >>> Initialize flow controlnet module >>> ###
208
- flow_controlnet = create_flow_controlnet("configs/inference/flow_condition.yaml", unet)
209
- add_LoRA_to_controlnet(lora_rank, flow_controlnet)
210
- load_model(flow_controlnet, flow_controlnet_path)
211
-
212
- unet.eval().to(device)
213
- image_controlnet.eval().to(device)
214
- flow_controlnet.eval().to(device)
215
-
216
- self.pipeline = ImageConductorPipeline(
217
- unet=unet,
218
- vae=vae,
219
- tokenizer=tokenizer,
220
- text_encoder=text_encoder,
221
- scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs)),
222
- image_controlnet=image_controlnet,
223
- flow_controlnet=flow_controlnet,
224
- ).to(device)
225
-
226
-
227
- self.height = height
228
- self.width = width
229
- # _, model_step, _ = split_filename(model_path)
230
- # self.ouput_prefix = f'{model_step}_{width}X{height}'
231
- self.model_length = model_length
232
-
233
- blur_kernel = bivariate_Gaussian(kernel_size=99, sig_x=10, sig_y=10, theta=0, grid=None, isotropic=True)
234
-
235
- self.blur_kernel = blur_kernel
236
-
237
- @torch.no_grad()
238
- def run(self, first_frame_path, tracking_points, prompt, drag_mode, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, personalized):
239
-
240
-
241
- original_width, original_height=384, 256
242
- if isinstance(tracking_points, list):
243
- input_all_points = tracking_points
244
- else:
245
- input_all_points = tracking_points.constructor_args['value']
246
-
247
-
248
- resized_all_points = [tuple([tuple([float(e1[0]*self.width/original_width), float(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
249
-
250
- dir, base, ext = split_filename(first_frame_path)
251
- id = base.split('_')[-1]
252
-
253
-
254
- with open(f'{output_dir}/points-{id}.json', 'w') as f:
255
- json.dump(input_all_points, f)
256
-
257
-
258
- visualized_drag, _ = visualize_drag(first_frame_path, resized_all_points, drag_mode, self.width, self.height, self.model_length)
259
-
260
- ## image condition
261
- image_transforms = transforms.Compose([
262
- transforms.RandomResizedCrop(
263
- (self.height, self.width), (1.0, 1.0),
264
- ratio=(self.width/self.height, self.width/self.height)
265
- ),
266
- transforms.ToTensor(),
267
- ])
268
-
269
- image_norm = lambda x: x
270
- image_paths = [first_frame_path]
271
- controlnet_images = [image_norm(image_transforms(Image.open(path).convert("RGB"))) for path in image_paths]
272
- controlnet_images = torch.stack(controlnet_images).unsqueeze(0).cuda()
273
- controlnet_images = rearrange(controlnet_images, "b f c h w -> b c f h w")
274
- num_controlnet_images = controlnet_images.shape[2]
275
- controlnet_images = rearrange(controlnet_images, "b c f h w -> (b f) c h w")
276
- controlnet_images = self.vae.encode(controlnet_images * 2. - 1.).latent_dist.sample() * 0.18215
277
- controlnet_images = rearrange(controlnet_images, "(b f) c h w -> b c f h w", f=num_controlnet_images)
278
-
279
- # flow condition
280
- controlnet_flows = points_to_flows(resized_all_points, self.model_length, self.height, self.width)
281
- for i in range(0, self.model_length-1):
282
- controlnet_flows[i] = cv2.filter2D(controlnet_flows[i], -1, self.blur_kernel)
283
- controlnet_flows = np.concatenate([np.zeros_like(controlnet_flows[0])[np.newaxis, ...], controlnet_flows], axis=0) # pad the first frame with zero flow
284
- os.makedirs(os.path.join(output_dir, "control_flows"), exist_ok=True)
285
- trajs_video = vis_flow_to_video(controlnet_flows, num_frames=self.model_length) # T-1 x H x W x 3
286
- torchvision.io.write_video(f'{output_dir}/control_flows/sample-{id}-train_flow.mp4', trajs_video, fps=8, video_codec='h264', options={'crf': '10'})
287
- controlnet_flows = torch.from_numpy(controlnet_flows)[None].to(controlnet_images)[:, :self.model_length, ...]
288
- controlnet_flows = rearrange(controlnet_flows, "b f h w c-> b c f h w")
289
-
290
- dreambooth_model_path = DREAM_BOOTH.get(personalized, '')
291
- lora_model_path = LORA.get(personalized, '')
292
- lora_alpha = LORA_ALPHA.get(personalized, 0.6)
293
- self.pipeline = load_weights(
294
- self.pipeline,
295
- dreambooth_model_path = dreambooth_model_path,
296
- lora_model_path = lora_model_path,
297
- lora_alpha = lora_alpha,
298
- ).to(device)
299
-
300
- if NPROMPT.get(personalized, '') != '':
301
- negative_prompt = NPROMPT.get(personalized)
302
-
303
- if randomize_seed:
304
- random_seed = torch.seed()
305
- else:
306
- seed = int(seed)
307
- random_seed = seed
308
- torch.manual_seed(random_seed)
309
- torch.cuda.manual_seed_all(random_seed)
310
- print(f"current seed: {torch.initial_seed()}")
311
- sample = self.pipeline(
312
- prompt,
313
- negative_prompt = negative_prompt,
314
- num_inference_steps = num_inference_steps,
315
- guidance_scale = guidance_scale,
316
- width = self.width,
317
- height = self.height,
318
- video_length = self.model_length,
319
- controlnet_images = controlnet_images, # 1 4 1 32 48
320
- controlnet_image_index = [0],
321
- controlnet_flows = controlnet_flows,# [1, 2, 16, 256, 384]
322
- control_mode = drag_mode,
323
- eval_mode = True,
324
- ).videos
325
-
326
- outputs_path = os.path.join(output_dir, f'output_{i}_{id}.mp4')
327
- vis_video = (rearrange(sample[0], 'c t h w -> t h w c') * 255.).clip(0, 255)
328
- torchvision.io.write_video(outputs_path, vis_video, fps=8, video_codec='h264', options={'crf': '10'})
329
-
330
- return visualized_drag, outputs_path
331
-
332
-
333
- def reset_states(first_frame_path, tracking_points):
334
- first_frame_path = gr.State()
335
- tracking_points = gr.State([])
336
- return None, first_frame_path, tracking_points
337
-
338
-
339
- def preprocess_image(image):
340
- image_pil = image2pil(image.name)
341
- raw_w, raw_h = image_pil.size
342
- resize_ratio = max(384/raw_w, 256/raw_h)
343
- image_pil = image_pil.resize((int(raw_w * resize_ratio), int(raw_h * resize_ratio)), Image.BILINEAR)
344
- image_pil = transforms.CenterCrop((256, 384))(image_pil.convert('RGB'))
345
- id = str(uuid.uuid4())[:4]
346
- first_frame_path = os.path.join(output_dir, f"first_frame_{id}.jpg")
347
- image_pil.save(first_frame_path, quality=95)
348
- return first_frame_path, first_frame_path, gr.State([])
349
-
350
-
351
- def add_tracking_points(tracking_points, first_frame_path, drag_mode, evt: gr.SelectData): # SelectData is a subclass of EventData
352
- if drag_mode=='object':
353
- color = (255, 0, 0, 255)
354
- elif drag_mode=='camera':
355
- color = (0, 0, 255, 255)
356
-
357
-
358
- print(f"You selected {evt.value} at {evt.index} from {evt.target}")
359
- tracking_points.constructor_args['value'][-1].append(evt.index)
360
- print(tracking_points.constructor_args)
361
-
362
- transparent_background = Image.open(first_frame_path).convert('RGBA')
363
- w, h = transparent_background.size
364
- transparent_layer = np.zeros((h, w, 4))
365
- for track in tracking_points.constructor_args['value']:
366
- if len(track) > 1:
367
- for i in range(len(track)-1):
368
- start_point = track[i]
369
- end_point = track[i+1]
370
- vx = end_point[0] - start_point[0]
371
- vy = end_point[1] - start_point[1]
372
- arrow_length = np.sqrt(vx**2 + vy**2)
373
- if i == len(track)-2:
374
- cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), color, 2, tipLength=8 / arrow_length)
375
- else:
376
- cv2.line(transparent_layer, tuple(start_point), tuple(end_point), color, 2,)
377
- else:
378
- cv2.circle(transparent_layer, tuple(track[0]), 5, color, -1)
379
-
380
- transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
381
- trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
382
- return tracking_points, trajectory_map
383
-
384
-
385
- def add_drag(tracking_points):
386
- tracking_points.constructor_args['value'].append([])
387
- print(tracking_points.constructor_args)
388
- return tracking_points
389
-
390
-
391
- def delete_last_drag(tracking_points, first_frame_path, drag_mode):
392
- if drag_mode=='object':
393
- color = (255, 0, 0, 255)
394
- elif drag_mode=='camera':
395
- color = (0, 0, 255, 255)
396
- tracking_points.constructor_args['value'].pop()
397
- transparent_background = Image.open(first_frame_path).convert('RGBA')
398
- w, h = transparent_background.size
399
- transparent_layer = np.zeros((h, w, 4))
400
- for track in tracking_points.constructor_args['value']:
401
- if len(track) > 1:
402
- for i in range(len(track)-1):
403
- start_point = track[i]
404
- end_point = track[i+1]
405
- vx = end_point[0] - start_point[0]
406
- vy = end_point[1] - start_point[1]
407
- arrow_length = np.sqrt(vx**2 + vy**2)
408
- if i == len(track)-2:
409
- cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), color, 2, tipLength=8 / arrow_length)
410
- else:
411
- cv2.line(transparent_layer, tuple(start_point), tuple(end_point), color, 2,)
412
- else:
413
- cv2.circle(transparent_layer, tuple(track[0]), 5, color, -1)
414
-
415
- transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
416
- trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
417
- return tracking_points, trajectory_map
418
-
419
-
420
- def delete_last_step(tracking_points, first_frame_path, drag_mode):
421
- if drag_mode=='object':
422
- color = (255, 0, 0, 255)
423
- elif drag_mode=='camera':
424
- color = (0, 0, 255, 255)
425
- tracking_points.constructor_args['value'][-1].pop()
426
- transparent_background = Image.open(first_frame_path).convert('RGBA')
427
- w, h = transparent_background.size
428
- transparent_layer = np.zeros((h, w, 4))
429
- for track in tracking_points.constructor_args['value']:
430
- if len(track) > 1:
431
- for i in range(len(track)-1):
432
- start_point = track[i]
433
- end_point = track[i+1]
434
- vx = end_point[0] - start_point[0]
435
- vy = end_point[1] - start_point[1]
436
- arrow_length = np.sqrt(vx**2 + vy**2)
437
- if i == len(track)-2:
438
- cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), color, 2, tipLength=8 / arrow_length)
439
- else:
440
- cv2.line(transparent_layer, tuple(start_point), tuple(end_point), color, 2,)
441
- else:
442
- cv2.circle(transparent_layer, tuple(track[0]), 5,color, -1)
443
-
444
- transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
445
- trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
446
- return tracking_points, trajectory_map
447
-
448
-
449
- block = gr.Blocks(
450
- theme=gr.themes.Soft(
451
- radius_size=gr.themes.sizes.radius_none,
452
- text_size=gr.themes.sizes.text_md
453
- )
454
- ).queue()
455
- with block as demo:
456
- with gr.Row():
457
- with gr.Column():
458
- gr.HTML(head)
459
-
460
- gr.Markdown(descriptions)
461
-
462
- with gr.Accordion(label="🛠️ Instructions:", open=True, elem_id="accordion"):
463
- with gr.Row(equal_height=True):
464
- gr.Markdown(instructions)
465
-
466
-
467
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
468
- unet_path = 'models/unet.ckpt'
469
- image_controlnet_path = 'models/image_controlnet.ckpt'
470
- flow_controlnet_path = 'models/flow_controlnet.ckpt'
471
- ImageConductor_net = ImageConductor(device=device,
472
- unet_path=unet_path,
473
- image_controlnet_path=image_controlnet_path,
474
- flow_controlnet_path=flow_controlnet_path,
475
- height=256,
476
- width=384,
477
- model_length=16
478
- )
479
- first_frame_path = gr.State()
480
- tracking_points = gr.State([])
481
-
482
-
483
- with gr.Row():
484
- with gr.Column(scale=1):
485
- image_upload_button = gr.UploadButton(label="Upload Image",file_types=["image"])
486
- add_drag_button = gr.Button(value="Add Drag")
487
- reset_button = gr.Button(value="Reset")
488
- delete_last_drag_button = gr.Button(value="Delete last drag")
489
- delete_last_step_button = gr.Button(value="Delete last step")
490
-
491
-
492
-
493
- with gr.Column(scale=7):
494
- with gr.Row():
495
- with gr.Column(scale=6):
496
- input_image = gr.Image(label=None,
497
- interactive=True,
498
- height=256,
499
- width=384,)
500
- with gr.Column(scale=6):
501
- output_image = gr.Image(label="Motion Path",
502
- interactive=False,
503
- height=256,
504
- width=384,)
505
- with gr.Row():
506
- with gr.Column(scale=1):
507
- prompt = gr.Textbox(value="a wonderful elf.", label="Prompt (highly-recommended)", interactive=True, visible=True)
508
- negative_prompt = gr.Text(
509
- label="Negative Prompt",
510
- max_lines=5,
511
- placeholder="Please input your negative prompt",
512
- value='worst quality, low quality, letterboxed',lines=1
513
- )
514
- drag_mode = gr.Radio(['camera', 'object'], label='Drag mode: ', value='object', scale=2)
515
- run_button = gr.Button(value="Run")
516
-
517
- with gr.Accordion("More input params", open=False, elem_id="accordion1"):
518
- with gr.Group():
519
- seed = gr.Textbox(
520
- label="Seed: ", value=561793204,
521
- )
522
- randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
523
-
524
- with gr.Group():
525
- with gr.Row():
526
- guidance_scale = gr.Slider(
527
- label="Guidance scale",
528
- minimum=1,
529
- maximum=12,
530
- step=0.1,
531
- value=8.5,
532
- )
533
- num_inference_steps = gr.Slider(
534
- label="Number of inference steps",
535
- minimum=1,
536
- maximum=50,
537
- step=1,
538
- value=25,
539
- )
540
-
541
- with gr.Group():
542
- personalized = gr.Dropdown(label="Personalized template", choices=['HelloObject', 'TUSUN'], value="")
543
-
544
- with gr.Column(scale=7):
545
- output_video = gr.Video(value=None,
546
- label="Output Video",
547
- width=384,
548
- height=256)
549
-
550
-
551
- with gr.Row():
552
- def process_example(input_image, prompt, drag_mode, seed, personalized, tracking_points, first_frame_path):
553
-
554
- return input_image, prompt, drag_mode, seed, personalized, tracking_points, first_frame_path
555
-
556
- example = gr.Examples(
557
- label="Input Example",
558
- examples=image_examples,
559
- inputs=[input_image, prompt, drag_mode, seed, personalized, tracking_points, first_frame_path],
560
- outputs=[input_image, prompt, drag_mode, seed, personalized, tracking_points, first_frame_path],
561
- fn=process_example,
562
- run_on_click=True,
563
- examples_per_page=10
564
- )
565
-
566
- with gr.Row():
567
- gr.Markdown(citation)
568
-
569
-
570
- image_upload_button.upload(preprocess_image, image_upload_button, [input_image, first_frame_path, tracking_points])
571
-
572
- add_drag_button.click(add_drag, tracking_points, tracking_points)
573
-
574
- delete_last_drag_button.click(delete_last_drag, [tracking_points, first_frame_path, drag_mode], [tracking_points, input_image])
575
-
576
- delete_last_step_button.click(delete_last_step, [tracking_points, first_frame_path, drag_mode], [tracking_points, input_image])
577
-
578
- reset_button.click(reset_states, [first_frame_path, tracking_points], [input_image, first_frame_path, tracking_points])
579
-
580
- input_image.select(add_tracking_points, [tracking_points, first_frame_path, drag_mode], [tracking_points, input_image])
581
-
582
- run_button.click(ImageConductor_net.run, [first_frame_path, tracking_points, prompt, drag_mode,
583
- negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, personalized],
584
- [output_image, output_video])
585
-
586
- demo.launch(server_name="0.0.0.0", debug=True, server_port=12345)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -80,6 +80,15 @@ If you have any questions, please feel free to reach me out at <b>ywl@stu.pku.ed
80
 
81
  # """
82
 
 
 
 
 
 
 
 
 
 
83
 
84
  # - - - - - examples - - - - - #
85
  image_examples = [
 
80
 
81
  # """
82
 
83
+ os.makedirs("models/personalized")
84
+ os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/flow_controlnet.ckpt -P models/')
85
+ os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/image_controlnet.ckpt -P models/')
86
+ os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/unet.ckpt -P models/')
87
+ os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/helloobjects_V12c.safetensors -P models/personalized')
88
+ os.system(f'wget https://huggingface.co/TencentARC/ImageConductor/blob/main/TUSUN.safetensors -P models/personalized')
89
+
90
+
91
+
92
 
93
  # - - - - - examples - - - - - #
94
  image_examples = [