Spaces:
Running
on
Zero
Running
on
Zero
sdsdsdadasd3
commited on
Commit
•
dd60833
1
Parent(s):
7c2f6e2
[Release] v1.0.1
Browse files- improve the performance
- improve efficiency
- app.py +21 -12
- depthcrafter/utils.py +44 -0
- run.py +2 -50
app.py
CHANGED
@@ -17,11 +17,11 @@ from huggingface_hub import hf_hub_download
|
|
17 |
from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video
|
18 |
|
19 |
examples = [
|
20 |
-
["examples/example_01.mp4",
|
21 |
-
["examples/example_02.mp4",
|
22 |
-
["examples/example_03.mp4",
|
23 |
-
["examples/example_04.mp4",
|
24 |
-
["examples/example_05.mp4",
|
25 |
]
|
26 |
|
27 |
|
@@ -39,18 +39,18 @@ pipe = DepthCrafterPipeline.from_pretrained(
|
|
39 |
pipe.to("cuda")
|
40 |
|
41 |
|
42 |
-
@spaces.GPU(duration=
|
43 |
def infer_depth(
|
44 |
video: str,
|
45 |
num_denoising_steps: int,
|
46 |
guidance_scale: float,
|
47 |
max_res: int = 1024,
|
48 |
-
process_length: int =
|
49 |
#
|
50 |
save_folder: str = "./demo_output",
|
51 |
window_size: int = 110,
|
52 |
overlap: int = 25,
|
53 |
-
target_fps: int =
|
54 |
seed: int = 42,
|
55 |
track_time: bool = True,
|
56 |
save_npz: bool = False,
|
@@ -59,7 +59,6 @@ def infer_depth(
|
|
59 |
pipe.enable_xformers_memory_efficient_attention()
|
60 |
|
61 |
frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
|
62 |
-
print(f"==> video name: {video}, frames shape: {frames.shape}")
|
63 |
|
64 |
# inference the depth map using the DepthCrafter pipeline
|
65 |
with torch.inference_mode():
|
@@ -82,6 +81,7 @@ def infer_depth(
|
|
82 |
vis = vis_sequence_depth(res)
|
83 |
# save the depth map and visualization with the target FPS
|
84 |
save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
|
|
|
85 |
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
86 |
if save_npz:
|
87 |
np.savez_compressed(save_path + ".npz", depth=res)
|
@@ -155,14 +155,14 @@ def construct_demo():
|
|
155 |
label="num denoising steps",
|
156 |
minimum=1,
|
157 |
maximum=25,
|
158 |
-
value=
|
159 |
step=1,
|
160 |
)
|
161 |
guidance_scale = gr.Slider(
|
162 |
label="cfg scale",
|
163 |
minimum=1.0,
|
164 |
maximum=1.2,
|
165 |
-
value=1.
|
166 |
step=0.1,
|
167 |
)
|
168 |
max_res = gr.Slider(
|
@@ -174,11 +174,18 @@ def construct_demo():
|
|
174 |
)
|
175 |
process_length = gr.Slider(
|
176 |
label="process length",
|
177 |
-
minimum
|
178 |
maximum=280,
|
179 |
value=60,
|
180 |
step=1,
|
181 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
generate_btn = gr.Button("Generate")
|
183 |
with gr.Column(scale=2):
|
184 |
pass
|
@@ -191,6 +198,7 @@ def construct_demo():
|
|
191 |
guidance_scale,
|
192 |
max_res,
|
193 |
process_length,
|
|
|
194 |
],
|
195 |
outputs=[output_video_1, output_video_2],
|
196 |
fn=infer_depth,
|
@@ -216,6 +224,7 @@ def construct_demo():
|
|
216 |
guidance_scale,
|
217 |
max_res,
|
218 |
process_length,
|
|
|
219 |
],
|
220 |
outputs=[output_video_1, output_video_2],
|
221 |
)
|
|
|
17 |
from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video
|
18 |
|
19 |
examples = [
|
20 |
+
["examples/example_01.mp4", 5, 1.0, 1024, -1, -1],
|
21 |
+
["examples/example_02.mp4", 5, 1.0, 1024, -1, -1],
|
22 |
+
["examples/example_03.mp4", 5, 1.0, 1024, -1, -1],
|
23 |
+
["examples/example_04.mp4", 5, 1.0, 1024, -1, -1],
|
24 |
+
["examples/example_05.mp4", 5, 1.0, 1024, -1, -1],
|
25 |
]
|
26 |
|
27 |
|
|
|
39 |
pipe.to("cuda")
|
40 |
|
41 |
|
42 |
+
@spaces.GPU(duration=120)
|
43 |
def infer_depth(
|
44 |
video: str,
|
45 |
num_denoising_steps: int,
|
46 |
guidance_scale: float,
|
47 |
max_res: int = 1024,
|
48 |
+
process_length: int = -1,
|
49 |
#
|
50 |
save_folder: str = "./demo_output",
|
51 |
window_size: int = 110,
|
52 |
overlap: int = 25,
|
53 |
+
target_fps: int = -1,
|
54 |
seed: int = 42,
|
55 |
track_time: bool = True,
|
56 |
save_npz: bool = False,
|
|
|
59 |
pipe.enable_xformers_memory_efficient_attention()
|
60 |
|
61 |
frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
|
|
|
62 |
|
63 |
# inference the depth map using the DepthCrafter pipeline
|
64 |
with torch.inference_mode():
|
|
|
81 |
vis = vis_sequence_depth(res)
|
82 |
# save the depth map and visualization with the target FPS
|
83 |
save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
|
84 |
+
print(f"==> saving results to {save_path}")
|
85 |
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
86 |
if save_npz:
|
87 |
np.savez_compressed(save_path + ".npz", depth=res)
|
|
|
155 |
label="num denoising steps",
|
156 |
minimum=1,
|
157 |
maximum=25,
|
158 |
+
value=5,
|
159 |
step=1,
|
160 |
)
|
161 |
guidance_scale = gr.Slider(
|
162 |
label="cfg scale",
|
163 |
minimum=1.0,
|
164 |
maximum=1.2,
|
165 |
+
value=1.0,
|
166 |
step=0.1,
|
167 |
)
|
168 |
max_res = gr.Slider(
|
|
|
174 |
)
|
175 |
process_length = gr.Slider(
|
176 |
label="process length",
|
177 |
+
minimum=-1,
|
178 |
maximum=280,
|
179 |
value=60,
|
180 |
step=1,
|
181 |
)
|
182 |
+
process_target_fps = gr.Slider(
|
183 |
+
label="target FPS",
|
184 |
+
minimum=-1,
|
185 |
+
maximum=30,
|
186 |
+
value=15,
|
187 |
+
step=1,
|
188 |
+
)
|
189 |
generate_btn = gr.Button("Generate")
|
190 |
with gr.Column(scale=2):
|
191 |
pass
|
|
|
198 |
guidance_scale,
|
199 |
max_res,
|
200 |
process_length,
|
201 |
+
process_target_fps,
|
202 |
],
|
203 |
outputs=[output_video_1, output_video_2],
|
204 |
fn=infer_depth,
|
|
|
224 |
guidance_scale,
|
225 |
max_res,
|
226 |
process_length,
|
227 |
+
process_target_fps,
|
228 |
],
|
229 |
outputs=[output_video_1, output_video_2],
|
230 |
)
|
depthcrafter/utils.py
CHANGED
@@ -5,6 +5,50 @@ import PIL.Image
|
|
5 |
import matplotlib.cm as cm
|
6 |
import mediapy
|
7 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
def save_video(
|
|
|
5 |
import matplotlib.cm as cm
|
6 |
import mediapy
|
7 |
import torch
|
8 |
+
from decord import VideoReader, cpu
|
9 |
+
|
10 |
+
dataset_res_dict = {
|
11 |
+
"sintel": [448, 1024],
|
12 |
+
"scannet": [640, 832],
|
13 |
+
"KITTI": [384, 1280],
|
14 |
+
"bonn": [512, 640],
|
15 |
+
"NYUv2": [448, 640],
|
16 |
+
}
|
17 |
+
|
18 |
+
|
19 |
+
def read_video_frames(video_path, process_length, target_fps, max_res, dataset="open"):
|
20 |
+
if dataset == "open":
|
21 |
+
print("==> processing video: ", video_path)
|
22 |
+
vid = VideoReader(video_path, ctx=cpu(0))
|
23 |
+
print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
|
24 |
+
original_height, original_width = vid.get_batch([0]).shape[1:3]
|
25 |
+
height = round(original_height / 64) * 64
|
26 |
+
width = round(original_width / 64) * 64
|
27 |
+
if max(height, width) > max_res:
|
28 |
+
scale = max_res / max(original_height, original_width)
|
29 |
+
height = round(original_height * scale / 64) * 64
|
30 |
+
width = round(original_width * scale / 64) * 64
|
31 |
+
else:
|
32 |
+
height = dataset_res_dict[dataset][0]
|
33 |
+
width = dataset_res_dict[dataset][1]
|
34 |
+
|
35 |
+
vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
|
36 |
+
|
37 |
+
fps = vid.get_avg_fps() if target_fps == -1 else target_fps
|
38 |
+
stride = round(vid.get_avg_fps() / fps)
|
39 |
+
stride = max(stride, 1)
|
40 |
+
frames_idx = list(range(0, len(vid), stride))
|
41 |
+
print(
|
42 |
+
f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
|
43 |
+
)
|
44 |
+
if process_length != -1 and process_length < len(frames_idx):
|
45 |
+
frames_idx = frames_idx[:process_length]
|
46 |
+
print(
|
47 |
+
f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
|
48 |
+
)
|
49 |
+
frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
|
50 |
+
|
51 |
+
return frames, fps
|
52 |
|
53 |
|
54 |
def save_video(
|
run.py
CHANGED
@@ -3,21 +3,12 @@ import os
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
|
6 |
-
from decord import VideoReader, cpu
|
7 |
from diffusers.training_utils import set_seed
|
8 |
from fire import Fire
|
9 |
|
10 |
from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
|
11 |
from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
|
12 |
-
from depthcrafter.utils import vis_sequence_depth, save_video
|
13 |
-
|
14 |
-
dataset_res_dict = {
|
15 |
-
"sintel": [448, 1024],
|
16 |
-
"scannet": [640, 832],
|
17 |
-
"KITTI": [384, 1280],
|
18 |
-
"bonn": [512, 640],
|
19 |
-
"NYUv2": [448, 640],
|
20 |
-
}
|
21 |
|
22 |
|
23 |
class DepthCrafterDemo:
|
@@ -59,45 +50,6 @@ class DepthCrafterDemo:
|
|
59 |
print("Xformers is not enabled")
|
60 |
self.pipe.enable_attention_slicing()
|
61 |
|
62 |
-
@staticmethod
|
63 |
-
def read_video_frames(
|
64 |
-
video_path, process_length, target_fps, max_res, dataset="open"
|
65 |
-
):
|
66 |
-
if dataset == "open":
|
67 |
-
print("==> processing video: ", video_path)
|
68 |
-
vid = VideoReader(video_path, ctx=cpu(0))
|
69 |
-
print(
|
70 |
-
"==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:])
|
71 |
-
)
|
72 |
-
original_height, original_width = vid.get_batch([0]).shape[1:3]
|
73 |
-
height = round(original_height / 64) * 64
|
74 |
-
width = round(original_width / 64) * 64
|
75 |
-
if max(height, width) > max_res:
|
76 |
-
scale = max_res / max(original_height, original_width)
|
77 |
-
height = round(original_height * scale / 64) * 64
|
78 |
-
width = round(original_width * scale / 64) * 64
|
79 |
-
else:
|
80 |
-
height = dataset_res_dict[dataset][0]
|
81 |
-
width = dataset_res_dict[dataset][1]
|
82 |
-
|
83 |
-
vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
|
84 |
-
|
85 |
-
fps = vid.get_avg_fps() if target_fps == -1 else target_fps
|
86 |
-
stride = round(vid.get_avg_fps() / fps)
|
87 |
-
stride = max(stride, 1)
|
88 |
-
frames_idx = list(range(0, len(vid), stride))
|
89 |
-
print(
|
90 |
-
f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
|
91 |
-
)
|
92 |
-
if process_length != -1 and process_length < len(frames_idx):
|
93 |
-
frames_idx = frames_idx[:process_length]
|
94 |
-
print(
|
95 |
-
f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
|
96 |
-
)
|
97 |
-
frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
|
98 |
-
|
99 |
-
return frames, fps
|
100 |
-
|
101 |
def infer(
|
102 |
self,
|
103 |
video: str,
|
@@ -116,7 +68,7 @@ class DepthCrafterDemo:
|
|
116 |
):
|
117 |
set_seed(seed)
|
118 |
|
119 |
-
frames, target_fps =
|
120 |
video,
|
121 |
process_length,
|
122 |
target_fps,
|
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
|
|
|
6 |
from diffusers.training_utils import set_seed
|
7 |
from fire import Fire
|
8 |
|
9 |
from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
|
10 |
from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
|
11 |
+
from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
class DepthCrafterDemo:
|
|
|
50 |
print("Xformers is not enabled")
|
51 |
self.pipe.enable_attention_slicing()
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def infer(
|
54 |
self,
|
55 |
video: str,
|
|
|
68 |
):
|
69 |
set_seed(seed)
|
70 |
|
71 |
+
frames, target_fps = read_video_frames(
|
72 |
video,
|
73 |
process_length,
|
74 |
target_fps,
|