Spaces:

tencent
/

DepthCrafter

Running on Zero

App Files Files Community

sdsdsdadasd3 commited on Nov 25, 2024

Commit

dd60833

1 Parent(s): 7c2f6e2

[Release] v1.0.1

Browse files

- improve the performance
- improve efficiency

Files changed (3) hide show

app.py +21 -12
depthcrafter/utils.py +44 -0
run.py +2 -50

app.py CHANGED Viewed

@@ -17,11 +17,11 @@ from huggingface_hub import hf_hub_download
 from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video
 examples = [
-    ["examples/example_01.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_02.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_03.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_04.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_05.mp4", 10, 1.2, 1024, 60],
 ]
@@ -39,18 +39,18 @@ pipe = DepthCrafterPipeline.from_pretrained(
 pipe.to("cuda")
-@spaces.GPU(duration=140)
 def infer_depth(
     video: str,
     num_denoising_steps: int,
     guidance_scale: float,
     max_res: int = 1024,
-    process_length: int = 195,
     #
     save_folder: str = "./demo_output",
     window_size: int = 110,
     overlap: int = 25,
-    target_fps: int = 15,
     seed: int = 42,
     track_time: bool = True,
     save_npz: bool = False,
@@ -59,7 +59,6 @@ def infer_depth(
     pipe.enable_xformers_memory_efficient_attention()
     frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
-    print(f"==> video name: {video}, frames shape: {frames.shape}")
     # inference the depth map using the DepthCrafter pipeline
     with torch.inference_mode():
@@ -82,6 +81,7 @@ def infer_depth(
     vis = vis_sequence_depth(res)
     # save the depth map and visualization with the target FPS
     save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
     if save_npz:
         np.savez_compressed(save_path + ".npz", depth=res)
@@ -155,14 +155,14 @@ def construct_demo():
                             label="num denoising steps",
                             minimum=1,
                             maximum=25,
-                            value=10,
                             step=1,
                         )
                         guidance_scale = gr.Slider(
                             label="cfg scale",
                             minimum=1.0,
                             maximum=1.2,
-                            value=1.2,
                             step=0.1,
                         )
                         max_res = gr.Slider(
@@ -174,11 +174,18 @@ def construct_demo():
                         )
                         process_length = gr.Slider(
                             label="process length",
-                            minimum=1,
                             maximum=280,
                             value=60,
                             step=1,
                         )
                     generate_btn = gr.Button("Generate")
             with gr.Column(scale=2):
                 pass
@@ -191,6 +198,7 @@ def construct_demo():
                 guidance_scale,
                 max_res,
                 process_length,
             ],
             outputs=[output_video_1, output_video_2],
             fn=infer_depth,
@@ -216,6 +224,7 @@ def construct_demo():
                 guidance_scale,
                 max_res,
                 process_length,
             ],
             outputs=[output_video_1, output_video_2],
         )

 from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video
 examples = [
+    ["examples/example_01.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_02.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_03.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_04.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_05.mp4", 5, 1.0, 1024, -1, -1],
 ]
 pipe.to("cuda")
+@spaces.GPU(duration=120)
 def infer_depth(
     video: str,
     num_denoising_steps: int,
     guidance_scale: float,
     max_res: int = 1024,
+    process_length: int = -1,
     #
     save_folder: str = "./demo_output",
     window_size: int = 110,
     overlap: int = 25,
+    target_fps: int = -1,
     seed: int = 42,
     track_time: bool = True,
     save_npz: bool = False,
     pipe.enable_xformers_memory_efficient_attention()
     frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
     # inference the depth map using the DepthCrafter pipeline
     with torch.inference_mode():
     vis = vis_sequence_depth(res)
     # save the depth map and visualization with the target FPS
     save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
+    print(f"==> saving results to {save_path}")
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
     if save_npz:
         np.savez_compressed(save_path + ".npz", depth=res)
                             label="num denoising steps",
                             minimum=1,
                             maximum=25,
+                            value=5,
                             step=1,
                         )
                         guidance_scale = gr.Slider(
                             label="cfg scale",
                             minimum=1.0,
                             maximum=1.2,
+                            value=1.0,
                             step=0.1,
                         )
                         max_res = gr.Slider(
                         )
                         process_length = gr.Slider(
                             label="process length",
+                            minimum=-1,
                             maximum=280,
                             value=60,
                             step=1,
                         )
+                        process_target_fps = gr.Slider(
+                            label="target FPS",
+                            minimum=-1,
+                            maximum=30,
+                            value=15,
+                            step=1,
+                        )
                     generate_btn = gr.Button("Generate")
             with gr.Column(scale=2):
                 pass
                 guidance_scale,
                 max_res,
                 process_length,
+                process_target_fps,
             ],
             outputs=[output_video_1, output_video_2],
             fn=infer_depth,
                 guidance_scale,
                 max_res,
                 process_length,
+                process_target_fps,
             ],
             outputs=[output_video_1, output_video_2],
         )

depthcrafter/utils.py CHANGED Viewed

@@ -5,6 +5,50 @@ import PIL.Image
 import matplotlib.cm as cm
 import mediapy
 import torch
 def save_video(

 import matplotlib.cm as cm
 import mediapy
 import torch
+from decord import VideoReader, cpu
+dataset_res_dict = {
+    "sintel": [448, 1024],
+    "scannet": [640, 832],
+    "KITTI": [384, 1280],
+    "bonn": [512, 640],
+    "NYUv2": [448, 640],
+}
+def read_video_frames(video_path, process_length, target_fps, max_res, dataset="open"):
+    if dataset == "open":
+        print("==> processing video: ", video_path)
+        vid = VideoReader(video_path, ctx=cpu(0))
+        print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
+        original_height, original_width = vid.get_batch([0]).shape[1:3]
+        height = round(original_height / 64) * 64
+        width = round(original_width / 64) * 64
+        if max(height, width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = round(original_height * scale / 64) * 64
+            width = round(original_width * scale / 64) * 64
+    else:
+        height = dataset_res_dict[dataset][0]
+        width = dataset_res_dict[dataset][1]
+    vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+    fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+    stride = round(vid.get_avg_fps() / fps)
+    stride = max(stride, 1)
+    frames_idx = list(range(0, len(vid), stride))
+    print(
+        f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
+    )
+    if process_length != -1 and process_length < len(frames_idx):
+        frames_idx = frames_idx[:process_length]
+    print(
+        f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+    )
+    frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
+    return frames, fps
 def save_video(

run.py CHANGED Viewed

@@ -3,21 +3,12 @@ import os
 import numpy as np
 import torch
-from decord import VideoReader, cpu
 from diffusers.training_utils import set_seed
 from fire import Fire
 from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
 from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
-from depthcrafter.utils import vis_sequence_depth, save_video
-dataset_res_dict = {
-    "sintel": [448, 1024],
-    "scannet": [640, 832],
-    "KITTI": [384, 1280],
-    "bonn": [512, 640],
-    "NYUv2": [448, 640],
-}
 class DepthCrafterDemo:
@@ -59,45 +50,6 @@ class DepthCrafterDemo:
             print("Xformers is not enabled")
         self.pipe.enable_attention_slicing()
-    @staticmethod
-    def read_video_frames(
-        video_path, process_length, target_fps, max_res, dataset="open"
-    ):
-        if dataset == "open":
-            print("==> processing video: ", video_path)
-            vid = VideoReader(video_path, ctx=cpu(0))
-            print(
-                "==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:])
-            )
-            original_height, original_width = vid.get_batch([0]).shape[1:3]
-            height = round(original_height / 64) * 64
-            width = round(original_width / 64) * 64
-            if max(height, width) > max_res:
-                scale = max_res / max(original_height, original_width)
-                height = round(original_height * scale / 64) * 64
-                width = round(original_width * scale / 64) * 64
-        else:
-            height = dataset_res_dict[dataset][0]
-            width = dataset_res_dict[dataset][1]
-        vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
-        fps = vid.get_avg_fps() if target_fps == -1 else target_fps
-        stride = round(vid.get_avg_fps() / fps)
-        stride = max(stride, 1)
-        frames_idx = list(range(0, len(vid), stride))
-        print(
-            f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
-        )
-        if process_length != -1 and process_length < len(frames_idx):
-            frames_idx = frames_idx[:process_length]
-        print(
-            f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
-        )
-        frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
-        return frames, fps
     def infer(
         self,
         video: str,
@@ -116,7 +68,7 @@ class DepthCrafterDemo:
     ):
         set_seed(seed)
-        frames, target_fps = self.read_video_frames(
             video,
             process_length,
             target_fps,

 import numpy as np
 import torch
 from diffusers.training_utils import set_seed
 from fire import Fire
 from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
 from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames
 class DepthCrafterDemo:
             print("Xformers is not enabled")
         self.pipe.enable_attention_slicing()
     def infer(
         self,
         video: str,
     ):
         set_seed(seed)
+        frames, target_fps = read_video_frames(
             video,
             process_length,
             target_fps,