Spaces:

tencent
/

DepthCrafter

Running on Zero

App Files Files Community

sdsdsdadasd3 commited on Nov 25, 2024

Commit

acd6fd7

1 Parent(s): 85331ff

[Release] v1.0.1

Browse files

- improve the performance
- improve efficiency

Files changed (3) hide show

depthcrafter/utils.py +15 -67
requirements.txt +3 -1
run.py +99 -84

depthcrafter/utils.py CHANGED Viewed

@@ -1,79 +1,27 @@
 import numpy as np
-import cv2
 import matplotlib.cm as cm
 import torch
-dataset_res_dict = {
-    "sintel":[448, 1024],
-    "scannet":[640, 832],
-    "kitti":[384, 1280],
-    "bonn":[512, 640],
-    "nyu":[448, 640],
-}
-def read_video_frames(video_path, process_length, target_fps, max_res, dataset):
-    # a simple function to read video frames
-    cap = cv2.VideoCapture(video_path)
-    original_fps = cap.get(cv2.CAP_PROP_FPS)
-    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    # round the height and width to the nearest multiple of 64
-    if dataset=="open":
-        height = round(original_height / 64) * 64
-        width = round(original_width / 64) * 64
-    else:
-        height = dataset_res_dict[dataset][0]
-        width = dataset_res_dict[dataset][1]
-    # resize the video if the height or width is larger than max_res
-    if max(height, width) > max_res:
-        scale = max_res / max(original_height, original_width)
-        height = round(original_height * scale / 64) * 64
-        width = round(original_width * scale / 64) * 64
-    if target_fps < 0:
-        target_fps = original_fps
-    stride = max(round(original_fps / target_fps), 1)
-    frames = []
-    frame_count = 0
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret or (process_length > 0 and frame_count >= process_length):
-            break
-        if frame_count % stride == 0:
-            frame = cv2.resize(frame, (width, height))
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
-            frames.append(frame.astype("float32") / 255.0)
-        frame_count += 1
-    cap.release()
-    frames = np.array(frames)
-    return frames, target_fps
 def save_video(
-    video_frames,
-    output_video_path,
-    fps: int = 15,
 ) -> str:
-    # a simple function to save video frames
-    height, width = video_frames[0].shape[:2]
-    is_color = video_frames[0].ndim == 3
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    video_writer = cv2.VideoWriter(
-        output_video_path, fourcc, fps, (width, height), isColor=is_color
-    )
-    for frame in video_frames:
-        frame = (frame * 255).astype(np.uint8)
-        if is_color:
-            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-        video_writer.write(frame)
-    video_writer.release()
     return output_video_path

+from typing import Union, List
+import tempfile
 import numpy as np
+import PIL.Image
 import matplotlib.cm as cm
+import mediapy
 import torch
 def save_video(
+    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]],
+    output_video_path: str = None,
+    fps: int = 10,
+    crf: int = 18,
 ) -> str:
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+    if isinstance(video_frames[0], np.ndarray):
+        video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
+    elif isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+    mediapy.write_video(output_video_path, video_frames, fps=fps, crf=crf)
     return output_video_path

requirements.txt CHANGED Viewed

@@ -2,7 +2,9 @@ torch==2.0.1
 diffusers==0.29.1
 numpy==1.26.4
 matplotlib==3.8.4
-opencv-python==4.8.1.78
 transformers==4.41.2
 accelerate==0.30.1
 xformers==0.0.20

 diffusers==0.29.1
 numpy==1.26.4
 matplotlib==3.8.4
 transformers==4.41.2
 accelerate==0.30.1
 xformers==0.0.20
+mediapy==1.2.0
+fire==0.6.0
+decord==0.6.0

run.py CHANGED Viewed

@@ -2,12 +2,22 @@ import gc
 import os
 import numpy as np
 import torch
-import argparse
 from diffusers.training_utils import set_seed
 from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
 from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
-from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames
 class DepthCrafterDemo:
@@ -49,6 +59,45 @@ class DepthCrafterDemo:
             print("Xformers is not enabled")
         self.pipe.enable_attention_slicing()
     def infer(
         self,
         video: str,
@@ -67,11 +116,13 @@ class DepthCrafterDemo:
     ):
         set_seed(seed)
-        frames, target_fps = read_video_frames(
-            video, process_length, target_fps, max_res, dataset,
         )
-        print(f"==> video name: {video}, frames shape: {frames.shape}")
         # inference the depth map using the DepthCrafter pipeline
         with torch.inference_mode():
             res = self.pipe(
@@ -128,91 +179,55 @@ class DepthCrafterDemo:
         return res_path[:2]
-if __name__ == "__main__":
-    # running configs
-    # the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size`
-    # the most important arguments for trade-off between quality and speed are
-    # `num_inference_steps`, `guidance_scale`, and `max_res`
-    parser = argparse.ArgumentParser(description="DepthCrafter")
-    parser.add_argument(
-        "--video-path", type=str, required=True, help="Path to the input video file(s)"
-    )
-    parser.add_argument(
-        "--save-folder",
-        type=str,
-        default="./demo_output",
-        help="Folder to save the output",
-    )
-    parser.add_argument(
-        "--unet-path",
-        type=str,
-        default="tencent/DepthCrafter",
-        help="Path to the UNet model",
-    )
-    parser.add_argument(
-        "--pre-train-path",
-        type=str,
-        default="stabilityai/stable-video-diffusion-img2vid-xt",
-        help="Path to the pre-trained model",
-    )
-    parser.add_argument(
-        "--process-length", type=int, default=195, help="Number of frames to process"
-    )
-    parser.add_argument(
-        "--cpu-offload",
-        type=str,
-        default="model",
-        choices=["model", "sequential", None],
-        help="CPU offload option",
-    )
-    parser.add_argument(
-        "--target-fps", type=int, default=15, help="Target FPS for the output video"
-    )  # -1 for original fps
-    parser.add_argument("--seed", type=int, default=42, help="Random seed")
-    parser.add_argument(
-        "--num-inference-steps", type=int, default=25, help="Number of inference steps"
-    )
-    parser.add_argument(
-        "--guidance-scale", type=float, default=1.2, help="Guidance scale"
-    )
-    parser.add_argument("--window-size", type=int, default=110, help="Window size")
-    parser.add_argument("--overlap", type=int, default=25, help="Overlap size")
-    parser.add_argument("--max-res", type=int, default=1024, help="Maximum resolution")
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="open",
-        choices=["open", "sintel", "scannet", "kitti", "bonn", 'nyu'],
-        help="Assigned resolution for specific dataset evaluation"
-    )
-    parser.add_argument("--save_npz", type=bool, default=True, help="Save npz file")
-    parser.add_argument("--track_time", type=bool, default=False, help="Track time")
-    args = parser.parse_args()
     depthcrafter_demo = DepthCrafterDemo(
-        unet_path=args.unet_path,
-        pre_train_path=args.pre_train_path,
-        cpu_offload=args.cpu_offload,
     )
     # process the videos, the video paths are separated by comma
-    video_paths = args.video_path.split(",")
     for video in video_paths:
         depthcrafter_demo.infer(
             video,
-            args.num_inference_steps,
-            args.guidance_scale,
-            save_folder=args.save_folder,
-            window_size=args.window_size,
-            process_length=args.process_length,
-            overlap=args.overlap,
-            max_res=args.max_res,
-            dataset=args.dataset,
-            target_fps=args.target_fps,
-            seed=args.seed,
-            track_time=args.track_time,
-            save_npz=args.save_npz,
         )
         # clear the cache for the next video
         gc.collect()
         torch.cuda.empty_cache()

 import os
 import numpy as np
 import torch
+from decord import VideoReader, cpu
 from diffusers.training_utils import set_seed
+from fire import Fire
 from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
 from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+from depthcrafter.utils import vis_sequence_depth, save_video
+dataset_res_dict = {
+    "sintel": [448, 1024],
+    "scannet": [640, 832],
+    "KITTI": [384, 1280],
+    "bonn": [512, 640],
+    "NYUv2": [448, 640],
+}
 class DepthCrafterDemo:
             print("Xformers is not enabled")
         self.pipe.enable_attention_slicing()
+    @staticmethod
+    def read_video_frames(
+        video_path, process_length, target_fps, max_res, dataset="open"
+    ):
+        if dataset == "open":
+            print("==> processing video: ", video_path)
+            vid = VideoReader(video_path, ctx=cpu(0))
+            print(
+                "==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:])
+            )
+            original_height, original_width = vid.get_batch([0]).shape[1:3]
+            height = round(original_height / 64) * 64
+            width = round(original_width / 64) * 64
+            if max(height, width) > max_res:
+                scale = max_res / max(original_height, original_width)
+                height = round(original_height * scale / 64) * 64
+                width = round(original_width * scale / 64) * 64
+        else:
+            height = dataset_res_dict[dataset][0]
+            width = dataset_res_dict[dataset][1]
+        vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+        fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+        stride = round(vid.get_avg_fps() / fps)
+        stride = max(stride, 1)
+        frames_idx = list(range(0, len(vid), stride))
+        print(
+            f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
+        )
+        if process_length != -1 and process_length < len(frames_idx):
+            frames_idx = frames_idx[:process_length]
+        print(
+            f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+        )
+        frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
+        return frames, fps
     def infer(
         self,
         video: str,
     ):
         set_seed(seed)
+        frames, target_fps = self.read_video_frames(
+            video,
+            process_length,
+            target_fps,
+            max_res,
+            dataset,
         )
         # inference the depth map using the DepthCrafter pipeline
         with torch.inference_mode():
             res = self.pipe(
         return res_path[:2]
+def main(
+    video_path: str,
+    save_folder: str = "./demo_output",
+    unet_path: str = "tencent/DepthCrafter",
+    pre_train_path: str = "stabilityai/stable-video-diffusion-img2vid-xt",
+    process_length: int = -1,
+    cpu_offload: str = "model",
+    target_fps: int = -1,
+    seed: int = 42,
+    num_inference_steps: int = 5,
+    guidance_scale: float = 1.0,
+    window_size: int = 110,
+    overlap: int = 25,
+    max_res: int = 1024,
+    dataset: str = "open",
+    save_npz: bool = True,
+    track_time: bool = False,
+):
     depthcrafter_demo = DepthCrafterDemo(
+        unet_path=unet_path,
+        pre_train_path=pre_train_path,
+        cpu_offload=cpu_offload,
     )
     # process the videos, the video paths are separated by comma
+    video_paths = video_path.split(",")
     for video in video_paths:
         depthcrafter_demo.infer(
             video,
+            num_inference_steps,
+            guidance_scale,
+            save_folder=save_folder,
+            window_size=window_size,
+            process_length=process_length,
+            overlap=overlap,
+            max_res=max_res,
+            dataset=dataset,
+            target_fps=target_fps,
+            seed=seed,
+            track_time=track_time,
+            save_npz=save_npz,
         )
         # clear the cache for the next video
         gc.collect()
         torch.cuda.empty_cache()
+if __name__ == "__main__":
+    # running configs
+    # the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size`
+    # the most important arguments for trade-off between quality and speed are
+    # `num_inference_steps`, `guidance_scale`, and `max_res`
+    Fire(main)