Spaces:

jhshao
/

ChronoDepth

Running on Zero

App Files Files Community

jhshao commited on Jun 19, 2024

Commit

78c99d1

1 Parent(s): 8df4a7a

minor bug fixed

Browse files

Files changed (2) hide show

.lh/app.py.json +18 -0
app.py +1 -1

.lh/app.py.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "sourceFile": "app.py",
+    "activeCommit": 0,
+    "commits": [
+        {
+            "activePatchIndex": 0,
+            "patches": [
+                {
+                    "date": 1718785420663,
+                    "content": "Index: \n===================================================================\n--- \n+++ \n"
+                }
+            ],
+            "date": 1718785420663,
+            "name": "Commit-0",
+            "content": "# MIT License\n\n# Copyright (c) 2024 Jiahao Shao\n\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport functools\nimport os\nimport zipfile\nimport tempfile\nfrom io import BytesIO\n\nimport spaces\nimport gradio as gr\nimport numpy as np\nimport torch as torch\nfrom PIL import Image\nfrom tqdm import tqdm\nimport mediapy as media\n\nfrom huggingface_hub import login\n\nfrom chronodepth_pipeline import ChronoDepthPipeline\nfrom gradio_patches.examples import Examples\n\ndefault_seed = 2024\n\ndefault_num_inference_steps = 5\ndefault_num_frames = 10\ndefault_window_size = 9\ndefault_video_processing_resolution = 768\ndefault_video_out_max_frames = 90\ndefault_decode_chunk_size = 10\n\ndef process_video(\n    pipe,\n    path_input,\n    num_inference_steps=default_num_inference_steps,\n    num_frames=default_num_frames,\n    window_size=default_window_size,\n    out_max_frames=default_video_out_max_frames,\n    progress=gr.Progress(),\n):\n    if path_input is None:\n        raise gr.Error(\n            \"Missing video in the first pane: upload a file or use one from the gallery below.\"\n        )\n\n    name_base, name_ext = os.path.splitext(os.path.basename(path_input))\n    print(f\"Processing video {name_base}{name_ext}\")\n\n    path_output_dir = tempfile.mkdtemp()\n    path_out_vis = os.path.join(path_output_dir, f\"{name_base}_depth_colored.mp4\")\n    path_out_16bit = os.path.join(path_output_dir, f\"{name_base}_depth_16bit.zip\")\n\n    generator = torch.Generator(device=pipe.device).manual_seed(default_seed)\n\n    import time\n    start_time = time.time()\n    zipf = None\n    try:\n        if window_size is None or window_size == num_frames:\n            inpaint_inference = False\n        else:\n            inpaint_inference = True\n        data_ls = []\n        video_data = media.read_video(path_input)\n        video_length = len(video_data)\n        fps = video_data.metadata.fps\n\n        duration_sec = video_length / fps\n\n        out_duration_sec = out_max_frames / fps\n        if duration_sec > out_duration_sec:\n            gr.Warning(\n                f\"Only the first ~{int(out_duration_sec)} seconds will be processed; \"\n                f\"use alternative setups such as ChronoDepth on github for full processing\"\n            )\n            video_length = out_max_frames\n\n        for i in tqdm(range(video_length-num_frames+1)):\n            is_first_clip = i == 0\n            is_last_clip = i == video_length - num_frames\n            is_new_clip = (\n                (inpaint_inference and i % window_size == 0)\n                or (inpaint_inference == False and i % num_frames == 0)\n            )\n            if is_first_clip or is_last_clip or is_new_clip:\n                data_ls.append(np.array(video_data[i: i+num_frames])) # [t, H, W, 3]\n\n        zipf = zipfile.ZipFile(path_out_16bit, \"w\", zipfile.ZIP_DEFLATED)\n\n        depth_colored_pred = []\n        depth_pred = []\n        # -------------------- Inference and saving --------------------\n        with torch.no_grad():\n            for iter, batch in enumerate(tqdm(data_ls)):\n                rgb_int = batch\n                input_images = [Image.fromarray(rgb_int[i]) for i in range(num_frames)]\n\n                # Predict depth\n                if iter == 0: # First clip\n                    pipe_out = pipe(\n                        input_images,\n                        num_frames=len(input_images),\n                        num_inference_steps=num_inference_steps,\n                        decode_chunk_size=default_decode_chunk_size,\n                        motion_bucket_id=127,\n                        fps=7,\n                        noise_aug_strength=0.0,\n                        generator=generator,\n                    )\n                elif inpaint_inference and (iter == len(data_ls) - 1): # temporal inpaint inference for last clip\n                    last_window_size = window_size if video_length%window_size == 0 else video_length%window_size\n                    pipe_out = pipe(\n                        input_images,\n                        num_frames=num_frames,\n                        num_inference_steps=num_inference_steps,\n                        decode_chunk_size=default_decode_chunk_size,\n                        motion_bucket_id=127,\n                        fps=7,\n                        noise_aug_strength=0.0,\n                        generator=generator,\n                        depth_pred_last=depth_frames_pred_ts[last_window_size:],\n                    )\n                elif inpaint_inference and iter > 0: # temporal inpaint inference\n                    pipe_out = pipe(\n                        input_images,\n                        num_frames=num_frames,\n                        num_inference_steps=num_inference_steps,\n                        decode_chunk_size=default_decode_chunk_size,\n                        motion_bucket_id=127,\n                        fps=7,\n                        noise_aug_strength=0.0,\n                        generator=generator,\n                        depth_pred_last=depth_frames_pred_ts[window_size:],\n                    )\n                else: # separate inference\n                    pipe_out = pipe(\n                        input_images,\n                        num_frames=num_frames,\n                        num_inference_steps=num_inference_steps,\n                        decode_chunk_size=default_decode_chunk_size,\n                        motion_bucket_id=127,\n                        fps=7,\n                        noise_aug_strength=0.0,\n                        generator=generator,\n                    )\n\n                depth_frames_pred = [pipe_out.depth_np[i] for i in range(num_frames)]\n\n                depth_frames_colored_pred = []\n                for i in range(num_frames):\n                    depth_frame_colored_pred = np.array(pipe_out.depth_colored[i])\n                    depth_frames_colored_pred.append(depth_frame_colored_pred)\n                depth_frames_colored_pred = np.stack(depth_frames_colored_pred, axis=0)\n\n                depth_frames_pred = np.stack(depth_frames_pred, axis=0)\n                depth_frames_pred_ts = torch.from_numpy(depth_frames_pred).to(pipe.device)\n                depth_frames_pred_ts = depth_frames_pred_ts * 2 - 1\n\n                if inpaint_inference == False:\n                    if iter == len(data_ls) - 1:\n                        last_window_size = num_frames if video_length%num_frames == 0 else video_length%num_frames\n                        depth_colored_pred.append(depth_frames_colored_pred[-last_window_size:])\n                        depth_pred.append(depth_frames_pred[-last_window_size:])\n                    else:\n                        depth_colored_pred.append(depth_frames_colored_pred)\n                        depth_pred.append(depth_frames_pred)\n                else:\n                    if iter == 0:\n                        depth_colored_pred.append(depth_frames_colored_pred)\n                        depth_pred.append(depth_frames_pred)\n                    elif iter == len(data_ls) - 1:\n                        depth_colored_pred.append(depth_frames_colored_pred[-last_window_size:])\n                        depth_pred.append(depth_frames_pred[-last_window_size:])\n                    else:\n                        depth_colored_pred.append(depth_frames_colored_pred[-window_size:])\n                        depth_pred.append(depth_frames_pred[-window_size:])\n\n        depth_colored_pred = np.concatenate(depth_colored_pred, axis=0)\n        depth_pred = np.concatenate(depth_pred, axis=0)\n\n        # -------------------- Save results --------------------\n        # Save images\n        for i in tqdm(range(len(depth_pred))):\n            archive_path = os.path.join(\n                f\"{name_base}_depth_16bit\", f\"{i:05d}.png\"\n            )\n            img_byte_arr = BytesIO()\n            depth_16bit = Image.fromarray((depth_pred[i] * 65535.0).astype(np.uint16))\n            depth_16bit.save(img_byte_arr, format=\"png\")\n            img_byte_arr.seek(0)\n            zipf.writestr(archive_path, img_byte_arr.read())\n\n        # Export to video\n        media.write_video(path_out_vis, depth_colored_pred, fps=fps)\n    finally:\n        if zipf is not None:\n            zipf.close()\n\n    end_time = time.time()\n    print(f\"Processing time: {end_time - start_time} seconds\")\n    return (\n        path_out_vis,\n        [path_out_vis, path_out_16bit],\n    )\n\n\ndef run_demo_server(pipe):\n    process_pipe_video = spaces.GPU(\n        functools.partial(process_video, pipe), duration=220\n    )\n    os.environ[\"GRADIO_ALLOW_FLAGGING\"] = \"never\"\n\n    with gr.Blocks(\n        analytics_enabled=False,\n        title=\"ChronoDepth Video Depth Estimation\",\n        css=\"\"\"\n            #download {\n                height: 118px;\n            }\n            .slider .inner {\n                width: 5px;\n                background: #FFF;\n            }\n            .viewport {\n                aspect-ratio: 4/3;\n            }\n            h1 {\n                text-align: center;\n                display: block;\n            }\n            h2 {\n                text-align: center;\n                display: block;\n            }\n            h3 {\n                text-align: center;\n                display: block;\n            }\n        \"\"\",\n    ) as demo:\n        gr.Markdown(\n            \"\"\"\n            # ChronoDepth Video Depth Estimation\n\n            <p align=\"center\">\n            <a title=\"Website\" href=\"https://jhaoshao.github.io/ChronoDepth/\" target=\"_blank\" rel=\"noopener noreferrer\" style=\"display: inline-block;\">\n                <img src=\"https://img.shields.io/website?url=https%3A%2F%2Fjhaoshao.github.io%2FChronoDepth%2F&up_message=ChronoDepth&up_color=blue&style=flat&logo=timescale&logoColor=%23FFDC0F\">\n            </a>\n            <a title=\"arXiv\" href=\"https://arxiv.org/abs/2312.02145\" target=\"_blank\" rel=\"noopener noreferrer\" style=\"display: inline-block;\">\n                <img src=\"https://img.shields.io/badge/arXiv-PDF-b31b1b\">\n            </a>\n            <a title=\"Github\" href=\"https://github.com/jhaoshao/ChronoDepth\" target=\"_blank\" rel=\"noopener noreferrer\" style=\"display: inline-block;\">\n                <img src=\"https://img.shields.io/github/stars/jhaoshao/ChronoDepth?label=GitHub%20%E2%98%85&logo=github&color=C8C\" alt=\"badge-github-stars\">\n            </a>\n            </p>\n\n            ChronoDepth is the state-of-the-art video depth estimator for videos in the wild. \n            Upload your video and have a try!<br>\n            We set denoising steps to 5, number of frames for each video clip to 10, and overlap between clips to 1.\n\n        \"\"\"\n        )\n\n        with gr.Row():\n            with gr.Column():\n                video_input = gr.Video(\n                    label=\"Input Video\",\n                    sources=[\"upload\"],\n                )\n                with gr.Row():\n                    video_submit_btn = gr.Button(\n                        value=\"Compute Depth\", variant=\"primary\"\n                    )\n                    video_reset_btn = gr.Button(value=\"Reset\")\n            with gr.Column():\n                video_output_video = gr.Video(\n                    label=\"Output video depth (red-near, blue-far)\",\n                    interactive=False,\n                )\n                video_output_files = gr.Files(\n                    label=\"Depth outputs\",\n                    elem_id=\"download\",\n                    interactive=False,\n                )\n        Examples(\n            fn=process_pipe_video,\n            examples=[\n                os.path.join(\"files\", name)\n                for name in [\n                    \"sora_e2.mp4\",\n                    \"sora_1758192960116785459.mp4\",\n                ]\n            ],\n            inputs=[video_input],\n            outputs=[video_output_video, video_output_files],\n            cache_examples=True,\n            directory_name=\"examples_video\",\n        )\n\n        video_submit_btn.click(\n            fn=process_pipe_video,\n            inputs=[video_input],\n            outputs=[video_output_video, video_output_files],\n            concurrency_limit=1,\n        )\n\n        video_reset_btn.click(\n            fn=lambda: (None, None, None),\n            inputs=[],\n            outputs=[video_input, video_output_video],\n            concurrency_limit=1,\n        )\n\n        demo.queue(\n            api_open=False,\n        ).launch(\n            server_name=\"0.0.0.0\",\n            server_port=7860,\n        )\n\n\ndef main():\n    CHECKPOINT = \"jhshao/ChronoDepth\"\n\n    if \"HF_TOKEN_LOGIN\" in os.environ:\n        login(token=os.environ[\"HF_TOKEN_LOGIN\"])\n\n    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    print(f\"Running on device: {device}\")\n    pipe = ChronoDepthPipeline.from_pretrained(CHECKPOINT)\n    try:\n        import xformers\n\n        pipe.enable_xformers_memory_efficient_attention()\n    except:\n        pass  # run without xformers\n\n    pipe = pipe.to(device)\n    run_demo_server(pipe)\n\n\nif __name__ == \"__main__\":\n    main()\n"
+        }
+    ]
+}

app.py CHANGED Viewed

@@ -225,7 +225,7 @@ def process_video(
 def run_demo_server(pipe):
     process_pipe_video = spaces.GPU(
-        functools.partial(process_video, pipe), duration=210
     )
     os.environ["GRADIO_ALLOW_FLAGGING"] = "never"

 def run_demo_server(pipe):
     process_pipe_video = spaces.GPU(
+        functools.partial(process_video, pipe), duration=220
     )
     os.environ["GRADIO_ALLOW_FLAGGING"] = "never"