Spaces:

XiangpengYang
/

VideoGrain

Runtime error

File size: 11,343 Bytes

#!/usr/bin/env python

from __future__ import annotations

import os

import gradio as gr

from webui.merge_config_gradio import merge_config_then_run

import huggingface_hub
import shutil
import os
import torch

HF_TOKEN = os.getenv('HF_TOKEN')
pipe = merge_config_then_run()


ARTICLE = r"""
If VideoGrain is helpful, please help to ⭐ the <a href='https://github.com/knightyxp/VideoGrain' target='_blank'>Github Repo</a>. Thanks! 
[![GitHub Stars](https://img.shields.io/github/stars/knightyxp/VideoGrain?style=social)](https://github.com/knightyxp/VideoGrain)
---
📝 **Citation**
If our work is useful for your research, please consider citing:
```bibtex
@article{yang2025videograin,
  title={VideoGrain: Modulating Space-Time Attention for Multi-grained Video Editing},
  author={Yang, Xiangpeng and Zhu, Linchao and Fan, Hehe and Yang, Yi},
  journal={ICLR},
  year={2025}
}
```
📋 **License**
This project is licensed under <a rel="license" href="https://github.com/knightyxp/VideoGrain?tab=License-1-ov-file#readme">ReLER-Lab License 1.0</a>. 
Redistribution and use for non-commercial purposes should follow this license.
📧 **Contact**
If you have any questions, please feel free to reach me out at <b>knightyxp@gmail.com</b>.
"""


def update_layout_visibility(selected_num):
    num = int(selected_num)
    return [gr.update(visible=(i < num)) for i in range(len(layout_files))]


with gr.Blocks(css='style.css') as demo:
    # gr.Markdown(TITLE)
    
    gr.HTML(
    """
    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
    <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
        VideoGrain: Modulating Space-Time Attention for Multi-Grained Video Editing
    </h1>
    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
            <a href="https://github.com/knightyxp">Xiangpeng Yang</a>
    </h2>
    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                  <span class="link-block">
                    [<a href="https://arxiv.org/abs/2502.17258" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>]
                </span>
                  <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://github.com/knightyxp/VideoGrain" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>]
                </span>
                <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://knightyxp.github.io/VideoGrain_project_page" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Homepage</span>
                  </a>]
                </span>
                <!-- Github link -->
                <span class="link-block">
                  [<a href="https://www.youtube.com/watch?v=XEM4Pex7F9E" target="_blank"
                  class="external-link ">
                  <span class="icon">
                    <i class="fab fa-youtube"></i>
                  </span>
                  <span>Youtube Video</span>
                </a>]
              </span>
    </h2>
    <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
       📕 TL;DR: VideoGrain is a zero-shot method for class-level, instance-level, and part-level video editing
    </h2>
    <h2 style="font-weight: 450; font-size: 1rem;">
        Note that this page is a limited demo of VideoGrain. To run with more configurations, please check out our <a href="https://github.com/knightyxp/VideoGrain">github page.
    </h2>
    </div>
    """)


    gr.HTML("""
    <p>We provide an <a href="https://github.com/knightyxp/VideoGrain?tab=readme-ov-file#editing-guidance-for-your-video"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
    <p>To remove the limitations or avoid queue on your own hardware, you may <a href="https://huggingface.co/spaces/XiangpengYang/VideoGrain?duplicate=true" style="display: inline-block; vertical-align: middle;"><img style="margin-top: 0em; margin-bottom: 0em; display: inline-block;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>
    """)

    with gr.Row():
        with gr.Column():
            with gr.Accordion('Input Video', open=True):
                # user_input_video = gr.File(label='Input Source Video')
                user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
                

                # Radio to choose how many layout videos to show
                num_layouts = gr.Radio(
                    choices=["2", "3", "4", "5"],
                    label="Select Number of Editing Areas",
                    value="2",  # default
                    info="Please select the number of editing areas"
                )

                # 使用循环生成所有的布局视频组件，并存到列表 layout_files 中
                layout_files = []
                with gr.Row():
                    for i in range(5):
                        video = gr.Video(
                            label=f"Layout Video {i+1}",
                            type="numpy",
                            format="mp4",
                            visible=(i < 2)  # 默认显示前两个
                        )
                        layout_files.append(video)

                # 当 num_layouts 改变时，通过回调函数更新 layout_files 列表中各视频组件的 visible 属性
                num_layouts.change(
                    fn=update_layout_visibility,
                    inputs=num_layouts,
                    outputs=layout_files
                )

                prompt = gr.Textbox(label='Prompt',
                                    info='Change the prompt, and extract each local prompt in the editing prompts.\
                                    (the local prompt order should be same as layout masks order.)',
                                )

                model_id = gr.Dropdown(
                    label='Model ID',
                    choices=[
                        'stable-diffusion-v1-5/stable-diffusion-v1-5',
                        # add shape editing ckpt here
                    ],
                    value='stable-diffusion-v1-5/stable-diffusion-v1-5')


        with gr.Column():
            result = gr.Video(label='Result')
            # result.style(height=512, width=512)
            with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
                n_sample_frame = gr.Slider(label='Number of Frames',
                                    minimum=0,
                                    maximum=32,
                                    step=1,
                                    value=16)
                sampling_rate = gr.Slider(label='sampling_rate',
                                        minimum=0,
                                        maximum=20,
                                        step=1,
                                        value=1)
                start_sample_frame = gr.Number(label='Start frame in the video',
                            value=0,
                            precision=0)


            with gr.Row():  
                control_list = ['dwpose', 'depth_zoe', 'depth_midas']
                control_type = gr.Dropdown(
                    choices=control_list,
                    label='Control type',
                    value='dwpose'
                )
                
                # Checkbox group for "dwpose" options; default: hand selected, face not selected.
                dwpose_options = gr.CheckboxGroup(
                    choices=["hand", "face"],
                    label="DW Pose Options",
                    value=["hand"],
                    visible=True  # Initially visible since default control_type is "dwpose"
                )
                
                # Update the visibility of the dwpose_options based on the selected control type
                control_type.change(
                    fn=lambda x: gr.update(visible=(x == "dwpose")),
                    inputs=control_type,
                    outputs=dwpose_options
                )

                controlnet_conditioning_scale = gr.Slider(label='ControlNet conditioning scale',
                          minimum=0.0,
                          maximum=1.0,
                          value=1.0,
                          step=0.1)

            with gr.Accordion('Editing config for VideoGrian', open=True):
                use_pnp = gr.Checkbox(
                    label="Use PnP",
                    value=False,
                    info="Check to enable PnP functionality."
                )

                pnp_inject_steps = gr.Slider(label='pnp inject steps',
                                info='PnP inject steps for temporal consistency',
                                minimum=0,
                                maximum=10,
                                step=1,
                                value=0)
                
                flatten_res = gr.CheckboxGroup(
                    choices=["1", "2", "4", "8"],
                    label="Flatten Resolution",
                    value=["1"],
                    info="Select one or more flatten resolution factors. Mapping: 1 -> 64, 2 -> 32 (64/2), 4 -> 16 (64/4), 8 -> 8 (64/8)."
                )


            run_button = gr.Button('Generate')                

    with gr.Row():
        from example import style_example
        examples = style_example

        # gr.Examples(examples=examples,
        #             inputs=[
        #                 model_id,
        #                 user_input_video,
        #                 layout_files,
        #                 prompt,
        #                 model_id,
        #                 control_type,
        #                 dwpose_options,
        #                 controlnet_conditioning_scale,
        #                 use_pnp,
        #                 pnp_inject_steps,
        #                 flatten_res,
        #             ],
        #             outputs=result,
        #             fn=pipe.run,
        #             cache_examples=True,
        #             # cache_examples=os.getenv('SYSTEM') == 'spaces'
        #             )
    gr.Markdown(ARTICLE)
    inputs = [user_input_video, num_layouts,
        *layout_files,
        prompt,
        model_id,
        n_sample_frame,
        start_sample_frame,
        sampling_rate,
        control_type,
        dwpose_options,
        controlnet_conditioning_scale,
        use_pnp,
        pnp_inject_steps,
        flatten_res,
    ]
    prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
    run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

demo.queue().launch(share=True)