VideoGrain / app.py
XiangpengYang's picture
share
9dbf912
#!/usr/bin/env python
from __future__ import annotations
import os
import gradio as gr
from webui.merge_config_gradio import merge_config_then_run
import huggingface_hub
import shutil
import os
import torch
HF_TOKEN = os.getenv('HF_TOKEN')
pipe = merge_config_then_run()
ARTICLE = r"""
If VideoGrain is helpful, please help to ⭐ the <a href='https://github.com/knightyxp/VideoGrain' target='_blank'>Github Repo</a>. Thanks!
[![GitHub Stars](https://img.shields.io/github/stars/knightyxp/VideoGrain?style=social)](https://github.com/knightyxp/VideoGrain)
---
📝 **Citation**
If our work is useful for your research, please consider citing:
```bibtex
@article{yang2025videograin,
title={VideoGrain: Modulating Space-Time Attention for Multi-grained Video Editing},
author={Yang, Xiangpeng and Zhu, Linchao and Fan, Hehe and Yang, Yi},
journal={ICLR},
year={2025}
}
```
📋 **License**
This project is licensed under <a rel="license" href="https://github.com/knightyxp/VideoGrain?tab=License-1-ov-file#readme">ReLER-Lab License 1.0</a>.
Redistribution and use for non-commercial purposes should follow this license.
📧 **Contact**
If you have any questions, please feel free to reach me out at <b>knightyxp@gmail.com</b>.
"""
def update_layout_visibility(selected_num):
num = int(selected_num)
return [gr.update(visible=(i < num)) for i in range(len(layout_files))]
with gr.Blocks(css='style.css') as demo:
# gr.Markdown(TITLE)
gr.HTML(
"""
<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
<h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
VideoGrain: Modulating Space-Time Attention for Multi-Grained Video Editing
</h1>
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
<a href="https://github.com/knightyxp">Xiangpeng Yang</a>
</h2>
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
<span class="link-block">
[<a href="https://arxiv.org/abs/2502.17258" target="_blank"
class="external-link ">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>]
</span>
<!-- Github link -->
<span class="link-block">
[<a href="https://github.com/knightyxp/VideoGrain" target="_blank"
class="external-link ">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>]
</span>
<!-- Github link -->
<span class="link-block">
[<a href="https://knightyxp.github.io/VideoGrain_project_page" target="_blank"
class="external-link ">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Homepage</span>
</a>]
</span>
<!-- Github link -->
<span class="link-block">
[<a href="https://www.youtube.com/watch?v=XEM4Pex7F9E" target="_blank"
class="external-link ">
<span class="icon">
<i class="fab fa-youtube"></i>
</span>
<span>Youtube Video</span>
</a>]
</span>
</h2>
<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
📕 TL;DR: VideoGrain is a zero-shot method for class-level, instance-level, and part-level video editing
</h2>
<h2 style="font-weight: 450; font-size: 1rem;">
Note that this page is a limited demo of VideoGrain. To run with more configurations, please check out our <a href="https://github.com/knightyxp/VideoGrain">github page.
</h2>
</div>
""")
gr.HTML("""
<p>We provide an <a href="https://github.com/knightyxp/VideoGrain?tab=readme-ov-file#editing-guidance-for-your-video"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
<p>To remove the limitations or avoid queue on your own hardware, you may <a href="https://huggingface.co/spaces/XiangpengYang/VideoGrain?duplicate=true" style="display: inline-block; vertical-align: middle;"><img style="margin-top: 0em; margin-bottom: 0em; display: inline-block;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>
""")
with gr.Row():
with gr.Column():
with gr.Accordion('Input Video', open=True):
# user_input_video = gr.File(label='Input Source Video')
user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
# Radio to choose how many layout videos to show
num_layouts = gr.Radio(
choices=["2", "3", "4", "5"],
label="Select Number of Editing Areas",
value="2", # default
info="Please select the number of editing areas"
)
# 使用循环生成所有的布局视频组件,并存到列表 layout_files 中
layout_files = []
with gr.Row():
for i in range(5):
video = gr.Video(
label=f"Layout Video {i+1}",
type="numpy",
format="mp4",
visible=(i < 2) # 默认显示前两个
)
layout_files.append(video)
# 当 num_layouts 改变时,通过回调函数更新 layout_files 列表中各视频组件的 visible 属性
num_layouts.change(
fn=update_layout_visibility,
inputs=num_layouts,
outputs=layout_files
)
prompt = gr.Textbox(label='Prompt',
info='Change the prompt, and extract each local prompt in the editing prompts.\
(the local prompt order should be same as layout masks order.)',
)
model_id = gr.Dropdown(
label='Model ID',
choices=[
'stable-diffusion-v1-5/stable-diffusion-v1-5',
# add shape editing ckpt here
],
value='stable-diffusion-v1-5/stable-diffusion-v1-5')
with gr.Column():
result = gr.Video(label='Result')
# result.style(height=512, width=512)
with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
n_sample_frame = gr.Slider(label='Number of Frames',
minimum=0,
maximum=32,
step=1,
value=16)
sampling_rate = gr.Slider(label='sampling_rate',
minimum=0,
maximum=20,
step=1,
value=1)
start_sample_frame = gr.Number(label='Start frame in the video',
value=0,
precision=0)
with gr.Row():
control_list = ['dwpose', 'depth_zoe', 'depth_midas']
control_type = gr.Dropdown(
choices=control_list,
label='Control type',
value='dwpose'
)
# Checkbox group for "dwpose" options; default: hand selected, face not selected.
dwpose_options = gr.CheckboxGroup(
choices=["hand", "face"],
label="DW Pose Options",
value=["hand"],
visible=True # Initially visible since default control_type is "dwpose"
)
# Update the visibility of the dwpose_options based on the selected control type
control_type.change(
fn=lambda x: gr.update(visible=(x == "dwpose")),
inputs=control_type,
outputs=dwpose_options
)
controlnet_conditioning_scale = gr.Slider(label='ControlNet conditioning scale',
minimum=0.0,
maximum=1.0,
value=1.0,
step=0.1)
with gr.Accordion('Editing config for VideoGrian', open=True):
use_pnp = gr.Checkbox(
label="Use PnP",
value=False,
info="Check to enable PnP functionality."
)
pnp_inject_steps = gr.Slider(label='pnp inject steps',
info='PnP inject steps for temporal consistency',
minimum=0,
maximum=10,
step=1,
value=0)
flatten_res = gr.CheckboxGroup(
choices=["1", "2", "4", "8"],
label="Flatten Resolution",
value=["1"],
info="Select one or more flatten resolution factors. Mapping: 1 -> 64, 2 -> 32 (64/2), 4 -> 16 (64/4), 8 -> 8 (64/8)."
)
run_button = gr.Button('Generate')
with gr.Row():
from example import style_example
examples = style_example
# gr.Examples(examples=examples,
# inputs=[
# model_id,
# user_input_video,
# layout_files,
# prompt,
# model_id,
# control_type,
# dwpose_options,
# controlnet_conditioning_scale,
# use_pnp,
# pnp_inject_steps,
# flatten_res,
# ],
# outputs=result,
# fn=pipe.run,
# cache_examples=True,
# # cache_examples=os.getenv('SYSTEM') == 'spaces'
# )
gr.Markdown(ARTICLE)
inputs = [user_input_video, num_layouts,
*layout_files,
prompt,
model_id,
n_sample_frame,
start_sample_frame,
sampling_rate,
control_type,
dwpose_options,
controlnet_conditioning_scale,
use_pnp,
pnp_inject_steps,
flatten_res,
]
prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
demo.queue().launch(share=True)