Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
from utils.gradio_utils import * | |
import argparse | |
GRADIO_CACHE = "" | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--public_access', action='store_true') | |
args = parser.parse_args() | |
streaming_svd = StreamingSVD(load_argv=False) | |
on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR" | |
examples = [ | |
["Experience the dance of jellyfish: float through mesmerizing swarms of jellyfish, pulsating with otherworldly grace and beauty.", | |
"200 - frames (recommended)", 33, None, None], | |
["Dive into the depths of the ocean: explore vibrant coral reefs, mysterious underwater caves, and the mesmerizing creatures that call the sea home.", | |
"200 - frames (recommended)", 33, None, None], | |
["A cute cat.", | |
"200 - frames (recommended)", 33, None, None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test1.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test2.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test3.png", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test4.png", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test5.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test6.png", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test7.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test8.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test9.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test10.jpg", None], | |
["", | |
"200 - frames (recommended)", 33, "__assets__/gradio_cached_examples/test11.jpg", None], | |
] | |
def generate(prompt, num_frames, seed, image: np.ndarray): | |
if num_frames == [] or num_frames is None: | |
num_frames = 50 | |
else: | |
num_frames = int(num_frames.split(" ")[0]) | |
if num_frames > 200: # and on_huggingspace: | |
num_frames = 200 | |
if image is None: | |
image = text_to_image_gradio( | |
prompt=prompt, streaming_svd=streaming_svd, seed=seed) | |
video_file_stage_one = image_to_video_vfi_gradio( | |
img=image, num_frames=num_frames, streaming_svd=streaming_svd, seed=seed, gradio_cache=GRADIO_CACHE) | |
expanded_size, orig_size, scaled_outpainted_image = retrieve_intermediate_data(video_file_stage_one) | |
video_file_stage_two = enhance_video_vfi_gradio( | |
img=scaled_outpainted_image, video=video_file_stage_one.replace("__cropped__", "__expanded__"), num_frames=24, streaming_svd=streaming_svd, seed=seed, expanded_size=expanded_size, orig_size=orig_size, gradio_cache=GRADIO_CACHE) | |
return image, video_file_stage_one, video_file_stage_two | |
def enhance(prompt, num_frames, seed, image: np.ndarray, video:str): | |
if num_frames == [] or num_frames is None: | |
num_frames = 50 | |
else: | |
num_frames = int(num_frames.split(" ")[0]) | |
if num_frames > 200: # and on_huggingspace: | |
num_frames = 200 | |
# User directly applied Long Video Generation (without preview) with Flux. | |
if image is None: | |
image = text_to_image_gradio( | |
prompt=prompt, streaming_svd=streaming_svd, seed=seed) | |
# User directly applied Long Video Generation (without preview) with or without Flux. | |
if video is None: | |
video = image_to_video_gradio( | |
img=image, num_frames=(num_frames+1) // 2, streaming_svd=streaming_svd, seed=seed, gradio_cache=GRADIO_CACHE) | |
expanded_size, orig_size, scaled_outpainted_image = retrieve_intermediate_data(video) | |
# Here the video is path and image is numpy array | |
video_file_stage_two = enhance_video_vfi_gradio( | |
img=scaled_outpainted_image, video=video.replace("__cropped__", "__expanded__"), num_frames=num_frames, streaming_svd=streaming_svd, seed=seed, expanded_size=expanded_size, orig_size=orig_size, gradio_cache=GRADIO_CACHE) | |
return image, video_file_stage_two | |
with gr.Blocks() as demo: | |
GRADIO_CACHE = demo.GRADIO_CACHE | |
gr.HTML(""" | |
<div style="text-align: center; max-width: 1200px; margin: 20px auto;"> | |
<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem"> | |
<a href="https://github.com/Picsart-AI-Research/StreamingT2V" style="color:blue;">StreamingSVD</a> | |
</h1> | |
<h2 style="font-weight: 650; font-size: 2rem; margin: 0rem"> | |
A StreamingT2V method for high-quality long video generation | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
Roberto Henschel<sup>1*</sup>, Levon Khachatryan<sup>1*</sup>, Daniil Hayrapetyan<sup>1*</sup>, Hayk Poghosyan<sup>1</sup>, Vahram Tadevosyan<sup>1</sup>, Zhangyang Wang<sup>1,2</sup>, Shant Navasardyan<sup>1</sup>, <a href="https://www.humphreyshi.com/" style="color:blue;">Humphrey Shi</a><sup>1,3</sup> | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
<sup>1</sup>Picsart AI Resarch (PAIR), <sup>2</sup>UT Austin, <sup>3</sup>SHI Labs @ Georgia Tech, Oregon & UIUC | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
*Equal Contribution | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
[<a href="https://arxiv.org/abs/2403.14773" style="color:blue;">arXiv</a>] | |
[<a href="https://github.com/Picsart-AI-Research/StreamingT2V" style="color:blue;">GitHub</a>] | |
</h2> | |
<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> | |
<b>StreamingSVD</b> is an advanced autoregressive technique for text-to-video and image-to-video generation, | |
generating long hiqh-quality videos with rich motion dynamics, turning SVD into a long video generator. | |
Our method ensures temporal consistency throughout the video, aligns closely to the input text/image, | |
and maintains high frame-level image quality. Our demonstrations include successful examples of videos | |
up to 200 frames, spanning 8 seconds, and can be extended for even longer durations. | |
</h2> | |
</div> | |
""") | |
if on_huggingspace: | |
gr.HTML(""" | |
<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. | |
<br/> | |
<a href="https://huggingface.co/spaces/PAIR/StreamingT2V?duplicate=true"> | |
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> | |
</p>""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
num_frames = gr.Dropdown(["50 - frames (recommended)", "80 - frames (recommended)", "140 - frames (recommended)", "200 - frames (recommended)", "500 - frames", "1000 - frames", "10000 - frames"], | |
label="Number of Video Frames", info="For >200 frames use local workstation!", value="50 - frames (recommended)") | |
with gr.Row(): | |
prompt_stage1 = gr.Textbox(label='Text-to-Video (Enter text prompt here)', | |
interactive=True, max_lines=1) | |
with gr.Row(): | |
image_stage1 = gr.Image(label='Image-to-Video (Upload Image here, text prompt will be ignored for I2V if entered)', | |
show_label=True, show_download_button=True, interactive=True, height=250) | |
with gr.Column(): | |
video_stage1 = gr.Video(label='Long Video Preview', show_label=True, | |
interactive=False, show_download_button=True, height=203) | |
with gr.Row(): | |
run_button_stage1 = gr.Button("Long Video Generation (faster preview)") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion('Advanced options', open=False): | |
seed = gr.Slider(label='Seed', minimum=0, | |
maximum=65536, value=33, step=1,) | |
with gr.Column(scale=3): | |
with gr.Row(): | |
video_stage2 = gr.Video(label='High-Quality Long Video (Preview or Full)', show_label=True, | |
interactive=False, show_download_button=True, height=700) | |
with gr.Row(): | |
run_button_stage2 = gr.Button("Long Video Generation (full high-quality)") | |
inputs_t2v = [prompt_stage1, num_frames, | |
seed, image_stage1] | |
inputs_v2v = [prompt_stage1, num_frames, seed, | |
image_stage1, video_stage1] | |
run_button_stage1.click(fn=generate, inputs=inputs_t2v, | |
outputs=[image_stage1, video_stage1, video_stage2]) | |
run_button_stage2.click(fn=enhance, inputs=inputs_v2v, | |
outputs=[image_stage1, video_stage2]) | |
gr.Examples(examples=examples, | |
inputs=inputs_v2v, | |
outputs=[image_stage1, video_stage2], | |
fn=enhance, | |
cache_examples=True, | |
run_on_click=False, | |
) | |
''' | |
''' | |
gr.HTML(""" | |
<div style="text-align: justify; max-width: 1200px; margin: 20px auto;"> | |
<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem"> | |
<b>Version: v1.0</b> | |
</h3> | |
<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem"> | |
<b>Caution</b>: | |
We would like the raise the awareness of users of this demo of its potential issues and concerns. | |
Like previous large foundation models, StreamingSVD could be problematic in some cases, partially we use pretrained ModelScope, therefore StreamingSVD can Inherit Its Imperfections. | |
So far, we keep all features available for research testing both to show the great potential of the StreamingSVD framework and to collect important feedback to improve the model in the future. | |
We welcome researchers and users to report issues with the HuggingFace community discussion feature or email the authors. | |
</h3> | |
<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem"> | |
<b>Biases and content acknowledgement</b>: | |
Beware that StreamingSVD may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography, and violence. | |
StreamingSVD in this demo is meant only for research purposes. | |
</h3> | |
</div> | |
""") | |
if on_huggingspace: | |
demo.queue(max_size=20) | |
demo.launch(debug=True) | |
else: | |
demo.queue(api_open=False).launch(share=args.public_access) | |