Spaces:

chenyangqi
/

FateZero

Runtime error

File size: 11,643 Bytes

#!/usr/bin/env python

from __future__ import annotations

import os

import gradio as gr

from inference_fatezero import merge_config_then_run


# TITLE = '# [FateZero](http://fate-zero-edit.github.io/)'
HF_TOKEN = os.getenv('HF_TOKEN')
# pipe = InferencePipeline(HF_TOKEN)
pipe = merge_config_then_run()
# app = InferenceUtil(HF_TOKEN)

with gr.Blocks(css='style.css') as demo:
    # gr.Markdown(TITLE)
    
    gr.HTML(
    """
    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
    <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
        FateZero : Fusing Attentions for Zero-shot Text-based Video Editing
    </h1>
    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
            <a href="https://chenyangqiqi.github.io/">Chenyang Qi</a>
            <a href="https://vinthony.github.io/academic/">Xiaodong Cun</a> , <a href="https://yzhang2016.github.io/">Yong Zhang</a>, 
            <a href="https://chenyanglei.github.io">Chenyang Lei</a>, <a href="https://xinntao.github.io/"> Xintao Wang </a>,
            <a href="https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=zh-CN">Ying Shan</a>,
            <a href="http://cqf.io">Qifeng Chen</a>
    </h2>

    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                  <span class="link-block">
                    [<a href="https://arxiv.org/abs/2303.09535" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>]
                </span>

                  <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://github.com/ChenyangQiQi/FateZero" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>]
                </span>

                <!-- Github link -->
                  <span class="link-block">
                    [<a href="http://fate-zero-edit.github.io/" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Homepage</span>
                  </a>]
                </span>

                <!-- Github link -->
                <span class="link-block">
                  [<a href="https://hkustconnect-my.sharepoint.com/:v:/g/personal/cqiaa_connect_ust_hk/EXKDI_nahEhKtiYPvvyU9SkBDTG2W4G1AZ_vkC7ekh3ENw?e=ficp9t" target="_blank"
                  class="external-link ">
                  <span class="icon">
                    <i class="fab fa-youtube"></i>
                  </span>
                  <span>Video</span>
                </a>]
              </span>
    </h2>
    <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        TL;DR: FateZero is the first zero-shot framework for text-driven video editing via pretrained diffusion models without training.
    </h2>
    </div>
    """)


    gr.HTML("""
    <p>We provide an <a href="https://github.com/ChenyangQiQi/FateZero/blob/main/docs/EditingGuidance.md"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
    <p>Note that due to the limits of memory and computing resources on hugging-face, the results here are only toy examples and take longer to edit.
    <p>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue.
    <br/>
    <a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true">
    <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
    <p>Alternatively, try our GitHub <a href=https://github.com/ChenyangQiQi/FateZero> code  </a> on your GPU.
    </p>""")

    with gr.Row():
        with gr.Column():
            with gr.Accordion('Input Video', open=True):
                # user_input_video = gr.File(label='Input Source Video')
                user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
                with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
                    n_sample_frame = gr.Slider(label='Number of Frames',
                                        minimum=0,
                                        maximum=32,
                                        step=1,
                                        value=8)
                    stride = gr.Slider(label='Temporal stride',
                                            minimum=0,
                                            maximum=20,
                                            step=1,
                                            value=1)
                    start_sample_frame = gr.Number(label='Start frame in the video',
                              value=0,
                              precision=0)

                with gr.Accordion('Spatial Crop offset', open=False):
                    left_crop = gr.Number(label='Left crop',
                              value=0,
                              precision=0)
                    right_crop = gr.Number(label='Right crop',
                              value=0,
                              precision=0)
                    top_crop = gr.Number(label='Top crop',
                              value=0,
                              precision=0)
                    bottom_crop = gr.Number(label='Bottom crop',
                              value=0,
                              precision=0)
                    offset_list = [
                         left_crop,
                         right_crop,
                         top_crop,
                         bottom_crop,
                    ]
                
                ImageSequenceDataset_list = [
                   start_sample_frame,
                   n_sample_frame,
                   stride
                ] + offset_list
                
                model_id = gr.Dropdown(
                    label='Model ID',
                    choices=[
                        'CompVis/stable-diffusion-v1-4',
                        # add shape editing ckpt here
                    ],
                    value='CompVis/stable-diffusion-v1-4')


            with gr.Accordion('Text Prompt', open=True):

                source_prompt = gr.Textbox(label='Source Prompt',
                                    info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
                                    max_lines=1,
                                    placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
                                    value='a silver jeep driving down a curvy road in the countryside')
                target_prompt = gr.Textbox(label='Target Prompt',
                                    info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
                                    max_lines=1,
                                    placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
                                    value='watercolor painting of a silver jeep driving down a curvy road in the countryside')





            run_button = gr.Button('Generate')

        with gr.Column():
            result = gr.Video(label='Result')
            # result.style(height=512, width=512)
            with gr.Accordion('FateZero Parameters for attention fusing', open=True):
                cross_replace_steps = gr.Slider(label='Cross-att replace steps',
                                info='More steps, replace more cross attention to preserve semantic layout.',
                                minimum=0.0,
                                maximum=1.0,
                                step=0.1,
                                value=0.7)
                
                self_replace_steps = gr.Slider(label='Self-att replace steps',
                                info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
                                minimum=0.0,
                                maximum=1.0,
                                step=0.1,
                                value=0.7)
                
                enhance_words = gr.Textbox(label='Enhanced words',
                                    info='Amplify the target-words cross attention',
                                    max_lines=1,
                                    placeholder='Example: "watercolor "',
                                    value='watercolor')

                enhance_words_value = gr.Slider(label='Target cross-att amplification',
                                info='larger value, more elements of target words',
                                minimum=0.0,
                                maximum=20.0,
                                step=1,
                                value=10)
            with gr.Accordion('DDIM Parameters', open=True):
                num_steps = gr.Slider(label='Number of Steps',
                                      info='larger value has better editing capacity, but takes more time and memory. (50 steps may produces memory errors)',
                                      minimum=0,
                                      maximum=30,
                                      step=1,
                                      value=10)
                guidance_scale = gr.Slider(label='CFG Scale',
                                           minimum=0,
                                           maximum=50,
                                           step=0.1,
                                           value=7.5)
    with gr.Row():
        from example import style_example
        examples = style_example

        gr.Examples(examples=examples,
                    inputs=[
                        model_id,
                        user_input_video,
                        source_prompt,
                        target_prompt,
                        cross_replace_steps,
                        self_replace_steps,
                        enhance_words,
                        enhance_words_value,
                        num_steps,
                        guidance_scale,
                        user_input_video,
                        *ImageSequenceDataset_list
                    ],
                    outputs=result,
                    fn=pipe.run,
                    cache_examples=True,
                    # cache_examples=os.getenv('SYSTEM') == 'spaces'
                    )

    inputs = [
            model_id,
            user_input_video,
            source_prompt,
            target_prompt,
            cross_replace_steps,
            self_replace_steps,
            enhance_words,
            enhance_words_value,
            num_steps,
            guidance_scale,
            user_input_video,
            *ImageSequenceDataset_list
    ]
    target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
    run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

demo.queue().launch()