Spaces:

Make-A-Protagonist
/

Make-A-Protagonist-inference

Runtime error

File size: 11,786 Bytes

9ae2d76
 
 
 
 
 
 
 
681faa4
 
9ae2d76

#!/usr/bin/env python

from __future__ import annotations

import os
import sys
import warnings

os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/segment_anything")
os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO")
# os.system("pip install --upgrade diffusers[torch]")
warnings.filterwarnings("ignore")

import gradio as gr

from inference import InferencePipeline


class InferenceUtil:
    def __init__(self, hf_token: str | None):
        self.hf_token = hf_token

    def load_model_info(self, model_id: str) -> tuple[str, str]:
        ## TODO the modelcard is in the readme of huggingface repo, should know how to write it
        try:
            card = InferencePipeline.get_model_card(model_id, self.hf_token)
        except Exception:
            return '', ''
            # return ''
        base_model = getattr(card.data, 'base_model', '')
        protagonist = getattr(card.data, 'protagonist', '')
        training_prompt = getattr(card.data, 'training_prompt', '')
        return protagonist, training_prompt
        # return training_prompt


# TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)'
HF_TOKEN = os.getenv('HF_TOKEN')
# print("HF Token ===> ", HF_TOKEN)
pipe = InferencePipeline(HF_TOKEN)
app = InferenceUtil(HF_TOKEN)

with gr.Blocks(css='style.css') as demo:
    # gr.Markdown(TITLE)
    
    gr.HTML(
    """
    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
    <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
            Make-A-Protagonist:
            <br>
    Generic Video Editing with An Ensemble of Experts
    </h1>
    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
            <a href="https://yuyangzhao.com">Yuyang Zhao</a><sup>1</sup>
            <a href="https://xieenze.github.io/">Enze Xie</a><sup>2</sup> 
            <a href="https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ&hl=en">Lanqing Hong</a><sup>2</sup>
            <a href="https://scholar.google.com.sg/citations?user=XboZC1AAAAAJ&hl=en">Zhenguo Li</a><sup>2</sup>
            <a href="https://www.comp.nus.edu.sg/~leegh/">Gim Hee Lee</a><sup>1</sup>
    </h2>

    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
            <sup>1 </sup>National University of Singapore
            <sup>2 </sup>Huawei Noah's Ark Lab</span>
    </h2>

    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                  <span class="link-block">
                    [<a href="https://arxiv.org/abs/2305.08850" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>]
                </span>

                  <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://github.com/Make-A-Protagonist/Make-A-Protagonist" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>]
                </span>

                <!-- Github link -->
                  <span class="link-block">
                    [<a href="https://make-a-protagonist.github.io/" target="_blank"
                    class="external-link ">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Homepage</span>
                  </a>]
                </span>

    </h2>
    <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        TL;DR: The first framework for generic video editing with both visual and textual clues.
    </h2>
    </div>
    """)


    with gr.Row():
        with gr.Column():
            with gr.Box():
                model_id = gr.Dropdown(
                    label='Model ID',
                    choices=[
                        'Make-A-Protagonist/ikun',
                        'Make-A-Protagonist/huaqiang',
                        'Make-A-Protagonist/yanzi',
                        'Make-A-Protagonist/car-turn',
                    ],
                    value='Make-A-Protagonist/ikun')

                with gr.Row():
                    base_model_used_for_training = gr.Textbox(
                        label='Protagonist', interactive=False, value='man')
                    prompt_used_for_training = gr.Textbox(
                        label='Training prompt', interactive=False, value='A man is playing basketball')
            with gr.Box():
                ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto")
                ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt',
                    max_lines=1,
                    placeholder='Example: "man"')
                    
            prompt = gr.Textbox(label='Prompt',
                                max_lines=1,
                                placeholder='Example: "A panda is surfing"')
            video_length = gr.Slider(label='Video length',
                                     minimum=4,
                                     maximum=8,
                                     step=1,
                                     value=8)
            fps = gr.Slider(label='FPS',
                            minimum=1,
                            maximum=8,
                            step=1,
                            value=4)
            seed = gr.Slider(label='Seed',
                             minimum=0,
                             maximum=100000,
                             step=1,
                             value=0)

            with gr.Accordion('ControlNet Parameters', open=True):
                control_pose = gr.Slider(label='Pose',
                                      minimum=0,
                                      maximum=1,
                                      step=0.1,
                                      value=.5)
                control_depth = gr.Slider(label='Depth',
                                      minimum=0,
                                      maximum=1,
                                      step=0.1,
                                      value=.5)

            with gr.Accordion('Editing Function', open=True):
                with gr.Row():
                    source_pro = gr.Slider(label='Source Protagonist',
                                      minimum=0,
                                      maximum=1,
                                      step=1,
                                      value=0)
                    source_bg = gr.Slider(label='Source Background',
                                      minimum=0,
                                      maximum=1,
                                      step=1,
                                      value=0)

            with gr.Accordion('Other Parameters', open=False):
                num_steps = gr.Slider(label='Number of Steps',
                                      minimum=0,
                                      maximum=100,
                                      step=1,
                                      value=50)
                guidance_scale = gr.Slider(label='CFG Scale',
                                           minimum=0,
                                           maximum=50,
                                           step=0.1,
                                           value=12.5)

                noise_level = gr.Slider(label='Noise Level',
                                           minimum=0,
                                           maximum=999,
                                           step=1,
                                           value=0)


            run_button = gr.Button('Generate')

            gr.Markdown('''
            - It takes a few minutes to download model first.
            - It takes one minute to load model and conduct DDIM inverse
            ''')
        with gr.Column():
            result = gr.Video(label='Result')
    with gr.Row():
        examples = [
            [
                'Make-A-Protagonist/ikun',
                'A man is playing basketball on the beach, anime style.',
                8,
                4,
                33,
                50,
                12.5,
                'data/ikun/reference_images/zhongli.jpg',
                'man',
                0,
                0.5,
                0.5,
                0,
                0
            ],

            [
                'Make-A-Protagonist/huaqiang',
                'Elon Musk walking down the street.',
                8,
                4,
                33,
                50,
                12.5,
                'data/huaqiang/reference_images/musk.jpg',
                'man',
                0,
                0.5,
                0.5,
                0,
                1,
            ],

            [
                'Make-A-Protagonist/yanzi',
                'A panda walking down the snowy street.',
                8,
                4,
                33,
                50,
                12.5,
                'data/yanzi/reference_images/panda.jpeg',
                'panda',
                0,
                0.5,
                0.5,
                0,
                0
            ],

            [
                'Make-A-Protagonist/car-turn',
                'A car moving in the desert.',
                8,
                4,
                33,
                50,
                12.5,
                'data/car-turn/reference_images/audi.jpeg',
                'car',
                0,
                0.0,
                1.0,
                0,
                0
            ],

            [
                'Make-A-Protagonist/car-turn',
                'A Suzuki Jimny driving down a mountain road in the rain.',
                8,
                4,
                33,
                50,
                12.5,
                'data/car-turn/images/0000.jpg',
                'car',
                0,
                0.0,
                1.0,
                1,
                0
            ],

        ]
        gr.Examples(examples=examples,
                    inputs=[
                        model_id,
                        prompt,
                        video_length,
                        fps,
                        seed,
                        num_steps,
                        guidance_scale,
                        ref_image,
                        ref_pro_prompt,
                        noise_level,
                        control_pose,
                        control_depth,
                        source_pro,
                        source_bg,
                    ],
                    outputs=result,
                    fn=pipe.run,
                    cache_examples=os.getenv('SYSTEM') == 'spaces')

    model_id.change(fn=app.load_model_info,
                    inputs=model_id,
                    outputs=[
                        base_model_used_for_training,
                        prompt_used_for_training,
                    ])
    


    inputs = [
        model_id,
        prompt,
        video_length,
        fps,
        seed,
        num_steps,
        guidance_scale,
        ref_image,
        ref_pro_prompt,
        noise_level,
        control_pose,
        control_depth,
        source_pro,
        source_bg,
    ]
    prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
    run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

demo.queue().launch(share=True)