File size: 7,371 Bytes
c19ca42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import torch
import gradio as gr
import diffusers
from modules import scripts, processing, shared, images, sd_models, devices


MODELS = [
    { 'name': 'None', 'info': '' },
    # { 'name': 'PIA', 'url': 'openmmlab/PIA-condition-adapter', 'info': '<a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/pia" target="_blank">Open MMLab Personalized Image Animator</a>' },
    { 'name': 'VGen', 'url': 'ali-vilab/i2vgen-xl', 'info': '<a href="https://huggingface.co/ali-vilab/i2vgen-xl" target="_blank">Alibaba VGen</a>' },
]


class Script(scripts.Script):
    def title(self):
        return 'Image-to-Video'

    def show(self, is_img2img):
        return is_img2img if shared.backend == shared.Backend.DIFFUSERS else False
        # return False

    # return signature is array of gradio components
    def ui(self, _is_img2img):

        def video_change(video_type):
            return [
                gr.update(visible=video_type != 'None'),
                gr.update(visible=video_type == 'GIF' or video_type == 'PNG'),
                gr.update(visible=video_type == 'MP4'),
                gr.update(visible=video_type == 'MP4'),
            ]

        def model_change(model_name):
            model = next(m for m in MODELS if m['name'] == model_name)
            return gr.update(value=model['info']), gr.update(visible=model_name == 'PIA'), gr.update(visible=model_name == 'VGen')

        with gr.Row():
            model_name = gr.Dropdown(label='Model', value='None', choices=[m['name'] for m in MODELS])
        with gr.Row():
            model_info = gr.HTML()
        with gr.Row():
            num_frames = gr.Slider(label='Frames', minimum=0, maximum=50, step=1, value=16)
        with gr.Row():
            video_type = gr.Dropdown(label='Video file', choices=['None', 'GIF', 'PNG', 'MP4'], value='None')
            duration = gr.Slider(label='Duration', minimum=0.25, maximum=10, step=0.25, value=2, visible=False)
        with gr.Accordion('FreeInit', open=False, visible=False) as fi_accordion:
            with gr.Row():
                fi_method = gr.Dropdown(label='Method', choices=['none', 'butterworth', 'ideal', 'gaussian'], value='none')
            with gr.Row():
                # fi_fast = gr.Checkbox(label='Fast sampling', value=False)
                fi_iters = gr.Slider(label='Iterations', minimum=1, maximum=10, step=1, value=3)
                fi_order = gr.Slider(label='Order', minimum=1, maximum=10, step=1, value=4)
            with gr.Row():
                fi_spatial = gr.Slider(label='Spatial frequency', minimum=0.0, maximum=1.0, step=0.05, value=0.25)
                fi_temporal = gr.Slider(label='Temporal frequency', minimum=0.0, maximum=1.0, step=0.05, value=0.25)
        with gr.Accordion('VGen params', open=True, visible=False) as vgen_accordion:
            with gr.Row():
                vg_chunks = gr.Slider(label='Decode chunks', minimum=0.1, maximum=1.0, step=0.1, value=0.5)
                vg_fps = gr.Slider(label='Change rate', minimum=0.1, maximum=1.0, step=0.1, value=0.5)
        with gr.Row():
            gif_loop = gr.Checkbox(label='Loop', value=True, visible=False)
            mp4_pad = gr.Slider(label='Pad frames', minimum=0, maximum=24, step=1, value=1, visible=False)
            mp4_interpolate = gr.Slider(label='Interpolate frames', minimum=0, maximum=24, step=1, value=0, visible=False)
        model_name.change(fn=model_change, inputs=[model_name], outputs=[model_info, fi_accordion, vgen_accordion])
        video_type.change(fn=video_change, inputs=[video_type], outputs=[duration, gif_loop, mp4_pad, mp4_interpolate])
        return [model_name, num_frames, video_type, duration, gif_loop, mp4_pad, mp4_interpolate, fi_method, fi_iters, fi_order, fi_spatial, fi_temporal, vg_chunks, vg_fps]

    def run(self, p: processing.StableDiffusionProcessing, model_name, num_frames, video_type, duration, gif_loop, mp4_pad, mp4_interpolate, fi_method, fi_iters, fi_order, fi_spatial, fi_temporal, vg_chunks, vg_fps): # pylint: disable=arguments-differ, unused-argument
        if model_name == 'None':
            return
        if p.init_images is None or len(p.init_images) == 0:
            return
        model = [m for m in MODELS if m['name'] == model_name][0]
        repo_id = model['url']
        shared.log.debug(f'Image2Video: model={model_name} frames={num_frames}, video={video_type} duration={duration} loop={gif_loop} pad={mp4_pad} interpolate={mp4_interpolate}')
        p.ops.append('image2video')
        p.do_not_save_grid = True
        orig_pipeline = shared.sd_model

        if model_name == 'PIA':
            if shared.sd_model_type != 'sd':
                shared.log.error('Image2Video PIA: base model must be SD15')
                return
            shared.log.info(f'Image2Video PIA load: model={repo_id}')
            motion_adapter = diffusers.MotionAdapter.from_pretrained(repo_id)
            sd_models.move_model(motion_adapter, devices.device)
            shared.sd_model = sd_models.switch_pipe(diffusers.PIAPipeline, shared.sd_model, { 'motion_adapter': motion_adapter })
            sd_models.move_model(shared.sd_model, devices.device, force=True) # move pipeline to device
            if num_frames > 0:
                p.task_args['num_frames'] = num_frames
                p.task_args['image'] = p.init_images[0]
            if hasattr(shared.sd_model, 'enable_free_init') and fi_method != 'none':
                shared.sd_model.enable_free_init(
                    num_iters=fi_iters,
                    use_fast_sampling=False,
                    method=fi_method,
                    order=fi_order,
                    spatial_stop_frequency=fi_spatial,
                    temporal_stop_frequency=fi_temporal,
                )
            shared.log.debug(f'Image2Video PIA: args={p.task_args}')
            processed = processing.process_images(p)
            shared.sd_model.motion_adapter = None

        if model_name == 'VGen':
            if not isinstance(shared.sd_model, diffusers.I2VGenXLPipeline):
                shared.log.info(f'Image2Video VGen load: model={repo_id}')
                pipe = diffusers.I2VGenXLPipeline.from_pretrained(repo_id, torch_dtype=devices.dtype, cache_dir=shared.opts.diffusers_dir)
                sd_models.copy_diffuser_options(pipe, shared.sd_model)
                sd_models.set_diffuser_options(pipe)
                shared.sd_model = pipe
                sd_models.move_model(shared.sd_model, devices.device) # move pipeline to device
                shared.sd_model.to(dtype=torch.float32)
            if num_frames > 0:
                p.task_args['image'] = p.init_images[0]
                p.task_args['num_frames'] = num_frames
                p.task_args['target_fps'] = max(1, int(num_frames * vg_fps))
                p.task_args['decode_chunk_size'] = max(1, int(num_frames * vg_chunks))
                p.task_args['output_type'] = 'pil'
            shared.log.debug(f'Image2Video VGen: args={p.task_args}')
            processed = processing.process_images(p)

        shared.sd_model = orig_pipeline
        if video_type != 'None' and processed is not None:
            images.save_video(p, filename=None, images=processed.images, video_type=video_type, duration=duration, loop=gif_loop, pad=mp4_pad, interpolate=mp4_interpolate)
        return processed