ai-tube-model-als-6

Running on A10G

File size: 5,688 Bytes

import spaces

import gradio as gr
# import gradio.helpers
import torch
import os
from glob import glob
from pathlib import Path
from typing import Optional

from PIL import Image
from diffusers.utils import load_image, export_to_video
from pipeline import StableVideoDiffusionPipeline

import random
from safetensors import safe_open
from lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler


def get_safetensors_files():
    models_dir = "./safetensors"
    safetensors_files = [
        f for f in os.listdir(models_dir) if f.endswith(".safetensors")
    ]
    return safetensors_files


def model_select(selected_file):
    print("load model weights", selected_file)
    pipe.unet.cpu()
    file_path = os.path.join("./safetensors", selected_file)
    state_dict = {}
    with safe_open(file_path, framework="pt", device="cpu") as f:
        for key in f.keys():
            state_dict[key] = f.get_tensor(key)
    missing, unexpected = pipe.unet.load_state_dict(state_dict, strict=True)
    pipe.unet.cuda()
    del state_dict
    return


noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler(
    num_train_timesteps=40,
    sigma_min=0.002,
    sigma_max=700.0,
    sigma_data=1.0,
    s_noise=1.0,
    rho=7,
    clip_denoised=False,
)
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    scheduler=noise_scheduler,
    torch_dtype=torch.float16,
    variant="fp16",
)
pipe.to("cuda")
pipe.enable_model_cpu_offload()  # for smaller cost
model_select("AnimateLCM-SVD-xt-1.1.safetensors")
# pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) # for faster inference


max_64_bit_int = 2**63 - 1

@spaces.GPU
def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = False,
    motion_bucket_id: int = 80,
    fps_id: int = 8,
    max_guidance_scale: float = 1.2,
    min_guidance_scale: float = 1,
    width: int = 1024,
    height: int = 576,
    num_inference_steps: int = 4,
    decoding_t: int = 4,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    output_folder: str = "outputs_gradio",
):
    if image.mode == "RGBA":
        image = image.convert("RGB")

    if randomize_seed:
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)

    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    with torch.autocast("cuda"):
        frames = pipe(
            image,
            decode_chunk_size=decoding_t,
            generator=generator,
            motion_bucket_id=motion_bucket_id,
            height=height,
            width=width,
            num_inference_steps=num_inference_steps,
            min_guidance_scale=min_guidance_scale,
            max_guidance_scale=max_guidance_scale,
        ).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)

    return video_path, seed


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            image = gr.Image(label="Upload your image", type="pil")
            generate_btn = gr.Button("Generate")
        video = gr.Video()

        seed = gr.Slider(
            label="Seed",
            value=42,
            randomize=False,
            minimum=0,
            maximum=max_64_bit_int,
            step=1,
        )
        randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
        motion_bucket_id = gr.Slider(
            label="Motion bucket id",
            info="Controls how much motion to add/remove from the image",
            value=80,
            minimum=1,
            maximum=255,
        )
        fps_id = gr.Slider(
            label="Frames per second",
            info="The length of your video in seconds will be 25/fps",
            value=8,
            minimum=5,
            maximum=30,
        )
        # note: we want something that is close to 16:9 (1.7777)
        # 576 / 320 = 1.8
        #  448 / 256 = 1.75
        width = gr.Slider(
            label="Width of input image",
            info="It should be divisible by 64",
            value=576, # 256, 320, 384, 448
            minimum=256,
            maximum=2048,
            step=64,
        )
        height = gr.Slider(
            label="Height of input image",
            info="It should be divisible by 64",
            value=320, # 256, 320, 384, 448
            minimum=256,
            maximum=1152,
        )
        max_guidance_scale = gr.Slider(
            label="Max guidance scale",
            info="classifier-free guidance strength",
            value=1.2,
            minimum=1,
            maximum=2,
        )
        min_guidance_scale = gr.Slider(
            label="Min guidance scale",
            info="classifier-free guidance strength",
            value=1,
            minimum=1,
            maximum=1.5,
        )
        num_inference_steps = gr.Slider(
            label="Num inference steps",
            info="steps for inference",
            value=4,
            minimum=1,
            maximum=20,
            step=1,
        )

    generate_btn.click(
        fn=sample,
        inputs=[
            image,
            seed,
            randomize_seed,
            motion_bucket_id,
            fps_id,
            max_guidance_scale,
            min_guidance_scale,
            width,
            height,
            num_inference_steps,
        ],
        outputs=[video, seed],
        api_name="video",
    )

if __name__ == "__main__":
    demo.queue()
    demo.launch(show_error=True)