|
|
|
|
|
from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler |
|
from transformers import CLIPTextModel, CLIPTokenizer |
|
from omegaconf import OmegaConf |
|
from huggingface_hub import hf_hub_download |
|
|
|
import os |
|
|
|
from diffusers.utils.import_utils import is_xformers_available |
|
from typing import Any |
|
import torch |
|
from einops import rearrange |
|
|
|
from animatediff.models.unet import UNet3DConditionModel |
|
from animatediff.pipelines.pipeline_animation import AnimationPipeline |
|
from animatediff.utils.util import save_videos_grid |
|
from animatediff.utils.util import load_weights |
|
|
|
|
|
class EndpointHandler(): |
|
def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"): |
|
|
|
inference_config_path = "configs/inference/inference-v3.yaml" |
|
hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="configs/inference/inference-v3.yaml") |
|
|
|
inference_config = OmegaConf.load(inference_config_path) |
|
|
|
|
|
|
|
|
|
tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="models/StableDiffusion/tokenizer") |
|
text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="models/StableDiffusion/text_encoder") |
|
vae = AutoencoderKL.from_pretrained(model_path, subfolder="models/StableDiffusion/vae") |
|
|
|
if not os.path.isfile("models/StableDiffusion/unet/diffusion_pytorch_model.bin"): |
|
hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/StableDiffusion/unet/config.json") |
|
hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/StableDiffusion/unet/diffusion_pytorch_model.bin") |
|
|
|
unet_model_path = "models/StableDiffusion/unet" |
|
|
|
unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs)) |
|
|
|
if is_xformers_available(): unet.enable_xformers_memory_efficient_attention() |
|
else: assert False |
|
|
|
self.pipeline = AnimationPipeline( |
|
vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, |
|
scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs.DDIMScheduler)) |
|
).to("cuda") |
|
|
|
|
|
|
|
motion_module = "models/MotionModule/mm_sd_v15.ckpt" |
|
hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt") |
|
|
|
|
|
self.pipeline = load_weights( |
|
self.pipeline, |
|
|
|
motion_module_path = motion_module, |
|
motion_module_lora_configs = [], |
|
|
|
dreambooth_model_path = "", |
|
lora_model_path = "", |
|
lora_alpha = 0.8, |
|
).to("cuda") |
|
|
|
def __call__(self, prompt, negative_prompt, steps, guidance_scale): |
|
""" |
|
__call__ method will be called once per request. This can be used to |
|
run inference. |
|
""" |
|
vids = self.pipeline( |
|
prompt=prompt, |
|
negative_prompt=negative_prompt, |
|
num_inference_steps=steps, |
|
guidance_scale=guidance_scale, |
|
width= 256, |
|
height= 256, |
|
video_length= 5, |
|
).videos |
|
|
|
videos = rearrange(vids, "b c t h w -> t b c h w") |
|
n_rows=6 |
|
fps=1 |
|
loop = True |
|
rescale=False |
|
outputs = [] |
|
for x in videos: |
|
x = torchvision.utils.make_grid(x, nrow=n_rows) |
|
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) |
|
if rescale: |
|
x = (x + 1.0) / 2.0 |
|
x = (x * 255).numpy().astype(np.uint8) |
|
outputs.append(x) |
|
|
|
|
|
|
|
|
|
return outputs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|