File size: 10,588 Bytes
ddb7519 f6f9150 ddb7519 5a57f66 d1f4ed8 f45086c db8cbef 376d1c9 f6f9150 48571f9 ddb7519 00e8857 f248e7b d1f4ed8 f6f9150 48571f9 f6f9150 48571f9 f6f9150 ddb7519 d1f4ed8 ddb7519 5a57f66 eea7935 0f38a31 eea7935 ddb7519 1a070e2 f45086c 5a57f66 f45086c 5a57f66 f45086c 5a57f66 ddb7519 48571f9 ddb7519 f45086c e2ccff0 d1f4ed8 48571f9 e2ccff0 ddb7519 f248e7b ddb7519 f248e7b 48571f9 f248e7b 48571f9 ddb7519 48571f9 ddb7519 d1f4ed8 ddb7519 00e8857 ddb7519 00e8857 376d1c9 00e8857 376d1c9 f6f9150 e2ccff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# this is the huggingface handler file
from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from omegaconf import OmegaConf
from huggingface_hub import hf_hub_download, try_to_load_from_cache
import os
import json
import base64
from safetensors import safe_open
from diffusers.utils.import_utils import is_xformers_available
from typing import Any
import torch
import imageio
import torchvision
import numpy as np
from einops import rearrange
from animatediff.models.unet import UNet3DConditionModel
from animatediff.pipelines.pipeline_animation import AnimationPipeline
from animatediff.utils.util import save_videos_grid
from animatediff.utils.util import load_weights
from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
current_model = "backup"
class EndpointHandler():
def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"):
# inference_config_path = "configs/inference/inference-v3.yaml"
inference_config_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="configs/inference/inference-v3.yaml")
print(inference_config_path)
inference_config = OmegaConf.load(inference_config_path)
# inference_config = {'unet_additional_kwargs': {'unet_use_cross_frame_attention': False, 'unet_use_temporal_attention': False, 'use_motion_module': True, 'motion_module_resolutions': [1, 2, 4, 8], 'motion_module_mid_block': False, 'motion_module_decoder_only': False, 'motion_module_type': 'Vanilla', 'motion_module_kwargs': {'num_attention_heads': 8, 'num_transformer_block': 1, 'attention_block_types': ['Temporal_Self', 'Temporal_Self'], 'temporal_position_encoding': True, 'temporal_position_encoding_max_len': 24, 'temporal_attention_dim_div': 1}}, 'noise_scheduler_kwargs': {'DDIMScheduler': {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'linear', 'steps_offset': 1, 'clip_sample': False}, 'EulerAncestralDiscreteScheduler': {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'linear'}, 'KDPM2AncestralDiscreteScheduler': {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'linear'}}}
### >>> create validation pipeline >>> ###
tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="models/StableDiffusion/tokenizer")
text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="models/StableDiffusion/text_encoder")
vae = AutoencoderKL.from_pretrained(model_path, subfolder="models/StableDiffusion/vae")
unet_model_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/StableDiffusion/unet/diffusion_pytorch_model.bin")
unet_config_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/StableDiffusion/unet/config.json")
print(unet_model_path)
unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs), config_path=unet_config_path)
# inv_latent_path = f"{OUTPUT_DIR}/inv_latents/ddim_latent-1.pt"
inv_latent_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/inv_latents/ddim_latent-1.pt")
self.latents = torch.load(inv_latent_path).to(torch.float)
print(self.latents.shape, self.latents.dtype)
# torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.enable_math_sdp(True)
if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
else: assert False
self.pipeline = AnimationPipeline(
vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,
scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs.DDIMScheduler))
).to("cuda")
# huggingface download motion module from bluestarburst/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt
# motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt")
motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/mm.pth")
# LORA_DREAMBOOTH_PATH="models/DreamBooth_LoRA/toonyou_beta3.safetensors"
LORA_DREAMBOOTH_PATH = None
LORA_DREAMBOOTH_PATH = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/DreamBooth_LoRA/toonyou_beta3.safetensors")
# self.pipeline = load_weights(
# self.pipeline,
# # motion module
# motion_module_path = motion_module,
# motion_module_lora_configs = [],
# # image layers
# dreambooth_model_path = "",
# lora_model_path = "",
# lora_alpha = 0.8,
# ).to("cuda")
motion_module_state_dict = torch.load(motion_module, map_location="cpu")
missing, unexpected = self.pipeline.unet.load_state_dict(motion_module_state_dict, strict=False)
assert len(unexpected) == 0
# FIX THIS
if LORA_DREAMBOOTH_PATH != "":
if LORA_DREAMBOOTH_PATH.endswith(".ckpt"):
state_dict = torch.load(LORA_DREAMBOOTH_PATH)
self.pipeline.unet.load_state_dict(state_dict)
elif LORA_DREAMBOOTH_PATH.endswith(".safetensors"):
state_dict = {}
with safe_open(LORA_DREAMBOOTH_PATH, framework="pt", device="cpu") as f:
for key in f.keys():
state_dict[key] = f.get_tensor(key)
is_lora = all("lora" in k for k in state_dict.keys())
if not is_lora:
base_state_dict = state_dict
else:
base_state_dict = {}
with safe_open("", framework="pt", device="cpu") as f:
for key in f.keys():
base_state_dict[key] = f.get_tensor(key)
# vae
converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_state_dict, self.pipeline.vae.config)
self.pipeline.vae.load_state_dict(converted_vae_checkpoint)
# unet
converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_state_dict, self.pipeline.unet.config)
self.pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
# text_model (TODO: problem here)
# converted_test_encoder_checkpoint = convert_ldm_clip_checkpoint(base_state_dict)
# pipeline.text_encoder = converted_test_encoder_checkpoint
# import pdb
# pdb.set_trace()
if is_lora:
self.pipeline = convert_lora(self.pipeline, state_dict)
# self.pipeline = convert_lora(self.pipeline, state_dict, alpha=model_config.lora_alpha)
self.pipeline.to("cuda")
def __call__(self, data : Any):
"""
__call__ method will be called once per request. This can be used to
run inference.
"""
prompt = data.pop("prompt", "")
negative_prompt = data.pop("negative_prompt", "")
negative_prompt += ",easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality"
steps = data.pop("steps", 25)
guidance_scale = data.pop("guidance_scale", 12.5)
print(f"current seed: {torch.initial_seed()}")
print(f"sampling {prompt} ...")
vids = self.pipeline(
prompt,
negative_prompt = negative_prompt,
num_inference_steps = steps,
guidance_scale = guidance_scale,
width = 256,
height = 256,
video_length = 5,
latents = self.latents,
).videos
# vids = self.pipeline(
# prompt=prompt,
# negative_prompt=negative_prompt,
# num_inference_steps=steps,
# guidance_scale=guidance_scale,
# width= 256,
# height= 256,
# video_length= 5,
# ).videos
videos = rearrange(vids, "b c t h w -> t b c h w")
n_rows=6
fps=1
loop = True
rescale=False
outputs = []
for x in videos:
x = torchvision.utils.make_grid(x, nrow=n_rows)
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
if rescale:
x = (x + 1.0) / 2.0 # -1,1 -> 0,1
x = (x * 255).numpy().astype(np.uint8)
outputs.append(x)
path = "output.gif"
imageio.mimsave(path, outputs, fps=fps)
# open the file as binary and read the data
with open(path, mode="rb") as file:
file_content = file.read()
# return json response with binary data
# Encode the binary data using Base64
base64_encoded_content = base64.b64encode(file_content).decode("utf-8")
# Create a JSON object with the Base64-encoded content
json_data = {
"filename": "output.gif",
"content": base64_encoded_content
}
# Convert the JSON object to a JSON-formatted string
return json.dumps(json_data)
# This is the entry point for the serverless function.
# This function will be called during inference time.
# new_handler = EndpointHandler()
|