Spaces:

multimodalart
/

Ip-Adapter-FaceID

Running on Zero

App Files Files Community

Ip-Adapter-FaceID / ip_adapter /ip_adapter_faceid.py

multimodalart HF staff

Upload 9 files

e60abae about 1 year ago

raw

history blame

6.86 kB

	import os
	from typing import List

	import torch
	from diffusers import StableDiffusionPipeline
	from diffusers.pipelines.controlnet import MultiControlNetModel
	from PIL import Image
	from safetensors import safe_open
	from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

	from .attention_processor_faceid import LoRAAttnProcessor, LoRAIPAttnProcessor


	class MLPProjModel(torch.nn.Module):
	"""SD model with image prompt"""
	def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
	super().__init__()

	self.cross_attention_dim = cross_attention_dim
	self.num_tokens = num_tokens

	self.proj = torch.nn.Sequential(
	torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
	torch.nn.GELU(),
	torch.nn.Linear(id_embeddings_dim2, cross_attention_dimnum_tokens),
	)
	self.norm = torch.nn.LayerNorm(cross_attention_dim)

	def forward(self, id_embeds):
	x = self.proj(id_embeds)
	x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
	x = self.norm(x)
	return x


	class IPAdapterFaceID:
	def __init__(self, sd_pipe, ip_ckpt, device, lora_rank=128, num_tokens=4):
	self.device = device
	self.ip_ckpt = ip_ckpt
	self.lora_rank = lora_rank
	self.num_tokens = num_tokens

	self.pipe = sd_pipe.to(self.device)
	self.set_ip_adapter()

	# image proj model
	self.image_proj_model = self.init_proj()

	self.load_ip_adapter()

	def init_proj(self):
	image_proj_model = MLPProjModel(
	cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
	id_embeddings_dim=512,
	num_tokens=self.num_tokens,
	).to(self.device, dtype=torch.float16)
	return image_proj_model

	def set_ip_adapter(self):
	unet = self.pipe.unet
	attn_procs = {}
	for name in unet.attn_processors.keys():
	cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
	if name.startswith("mid_block"):
	hidden_size = unet.config.block_out_channels[-1]
	elif name.startswith("up_blocks"):
	block_id = int(name[len("up_blocks.")])
	hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
	elif name.startswith("down_blocks"):
	block_id = int(name[len("down_blocks.")])
	hidden_size = unet.config.block_out_channels[block_id]
	if cross_attention_dim is None:
	attn_procs[name] = LoRAAttnProcessor(
	hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=self.lora_rank,
	).to(self.device, dtype=torch.float16)
	else:
	attn_procs[name] = LoRAIPAttnProcessor(
	hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0, rank=self.lora_rank, num_tokens=self.num_tokens,
	).to(self.device, dtype=torch.float16)
	unet.set_attn_processor(attn_procs)

	def load_ip_adapter(self):
	if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
	state_dict = {"image_proj": {}, "ip_adapter": {}}
	with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
	for key in f.keys():
	if key.startswith("image_proj."):
	state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
	elif key.startswith("ip_adapter."):
	state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
	else:
	state_dict = torch.load(self.ip_ckpt, map_location="cpu")
	self.image_proj_model.load_state_dict(state_dict["image_proj"])
	ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
	ip_layers.load_state_dict(state_dict["ip_adapter"])

	@torch.inference_mode()
	def get_image_embeds(self, faceid_embeds):

	faceid_embeds = faceid_embeds.to(self.device, dtype=torch.float16)
	image_prompt_embeds = self.image_proj_model(faceid_embeds)
	uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds))
	return image_prompt_embeds, uncond_image_prompt_embeds

	def set_scale(self, scale):
	for attn_processor in self.pipe.unet.attn_processors.values():
	if isinstance(attn_processor, LoRAIPAttnProcessor):
	attn_processor.scale = scale

	def generate(
	self,
	faceid_embeds=None,
	prompt=None,
	negative_prompt=None,
	scale=1.0,
	num_samples=4,
	seed=None,
	guidance_scale=7.5,
	num_inference_steps=30,
	**kwargs,
	):
	self.set_scale(scale)


	num_prompts = faceid_embeds.size(0)

	if prompt is None:
	prompt = "best quality, high quality"
	if negative_prompt is None:
	negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

	if not isinstance(prompt, List):
	prompt = [prompt] * num_prompts
	if not isinstance(negative_prompt, List):
	negative_prompt = [negative_prompt] * num_prompts

	image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(faceid_embeds)

	bs_embed, seq_len, _ = image_prompt_embeds.shape
	image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
	image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
	uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
	uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)

	with torch.inference_mode():
	prompt_embeds_, negative_prompt_embeds_ = self.pipe.encode_prompt(
	prompt,
	device=self.device,
	num_images_per_prompt=num_samples,
	do_classifier_free_guidance=True,
	negative_prompt=negative_prompt,
	)
	prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
	negative_prompt_embeds = torch.cat([negative_prompt_embeds_, uncond_image_prompt_embeds], dim=1)

	generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
	images = self.pipe(
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps,
	generator=generator,
	**kwargs,
	).images

	return images