RED-AIGC
/

InstantID-XS

Model card Files Files and versions Community

InstantID-XS / README.md

XuDongZhou

Update README.md

c81ff7e verified about 2 months ago

preview code

raw

history blame

4.13 kB

	---
	license: mit
	---



	Usage InstanID-XS:

	# 1.Download model .

	```bash
	# InstanID-XS
	huggingface-cli download --resume-download RED-AIGC/InstantID-XS --local-dir ./checkpoints
	# vae: madebyollin/sdxl-vae-fp16-fix
	huggingface-cli download --resume-download madebyollin/sdxl-vae-fp16-fix --local-dir ./checkpoints
	# base model: RealVisXL V4.0
	huggingface-cli download --resume-download frankjoshua/realvisxlV40_v40Bakedvae --local-dir ./checkpoints
	```



	# 2.Get pipeline



	Note: In ControlNetXS, the input of encoder_hidden_states in the controlnet part is the same as that of UNET by default, which is prompt-embeddings. We decouple the inputs of the two, so that the input of encoder_hidden_states in UNET is prompt-embeddings, while the input of encoder_hidden_states in the controlnet part is face-embeddings.

	```python
	from diffusers import AutoencoderKL,UNet2DConditionModel,UniPCMultistepScheduler
	from controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
	import torch


	base_model = './checkpoints/frankjoshua/realvisxlV40_v40Bakedvae'
	vae_path = './checkpoints/madebyollin/sdxl-vae-fp16-fix'
	ckpt = './checkpoints/RED-AIGC/InstantID-XS'

	image_proj_path = os.path.join(ckpt, "image_proj.bin")
	cnxs_path = os.path.join(ckpt, "controlnetxs.bin")
	cross_attn_path = os.path.join(ckpt, "cross_attn.bin")

	# Get ControlNetXS:
	unet = UNet2DConditionModel.from_pretrained(base_model, subfolder="unet").to(device, dtype=weight_dtype)
	controlnet = ControlNetXSAdapter.from_unet(unet, size_ratio=0.125, learn_time_embedding=True)
	state_dict = torch.load(cnxs_path, map_location="cpu", weights_only=True)
	ctrl_state_dict = {}
	for key, value in state_dict.items():
	if 'ctrl_' in key and 'ctrl_to_base' not in key:
	key = key.replace('ctrl_', '')
	if 'up_blocks' in key:
	key = key.replace('up_blocks', 'up_connections')
	ctrl_state_dict[key] = value
	controlnet.load_state_dict(ctrl_state_dict, strict=True)
	controlnet.to(device, dtype=weight_dtype)
	ControlNetXS = UNetControlNetXSModel.from_unet(unet, controlnet).to(device, dtype=weight_dtype)


	# Get pipeline
	vae = AutoencoderKL.from_pretrained(vae_model)

	pipe = StableDiffusionXLInstantIDXSPipeline.from_pretrained(
	base_model,
	vae=vae,
	unet=ControlNetXS,
	controlnet=None,
	torch_dtype=weight_dtype,
	)

	pipe.cuda(device=device, dtype=weight_dtype, use_xformers=True)
	pipe.load_ip_adapter(image_proj_path, cross_attn_path)

	pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
	pipe.unet.config.ctrl_learn_time_embedding = True
	pipe = pipe.to(device)

	```



	# 3.Infer:

	```python
	import cv2
	import os
	from PIL import Image
	from insightface.app import FaceAnalysis

	app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
	app.prepare(ctx_id=0, det_size=(640, 640))


	img_path = './image.jpg'
	image = cv2.imread(img_path)
	image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
	image = resize_img(image)

	face_infos = app.get(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
	face_info = sorted(face_infos, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]
	face_emb = torch.from_numpy(face_info.normed_embedding)
	face_kps = draw_kps_pil(image, face_info['kps'])

	prompt = 'a woman, (looking at the viewer), portrait, daily wear, 8K texture, realistic, symmetrical hyperdetailed texture, masterpiece, enhanced details, (eye highlight:2), perfect composition, natural lighting, best quality, authentic, natural posture'
	n_prompt = '(worst quality:2), (low quality:2), (normal quality:2), lowres, bad anatomy, bad hands, normal quality, long neck, hunchback, narrow shoulder, wall, (blurry), vague, indistinct, (shiny face:2), (buffing:2), (face highlight:2), pale skin'

	seed = 0
	image = pipe(
	prompt=prompt,
	negative_prompt=n_prompt,
	image=face_kps,
	face_emb=face_emb,
	num_images_per_prompt=1,
	num_inference_steps=20,
	generator=torch.Generator(device=device).manual_seed(seed),
	ip_adapter_scale=0.8,
	guidance_scale=4.0,
	controlnet_conditioning_scale=0.8,
	).images[0]
	```