Spaces:

svjack
/

Genshin-Impact-XL-MasaCtrl

Running

App Files Files Community

Genshin-Impact-XL-MasaCtrl / masactrl_w_adapter /masactrl_w_adapter.py

svjack

Upload 23 files

f070657 verified 9 days ago

raw

history blame contribute delete

9.17 kB

	import os

	import cv2
	import torch
	import torch.nn.functional as F
	from basicsr.utils import tensor2img
	from pytorch_lightning import seed_everything
	from torch import autocast
	from torchvision.io import read_image

	from ldm.inference_base import (diffusion_inference, get_adapters, get_base_argument_parser, get_sd_models)
	from ldm.modules.extra_condition import api
	from ldm.modules.extra_condition.api import (ExtraCondition, get_adapter_feature, get_cond_model)
	from ldm.util import fix_cond_shapes

	# for masactrl
	from masactrl.masactrl_utils import regiter_attention_editor_ldm
	from masactrl.masactrl import MutualSelfAttentionControl
	from masactrl.masactrl import MutualSelfAttentionControlMask
	from masactrl.masactrl import MutualSelfAttentionControlMaskAuto

	torch.set_grad_enabled(False)


	def main():
	supported_cond = [e.name for e in ExtraCondition]
	parser = get_base_argument_parser()
	parser.add_argument(
	'--which_cond',
	type=str,
	required=True,
	choices=supported_cond,
	help='which condition modality you want to test',
	)
	# [MasaCtrl added] reference cond path
	parser.add_argument(
	"--cond_path_src",
	type=str,
	default=None,
	help="the condition image path to synthesize the source image",
	)
	parser.add_argument(
	"--prompt_src",
	type=str,
	default=None,
	help="the prompt to synthesize the source image",
	)
	parser.add_argument(
	"--src_img_path",
	type=str,
	default=None,
	help="the input real source image path"
	)
	parser.add_argument(
	"--start_code_path",
	type=str,
	default=None,
	help="the inverted start code path to synthesize the source image",
	)
	parser.add_argument(
	"--masa_step",
	type=int,
	default=4,
	help="the starting step for MasaCtrl",
	)
	parser.add_argument(
	"--masa_layer",
	type=int,
	default=10,
	help="the starting layer for MasaCtrl",
	)

	opt = parser.parse_args()
	which_cond = opt.which_cond
	if opt.outdir is None:
	opt.outdir = f'outputs/test-{which_cond}'
	os.makedirs(opt.outdir, exist_ok=True)
	if opt.resize_short_edge is None:
	print(f"you don't specify the resize_shot_edge, so the maximum resolution is set to {opt.max_resolution}")
	opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

	if os.path.isdir(opt.cond_path): # for conditioning image folder
	image_paths = [os.path.join(opt.cond_path, f) for f in os.listdir(opt.cond_path)]
	else:
	image_paths = [opt.cond_path]
	print(image_paths)

	# prepare models
	sd_model, sampler = get_sd_models(opt)
	adapter = get_adapters(opt, getattr(ExtraCondition, which_cond))
	cond_model = None
	if opt.cond_inp_type == 'image':
	cond_model = get_cond_model(opt, getattr(ExtraCondition, which_cond))

	process_cond_module = getattr(api, f'get_cond_{which_cond}')

	# [MasaCtrl added] default STEP and LAYER params for MasaCtrl
	STEP = opt.masa_step if opt.masa_step is not None else 4
	LAYER = opt.masa_layer if opt.masa_layer is not None else 10

	# inference
	with torch.inference_mode(), \
	sd_model.ema_scope(), \
	autocast('cuda'):
	for test_idx, cond_path in enumerate(image_paths):
	seed_everything(opt.seed)
	for v_idx in range(opt.n_samples):
	# seed_everything(opt.seed+v_idx+test_idx)
	if opt.cond_path_src:
	cond_src = process_cond_module(opt, opt.cond_path_src, opt.cond_inp_type, cond_model)
	cond = process_cond_module(opt, cond_path, opt.cond_inp_type, cond_model)

	base_count = len(os.listdir(opt.outdir)) // 2
	cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_{which_cond}.png'), tensor2img(cond))
	if opt.cond_path_src:
	cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_{which_cond}_src.png'), tensor2img(cond_src))

	adapter_features, append_to_context = get_adapter_feature(cond, adapter)
	if opt.cond_path_src:
	adapter_features_src, append_to_context_src = get_adapter_feature(cond_src, adapter)

	if opt.cond_path_src:
	print("using reference guidance to synthesize image")
	adapter_features = [torch.cat([adapter_features_src[i], adapter_features[i]]) for i in range(len(adapter_features))]
	else:
	adapter_features = [torch.cat([torch.zeros_like(feats), feats]) for feats in adapter_features]

	if opt.scale > 1.:
	adapter_features = [torch.cat([feats] * 2) for feats in adapter_features]

	# prepare the batch prompts
	if opt.prompt_src is not None:
	prompts = [opt.prompt_src, opt.prompt]
	else:
	prompts = [opt.prompt] * 2
	print("promts: ", prompts)
	# get text embedding
	c = sd_model.get_learned_conditioning(prompts)
	if opt.scale != 1.0:
	uc = sd_model.get_learned_conditioning([""] * len(prompts))
	else:
	uc = None
	c, uc = fix_cond_shapes(sd_model, c, uc)

	if not hasattr(opt, 'H'):
	opt.H = 512
	opt.W = 512
	shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
	if opt.src_img_path: # perform ddim inversion

	src_img = read_image(opt.src_img_path)
	src_img = src_img.float() / 255. # input normalized image [0, 1]
	src_img = src_img * 2 - 1
	if src_img.dim() == 3:
	src_img = src_img.unsqueeze(0)
	src_img = F.interpolate(src_img, (opt.H, opt.W))
	src_img = src_img.to(opt.device)
	# obtain initial latent
	encoder_posterior = sd_model.encode_first_stage(src_img)
	src_x_0 = sd_model.get_first_stage_encoding(encoder_posterior)
	start_code, latents_dict = sampler.ddim_sampling_reverse(
	num_steps=opt.steps,
	x_0=src_x_0,
	conditioning=uc[:1], # you may change here during inversion
	unconditional_guidance_scale=opt.scale,
	unconditional_conditioning=uc[:1],
	)
	torch.save(
	{
	"start_code": start_code
	},
	os.path.join(opt.outdir, "start_code.pth"),
	)
	elif opt.start_code_path:
	# load the inverted start code
	start_code_dict = torch.load(opt.start_code_path)
	start_code = start_code_dict.get("start_code").to(opt.device)
	else:
	start_code = torch.randn([1, *shape], device=opt.device)
	start_code = start_code.expand(len(prompts), -1, -1, -1)

	# hijack the attention module
	editor = MutualSelfAttentionControl(STEP, LAYER)
	regiter_attention_editor_ldm(sd_model, editor)

	samples_latents, _ = sampler.sample(
	S=opt.steps,
	conditioning=c,
	batch_size=len(prompts),
	shape=shape,
	verbose=False,
	unconditional_guidance_scale=opt.scale,
	unconditional_conditioning=uc,
	x_T=start_code,
	features_adapter=adapter_features,
	append_to_context=append_to_context,
	cond_tau=opt.cond_tau,
	style_cond_tau=opt.style_cond_tau,
	)

	x_samples = sd_model.decode_first_stage(samples_latents)
	x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)

	cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_all_result.png'), tensor2img(x_samples))
	# save the prompts and seed
	with open(os.path.join(opt.outdir, "log.txt"), "w") as f:
	for prom in prompts:
	f.write(prom)
	f.write("\n")
	f.write(f"seed: {opt.seed}")
	for i in range(len(x_samples)):
	base_count += 1
	cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_result.png'), tensor2img(x_samples[i]))


	if __name__ == '__main__':
	main()