ACE-Plus

Running on Zero

App Files Files Community

ACE-Plus / model /flux.py

chaojiemao

modify app.py

154c805 3 months ago

raw

history blame

44 kB

	# -- coding: utf-8 --
	# Copyright (c) Alibaba, Inc. and its affiliates.
	import math
	from collections import OrderedDict
	from functools import partial
	import warnings
	from contextlib import nullcontext
	import torch
	from einops import rearrange, repeat
	from scepter.modules.model.base_model import BaseModel
	from scepter.modules.model.registry import BACKBONES
	from scepter.modules.utils.config import dict_to_yaml
	from scepter.modules.utils.distribute import we
	from scepter.modules.utils.file_system import FS
	from torch import Tensor, nn
	from torch.nn.utils.rnn import pad_sequence
	from torch.utils.checkpoint import checkpoint_sequential
	import torch.nn.functional as F
	import torch.utils.dlpack
	import transformers
	from scepter.modules.model.embedder.base_embedder import BaseEmbedder
	from scepter.modules.model.registry import EMBEDDERS
	from scepter.modules.model.tokenizer.tokenizer_component import (
	basic_clean, canonicalize, heavy_clean, whitespace_clean)
	try:
	from transformers import AutoTokenizer, T5EncoderModel
	except Exception as e:
	warnings.warn(
	f'Import transformers error, please deal with this problem: {e}')

	from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
	MLPEmbedder, SingleStreamBlock,
	timestep_embedding)



	@EMBEDDERS.register_class()
	class ACETextEmbedder(BaseEmbedder):
	"""
	Uses the OpenCLIP transformer encoder for text
	"""
	"""
	Uses the OpenCLIP transformer encoder for text
	"""
	para_dict = {
	'PRETRAINED_MODEL': {
	'value':
	'google/umt5-small',
	'description':
	'Pretrained Model for umt5, modelcard path or local path.'
	},
	'TOKENIZER_PATH': {
	'value': 'google/umt5-small',
	'description':
	'Tokenizer Path for umt5, modelcard path or local path.'
	},
	'FREEZE': {
	'value': True,
	'description': ''
	},
	'USE_GRAD': {
	'value': False,
	'description': 'Compute grad or not.'
	},
	'CLEAN': {
	'value':
	'whitespace',
	'description':
	'Set the clean strtegy for tokenizer, used when TOKENIZER_PATH is not None.'
	},
	'LAYER': {
	'value': 'last',
	'description': ''
	},
	'LEGACY': {
	'value':
	True,
	'description':
	'Whether use legacy returnd feature or not ,default True.'
	}
	}

	def __init__(self, cfg, logger=None):
	super().__init__(cfg, logger=logger)
	pretrained_path = cfg.get('PRETRAINED_MODEL', None)
	self.t5_dtype = cfg.get('T5_DTYPE', 'float32')
	assert pretrained_path
	with FS.get_dir_to_local_dir(pretrained_path,
	wait_finish=True) as local_path:
	self.model = T5EncoderModel.from_pretrained(
	local_path,
	torch_dtype=getattr(
	torch,
	'float' if self.t5_dtype == 'float32' else self.t5_dtype))
	tokenizer_path = cfg.get('TOKENIZER_PATH', None)
	self.length = cfg.get('LENGTH', 77)

	self.use_grad = cfg.get('USE_GRAD', False)
	self.clean = cfg.get('CLEAN', 'whitespace')
	self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
	if tokenizer_path:
	self.tokenize_kargs = {'return_tensors': 'pt'}
	with FS.get_dir_to_local_dir(tokenizer_path,
	wait_finish=True) as local_path:
	if self.added_identifier is not None and isinstance(
	self.added_identifier, list):
	self.tokenizer = AutoTokenizer.from_pretrained(local_path)
	else:
	self.tokenizer = AutoTokenizer.from_pretrained(local_path)
	if self.length is not None:
	self.tokenize_kargs.update({
	'padding': 'max_length',
	'truncation': True,
	'max_length': self.length
	})
	self.eos_token = self.tokenizer(
	self.tokenizer.eos_token)['input_ids'][0]
	else:
	self.tokenizer = None
	self.tokenize_kargs = {}

	self.use_grad = cfg.get('USE_GRAD', False)
	self.clean = cfg.get('CLEAN', 'whitespace')

	def freeze(self):
	self.model = self.model.eval()
	for param in self.parameters():
	param.requires_grad = False

	# encode && encode_text
	def forward(self, tokens, return_mask=False, use_mask=True):
	# tokenization
	embedding_context = nullcontext if self.use_grad else torch.no_grad
	with embedding_context():
	if use_mask:
	x = self.model(tokens.input_ids.to(we.device_id),
	tokens.attention_mask.to(we.device_id))
	else:
	x = self.model(tokens.input_ids.to(we.device_id))
	x = x.last_hidden_state

	if return_mask:
	return x.detach() + 0.0, tokens.attention_mask.to(we.device_id)
	else:
	return x.detach() + 0.0, None

	def _clean(self, text):
	if self.clean == 'whitespace':
	text = whitespace_clean(basic_clean(text))
	elif self.clean == 'lower':
	text = whitespace_clean(basic_clean(text)).lower()
	elif self.clean == 'canonicalize':
	text = canonicalize(basic_clean(text))
	elif self.clean == 'heavy':
	text = heavy_clean(basic_clean(text))
	return text

	def encode(self, text, return_mask=False, use_mask=True):
	if isinstance(text, str):
	text = [text]
	if self.clean:
	text = [self._clean(u) for u in text]
	assert self.tokenizer is not None
	cont, mask = [], []
	with torch.autocast(device_type='cuda',
	enabled=self.t5_dtype in ('float16', 'bfloat16'),
	dtype=getattr(torch, self.t5_dtype)):
	for tt in text:
	tokens = self.tokenizer([tt], **self.tokenize_kargs)
	one_cont, one_mask = self(tokens,
	return_mask=return_mask,
	use_mask=use_mask)
	cont.append(one_cont)
	mask.append(one_mask)
	if return_mask:
	return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
	else:
	return torch.cat(cont, dim=0)

	def encode_list(self, text_list, return_mask=True):
	cont_list = []
	mask_list = []
	for pp in text_list:
	cont, cont_mask = self.encode(pp, return_mask=return_mask)
	cont_list.append(cont)
	mask_list.append(cont_mask)
	if return_mask:
	return cont_list, mask_list
	else:
	return cont_list

	@staticmethod
	def get_config_template():
	return dict_to_yaml('MODELS',
	__class__.__name__,
	ACETextEmbedder.para_dict,
	set_name=True)

	@EMBEDDERS.register_class()
	class ACEHFEmbedder(BaseEmbedder):
	para_dict = {
	"HF_MODEL_CLS": {
	"value": None,
	"description": "huggingface cls in transfomer"
	},
	"MODEL_PATH": {
	"value": None,
	"description": "model folder path"
	},
	"HF_TOKENIZER_CLS": {
	"value": None,
	"description": "huggingface cls in transfomer"
	},

	"TOKENIZER_PATH": {
	"value": None,
	"description": "tokenizer folder path"
	},
	"MAX_LENGTH": {
	"value": 77,
	"description": "max length of input"
	},
	"OUTPUT_KEY": {
	"value": "last_hidden_state",
	"description": "output key"
	},
	"D_TYPE": {
	"value": "float",
	"description": "dtype"
	},
	"BATCH_INFER": {
	"value": False,
	"description": "batch infer"
	}
	}
	para_dict.update(BaseEmbedder.para_dict)
	def __init__(self, cfg, logger=None):
	super().__init__(cfg, logger=logger)
	hf_model_cls = cfg.get('HF_MODEL_CLS', None)
	model_path = cfg.get("MODEL_PATH", None)
	hf_tokenizer_cls = cfg.get('HF_TOKENIZER_CLS', None)
	tokenizer_path = cfg.get('TOKENIZER_PATH', None)
	self.max_length = cfg.get('MAX_LENGTH', 77)
	self.output_key = cfg.get("OUTPUT_KEY", "last_hidden_state")
	self.d_type = cfg.get("D_TYPE", "float")
	self.clean = cfg.get("CLEAN", "whitespace")
	self.batch_infer = cfg.get("BATCH_INFER", False)
	self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
	torch_dtype = getattr(torch, self.d_type)

	assert hf_model_cls is not None and hf_tokenizer_cls is not None
	assert model_path is not None and tokenizer_path is not None
	with FS.get_dir_to_local_dir(tokenizer_path, wait_finish=True) as local_path:
	self.tokenizer = getattr(transformers, hf_tokenizer_cls).from_pretrained(local_path,
	max_length = self.max_length,
	torch_dtype = torch_dtype,
	additional_special_tokens=self.added_identifier)

	with FS.get_dir_to_local_dir(model_path, wait_finish=True) as local_path:
	self.hf_module = getattr(transformers, hf_model_cls).from_pretrained(local_path, torch_dtype = torch_dtype)


	self.hf_module = self.hf_module.eval().requires_grad_(False)

	def forward(self, text: list[str], return_mask = False):
	batch_encoding = self.tokenizer(
	text,
	truncation=True,
	max_length=self.max_length,
	return_length=False,
	return_overflowing_tokens=False,
	padding="max_length",
	return_tensors="pt",
	)

	outputs = self.hf_module(
	input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
	attention_mask=None,
	output_hidden_states=False,
	)
	if return_mask:
	return outputs[self.output_key], batch_encoding['attention_mask'].to(self.hf_module.device)
	else:
	return outputs[self.output_key], None

	def encode(self, text, return_mask = False):
	if isinstance(text, str):
	text = [text]
	if self.clean:
	text = [self._clean(u) for u in text]
	if not self.batch_infer:
	cont, mask = [], []
	for tt in text:
	one_cont, one_mask = self([tt], return_mask=return_mask)
	cont.append(one_cont)
	mask.append(one_mask)
	if return_mask:
	return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
	else:
	return torch.cat(cont, dim=0)
	else:
	ret_data = self(text, return_mask = return_mask)
	if return_mask:
	return ret_data
	else:
	return ret_data[0]

	def encode_list(self, text_list, return_mask=True):
	cont_list = []
	mask_list = []
	for pp in text_list:
	cont = self.encode(pp, return_mask=return_mask)
	cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
	mask_list.append(cont[1]) if return_mask else mask_list.append(None)
	if return_mask:
	return cont_list, mask_list
	else:
	return cont_list

	def encode_list_of_list(self, text_list, return_mask=True):
	cont_list = []
	mask_list = []
	for pp in text_list:
	cont = self.encode_list(pp, return_mask=return_mask)
	cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
	mask_list.append(cont[1]) if return_mask else mask_list.append(None)
	if return_mask:
	return cont_list, mask_list
	else:
	return cont_list

	def _clean(self, text):
	if self.clean == 'whitespace':
	text = whitespace_clean(basic_clean(text))
	elif self.clean == 'lower':
	text = whitespace_clean(basic_clean(text)).lower()
	elif self.clean == 'canonicalize':
	text = canonicalize(basic_clean(text))
	return text
	@staticmethod
	def get_config_template():
	return dict_to_yaml('EMBEDDER',
	__class__.__name__,
	ACEHFEmbedder.para_dict,
	set_name=True)

	@EMBEDDERS.register_class()
	class T5ACEPlusClipFluxEmbedder(BaseEmbedder):
	"""
	Uses the OpenCLIP transformer encoder for text
	"""
	para_dict = {
	'T5_MODEL': {},
	'CLIP_MODEL': {}
	}

	def __init__(self, cfg, logger=None):
	super().__init__(cfg, logger=logger)
	self.t5_model = EMBEDDERS.build(cfg.T5_MODEL, logger=logger)
	self.clip_model = EMBEDDERS.build(cfg.CLIP_MODEL, logger=logger)

	def encode(self, text, return_mask = False):
	t5_embeds = self.t5_model.encode(text, return_mask = return_mask)
	clip_embeds = self.clip_model.encode(text, return_mask = return_mask)
	# change embedding strategy here
	return {
	'context': t5_embeds,
	'y': clip_embeds,
	}

	def encode_list(self, text, return_mask = False):
	t5_embeds = self.t5_model.encode_list(text, return_mask = return_mask)
	clip_embeds = self.clip_model.encode_list(text, return_mask = return_mask)
	# change embedding strategy here
	return {
	'context': t5_embeds,
	'y': clip_embeds,
	}

	def encode_list_of_list(self, text, return_mask = False):
	t5_embeds = self.t5_model.encode_list_of_list(text, return_mask = return_mask)
	clip_embeds = self.clip_model.encode_list_of_list(text, return_mask = return_mask)
	# change embedding strategy here
	return {
	'context': t5_embeds,
	'y': clip_embeds,
	}


	@staticmethod
	def get_config_template():
	return dict_to_yaml('EMBEDDER',
	__class__.__name__,
	T5ACEPlusClipFluxEmbedder.para_dict,
	set_name=True)

	@BACKBONES.register_class()
	class Flux(BaseModel):
	"""
	Transformer backbone Diffusion model with RoPE.
	"""
	para_dict = {
	"IN_CHANNELS": {
	"value": 64,
	"description": "model's input channels."
	},
	"OUT_CHANNELS": {
	"value": 64,
	"description": "model's output channels."
	},
	"HIDDEN_SIZE": {
	"value": 1024,
	"description": "model's hidden size."
	},
	"NUM_HEADS": {
	"value": 16,
	"description": "number of heads in the transformer."
	},
	"AXES_DIM": {
	"value": [16, 56, 56],
	"description": "dimensions of the axes of the positional encoding."
	},
	"THETA": {
	"value": 10_000,
	"description": "theta for positional encoding."
	},
	"VEC_IN_DIM": {
	"value": 768,
	"description": "dimension of the vector input."
	},
	"GUIDANCE_EMBED": {
	"value": False,
	"description": "whether to use guidance embedding."
	},
	"CONTEXT_IN_DIM": {
	"value": 4096,
	"description": "dimension of the context input."
	},
	"MLP_RATIO": {
	"value": 4.0,
	"description": "ratio of mlp hidden size to hidden size."
	},
	"QKV_BIAS": {
	"value": True,
	"description": "whether to use bias in qkv projection."
	},
	"DEPTH": {
	"value": 19,
	"description": "number of transformer blocks."
	},
	"DEPTH_SINGLE_BLOCKS": {
	"value": 38,
	"description": "number of transformer blocks in the single stream block."
	},
	"USE_GRAD_CHECKPOINT": {
	"value": False,
	"description": "whether to use gradient checkpointing."
	},
	"ATTN_BACKEND": {
	"value": "pytorch",
	"description": "backend for the transformer blocks, 'pytorch' or 'flash_attn'."
	}
	}
	def __init__(
	self,
	cfg,
	logger = None
	):
	super().__init__(cfg, logger=logger)
	self.in_channels = cfg.IN_CHANNELS
	self.out_channels = cfg.get("OUT_CHANNELS", self.in_channels)
	hidden_size = cfg.get("HIDDEN_SIZE", 1024)
	num_heads = cfg.get("NUM_HEADS", 16)
	axes_dim = cfg.AXES_DIM
	theta = cfg.THETA
	vec_in_dim = cfg.VEC_IN_DIM
	self.guidance_embed = cfg.GUIDANCE_EMBED
	context_in_dim = cfg.CONTEXT_IN_DIM
	mlp_ratio = cfg.MLP_RATIO
	qkv_bias = cfg.QKV_BIAS
	depth = cfg.DEPTH
	depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
	self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
	self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
	self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
	self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
	self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)

	if hidden_size % num_heads != 0:
	raise ValueError(
	f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
	)
	pe_dim = hidden_size // num_heads
	if sum(axes_dim) != pe_dim:
	raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
	self.hidden_size = hidden_size
	self.num_heads = num_heads
	self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim= axes_dim)
	self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
	self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
	self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
	self.guidance_in = (
	MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if self.guidance_embed else nn.Identity()
	)
	self.txt_in = nn.Linear(context_in_dim, self.hidden_size)

	self.double_blocks = nn.ModuleList(
	[
	DoubleStreamBlock(
	self.hidden_size,
	self.num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	backend=self.attn_backend
	)
	for _ in range(depth)
	]
	)

	self.single_blocks = nn.ModuleList(
	[
	SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
	for _ in range(depth_single_blocks)
	]
	)

	self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)

	def prepare_input(self, x, context, y, x_shape=None):
	# x.shape [6, 16, 16, 16] target is [6, 16, 768, 1360]
	bs, c, h, w = x.shape
	x = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
	x_id = torch.zeros(h // 2, w // 2, 3)
	x_id[..., 1] = x_id[..., 1] + torch.arange(h // 2)[:, None]
	x_id[..., 2] = x_id[..., 2] + torch.arange(w // 2)[None, :]
	x_ids = repeat(x_id, "h w c -> b (h w) c", b=bs)
	txt_ids = torch.zeros(bs, context.shape[1], 3)
	return x, x_ids.to(x), context.to(x), txt_ids.to(x), y.to(x), h, w

	def unpack(self, x: Tensor, height: int, width: int) -> Tensor:
	return rearrange(
	x,
	"b (h w) (c ph pw) -> b c (h ph) (w pw)",
	h=math.ceil(height/2),
	w=math.ceil(width/2),
	ph=2,
	pw=2,
	)

	def merge_diffuser_lora(self, ori_sd, lora_sd, scale = 1.0):
	key_map = {
	"single_blocks.{}.linear1.weight": {"key_list": [
	["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
	"transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight"],
	["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
	"transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight"],
	["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
	"transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight"],
	["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
	"transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight"]
	], "num": 38},
	"single_blocks.{}.modulation.lin.weight": {"key_list": [
	["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
	"transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight"],
	], "num": 38},
	"single_blocks.{}.linear2.weight": {"key_list": [
	["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
	"transformer.single_transformer_blocks.{}.proj_out.lora_B.weight"],
	], "num": 38},
	"double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
	["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight"],
	["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight"],
	["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight"],
	], "num": 19},
	"double_blocks.{}.img_attn.qkv.weight": {"key_list": [
	["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.to_q.lora_B.weight"],
	["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.to_k.lora_B.weight"],
	["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.to_v.lora_B.weight"],
	], "num": 19},
	"double_blocks.{}.img_attn.proj.weight": {"key_list": [
	["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.txt_attn.proj.weight": {"key_list": [
	["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
	"transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.img_mlp.0.weight": {"key_list": [
	["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
	"transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.img_mlp.2.weight": {"key_list": [
	["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
	"transformer.transformer_blocks.{}.ff.net.2.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.txt_mlp.0.weight": {"key_list": [
	["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
	"transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.txt_mlp.2.weight": {"key_list": [
	["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
	"transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.img_mod.lin.weight": {"key_list": [
	["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
	"transformer.transformer_blocks.{}.norm1.linear.lora_B.weight"]
	], "num": 19},
	"double_blocks.{}.txt_mod.lin.weight": {"key_list": [
	["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
	"transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight"]
	], "num": 19}
	}
	for k, v in key_map.items():
	key_list = v["key_list"]
	block_num = v["num"]
	for block_id in range(block_num):
	current_weight_list = []
	for k_list in key_list:
	current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
	lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
	current_weight_list.append(current_weight)
	current_weight = torch.cat(current_weight_list, dim=0)
	ori_sd[k.format(block_id)] += scale*current_weight
	return ori_sd

	def merge_swift_lora(self, ori_sd, lora_sd, scale = 1.0):
	have_lora_keys = {}
	for k, v in lora_sd.items():
	k = k[len("model."):] if k.startswith("model.") else k
	ori_key = k.split("lora")[0] + "weight"
	if ori_key not in ori_sd:
	raise f"{ori_key} should in the original statedict"
	if ori_key not in have_lora_keys:
	have_lora_keys[ori_key] = {}
	if "lora_A" in k:
	have_lora_keys[ori_key]["lora_A"] = v
	elif "lora_B" in k:
	have_lora_keys[ori_key]["lora_B"] = v
	else:
	raise NotImplementedError
	for key, v in have_lora_keys.items():
	current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
	ori_sd[key] += scale * current_weight
	return ori_sd


	def load_pretrained_model(self, pretrained_model):
	if next(self.parameters()).device.type == 'meta':
	map_location = we.device_id
	else:
	map_location = "cpu"
	if self.lora_model is not None:
	map_location = we.device_id
	if pretrained_model is not None:
	with FS.get_from(pretrained_model, wait_finish=True) as local_model:
	if local_model.endswith('safetensors'):
	from safetensors.torch import load_file as load_safetensors
	sd = load_safetensors(local_model, device=map_location)
	else:
	sd = torch.load(local_model, map_location=map_location)
	if "state_dict" in sd:
	sd = sd["state_dict"]
	if "model" in sd:
	sd = sd["model"]["model"]

	if self.lora_model is not None:
	with FS.get_from(self.lora_model, wait_finish=True) as local_model:
	if local_model.endswith('safetensors'):
	from safetensors.torch import load_file as load_safetensors
	lora_sd = load_safetensors(local_model, device=map_location)
	else:
	lora_sd = torch.load(local_model, map_location=map_location)
	sd = self.merge_diffuser_lora(sd, lora_sd)
	if self.swift_lora_model is not None:
	with FS.get_from(self.swift_lora_model, wait_finish=True) as local_model:
	if local_model.endswith('safetensors'):
	from safetensors.torch import load_file as load_safetensors
	lora_sd = load_safetensors(local_model, device=map_location)
	else:
	lora_sd = torch.load(local_model, map_location=map_location)
	sd = self.merge_swift_lora(sd, lora_sd)

	adapter_ckpt = {}
	if self.pretrain_adapter is not None:
	with FS.get_from(self.pretrain_adapter, wait_finish=True) as local_adapter:
	if local_model.endswith('safetensors'):
	from safetensors.torch import load_file as load_safetensors
	adapter_ckpt = load_safetensors(local_adapter, device=map_location)
	else:
	adapter_ckpt = torch.load(local_adapter, map_location=map_location)
	sd.update(adapter_ckpt)


	new_ckpt = OrderedDict()
	for k, v in sd.items():
	if k in ("img_in.weight"):
	model_p = self.state_dict()[k]
	if v.shape != model_p.shape:
	model_p.zero_()
	model_p[:, :64].copy_(v[:, :64])
	new_ckpt[k] = torch.nn.parameter.Parameter(model_p)
	else:
	new_ckpt[k] = v
	else:
	new_ckpt[k] = v


	missing, unexpected = self.load_state_dict(new_ckpt, strict=False, assign=True)
	self.logger.info(
	f'Restored from {pretrained_model} with {len(missing)} missing and {len(unexpected)} unexpected keys'
	)
	if len(missing) > 0:
	self.logger.info(f'Missing Keys:\n {missing}')
	if len(unexpected) > 0:
	self.logger.info(f'\nUnexpected Keys:\n {unexpected}')

	def forward(
	self,
	x: Tensor,
	t: Tensor,
	cond: dict = {},
	guidance: Tensor \| None = None,
	gc_seg: int = 0
	) -> Tensor:
	x, x_ids, txt, txt_ids, y, h, w = self.prepare_input(x, cond["context"], cond["y"])
	# running on sequences img
	x = self.img_in(x)
	vec = self.time_in(timestep_embedding(t, 256))
	if self.guidance_embed:
	if guidance is None:
	raise ValueError("Didn't get guidance strength for guidance distilled model.")
	vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
	vec = vec + self.vector_in(y)
	txt = self.txt_in(txt)
	ids = torch.cat((txt_ids, x_ids), dim=1)
	pe = self.pe_embedder(ids)
	kwargs = dict(
	vec=vec,
	pe=pe,
	txt_length=txt.shape[1],
	)
	x = torch.cat((txt, x), 1)
	if self.use_grad_checkpoint and gc_seg >= 0:
	x = checkpoint_sequential(
	functions=[partial(block, **kwargs) for block in self.double_blocks],
	segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
	input=x,
	use_reentrant=False
	)
	else:
	for block in self.double_blocks:
	x = block(x, **kwargs)

	kwargs = dict(
	vec=vec,
	pe=pe,
	)

	if self.use_grad_checkpoint and gc_seg >= 0:
	x = checkpoint_sequential(
	functions=[partial(block, **kwargs) for block in self.single_blocks],
	segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
	input=x,
	use_reentrant=False
	)
	else:
	for block in self.single_blocks:
	x = block(x, **kwargs)
	x = x[:, txt.shape[1] :, ...]
	x = self.final_layer(x, vec) # (N, T, patch_size ** 2 * out_channels) 6 64 64
	x = self.unpack(x, h, w)
	return x

	@staticmethod
	def get_config_template():
	return dict_to_yaml('MODEL',
	__class__.__name__,
	Flux.para_dict,
	set_name=True)

	@BACKBONES.register_class()
	class FluxMR(Flux):
	def prepare_input(self, x, cond):
	if isinstance(cond['context'], list):
	context, y = torch.cat(cond["context"], dim=0).to(x), torch.cat(cond["y"], dim=0).to(x)
	else:
	context, y = cond['context'].to(x), cond['y'].to(x)
	batch_frames, batch_frames_ids = [], []
	for ix, shape in zip(x, cond["x_shapes"]):
	# unpack image from sequence
	ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
	c, h, w = ix.shape
	ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
	ix_id = torch.zeros(h // 2, w // 2, 3)
	ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
	ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
	ix_id = rearrange(ix_id, "h w c -> (h w) c")
	batch_frames.append([ix])
	batch_frames_ids.append([ix_id])

	x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
	for frames, frame_ids in zip(batch_frames, batch_frames_ids):
	proj_frames = []
	for idx, one_frame in enumerate(frames):
	one_frame = self.img_in(one_frame)
	proj_frames.append(one_frame)
	ix = torch.cat(proj_frames, dim=0)
	if_id = torch.cat(frame_ids, dim=0)
	x_list.append(ix)
	x_id_list.append(if_id)
	mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
	x_seq_length.append(ix.shape[0])
	x = pad_sequence(tuple(x_list), batch_first=True)
	x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x) # [b,pad_seq,2] pad (0.,0.) at dim2
	mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)

	txt = self.txt_in(context)
	txt_ids = torch.zeros(context.shape[0], context.shape[1], 3).to(x)
	mask_txt = torch.ones(context.shape[0], context.shape[1]).to(x.device, non_blocking=True).bool()

	return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length

	def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
	x_list = []
	image_shapes = cond["x_shapes"]
	for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
	height, width = shape
	h, w = math.ceil(height / 2), math.ceil(width / 2)
	u = rearrange(
	u[seq_length-h*w:seq_length, ...],
	"(h w) (c ph pw) -> (h ph w pw) c",
	h=h,
	w=w,
	ph=2,
	pw=2,
	)
	x_list.append(u)
	x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
	return x

	def forward(
	self,
	x: Tensor,
	t: Tensor,
	cond: dict = {},
	guidance: Tensor \| None = None,
	gc_seg: int = 0,
	**kwargs
	) -> Tensor:
	x, x_ids, txt, txt_ids, y, mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond)
	# running on sequences img
	vec = self.time_in(timestep_embedding(t, 256))
	if self.guidance_embed:
	if guidance is None:
	raise ValueError("Didn't get guidance strength for guidance distilled model.")
	vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
	vec = vec + self.vector_in(y)
	ids = torch.cat((txt_ids, x_ids), dim=1)
	pe = self.pe_embedder(ids)

	mask_aside = torch.cat((mask_txt, mask_x), dim=1)
	mask = mask_aside[:, None, :] * mask_aside[:, :, None]

	kwargs = dict(
	vec=vec,
	pe=pe,
	mask=mask,
	txt_length = txt.shape[1],
	)
	x = torch.cat((txt, x), 1)
	if self.use_grad_checkpoint and gc_seg >= 0:
	x = checkpoint_sequential(
	functions=[partial(block, **kwargs) for block in self.double_blocks],
	segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
	input=x,
	use_reentrant=False
	)
	else:
	for block in self.double_blocks:
	x = block(x, **kwargs)

	kwargs = dict(
	vec=vec,
	pe=pe,
	mask=mask,
	)

	if self.use_grad_checkpoint and gc_seg >= 0:
	x = checkpoint_sequential(
	functions=[partial(block, **kwargs) for block in self.single_blocks],
	segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
	input=x,
	use_reentrant=False
	)
	else:
	for block in self.single_blocks:
	x = block(x, **kwargs)
	x = x[:, txt.shape[1]:, ...]
	x = self.final_layer(x, vec) # (N, T, patch_size ** 2 * out_channels) 6 64 64
	x = self.unpack(x, cond, seq_length_list)
	return x

	@staticmethod
	def get_config_template():
	return dict_to_yaml('MODEL',
	__class__.__name__,
	FluxEdit.para_dict,
	set_name=True)
	@BACKBONES.register_class()
	class FluxEdit(FluxMR):
	def prepare_input(self, x, cond, args, *kwargs):
	context, y = cond["context"], cond["y"]
	batch_frames, batch_frames_ids, batch_shift = [], [], []

	for ix, shape, is_align in zip(x, cond["x_shapes"], cond['align']):
	# unpack image from sequence
	ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
	c, h, w = ix.shape
	ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
	ix_id = torch.zeros(h // 2, w // 2, 3)
	ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
	ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
	batch_shift.append(h // 2) #if is_align < 1 else batch_shift.append(0)
	ix_id = rearrange(ix_id, "h w c -> (h w) c")
	batch_frames.append([ix])
	batch_frames_ids.append([ix_id])
	if 'edit_x' in cond:
	for i, edit in enumerate(cond['edit_x']):
	if edit is None:
	continue
	for ie in edit:
	ie = ie.squeeze(0)
	c, h, w = ie.shape
	ie = rearrange(ie, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
	ie_id = torch.zeros(h // 2, w // 2, 3)
	ie_id[..., 1] = ie_id[..., 1] + torch.arange(batch_shift[i], h // 2 + batch_shift[i])[:, None]
	ie_id[..., 2] = ie_id[..., 2] + torch.arange(w // 2)[None, :]
	ie_id = rearrange(ie_id, "h w c -> (h w) c")
	batch_frames[i].append(ie)
	batch_frames_ids[i].append(ie_id)

	x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
	for frames, frame_ids in zip(batch_frames, batch_frames_ids):
	proj_frames = []
	for idx, one_frame in enumerate(frames):
	one_frame = self.img_in(one_frame)
	proj_frames.append(one_frame)
	ix = torch.cat(proj_frames, dim=0)
	if_id = torch.cat(frame_ids, dim=0)
	x_list.append(ix)
	x_id_list.append(if_id)
	mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
	x_seq_length.append(ix.shape[0])
	x = pad_sequence(tuple(x_list), batch_first=True)
	x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x) # [b,pad_seq,2] pad (0.,0.) at dim2
	mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)

	txt_list, mask_txt_list, y_list = [], [], []
	for sample_id, (ctx, yy) in enumerate(zip(context, y)):
	ctx_batch = []
	for frame_id, one_ctx in enumerate(ctx):
	one_ctx = self.txt_in(one_ctx.to(x))
	ctx_batch.append(one_ctx)
	txt_list.append(torch.cat(ctx_batch, dim=0))
	mask_txt_list.append(torch.ones(txt_list[-1].shape[0]).to(ctx.device, non_blocking=True).bool())
	y_list.append(yy.mean(dim = 0, keepdim=True))
	txt = pad_sequence(tuple(txt_list), batch_first=True)
	txt_ids = torch.zeros(txt.shape[0], txt.shape[1], 3).to(x)
	mask_txt = pad_sequence(tuple(mask_txt_list), batch_first=True)
	y = torch.cat(y_list, dim=0)
	return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length

	def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
	x_list = []
	image_shapes = cond["x_shapes"]
	for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
	height, width = shape
	h, w = math.ceil(height / 2), math.ceil(width / 2)
	u = rearrange(
	u[:h*w, ...],
	"(h w) (c ph pw) -> (h ph w pw) c",
	h=h,
	w=w,
	ph=2,
	pw=2,
	)
	x_list.append(u)
	x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
	return x

	def forward(
	self,
	x: Tensor,
	t: Tensor,
	cond: dict = {},
	guidance: Tensor \| None = None,
	gc_seg: int = 0,
	text_position_embeddings = None
	) -> Tensor:
	x, x_ids, txt, txt_ids, y, mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond, text_position_embeddings)
	# running on sequences img
	vec = self.time_in(timestep_embedding(t, 256))
	if self.guidance_embed:
	if guidance is None:
	raise ValueError("Didn't get guidance strength for guidance distilled model.")
	vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
	vec = vec + self.vector_in(y)
	ids = torch.cat((txt_ids, x_ids), dim=1)
	pe = self.pe_embedder(ids)

	mask_aside = torch.cat((mask_txt, mask_x), dim=1)
	mask = mask_aside[:, None, :] * mask_aside[:, :, None]

	kwargs = dict(
	vec=vec,
	pe=pe,
	mask=mask,
	txt_length = txt.shape[1],
	)
	x = torch.cat((txt, x), 1)

	if self.use_grad_checkpoint and gc_seg >= 0:
	x = checkpoint_sequential(
	functions=[partial(block, **kwargs) for block in self.double_blocks],
	segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
	input=x,
	use_reentrant=False
	)
	else:
	for block in self.double_blocks:
	x = block(x, **kwargs)

	kwargs = dict(
	vec=vec,
	pe=pe,
	mask=mask,
	)

	if self.use_grad_checkpoint and gc_seg >= 0:
	x = checkpoint_sequential(
	functions=[partial(block, **kwargs) for block in self.single_blocks],
	segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
	input=x,
	use_reentrant=False
	)
	else:
	for block in self.single_blocks:
	x = block(x, **kwargs)
	x = x[:, txt.shape[1]:, ...]
	x = self.final_layer(x, vec) # (N, T, patch_size ** 2 * out_channels) 6 64 64
	x = self.unpack(x, cond, seq_length_list)
	return x
	@staticmethod
	def get_config_template():
	return dict_to_yaml('MODEL',
	__class__.__name__,
	FluxEdit.para_dict,
	set_name=True)