ACE-Plus

Running on Zero

App Files Files Community

chaojiemao commited on 6 days ago

Commit

d1a539d

1 Parent(s): f2838d1

modify ace plus

Browse files

Files changed (9) hide show

config/ace_plus_fft.yaml +192 -0
modules/__init__.py +6 -0
modules/ace_plus_dataset.py +280 -0
modules/ace_plus_ldm.py +451 -0
modules/ace_plus_solver.py +181 -0
modules/checkpoint.py +135 -0
modules/embedder.py +219 -0
modules/flux.py +812 -0
modules/layers.py +521 -0

config/ace_plus_fft.yaml ADDED Viewed

	@@ -0,0 +1,192 @@

+NAME: ACEInference
+DTYPE: bfloat16
+VERSION: fft
+IS_DEFAULT: True
+MAX_SEQ_LEN: 4096
+MODEL:
+  NAME: LatentDiffusionACEPlus
+  PARAMETERIZATION: rf
+  TIMESTEPS: 1000
+  GUIDE_SCALE: 1.0
+  PRETRAINED_MODEL:
+  IGNORE_KEYS: [ ]
+  USE_EMA: False
+  EVAL_EMA: False
+  SIZE_FACTOR: 8
+  DIFFUSION:
+    NAME: DiffusionFluxRF
+    PREDICTION_TYPE: raw
+    NOISE_NORM: True
+    # NOISE_SCHEDULER DESCRIPTION:  TYPE:  default: ''
+    NOISE_SCHEDULER:
+      NAME: FlowMatchFluxShiftScheduler
+      SHIFT: False
+      PRE_T_SAMPLE: True
+      PRE_T_SAMPLE_FOLD: 1
+      SIGMOID_SCALE: 1
+      BASE_SHIFT: 0.5
+      MAX_SHIFT: 1.15
+    SAMPLER_SCHEDULER:
+      NAME: FlowMatchFluxShiftScheduler
+      SHIFT: True
+      PRE_T_SAMPLE: False
+      SIGMOID_SCALE: 1
+      BASE_SHIFT: 0.5
+      MAX_SHIFT: 1.15
+  #
+  DIFFUSION_MODEL:
+    # NAME DESCRIPTION:  TYPE:  default: 'Flux'
+    NAME: FluxMRModiACEPlus
+    PRETRAINED_MODEL: ${ACE_PLUS_FFT_MODEL}
+    # IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
+    IN_CHANNELS: 448
+    # OUT_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
+    OUT_CHANNELS: 64
+    # HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
+    HIDDEN_SIZE: 3072
+    REDUX_DIM: 1152
+    # NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
+    NUM_HEADS: 24
+    # AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
+    AXES_DIM: [ 16, 56, 56 ]
+    # THETA DESCRIPTION: theta for positional encoding. TYPE: int default: 10000
+    THETA: 10000
+    # VEC_IN_DIM DESCRIPTION: dimension of the vector input. TYPE: int default: 768
+    VEC_IN_DIM: 768
+    # GUIDANCE_EMBED DESCRIPTION: whether to use guidance embedding. TYPE: bool default: False
+    GUIDANCE_EMBED: True
+    # CONTEXT_IN_DIM DESCRIPTION: dimension of the context input. TYPE: int default: 4096
+    CONTEXT_IN_DIM: 4096
+    # MLP_RATIO DESCRIPTION: ratio of mlp hidden size to hidden size. TYPE: float default: 4.0
+    MLP_RATIO: 4.0
+    # QKV_BIAS DESCRIPTION: whether to use bias in qkv projection. TYPE: bool default: True
+    QKV_BIAS: True
+    # DEPTH DESCRIPTION: number of transformer blocks. TYPE: int default: 19
+    DEPTH: 19
+    # DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
+    DEPTH_SINGLE_BLOCKS: 38
+    ATTN_BACKEND: flash_attn
+  #
+  FIRST_STAGE_MODEL:
+    NAME: AutoencoderKLFlux
+    EMBED_DIM: 16
+    PRETRAINED_MODEL: ${FLUX_FILL_PATH}/ae.safetensors
+    IGNORE_KEYS: [ ]
+    BATCH_SIZE: 8
+    USE_CONV: False
+    SCALE_FACTOR: 0.3611
+    SHIFT_FACTOR: 0.1159
+    #
+    ENCODER:
+      NAME: Encoder
+      CH: 128
+      OUT_CH: 3
+      NUM_RES_BLOCKS: 2
+      IN_CHANNELS: 3
+      ATTN_RESOLUTIONS: [ ]
+      CH_MULT: [ 1, 2, 4, 4 ]
+      Z_CHANNELS: 16
+      DOUBLE_Z: True
+      DROPOUT: 0.0
+      RESAMP_WITH_CONV: True
+    #
+    DECODER:
+      NAME: Decoder
+      CH: 128
+      OUT_CH: 3
+      NUM_RES_BLOCKS: 2
+      IN_CHANNELS: 3
+      ATTN_RESOLUTIONS: [ ]
+      CH_MULT: [ 1, 2, 4, 4 ]
+      Z_CHANNELS: 16
+      DROPOUT: 0.0
+      RESAMP_WITH_CONV: True
+      GIVE_PRE_END: False
+      TANH_OUT: False
+  #
+  COND_STAGE_MODEL:
+    # NAME DESCRIPTION:  TYPE:  default: 'T5PlusClipFluxEmbedder'
+    NAME: T5PlusClipFluxEmbedder
+    # T5_MODEL DESCRIPTION:  TYPE:  default: ''
+    T5_MODEL:
+      # NAME DESCRIPTION:  TYPE:  default: 'HFEmbedder'
+      NAME: HFEmbedder
+      # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
+      HF_MODEL_CLS: T5EncoderModel
+      # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
+      MODEL_PATH: ${FLUX_FILL_PATH}/text_encoder_2/
+      # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
+      HF_TOKENIZER_CLS: T5Tokenizer
+      # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
+      TOKENIZER_PATH: ${FLUX_FILL_PATH}/tokenizer_2/
+      ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
+      # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
+      MAX_LENGTH: 512
+      # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
+      OUTPUT_KEY: last_hidden_state
+      # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
+      D_TYPE: bfloat16
+      # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
+      BATCH_INFER: False
+      CLEAN: whitespace
+    # CLIP_MODEL DESCRIPTION:  TYPE:  default: ''
+    CLIP_MODEL:
+      # NAME DESCRIPTION:  TYPE:  default: 'HFEmbedder'
+      NAME: HFEmbedder
+      # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
+      HF_MODEL_CLS: CLIPTextModel
+      # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
+      MODEL_PATH: ${FLUX_FILL_PATH}/text_encoder/
+      # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
+      HF_TOKENIZER_CLS: CLIPTokenizer
+      # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
+      TOKENIZER_PATH: ${FLUX_FILL_PATH}/tokenizer/
+      # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
+      MAX_LENGTH: 77
+      # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
+      OUTPUT_KEY: pooler_output
+      # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
+      D_TYPE: bfloat16
+      # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
+      BATCH_INFER: True
+      CLEAN: whitespace
+PREPROCESSOR:
+  - TYPE: repainting
+    REPAINTING_SCALE: 1.0
+    ANNOTATOR:
+  - TYPE: no_preprocess
+    REPAINTING_SCALE: 0.0
+    ANNOTATOR:
+  - TYPE: mosaic_repainting
+    REPAINTING_SCALE: 0.0
+    ANNOTATOR:
+      NAME: ColorAnnotator
+      RATIO: 64
+  - TYPE: contour_repainting
+    REPAINTING_SCALE: 0.0
+    ANNOTATOR:
+      NAME: InfoDrawContourAnnotator
+      INPUT_NC: 3
+      OUTPUT_NC: 1
+      N_RESIDUAL_BLOCKS: 3
+      SIGMOID: True
+      PRETRAINED_MODEL: "ms://iic/scepter_annotator@annotator/ckpts/informative_drawing_contour_style.pth"
+  - TYPE: depth_repainting
+    REPAINTING_SCALE: 0.0
+    ANNOTATOR:
+      NAME: MidasDetector
+      PRETRAINED_MODEL: "ms://iic/scepter_annotator@annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
+  - TYPE: recolorizing
+    REPAINTING_SCALE: 0.0
+    ANNOTATOR:
+      NAME: GrayAnnotator
+SAMPLE_ARGS:
+  SAMPLE_STEPS: 28
+  SAMPLER: flow_euler
+  SEED: 42
+  IMAGE_SIZE: [ 1024, 1024 ]
+  GUIDE_SCALE: 50

modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .flux import FluxMRACEPlus, FluxMRModiACEPlus
+from .ace_plus_dataset import ACEPlusDataset
+from .ace_plus_ldm import LatentDiffusionACEPlus
+from .ace_plus_solver import FormalACEPlusSolver
+from .embedder import ACEHFEmbedder, T5ACEPlusClipFluxEmbedder
+from .checkpoint import ACECheckpointHook, ACEBackwardHook

modules/ace_plus_dataset.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import re, io
+import numpy as np
+import random, torch
+from PIL import Image
+import torchvision.transforms as T
+from collections import defaultdict
+from scepter.modules.data.dataset.registry import DATASETS
+from scepter.modules.data.dataset.base_dataset import BaseDataset
+from scepter.modules.transform.io import pillow_convert
+from scepter.modules.utils.directory import osp_path
+from scepter.modules.utils.file_system import FS
+from torchvision.transforms import InterpolationMode
+def load_image(prefix, img_path, cvt_type=None):
+    if img_path is None or img_path == '':
+        return None
+    img_path = osp_path(prefix, img_path)
+    with FS.get_object(img_path) as image_bytes:
+        image = Image.open(io.BytesIO(image_bytes))
+        if cvt_type is not None:
+            image = pillow_convert(image, cvt_type)
+    return image
+def transform_image(image, std = 0.5, mean = 0.5):
+    return (image.permute(2, 0, 1)/255. - mean)/std
+def transform_mask(mask):
+    return mask.unsqueeze(0)/255.
+def ensure_src_align_target_h_mode(src_image, size, image_id, interpolation=InterpolationMode.BILINEAR):
+    # padding mode
+    H, W = size
+    ret_image = []
+    for one_id in image_id:
+        edit_image = src_image[one_id]
+        _, eH, eW = edit_image.shape
+        scale = H/eH
+        tH, tW = H, int(eW * scale)
+        ret_image.append(T.Resize((tH, tW), interpolation=interpolation, antialias=True)(edit_image))
+    return ret_image
+def ensure_src_align_target_padding_mode(src_image, size, image_id, size_h = [], interpolation=InterpolationMode.BILINEAR):
+    # padding mode
+    H, W = size
+    ret_data = []
+    ret_h = []
+    for idx, one_id in enumerate(image_id):
+        if len(size_h) < 1:
+            rH = random.randint(int(H / 3), int(H))
+        else:
+            rH = size_h[idx]
+        ret_h.append(rH)
+        edit_image = src_image[one_id]
+        _, eH, eW = edit_image.shape
+        scale = rH/eH
+        tH, tW = rH, int(eW * scale)
+        edit_image = T.Resize((tH, tW), interpolation=interpolation, antialias=True)(edit_image)
+        # padding
+        delta_w = 0
+        delta_h = H - tH
+        padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
+        ret_data.append(T.Pad(padding, fill=0, padding_mode="constant")(edit_image).float())
+    return ret_data, ret_h
+def ensure_limit_sequence(image, max_seq_len = 4096, d = 16, interpolation=InterpolationMode.BILINEAR):
+    # resize image for max_seq_len, while keep the aspect ratio
+    H, W = image.shape[-2:]
+    scale = min(1.0, math.sqrt(max_seq_len / ((H / d) * (W / d))))
+    rH = int(H * scale) // d * d  # ensure divisible by self.d
+    rW = int(W * scale) // d * d
+    # print(f"{H} {W} -> {rH} {rW}")
+    image = T.Resize((rH, rW), interpolation=interpolation, antialias=True)(image)
+    return image
+@DATASETS.register_class()
+class ACEPlusDataset(BaseDataset):
+    para_dict = {
+        "DELIMITER": {
+            "value": "#;#",
+            "description": "The delimiter for records of data list."
+        },
+        "FIELDS": {
+            "value": ["data_type", "edit_image", "edit_mask", "ref_image", "target_image", "prompt"],
+            "description": "The fields for every record."
+        },
+        "PATH_PREFIX": {
+            "value": "",
+            "description": "The path prefix for every input image."
+        },
+        "EDIT_TYPE_LIST": {
+            "value": [],
+            "description": "The edit type list to be trained for data list."
+        },
+        "MAX_SEQ_LEN": {
+            "value": 4096,
+            "description": "The max sequence length for input image."
+        },
+        "D": {
+            "value": 16,
+            "description": "Patch size for resized image."
+        }
+    }
+    para_dict.update(BaseDataset.para_dict)
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        delimiter = cfg.get("DELIMITER", "#;#")
+        fields = cfg.get("FIELDS", [])
+        prefix = cfg.get("PATH_PREFIX", "")
+        edit_type_list = cfg.get("EDIT_TYPE_LIST", [])
+        self.modify_mode = cfg.get("MODIFY_MODE", True)
+        self.max_seq_len = cfg.get("MAX_SEQ_LEN", 4096)
+        self.repaiting_scale = cfg.get("REPAINTING_SCALE", 0.5)
+        self.d = cfg.get("D", 16)
+        prompt_file = cfg.DATA_LIST
+        self.items = self.read_data_list(delimiter,
+                                         fields,
+                                         prefix,
+                                         edit_type_list,
+                                         prompt_file)
+        random.shuffle(self.items)
+        use_num = int(cfg.get('USE_NUM', -1))
+        if use_num > 0:
+            self.items = self.items[:use_num]
+    def read_data_list(self, delimiter,
+                             fields,
+                             prefix,
+                             edit_type_list,
+                             prompt_file):
+        with FS.get_object(prompt_file) as local_data:
+            rows = local_data.decode('utf-8').strip().split('\n')
+        items = list()
+        dtype_level_num = {}
+        for i, row in enumerate(rows):
+            item = {"prefix": prefix}
+            for key, val in zip(fields, row.split(delimiter)):
+                item[key] = val
+            edit_type = item["data_type"]
+            if len(edit_type_list) > 0:
+                for re_pattern in edit_type_list:
+                    if re.match(re_pattern, edit_type):
+                        items.append(item)
+                        if edit_type not in dtype_level_num:
+                            dtype_level_num[edit_type] = 0
+                        dtype_level_num[edit_type] += 1
+                        break
+            else:
+                items.append(item)
+                if edit_type not in dtype_level_num:
+                    dtype_level_num[edit_type] = 0
+                dtype_level_num[edit_type] += 1
+        for edit_type in dtype_level_num:
+            self.logger.info(f"{edit_type} has {dtype_level_num[edit_type]} samples.")
+        return items
+    def __len__(self):
+        return len(self.items)
+    def __getitem__(self, index):
+        item = self._get(index)
+        return self.pipeline(item)
+    def _get(self, index):
+        # normalize
+        sample_id =  index%len(self)
+        index = self.items[index%len(self)]
+        prefix = index.get("prefix", "")
+        edit_image = index.get("edit_image", "")
+        edit_mask = index.get("edit_mask", "")
+        ref_image = index.get("ref_image", "")
+        target_image = index.get("target_image", "")
+        prompt = index.get("prompt", "")
+        edit_image = load_image(prefix, edit_image, cvt_type="RGB") if edit_image != "" else None
+        edit_mask = load_image(prefix, edit_mask, cvt_type="L") if edit_mask != "" else None
+        ref_image = load_image(prefix, ref_image, cvt_type="RGB") if ref_image != "" else None
+        target_image = load_image(prefix, target_image, cvt_type="RGB") if target_image != "" else None
+        assert target_image is not None
+        edit_id, ref_id, src_image_list, src_mask_list = [], [], [], []
+        # parse editing image
+        if edit_image is None:
+            edit_image = Image.new("RGB", target_image.size, (255, 255, 255))
+            edit_mask = Image.new("L", edit_image.size, 255)
+        elif edit_mask is None:
+            edit_mask = Image.new("L", edit_image.size, 255)
+        src_image_list.append(edit_image)
+        edit_id.append(0)
+        src_mask_list.append(edit_mask)
+        # parse reference image
+        if ref_image is not None:
+            src_image_list.append(ref_image)
+            ref_id.append(1)
+            src_mask_list.append(Image.new("L", ref_image.size, 0))
+        image = transform_image(torch.tensor(np.array(target_image).astype(np.float32)))
+        if edit_mask is not None:
+            image_mask = transform_mask(torch.tensor(np.array(edit_mask).astype(np.float32)))
+        else:
+            image_mask = Image.new("L", target_image.size, 255)
+            image_mask = transform_mask(torch.tensor(np.array(image_mask).astype(np.float32)))
+        src_image_list = [transform_image(torch.tensor(np.array(im).astype(np.float32))) for im in src_image_list]
+        src_mask_list = [transform_mask(torch.tensor(np.array(im).astype(np.float32))) for im in src_mask_list]
+        # decide the repainting scale for the editing task
+        if len(ref_id) > 0:
+            repainting_scale = 1.0
+        else:
+            repainting_scale = self.repaiting_scale
+        for e_i in edit_id:
+            src_image_list[e_i] = src_image_list[e_i] * (1 - repainting_scale * src_mask_list[e_i])
+        size = image.shape[1:]
+        ref_image_list, ret_h = ensure_src_align_target_padding_mode(src_image_list, size,
+                                                                                   image_id=ref_id,
+                                                                                   interpolation=InterpolationMode.NEAREST_EXACT)
+        ref_mask_list, ret_h = ensure_src_align_target_padding_mode(src_mask_list, size,
+                                                                                  size_h=ret_h,
+                                                                                  image_id=ref_id,
+                                                                                  interpolation=InterpolationMode.NEAREST_EXACT)
+        edit_image_list = ensure_src_align_target_h_mode(src_image_list, size,
+                                                                       image_id=edit_id,
+                                                                       interpolation=InterpolationMode.NEAREST_EXACT)
+        edit_mask_list = ensure_src_align_target_h_mode(src_mask_list, size,
+                                                                      image_id=edit_id,
+                                                                      interpolation=InterpolationMode.NEAREST_EXACT)
+        src_image_list = [torch.cat(ref_image_list + edit_image_list, dim=-1)]
+        src_mask_list = [torch.cat(ref_mask_list + edit_mask_list, dim=-1)]
+        image = torch.cat(ref_image_list + [image], dim=-1)
+        image_mask = torch.cat(ref_mask_list + [image_mask], dim=-1)
+        # limit max sequence length
+        image = ensure_limit_sequence(image, max_seq_len = self.max_seq_len,
+                                      d = self.d, interpolation=InterpolationMode.BILINEAR)
+        image_mask = ensure_limit_sequence(image_mask, max_seq_len = self.max_seq_len,
+                                      d = self.d, interpolation=InterpolationMode.NEAREST_EXACT)
+        src_image_list = [ensure_limit_sequence(i, max_seq_len = self.max_seq_len,
+                                      d = self.d, interpolation=InterpolationMode.BILINEAR) for i in src_image_list]
+        src_mask_list = [ensure_limit_sequence(i, max_seq_len = self.max_seq_len,
+                                      d = self.d, interpolation=InterpolationMode.NEAREST_EXACT) for i in src_mask_list]
+        if self.modify_mode:
+            # To be modified regions according to mask
+            modify_image_list = [ii * im for ii, im in zip(src_image_list, src_mask_list)]
+            # To be edited regions according to mask
+            src_image_list = [ii * (1 - im) for ii, im in zip(src_image_list, src_mask_list)]
+        else:
+            src_image_list = src_image_list
+            modify_image_list = src_image_list
+        item = {
+            "src_image_list": src_image_list,
+            "src_mask_list": src_mask_list,
+            "modify_image_list": modify_image_list,
+            "image": image,
+            "image_mask": image_mask,
+            "edit_id": edit_id,
+            "ref_id": ref_id,
+            "prompt": prompt,
+            "edit_key": index["edit_key"] if "edit_key" in index else "",
+            "sample_id": sample_id
+        }
+        return item
+    @staticmethod
+    def collate_fn(batch):
+        collect = defaultdict(list)
+        for sample in batch:
+            for k, v in sample.items():
+                collect[k].append(v)
+        new_batch = dict()
+        for k, v in collect.items():
+            if all([i is None for i in v]):
+                new_batch[k] = None
+            else:
+                new_batch[k] = v
+        return new_batch

modules/ace_plus_ldm.py ADDED Viewed

	@@ -0,0 +1,451 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn.functional as F
+import copy
+import math
+import random
+from contextlib import nullcontext
+from einops import rearrange
+from scepter.modules.model.network.ldm import LatentDiffusion
+from scepter.modules.model.registry import MODELS, DIFFUSIONS, BACKBONES, LOSSES, TOKENIZERS, EMBEDDERS
+from scepter.modules.model.utils.basic_utils import check_list_of_list, to_device, pack_imagelist_into_tensor, \
+    limit_batch_data, unpack_tensor_into_imagelist, count_params, disabled_train
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+@MODELS.register_class()
+class LatentDiffusionACEPlus(LatentDiffusion):
+    para_dict = LatentDiffusion.para_dict
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.guide_scale = cfg.get('GUIDE_SCALE', 1.0)
+    def init_params(self):
+        self.parameterization = self.cfg.get('PARAMETERIZATION', 'rf')
+        assert self.parameterization in [
+            'eps', 'x0', 'v', 'rf'
+        ], 'currently only supporting "eps" and "x0" and "v" and "rf"'
+        diffusion_cfg = self.cfg.get("DIFFUSION", None)
+        assert diffusion_cfg is not None
+        if self.cfg.have("WORK_DIR"):
+            diffusion_cfg.WORK_DIR = self.cfg.WORK_DIR
+        self.diffusion = DIFFUSIONS.build(diffusion_cfg, logger=self.logger)
+        self.pretrained_model = self.cfg.get('PRETRAINED_MODEL', None)
+        self.ignore_keys = self.cfg.get('IGNORE_KEYS', [])
+        self.model_config = self.cfg.DIFFUSION_MODEL
+        self.first_stage_config = self.cfg.FIRST_STAGE_MODEL
+        self.cond_stage_config = self.cfg.COND_STAGE_MODEL
+        self.tokenizer_config = self.cfg.get('TOKENIZER', None)
+        self.loss_config = self.cfg.get('LOSS', None)
+        self.scale_factor = self.cfg.get('SCALE_FACTOR', 0.18215)
+        self.size_factor = self.cfg.get('SIZE_FACTOR', 16)
+        self.default_n_prompt = self.cfg.get('DEFAULT_N_PROMPT', '')
+        self.default_n_prompt = '' if self.default_n_prompt is None else self.default_n_prompt
+        self.p_zero = self.cfg.get('P_ZERO', 0.0)
+        self.train_n_prompt = self.cfg.get('TRAIN_N_PROMPT', '')
+        if self.default_n_prompt is None:
+            self.default_n_prompt = ''
+        if self.train_n_prompt is None:
+            self.train_n_prompt = ''
+        self.use_ema = self.cfg.get('USE_EMA', False)
+        self.model_ema_config = self.cfg.get('DIFFUSION_MODEL_EMA', None)
+    def construct_network(self):
+        # embedding_context = torch.device("meta") if self.model_config.get("PRETRAINED_MODEL", None) else nullcontext()
+        # with embedding_context:
+        self.model = BACKBONES.build(self.model_config, logger=self.logger).to(torch.bfloat16)
+        self.logger.info('all parameters:{}'.format(count_params(self.model)))
+        if self.use_ema:
+            if self.model_ema_config:
+                self.model_ema = BACKBONES.build(self.model_ema_config,
+                                                 logger=self.logger)
+            else:
+                self.model_ema = copy.deepcopy(self.model)
+            self.model_ema = self.model_ema.eval()
+            for param in self.model_ema.parameters():
+                param.requires_grad = False
+        if self.loss_config:
+            self.loss = LOSSES.build(self.loss_config, logger=self.logger)
+        if self.tokenizer_config is not None:
+            self.tokenizer = TOKENIZERS.build(self.tokenizer_config,
+                                              logger=self.logger)
+        if self.first_stage_config:
+            self.first_stage_model = MODELS.build(self.first_stage_config,
+                                                  logger=self.logger)
+            self.first_stage_model = self.first_stage_model.eval()
+            self.first_stage_model.train = disabled_train
+            for param in self.first_stage_model.parameters():
+                param.requires_grad = False
+        else:
+            self.first_stage_model = None
+        if self.tokenizer_config is not None:
+            self.cond_stage_config.KWARGS = {
+                'vocab_size': self.tokenizer.vocab_size
+            }
+        if self.cond_stage_config == '__is_unconditional__':
+            print(
+                f'Training {self.__class__.__name__} as an unconditional model.'
+            )
+            self.cond_stage_model = None
+        else:
+            model = EMBEDDERS.build(self.cond_stage_config, logger=self.logger)
+            self.cond_stage_model = model.eval().requires_grad_(False)
+            self.cond_stage_model.train = disabled_train
+    @torch.no_grad()
+    def encode_first_stage(self, x, **kwargs):
+        def run_one_image(u):
+            zu = self.first_stage_model.encode(u)
+            if isinstance(zu, (tuple, list)):
+                zu = zu[0]
+            return zu
+        z = [run_one_image(u.unsqueeze(0) if u.dim() == 3 else u) for u in x]
+        return z
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        return [self.first_stage_model.decode(zu) for zu in z]
+    def noise_sample(self, num_samples, h, w, seed, dtype=torch.bfloat16):
+        noise = torch.randn(
+            num_samples,
+            16,
+            # allow for packing
+            2 * math.ceil(h / 16),
+            2 * math.ceil(w / 16),
+            device=we.device_id,
+            dtype=dtype,
+            generator=torch.Generator(device=we.device_id).manual_seed(seed),
+        )
+        return noise
+    def resize_func(self, x, size):
+        if x is None: return x
+        return F.interpolate(x.unsqueeze(0), size = size, mode='nearest-exact')
+    def parse_ref_and_edit(self, src_image,
+                           modify_image,
+                           src_image_mask,
+                           text_embedding,
+                           #text_mask,
+                           edit_id):
+        edit_image = []
+        modi_image = []
+        edit_mask = []
+        ref_image = []
+        ref_mask = []
+        ref_context = []
+        ref_y = []
+        ref_id = []
+        txt = []
+        txt_y = []
+        for sample_id, (one_src,
+                        one_modify,
+                        one_src_mask,
+                        one_text_embedding,
+                        one_text_y,
+                        # one_text_mask,
+                        one_edit_id)  in enumerate(zip(src_image,
+                                        modify_image,
+                                        src_image_mask,
+                                        text_embedding["context"],
+                                        text_embedding["y"],
+                                        #text_mask,
+                                        edit_id)
+                                ):
+            ref_id.append([i for i in range(len(one_src))])
+            if hasattr(self, "ref_cond_stage_model") and self.ref_cond_stage_model:
+                ref_image.append(self.ref_cond_stage_model.encode_list([((i + 1.0) / 2.0 * 255).type(torch.uint8) for i in one_src]))
+            else:
+                ref_image.append(one_src)
+            ref_mask.append(one_src_mask)
+            # process edit image & edit image mask
+            current_edit_image = to_device([one_src[i] for i in one_edit_id], strict=False)
+            current_edit_image = [v.squeeze(0) for v in self.encode_first_stage(current_edit_image)]
+            # process modi image
+            current_modify_image = to_device([one_modify[i] for i in one_edit_id],
+                                             strict=False)
+            current_modify_image = [
+                v.squeeze(0)
+                for v in self.encode_first_stage(current_modify_image)
+            ]
+            current_edit_image_mask = to_device(
+                [one_src_mask[i] for i in one_edit_id], strict=False)
+            current_edit_image_mask = [
+                self.reshape_func(m).squeeze(0)
+                for m in current_edit_image_mask
+            ]
+            edit_image.append(current_edit_image)
+            modi_image.append(current_modify_image)
+            edit_mask.append(current_edit_image_mask)
+            ref_context.append(one_text_embedding[:len(ref_id[-1])])
+            ref_y.append(one_text_y[:len(ref_id[-1])])
+        if not sum(len(src_) for src_ in src_image) > 0:
+            ref_image = None
+            ref_context = None
+            ref_y = None
+        for sample_id, (one_text_embedding, one_text_y) in enumerate(zip(text_embedding["context"],
+                                                    text_embedding["y"])):
+            txt.append(one_text_embedding[-1].squeeze(0))
+            txt_y.append(one_text_y[-1])
+        return {
+            "edit": edit_image,
+            'modify': modi_image,
+            "edit_mask": edit_mask,
+            "edit_id": edit_id,
+            "ref_context": ref_context,
+            "ref_y": ref_y,
+            "context": txt,
+            "y": txt_y,
+            "ref_x": ref_image,
+            "ref_mask": ref_mask,
+            "ref_id": ref_id
+        }
+    def reshape_func(self, mask):
+        mask = mask.to(torch.bfloat16)
+        mask = mask.view((-1, mask.shape[-2], mask.shape[-1]))
+        mask = rearrange(
+            mask,
+            "c (h ph) (w pw) -> c (ph pw) h w",
+            ph=8,
+            pw=8,
+        )
+        return mask
+    def forward_train(self,
+                      src_image_list=[],
+                      modify_image_list=[],
+                      src_mask_list=[],
+                      edit_id=[],
+                      image=None,
+                      image_mask=None,
+                      noise=None,
+                      prompt=[],
+                      **kwargs):
+        '''
+           Args:
+               src_image: list of list of src_image
+               src_image_mask: list of list of src_image_mask
+               image: target image
+               image_mask: target image mask
+               noise: default is None, generate automaticly
+               ref_prompt: list of list of text
+               prompt: list of text
+               **kwargs:
+           Returns:
+        '''
+        assert check_list_of_list(src_image_list) and check_list_of_list(
+            src_mask_list)
+        assert self.cond_stage_model is not None
+        gc_seg = kwargs.pop("gc_seg", [])
+        gc_seg = int(gc_seg[0]) if len(gc_seg) > 0 else 0
+        align = kwargs.pop("align", [])
+        prompt_ = [[pp] if isinstance(pp, str) else pp for pp in prompt]
+        if len(align) < 1: align = [0] * len(prompt_)
+        context = getattr(self.cond_stage_model, 'encode_list_of_list')(prompt_)
+        guide_scale = self.guide_scale
+        if guide_scale is not None:
+            guide_scale = torch.full((len(prompt_),), guide_scale, device=we.device_id)
+        else:
+            guide_scale = None
+        # image and image_mask
+        # print("is list of list", check_list_of_list(image))
+        if check_list_of_list(image):
+            image = [to_device(ix) for ix in image]
+            x_start = [self.encode_first_stage(ix, **kwargs) for ix in image]
+            noise = [[torch.randn_like(ii) for ii in ix] for ix in x_start]
+            x_start = [torch.cat(ix, dim=-1) for ix in x_start]
+            noise = [torch.cat(ix, dim=-1) for ix in noise]
+            noise, _ = pack_imagelist_into_tensor(noise)
+            image_mask = [to_device(im, strict=False) for im in image_mask]
+            x_mask = [[self.reshape_func(i).squeeze(0) for i in im] if im is not None else [None] * len(ix) for ix, im in zip(image, image_mask)]
+            x_mask = [torch.cat(im, dim=-1) for im in x_mask]
+        else:
+            image = to_device(image)
+            x_start = self.encode_first_stage(image, **kwargs)
+            image_mask = to_device(image_mask, strict=False)
+            x_mask = [self.reshape_func(i).squeeze(0) for i in image_mask] if image_mask is not None else [None] * len(
+                image)
+        loss_mask, _ = pack_imagelist_into_tensor(
+            tuple(torch.ones_like(ix, dtype=torch.bool, device=ix.device) for ix in x_start))
+        x_start, x_shapes = pack_imagelist_into_tensor(x_start)
+        context['x_shapes'] = x_shapes
+        context['align'] = align
+        # process image mask
+        context['x_mask'] = x_mask
+        ref_edit_context = self.parse_ref_and_edit(src_image_list, modify_image_list, src_mask_list, context, edit_id)
+        context.update(ref_edit_context)
+        teacher_context = copy.deepcopy(context)
+        teacher_context["context"] = torch.cat(teacher_context["context"], dim=0)
+        teacher_context["y"] = torch.cat(teacher_context["y"], dim=0)
+        loss = self.diffusion.loss(x_0=x_start,
+                                   model=self.model,
+                                   model_kwargs={"cond": context,
+                                                 "gc_seg": gc_seg,
+                                                 "guidance": guide_scale},
+                                   noise=noise,
+                                   reduction='none',
+                                   **kwargs)
+        loss = loss[loss_mask].mean()
+        ret = {'loss': loss, 'probe_data': {'prompt': prompt}}
+        return ret
+    @torch.no_grad()
+    def forward_test(self,
+                     src_image_list=[],
+                     modify_image_list=[],
+                     src_mask_list=[],
+                     edit_id=[],
+                     image=None,
+                     image_mask=None,
+                     prompt=[],
+                     sampler='flow_euler',
+                     sample_steps=20,
+                     seed=2023,
+                     guide_scale=3.5,
+                     guide_rescale=0.0,
+                     show_process=False,
+                     log_num=-1,
+                     **kwargs):
+        outputs = self.forward_editing(
+            src_image_list=src_image_list,
+            src_mask_list=src_mask_list,
+            modify_image_list=modify_image_list,
+            edit_id=edit_id,
+            image=image,
+            image_mask=image_mask,
+            prompt=prompt,
+            sampler=sampler,
+            sample_steps=sample_steps,
+            seed=seed,
+            guide_scale=guide_scale,
+            guide_rescale=guide_rescale,
+            show_process=show_process,
+            log_num=log_num,
+            **kwargs
+        )
+        return outputs
+    @torch.no_grad()
+    def forward_editing(self,
+                        src_image_list=[],
+                        modify_image_list=None,
+                        src_mask_list=[],
+                        edit_id=[],
+                        image=None,
+                        image_mask=None,
+                        prompt=[],
+                        sampler='flow_euler',
+                        sample_steps=20,
+                        seed=2023,
+                        guide_scale=3.5,
+                        log_num=-1,
+                        **kwargs
+                        ):
+        # gc_seg is unused
+        prompt, image, image_mask, src_image, modify_image,  src_image_mask, edit_id = limit_batch_data(
+            [prompt, image, image_mask, src_image_list, modify_image_list, src_mask_list, edit_id], log_num)
+        assert check_list_of_list(src_image) and check_list_of_list(src_image_mask)
+        assert self.cond_stage_model is not None
+        align = kwargs.pop("align", [])
+        prompt_ = [[pp] if isinstance(pp, str) else pp for pp in prompt]
+        if len(align) < 1: align = [0] * len(prompt_)
+        context = getattr(self.cond_stage_model, 'encode_list_of_list')(prompt_)
+        guide_scale = guide_scale or self.guide_scale
+        if guide_scale is not None:
+            guide_scale = torch.full((len(prompt),), guide_scale, device=we.device_id)
+        else:
+            guide_scale = None
+        # image and image_mask
+        seed = seed if seed >= 0 else random.randint(0, 2 ** 32 - 1)
+        if image is not None:
+            if check_list_of_list(image):
+                image = [torch.cat(ix, dim=-1) for ix in image]
+                image_mask = [torch.cat(im, dim=-1) for im in image_mask]
+            noise = [self.noise_sample(1, ix.shape[1], ix.shape[2], seed) for ix in image]
+        else:
+            height, width = kwargs.pop("height"), kwargs.pop("width")
+            noise = [self.noise_sample(1, height, width, seed) for _ in prompt]
+        noise, x_shapes = pack_imagelist_into_tensor(noise)
+        context['x_shapes'] = x_shapes
+        context['align'] = align
+        # process image mask
+        image_mask = to_device(image_mask, strict=False)
+        x_mask = [self.reshape_func(i).squeeze(0) for i in image_mask]
+        context['x_mask'] = x_mask
+        ref_edit_context = self.parse_ref_and_edit(src_image, modify_image, src_image_mask, context, edit_id)
+        context.update(ref_edit_context)
+        # UNet use input n_prompt
+        # model = self.model_ema if self.use_ema and self.eval_ema else self.model
+        # import pdb;pdb.set_trace()
+        model = self.model
+        embedding_context = model.no_sync if isinstance(model, torch.distributed.fsdp.FullyShardedDataParallel) \
+            else nullcontext
+        with embedding_context():
+            samples = self.diffusion.sample(
+                noise=noise,
+                sampler=sampler,
+                model=self.model,
+                model_kwargs={"cond": context, "guidance": guide_scale, "gc_seg": -1
+                              },
+                steps=sample_steps,
+                show_progress=True,
+                guide_scale=guide_scale,
+                return_intermediate=None,
+                **kwargs).float()
+        samples = unpack_tensor_into_imagelist(samples, x_shapes)
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            x_samples = self.decode_first_stage(samples)
+        outputs = list()
+        for i in range(len(prompt)):
+            rec_img = torch.clamp((x_samples[i].float() + 1.0) / 2.0, min=0.0, max=1.0)
+            rec_img = rec_img.squeeze(0)
+            edit_imgs, modify_imgs, edit_img_masks = [], [], []
+            if src_image is not None and src_image[i] is not None:
+                if src_image_mask[i] is None:
+                    src_image_mask[i] = [None] * len(src_image[i])
+                for edit_img, modify_img, edit_mask in zip(src_image[i],  modify_image_list[i], src_image_mask[i]):
+                    edit_img = torch.clamp((edit_img.float() + 1.0) / 2.0, min=0.0, max=1.0)
+                    edit_imgs.append(edit_img.squeeze(0))
+                    modify_img = torch.clamp((modify_img.float() + 1.0) / 2.0,
+                                           min=0.0,
+                                           max=1.0)
+                    modify_imgs.append(modify_img.squeeze(0))
+                    if edit_mask is None:
+                        edit_mask = torch.ones_like(edit_img[[0], :, :])
+                    edit_img_masks.append(edit_mask)
+            one_tup = {
+                'reconstruct_image': rec_img,
+                'instruction': prompt[i],
+                'edit_image': edit_imgs if len(edit_imgs) > 0 else None,
+                'modify_image': modify_imgs if len(modify_imgs) > 0 else None,
+                'edit_mask': edit_img_masks if len(edit_imgs) > 0 else None
+            }
+            if image is not None:
+                if image_mask is None:
+                    image_mask = [None] * len(image)
+                ori_img = torch.clamp((image[i] + 1.0) / 2.0, min=0.0, max=1.0)
+                one_tup['target_image'] = ori_img.squeeze(0)
+                one_tup['target_mask'] = image_mask[i] if image_mask[i] is not None else torch.ones_like(
+                    ori_img[[0], :, :])
+            outputs.append(one_tup)
+        return outputs
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            LatentDiffusionACEPlus.para_dict,
+                            set_name=True)

modules/ace_plus_solver.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+from scepter.modules.solver import LatentDiffusionSolver
+from scepter.modules.solver.registry import SOLVERS
+from scepter.modules.utils.data import transfer_data_to_cuda
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.probe import ProbeData
+from tqdm import tqdm
+@SOLVERS.register_class()
+class FormalACEPlusSolver(LatentDiffusionSolver):
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.probe_prompt = cfg.get("PROBE_PROMPT", None)
+        self.probe_hw = cfg.get("PROBE_HW", [])
+    @torch.no_grad()
+    def run_eval(self):
+        self.eval_mode()
+        self.before_all_iter(self.hooks_dict[self._mode])
+        all_results = []
+        for batch_idx, batch_data in tqdm(
+                enumerate(self.datas[self._mode].dataloader)):
+            self.before_iter(self.hooks_dict[self._mode])
+            if self.sample_args:
+                batch_data.update(self.sample_args.get_lowercase_dict())
+            with torch.autocast(device_type='cuda',
+                                enabled=self.use_amp,
+                                dtype=self.dtype):
+                results = self.run_step_eval(transfer_data_to_cuda(batch_data),
+                                             batch_idx,
+                                             step=self.total_iter,
+                                             rank=we.rank)
+                all_results.extend(results)
+            self.after_iter(self.hooks_dict[self._mode])
+        log_data, log_label = self.save_results(all_results)
+        self.register_probe({'eval_label': log_label})
+        self.register_probe({
+            'eval_image':
+                ProbeData(log_data,
+                          is_image=True,
+                          build_html=True,
+                          build_label=log_label)
+        })
+        self.after_all_iter(self.hooks_dict[self._mode])
+    @torch.no_grad()
+    def run_test(self):
+        self.test_mode()
+        self.before_all_iter(self.hooks_dict[self._mode])
+        all_results = []
+        for batch_idx, batch_data in tqdm(
+                enumerate(self.datas[self._mode].dataloader)):
+            self.before_iter(self.hooks_dict[self._mode])
+            if self.sample_args:
+                batch_data.update(self.sample_args.get_lowercase_dict())
+            with torch.autocast(device_type='cuda',
+                                enabled=self.use_amp,
+                                dtype=self.dtype):
+                results = self.run_step_eval(transfer_data_to_cuda(batch_data),
+                                             batch_idx,
+                                             step=self.total_iter,
+                                             rank=we.rank)
+                all_results.extend(results)
+            self.after_iter(self.hooks_dict[self._mode])
+        log_data, log_label = self.save_results(all_results)
+        self.register_probe({'test_label': log_label})
+        self.register_probe({
+            'test_image':
+                ProbeData(log_data,
+                          is_image=True,
+                          build_html=True,
+                          build_label=log_label)
+        })
+        self.after_all_iter(self.hooks_dict[self._mode])
+    def run_step_val(self, batch_data, batch_idx=0, step=None, rank=None):
+        sample_id_list = batch_data['sample_id']
+        loss_dict = {}
+        with torch.autocast(device_type='cuda',
+                            enabled=self.use_amp,
+                            dtype=self.dtype):
+            results = self.model.forward_train(**batch_data)
+            loss = results['loss']
+        for sample_id in sample_id_list:
+            loss_dict[sample_id] = loss.detach().cpu().numpy()
+        return loss_dict
+    def save_results(self, results):
+        log_data, log_label = [], []
+        for result in results:
+            ret_images, ret_labels = [], []
+            edit_image = result.get('edit_image', None)
+            modify_image = result.get('modify_image', None)
+            edit_mask = result.get('edit_mask', None)
+            if edit_image is not None:
+                for i, edit_img in enumerate(result['edit_image']):
+                    if edit_img is None:
+                        continue
+                    ret_images.append((edit_img.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                    ret_labels.append(f'edit_image{i}; ')
+                    ret_images.append((modify_image[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                    ret_labels.append(f'modify_image{i}; ')
+                    if edit_mask is not None:
+                        ret_images.append((edit_mask[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                        ret_labels.append(f'edit_mask{i}; ')
+            target_image = result.get('target_image', None)
+            target_mask = result.get('target_mask', None)
+            if target_image is not None:
+                ret_images.append((target_image.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                ret_labels.append(f'target_image; ')
+                if target_mask is not None:
+                    ret_images.append((target_mask.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                    ret_labels.append(f'target_mask; ')
+            teacher_image = result.get('image', None)
+            if teacher_image is not None:
+                ret_images.append((teacher_image.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                ret_labels.append(f"teacher_image")
+            reconstruct_image = result.get('reconstruct_image', None)
+            if reconstruct_image is not None:
+                ret_images.append((reconstruct_image.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))
+                ret_labels.append(f"{result['instruction']}")
+            log_data.append(ret_images)
+            log_label.append(ret_labels)
+        return log_data, log_label
+    @property
+    def probe_data(self):
+        if not we.debug and self.mode == 'train':
+            batch_data = transfer_data_to_cuda(self.current_batch_data[self.mode])
+            self.eval_mode()
+            with torch.autocast(device_type='cuda',
+                                enabled=self.use_amp,
+                                dtype=self.dtype):
+                batch_data['log_num'] = self.log_train_num
+                batch_data.update(self.sample_args.get_lowercase_dict())
+                results = self.run_step_eval(batch_data)
+            self.train_mode()
+            log_data, log_label = self.save_results(results)
+            self.register_probe({
+                'train_image':
+                    ProbeData(log_data,
+                              is_image=True,
+                              build_html=True,
+                              build_label=log_label)
+            })
+            self.register_probe({'train_label': log_label})
+            if self.probe_prompt:
+                self.eval_mode()
+                all_results = []
+                for prompt in self.probe_prompt:
+                    with torch.autocast(device_type='cuda',
+                                        enabled=self.use_amp,
+                                        dtype=self.dtype):
+                        batch_data = {
+                            "prompt": [[prompt]],
+                            "image": [torch.zeros(3, self.probe_hw[0], self.probe_hw[1])],
+                            "image_mask": [torch.ones(1, self.probe_hw[0], self.probe_hw[1])],
+                            "src_image_list": [[]],
+                            "modify_image_list": [[]],
+                            "src_mask_list": [[]],
+                            "edit_id": [[]],
+                            "height": self.probe_hw[0],
+                            "width": self.probe_hw[1]
+                        }
+                        batch_data.update(self.sample_args.get_lowercase_dict())
+                        results = self.run_step_eval(batch_data)
+                        all_results.extend(results)
+                self.train_mode()
+                log_data, log_label = self.save_results(all_results)
+                self.register_probe({
+                    'probe_image':
+                        ProbeData(log_data,
+                                  is_image=True,
+                                  build_html=True,
+                                  build_label=log_label)
+                })
+        return super(LatentDiffusionSolver, self).probe_data

modules/checkpoint.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os, torch
+import os.path as osp
+import warnings
+from collections import OrderedDict
+from safetensors.torch import save_file
+from scepter.modules.solver.hooks import CheckpointHook, BackwardHook
+from scepter.modules.solver.hooks.registry import HOOKS
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+_DEFAULT_CHECKPOINT_PRIORITY = 300
+def convert_to_comfyui_lora(ori_sd, prefix = "lora_unet"):
+    new_ckpt = OrderedDict()
+    for k,v in ori_sd.items():
+        new_k = k.replace(".lora_A.0_SwiftLoRA.", ".lora_down.").replace(".lora_B.0_SwiftLoRA.", ".lora_up.")
+        new_k = prefix + "_" + new_k.split(".lora")[0].replace("model.", "").replace(".", "_") + ".lora" + new_k.split(".lora")[1]
+        alpha_k = new_k.split(".lora")[0] + ".alpha"
+        new_ckpt[new_k] = v
+        if "lora_up" in new_k:
+            alpha = v.shape[-1]
+        elif "lora_down" in new_k:
+            alpha = v.shape[0]
+        new_ckpt[alpha_k] = torch.tensor(float(alpha)).to(v)
+    return new_ckpt
+@HOOKS.register_class()
+class ACECheckpointHook(CheckpointHook):
+    """ Checkpoint resume or save hook.
+    Args:
+        interval (int): Save interval, by epoch.
+        save_best (bool): Save the best checkpoint by a metric key, default is False.
+        save_best_by (str): How to get the best the checkpoint by the metric key, default is ''.
+            + means the higher the best (default).
+            - means the lower the best.
+            E.g. +acc@1, -err@1, acc@5(same as +acc@5)
+    """
+    def __init__(self, cfg, logger=None):
+        super(ACECheckpointHook, self).__init__(cfg, logger=logger)
+    def after_iter(self, solver):
+        super().after_iter(solver)
+        if solver.total_iter != 0 and (
+            (solver.total_iter + 1) % self.interval == 0
+                or solver.total_iter == solver.max_steps - 1):
+            from swift import SwiftModel
+            if isinstance(solver.model, SwiftModel) or (
+                    hasattr(solver.model, 'module')
+                    and isinstance(solver.model.module, SwiftModel)):
+                save_path = osp.join(
+                    solver.work_dir,
+                    'checkpoints/{}-{}'.format(self.save_name_prefix,
+                                               solver.total_iter + 1))
+                if we.rank == 0:
+                    tuner_model = os.path.join(save_path, '0_SwiftLoRA', 'adapter_model.bin')
+                    save_model = os.path.join(save_path, '0_SwiftLoRA', 'comfyui_model.safetensors')
+                    if FS.exists(tuner_model):
+                        with FS.get_from(tuner_model) as local_file:
+                            swift_lora_sd = torch.load(local_file, weights_only=True)
+                        safetensor_lora_sd = convert_to_comfyui_lora(swift_lora_sd)
+                        with FS.put_to(save_model) as local_file:
+                            save_file(safetensor_lora_sd, local_file)
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('hook',
+                            __class__.__name__,
+                            ACECheckpointHook.para_dict,
+                            set_name=True)
+@HOOKS.register_class()
+class ACEBackwardHook(BackwardHook):
+    def grad_clip(self, optimizer):
+        for params_group in optimizer.param_groups:
+            train_params = []
+            for param in params_group['params']:
+                if param.requires_grad:
+                    train_params.append(param)
+            # print(len(train_params), self.gradient_clip)
+            torch.nn.utils.clip_grad_norm_(parameters=train_params,
+                                       max_norm=self.gradient_clip)
+    def after_iter(self, solver):
+        if solver.optimizer is not None and solver.is_train_mode:
+            if solver.loss is None:
+                warnings.warn(
+                    'solver.loss should not be None in train mode, remember to call solver._reduce_scalar()!'
+                )
+                return
+            if solver.scaler is not None:
+                solver.scaler.scale(solver.loss /
+                                    self.accumulate_step).backward()
+                self.current_step += 1
+                # Suppose profiler run after backward, so we need to set backward_prev_step
+                # as the previous one step before the backward step
+                if self.current_step % self.accumulate_step == 0:
+                    solver.scaler.unscale_(solver.optimizer)
+                    if self.gradient_clip > 0:
+                        self.grad_clip(solver.optimizer)
+                    self.profile(solver)
+                    solver.scaler.step(solver.optimizer)
+                    solver.scaler.update()
+                    solver.optimizer.zero_grad()
+            else:
+                (solver.loss / self.accumulate_step).backward()
+                self.current_step += 1
+                # Suppose profiler run after backward, so we need to set backward_prev_step
+                # as the previous one step before the backward step
+                if self.current_step % self.accumulate_step == 0:
+                    if self.gradient_clip > 0:
+                        self.grad_clip(solver.optimizer)
+                    self.profile(solver)
+                    solver.optimizer.step()
+                    solver.optimizer.zero_grad()
+            if solver.lr_scheduler:
+                if self.current_step % self.accumulate_step == 0:
+                    solver.lr_scheduler.step()
+            if self.current_step % self.accumulate_step == 0:
+                setattr(solver, 'backward_step', True)
+                self.current_step = 0
+            else:
+                setattr(solver, 'backward_step', False)
+            solver.loss = None
+        if self.empty_cache_step > 0 and solver.total_iter % self.empty_cache_step == 0:
+            torch.cuda.empty_cache()
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('hook',
+                            __class__.__name__,
+                            ACEBackwardHook.para_dict,
+                            set_name=True)

modules/embedder.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# This file contains code that is adapted from
+# https://github.com/black-forest-labs/flux.git
+import warnings
+import torch
+import torch.utils.dlpack
+import transformers
+from scepter.modules.model.embedder.base_embedder import BaseEmbedder
+from scepter.modules.model.registry import EMBEDDERS
+from scepter.modules.model.tokenizer.tokenizer_component import (
+    basic_clean, canonicalize, whitespace_clean)
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.file_system import FS
+try:
+    from transformers import AutoTokenizer, T5EncoderModel
+except Exception as e:
+    warnings.warn(
+        f'Import transformers error, please deal with this problem: {e}')
+@EMBEDDERS.register_class()
+class ACEHFEmbedder(BaseEmbedder):
+    para_dict = {
+        "HF_MODEL_CLS": {
+            "value": None,
+            "description": "huggingface cls in transfomer"
+        },
+        "MODEL_PATH": {
+            "value": None,
+            "description": "model folder path"
+        },
+        "HF_TOKENIZER_CLS": {
+            "value": None,
+            "description": "huggingface cls in transfomer"
+        },
+        "TOKENIZER_PATH": {
+            "value": None,
+            "description": "tokenizer folder path"
+        },
+        "MAX_LENGTH": {
+            "value": 77,
+            "description": "max length of input"
+        },
+        "OUTPUT_KEY": {
+            "value": "last_hidden_state",
+            "description": "output key"
+        },
+        "D_TYPE": {
+            "value": "float",
+            "description": "dtype"
+        },
+        "BATCH_INFER": {
+            "value": False,
+            "description": "batch infer"
+        }
+    }
+    para_dict.update(BaseEmbedder.para_dict)
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        hf_model_cls = cfg.get('HF_MODEL_CLS', None)
+        model_path = cfg.get("MODEL_PATH", None)
+        hf_tokenizer_cls = cfg.get('HF_TOKENIZER_CLS', None)
+        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
+        self.max_length = cfg.get('MAX_LENGTH', 77)
+        self.output_key = cfg.get("OUTPUT_KEY", "last_hidden_state")
+        self.d_type = cfg.get("D_TYPE", "float")
+        self.clean = cfg.get("CLEAN", "whitespace")
+        self.batch_infer = cfg.get("BATCH_INFER", False)
+        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
+        torch_dtype = getattr(torch, self.d_type)
+        assert hf_model_cls is not None and hf_tokenizer_cls is not None
+        assert model_path is not None and tokenizer_path is not None
+        with FS.get_dir_to_local_dir(tokenizer_path, wait_finish=True) as local_path:
+            self.tokenizer = getattr(transformers, hf_tokenizer_cls).from_pretrained(local_path,
+                                                                                     max_length = self.max_length,
+                                                                                     torch_dtype = torch_dtype,
+                                                                                     additional_special_tokens=self.added_identifier)
+        with FS.get_dir_to_local_dir(model_path, wait_finish=True) as local_path:
+            self.hf_module = getattr(transformers, hf_model_cls).from_pretrained(local_path, torch_dtype = torch_dtype)
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, text: list[str], return_mask = False):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        if return_mask:
+            return outputs[self.output_key], batch_encoding['attention_mask'].to(self.hf_module.device)
+        else:
+            return outputs[self.output_key], None
+    def encode(self, text, return_mask = False):
+        if isinstance(text, str):
+            text = [text]
+        if self.clean:
+            text = [self._clean(u) for u in text]
+        if not self.batch_infer:
+            cont, mask = [], []
+            for tt in text:
+                one_cont, one_mask = self([tt], return_mask=return_mask)
+                cont.append(one_cont)
+                mask.append(one_mask)
+            if return_mask:
+                return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
+            else:
+                return torch.cat(cont, dim=0)
+        else:
+            ret_data = self(text, return_mask = return_mask)
+            if return_mask:
+                return ret_data
+            else:
+                return ret_data[0]
+    def encode_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont = self.encode(pp, return_mask=return_mask)
+            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
+            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    def encode_list_of_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont = self.encode_list(pp, return_mask=return_mask)
+            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
+            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('EMBEDDER',
+                            __class__.__name__,
+                            ACEHFEmbedder.para_dict,
+                            set_name=True)
+@EMBEDDERS.register_class()
+class T5ACEPlusClipFluxEmbedder(BaseEmbedder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    para_dict = {
+        'T5_MODEL': {},
+        'CLIP_MODEL': {}
+    }
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.t5_model = EMBEDDERS.build(cfg.T5_MODEL, logger=logger)
+        self.clip_model = EMBEDDERS.build(cfg.CLIP_MODEL, logger=logger)
+    def encode(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    def encode_list(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode_list(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode_list(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    def encode_list_of_list(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode_list_of_list(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode_list_of_list(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('EMBEDDER',
+                            __class__.__name__,
+                            T5ACEPlusClipFluxEmbedder.para_dict,
+                            set_name=True)

modules/flux.py ADDED Viewed

	@@ -0,0 +1,812 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# This file contains code that is adapted from
+# https://github.com/black-forest-labs/flux.git
+import math
+import torch
+from torch import Tensor, nn
+from collections import OrderedDict
+from functools import partial
+from einops import rearrange, repeat
+from scepter.modules.model.base_model import BaseModel
+from scepter.modules.model.registry import BACKBONES
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+from torch.utils.checkpoint import checkpoint_sequential
+from torch.nn.utils.rnn import pad_sequence
+from .layers import (DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder,
+                     SingleStreamBlock, timestep_embedding)
+@BACKBONES.register_class()
+class Flux(BaseModel):
+    """
+    Transformer backbone Diffusion model with RoPE.
+    """
+    para_dict = {
+        'IN_CHANNELS': {
+            'value': 64,
+            'description': "model's input channels."
+        },
+        'OUT_CHANNELS': {
+            'value': 64,
+            'description': "model's output channels."
+        },
+        'HIDDEN_SIZE': {
+            'value': 1024,
+            'description': "model's hidden size."
+        },
+        'NUM_HEADS': {
+            'value': 16,
+            'description': 'number of heads in the transformer.'
+        },
+        'AXES_DIM': {
+            'value': [16, 56, 56],
+            'description': 'dimensions of the axes of the positional encoding.'
+        },
+        'THETA': {
+            'value': 10_000,
+            'description': 'theta for positional encoding.'
+        },
+        'VEC_IN_DIM': {
+            'value': 768,
+            'description': 'dimension of the vector input.'
+        },
+        'GUIDANCE_EMBED': {
+            'value': False,
+            'description': 'whether to use guidance embedding.'
+        },
+        'CONTEXT_IN_DIM': {
+            'value': 4096,
+            'description': 'dimension of the context input.'
+        },
+        'MLP_RATIO': {
+            'value': 4.0,
+            'description': 'ratio of mlp hidden size to hidden size.'
+        },
+        'QKV_BIAS': {
+            'value': True,
+            'description': 'whether to use bias in qkv projection.'
+        },
+        'DEPTH': {
+            'value': 19,
+            'description': 'number of transformer blocks.'
+        },
+        'DEPTH_SINGLE_BLOCKS': {
+            'value':
+            38,
+            'description':
+            'number of transformer blocks in the single stream block.'
+        },
+        'USE_GRAD_CHECKPOINT': {
+            'value': False,
+            'description': 'whether to use gradient checkpointing.'
+        }
+    }
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.in_channels = cfg.IN_CHANNELS
+        self.out_channels = cfg.get('OUT_CHANNELS', self.in_channels)
+        hidden_size = cfg.get('HIDDEN_SIZE', 1024)
+        num_heads = cfg.get('NUM_HEADS', 16)
+        axes_dim = cfg.AXES_DIM
+        theta = cfg.THETA
+        vec_in_dim = cfg.VEC_IN_DIM
+        self.guidance_embed = cfg.GUIDANCE_EMBED
+        context_in_dim = cfg.CONTEXT_IN_DIM
+        mlp_ratio = cfg.MLP_RATIO
+        qkv_bias = cfg.QKV_BIAS
+        depth = cfg.DEPTH
+        depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
+        self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
+        self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
+        self.cache_pretrain_model = cfg.get("CACHE_PRETRAIN_MODEL", False)
+        self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
+        self.comfyui_lora_model = cfg.get("COMFYUI_LORA_MODEL", None)
+        self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
+        self.blackforest_lora_model = cfg.get("BLACKFOREST_LORA_MODEL", None)
+        self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+        pe_dim = hidden_size // num_heads
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
+        self.guidance_in = (MLPEmbedder(in_dim=256,
+                                        hidden_dim=self.hidden_size)
+                            if self.guidance_embed else nn.Identity())
+        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    backend=self.attn_backend
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
+                for _ in range(depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def prepare_input(self, x, context, y, x_shape=None):
+        # x.shape [6, 16, 16, 16] target is [6, 16, 768, 1360]
+        bs, c, h, w = x.shape
+        x = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        x_id = torch.zeros(h // 2, w // 2, 3)
+        x_id[..., 1] = x_id[..., 1] + torch.arange(h // 2)[:, None]
+        x_id[..., 2] = x_id[..., 2] + torch.arange(w // 2)[None, :]
+        x_ids = repeat(x_id, "h w c -> b (h w) c", b=bs)
+        txt_ids = torch.zeros(bs, context.shape[1], 3)
+        return x, x_ids.to(x), context.to(x), txt_ids.to(x), y.to(x), h, w
+    def unpack(self, x: Tensor, height: int, width: int) -> Tensor:
+        return rearrange(
+            x,
+            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+            h=math.ceil(height/2),
+            w=math.ceil(width/2),
+            ph=2,
+            pw=2,
+        )
+    def merge_diffuser_lora(self, ori_sd, lora_sd, scale=1.0):
+        key_map = {
+            "single_blocks.{}.linear1.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight", [0, 3072]],
+                ["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight", [3072, 6144]],
+                ["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight", [6144, 9216]],
+                ["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight", [9216, 21504]]
+            ], "num": 38},
+            "single_blocks.{}.modulation.lin.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight", [0, 9216]],
+            ], "num": 38},
+            "single_blocks.{}.linear2.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.proj_out.lora_B.weight", [0, 3072]],
+            ], "num": 38},
+            "double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight", [0, 3072]],
+                ["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight", [3072, 6144]],
+                ["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight", [6144, 9216]],
+            ], "num": 19},
+            "double_blocks.{}.img_attn.qkv.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_q.lora_B.weight", [0, 3072]],
+                ["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_k.lora_B.weight", [3072, 6144]],
+                ["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_v.lora_B.weight", [6144, 9216]],
+            ], "num": 19},
+            "double_blocks.{}.img_attn.proj.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.txt_attn.proj.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.img_mlp.0.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight", [0, 12288]]
+            ], "num": 19},
+            "double_blocks.{}.img_mlp.2.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff.net.2.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.txt_mlp.0.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight", [0, 12288]]
+            ], "num": 19},
+            "double_blocks.{}.txt_mlp.2.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.img_mod.lin.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
+                 "transformer.transformer_blocks.{}.norm1.linear.lora_B.weight", [0, 18432]]
+            ], "num": 19},
+            "double_blocks.{}.txt_mod.lin.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
+                 "transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight", [0, 18432]]
+            ], "num": 19}
+        }
+        cover_lora_keys = set()
+        cover_ori_keys = set()
+        for k, v in key_map.items():
+            key_list = v["key_list"]
+            block_num = v["num"]
+            for block_id in range(block_num):
+                for k_list in key_list:
+                    if k_list[0].format(block_id) in lora_sd and k_list[1].format(block_id) in lora_sd:
+                        cover_lora_keys.add(k_list[0].format(block_id))
+                        cover_lora_keys.add(k_list[1].format(block_id))
+                        current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
+                                                      lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
+                        ori_sd[k.format(block_id)][k_list[2][0]:k_list[2][1], ...] += scale * current_weight
+                        cover_ori_keys.add(k.format(block_id))
+                        # lora_sd.pop(k_list[0].format(block_id))
+                        # lora_sd.pop(k_list[1].format(block_id))
+        self.logger.info(f"merge_blackforest_lora loads lora'parameters lora-paras: \n"
+                         f"cover-{len(cover_lora_keys)} vs total {len(lora_sd)} \n"
+                         f"cover ori-{len(cover_ori_keys)} vs total {len(ori_sd)}")
+        return ori_sd
+    def merge_swift_lora(self, ori_sd, lora_sd, scale = 1.0):
+        have_lora_keys = {}
+        for k, v in lora_sd.items():
+            k = k[len("model."):] if k.startswith("model.") else k
+            ori_key = k.split("lora")[0] + "weight"
+            if ori_key not in ori_sd:
+                raise f"{ori_key} should in the original statedict"
+            if ori_key not in have_lora_keys:
+                have_lora_keys[ori_key] = {}
+            if "lora_A" in k:
+                have_lora_keys[ori_key]["lora_A"] = v
+            elif "lora_B" in k:
+                have_lora_keys[ori_key]["lora_B"] = v
+            else:
+                raise NotImplementedError
+        self.logger.info(f"merge_swift_lora loads lora'parameters {len(have_lora_keys)}")
+        for key, v in have_lora_keys.items():
+            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
+            ori_sd[key] += scale * current_weight
+        return ori_sd
+    def merge_blackforest_lora(self, ori_sd, lora_sd, scale = 1.0):
+        have_lora_keys = {}
+        cover_lora_keys = set()
+        cover_ori_keys = set()
+        for k, v in lora_sd.items():
+            if "lora" in k:
+                ori_key = k.split("lora")[0] + "weight"
+                if ori_key not in ori_sd:
+                    raise f"{ori_key} should in the original statedict"
+                if ori_key not in have_lora_keys:
+                    have_lora_keys[ori_key] = {}
+                if "lora_A" in k:
+                    have_lora_keys[ori_key]["lora_A"] = v
+                    cover_lora_keys.add(k)
+                    cover_ori_keys.add(ori_key)
+                elif "lora_B" in k:
+                    have_lora_keys[ori_key]["lora_B"] = v
+                    cover_lora_keys.add(k)
+                    cover_ori_keys.add(ori_key)
+            else:
+                if k in ori_sd:
+                    ori_sd[k] = v
+                    cover_lora_keys.add(k)
+                    cover_ori_keys.add(k)
+                else:
+                    print("unsurpport keys: ", k)
+        self.logger.info(f"merge_blackforest_lora loads lora'parameters lora-paras: \n"
+                         f"cover-{len(cover_lora_keys)} vs total {len(lora_sd)} \n"
+                         f"cover ori-{len(cover_ori_keys)} vs total {len(ori_sd)}")
+        for key, v in have_lora_keys.items():
+            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
+            # print(key, ori_sd[key].shape, current_weight.shape)
+            ori_sd[key] += scale * current_weight
+        return ori_sd
+    def merge_comfyui_lora(self, ori_sd, lora_sd, scale = 1.0):
+        ori_key_map = {key.replace("_", ".") : key for key in ori_sd.keys()}
+        parse_ckpt = OrderedDict()
+        for k, v in lora_sd.items():
+            if "alpha" in k:
+                continue
+            k = k.replace("lora_unet_", "").replace("_", ".")
+            map_k = ori_key_map[k.split(".lora")[0] + ".weight"]
+            if map_k not in parse_ckpt:
+                parse_ckpt[map_k] = {}
+            if "lora.up" in k:
+                parse_ckpt[map_k]["lora_up"] = v
+            elif "lora.down" in k:
+                parse_ckpt[map_k]["lora_down"] = v
+        if self.cache_pretrain_model:
+            self.lora_dict[self.comfyui_lora_model] = {}
+        for key, v in parse_ckpt.items():
+            current_weight = torch.matmul(v["lora_down"].permute(1, 0), v["lora_up"].permute(1, 0)).permute(1, 0)
+            self.lora_dict[self.comfyui_lora_model] = current_weight
+            ori_sd[key] += scale * current_weight
+        return ori_sd
+    def easy_lora_merge(self, ori_sd, lora_sd, scale = 1.0):
+        for key, v in lora_sd.items():
+            ori_sd[key] += scale * v
+        return ori_sd
+    def load_pretrained_model(self, pretrained_model, lora_scale = 1.0):
+        if next(self.parameters()).device.type == 'meta':
+            map_location = torch.device(we.device_id)
+            safe_device = we.device_id
+        else:
+            map_location = "cpu"
+            safe_device = "cpu"
+        if pretrained_model is not None:
+            if not hasattr(self, "ckpt"):
+                with FS.get_from(pretrained_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        ckpt = load_safetensors(local_model, device=safe_device)
+                    else:
+                        ckpt = torch.load(local_model, map_location=map_location, weights_only=True)
+                if "state_dict" in ckpt:
+                    ckpt = ckpt["state_dict"]
+                if "model" in ckpt:
+                    ckpt = ckpt["model"]["model"]
+                if self.cache_pretrain_model:
+                    self.ckpt = ckpt
+                    self.lora_dict = {}
+            else:
+                ckpt = self.ckpt
+            new_ckpt = OrderedDict()
+            for k, v in ckpt.items():
+                if k in ("img_in.weight"):
+                    model_p = self.state_dict()[k]
+                    if v.shape != model_p.shape:
+                        expanded_state_dict_weight = torch.zeros_like(model_p, device=v.device)
+                        slices = tuple(slice(0, dim) for dim in v.shape)
+                        expanded_state_dict_weight[slices] = v
+                        new_ckpt[k] = expanded_state_dict_weight
+                    else:
+                        new_ckpt[k] = v
+                else:
+                    new_ckpt[k] = v
+            if self.lora_model is not None:
+                with FS.get_from(self.lora_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        lora_sd = load_safetensors(local_model, device=safe_device)
+                    else:
+                        lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                new_ckpt = self.merge_diffuser_lora(new_ckpt, lora_sd, scale=lora_scale)
+            if self.swift_lora_model is not None:
+                if not isinstance(self.swift_lora_model, list):
+                    self.swift_lora_model = [(self.swift_lora_model, 1.0)]
+                for lora_model in self.swift_lora_model:
+                    if isinstance(lora_model, str):
+                        lora_model = (lora_model, 1.0/len(self.swift_lora_model))
+                    print(lora_model)
+                    self.logger.info(f"load swift lora model: {lora_model}")
+                    with FS.get_from(lora_model[0], wait_finish=True) as local_model:
+                        if local_model.endswith('safetensors'):
+                            from safetensors.torch import load_file as load_safetensors
+                            lora_sd = load_safetensors(local_model, device=safe_device)
+                        else:
+                            lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                    new_ckpt = self.merge_swift_lora(new_ckpt, lora_sd, scale=lora_model[1])
+            if self.blackforest_lora_model is not None:
+                with FS.get_from(self.blackforest_lora_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        lora_sd = load_safetensors(local_model, device=safe_device)
+                    else:
+                        lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                new_ckpt = self.merge_blackforest_lora(new_ckpt, lora_sd, scale=lora_scale)
+            if self.comfyui_lora_model is not None:
+                if hasattr(self, "current_lora") and self.current_lora == self.comfyui_lora_model:
+                    return
+                if hasattr(self, "lora_dict") and self.comfyui_lora_model in self.lora_dict:
+                    new_ckpt = self.easy_lora_merge(new_ckpt, self.lora_dict[self.comfyui_lora_model], scale=lora_scale)
+                else:
+                    with FS.get_from(self.comfyui_lora_model, wait_finish=True) as local_model:
+                        if local_model.endswith('safetensors'):
+                            from safetensors.torch import load_file as load_safetensors
+                            lora_sd = load_safetensors(local_model, device=safe_device)
+                        else:
+                            lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                    new_ckpt = self.merge_comfyui_lora(new_ckpt, lora_sd, scale=lora_scale)
+                if self.comfyui_lora_model:
+                    self.current_lora = self.comfyui_lora_model
+            adapter_ckpt = {}
+            if self.pretrain_adapter is not None:
+                with FS.get_from(self.pretrain_adapter, wait_finish=True) as local_adapter:
+                    if local_adapter.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        adapter_ckpt = load_safetensors(local_adapter, device=safe_device)
+                    else:
+                        adapter_ckpt = torch.load(local_adapter, map_location=map_location, weights_only=True)
+            new_ckpt.update(adapter_ckpt)
+            missing, unexpected = self.load_state_dict(new_ckpt, strict=False, assign=True)
+            self.logger.info(
+                f'Restored from {pretrained_model} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+            )
+            if len(missing) > 0:
+                self.logger.info(f'Missing Keys:\n {missing}')
+            if len(unexpected) > 0:
+                self.logger.info(f'\nUnexpected Keys:\n {unexpected}')
+    def forward(
+        self,
+        x: Tensor,
+        t: Tensor,
+        cond: dict = {},
+        guidance: Tensor | None = None,
+        gc_seg: int = 0
+    ) -> Tensor:
+        x, x_ids, txt, txt_ids, y, h, w = self.prepare_input(x, cond["context"], cond["y"])
+        # running on sequences img
+        x = self.img_in(x)
+        vec = self.time_in(timestep_embedding(t, 256))
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, x_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            txt_length=txt.shape[1],
+        )
+        x = torch.cat((txt, x), 1)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.double_blocks:
+                x = block(x, **kwargs)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.single_blocks:
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1] :, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, h, w)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('BACKBONE',
+                            __class__.__name__,
+                            Flux.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class FluxMR(Flux):
+    def prepare_input(self, x, cond):
+        if isinstance(cond['context'], list):
+            context, y = torch.cat(cond["context"], dim=0).to(x), torch.cat(cond["y"], dim=0).to(x)
+        else:
+            context, y = cond['context'].to(x), cond['y'].to(x)
+        batch_frames, batch_frames_ids = [], []
+        for ix, shape in zip(x, cond["x_shapes"]):
+            # unpack image from sequence
+            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
+            c, h, w = ix.shape
+            ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
+            ix_id = torch.zeros(h // 2, w // 2, 3)
+            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
+            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
+            ix_id = rearrange(ix_id, "h w c -> (h w) c")
+            batch_frames.append([ix])
+            batch_frames_ids.append([ix_id])
+        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
+        for frames, frame_ids in zip(batch_frames, batch_frames_ids):
+            proj_frames = []
+            for idx, one_frame in enumerate(frames):
+                one_frame = self.img_in(one_frame)
+                proj_frames.append(one_frame)
+            ix = torch.cat(proj_frames, dim=0)
+            if_id = torch.cat(frame_ids, dim=0)
+            x_list.append(ix)
+            x_id_list.append(if_id)
+            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
+            x_seq_length.append(ix.shape[0])
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
+        txt = self.txt_in(context)
+        txt_ids = torch.zeros(context.shape[0], context.shape[1], 3).to(x)
+        mask_txt = torch.ones(context.shape[0], context.shape[1]).to(x.device, non_blocking=True).bool()
+        return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length
+    def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
+        x_list = []
+        image_shapes = cond["x_shapes"]
+        for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
+            height, width = shape
+            h, w = math.ceil(height / 2), math.ceil(width / 2)
+            u = rearrange(
+                u[seq_length-h*w:seq_length, ...],
+                "(h w) (c ph pw) -> (h ph w pw) c",
+                h=h,
+                w=w,
+                ph=2,
+                pw=2,
+            )
+            x_list.append(u)
+        x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
+        return x
+    def forward(
+            self,
+            x: Tensor,
+            t: Tensor,
+            cond: dict = {},
+            guidance: Tensor | None = None,
+            gc_seg: int = 0,
+            **kwargs
+    ) -> Tensor:
+        x, x_ids, txt, txt_ids, y, mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond)
+        # running on sequences img
+        vec = self.time_in(timestep_embedding(t, 256))
+        if self.guidance_embed and guidance[-1] >= 0:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        ids = torch.cat((txt_ids, x_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        mask_aside = torch.cat((mask_txt, mask_x), dim=1)
+        mask = mask_aside[:, None, :] * mask_aside[:, :, None]
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+            txt_length = txt.shape[1],
+        )
+        x = torch.cat((txt, x), 1)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.double_blocks:
+                x = block(x, **kwargs)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.single_blocks:
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1]:, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, cond, seq_length_list)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            FluxMR.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class FluxMRACEPlus(FluxMR):
+    def __init__(self, cfg, logger = None):
+        super().__init__(cfg, logger)
+    def prepare_input(self, x, cond):
+        context, y = cond["context"], cond["y"]
+        batch_frames, batch_frames_ids = [], []
+        for ix, shape, imask, ie, ie_mask in zip(x,
+                                                     cond['x_shapes'],
+                                                     cond['x_mask'],
+                                                     cond['edit'],
+                                                     cond['edit_mask']):
+            # unpack image from sequence
+            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
+            imask = torch.ones_like(
+                ix[[0], :, :]) if imask is None else imask.squeeze(0)
+            if len(ie) > 0:
+                ie = [iie.squeeze(0) for iie in ie]
+                ie_mask = [
+                    torch.ones(
+                        (ix.shape[0] * 4, ix.shape[1],
+                         ix.shape[2])) if iime is None else iime.squeeze(0)
+                    for iime in ie_mask
+                ]
+                ie = torch.cat(ie, dim=-1)
+                ie_mask = torch.cat(ie_mask, dim=-1)
+            else:
+                ie, ie_mask = torch.zeros_like(ix).to(x), torch.ones_like(
+                    imask).to(x),
+            ix = torch.cat([ix, ie, ie_mask], dim=0)
+            c, h, w = ix.shape
+            ix = rearrange(ix,
+                           'c (h ph) (w pw) -> (h w) (c ph pw)',
+                           ph=2,
+                           pw=2)
+            ix_id = torch.zeros(h // 2, w // 2, 3)
+            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
+            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
+            ix_id = rearrange(ix_id, 'h w c -> (h w) c')
+            batch_frames.append([ix])
+            batch_frames_ids.append([ix_id])
+        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
+        for frames, frame_ids in zip(batch_frames, batch_frames_ids):
+            proj_frames = []
+            for idx, one_frame in enumerate(frames):
+                one_frame = self.img_in(one_frame)
+                proj_frames.append(one_frame)
+            ix = torch.cat(proj_frames, dim=0)
+            if_id = torch.cat(frame_ids, dim=0)
+            x_list.append(ix)
+            x_id_list.append(if_id)
+            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
+            x_seq_length.append(ix.shape[0])
+        # if len(x_list) < 1: import pdb;pdb.set_trace()
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
+        if isinstance(context, list):
+            txt_list, mask_txt_list, y_list = [], [], []
+            for sample_id, (ctx, yy) in enumerate(zip(context, y)):
+                txt_list.append(self.txt_in(ctx.to(x)))
+                mask_txt_list.append(torch.ones(txt_list[-1].shape[0]).to(ctx.device, non_blocking=True).bool())
+                y_list.append(yy.to(x))
+            txt = pad_sequence(tuple(txt_list), batch_first=True)
+            txt_ids = torch.zeros(txt.shape[0], txt.shape[1], 3).to(x)
+            mask_txt = pad_sequence(tuple(mask_txt_list), batch_first=True)
+            y = torch.cat(y_list, dim=0)
+            assert y.ndim == 2 and txt.ndim == 3
+        else:
+            txt = self.txt_in(context)
+            txt_ids = torch.zeros(context.shape[0], context.shape[1], 3).to(x)
+            mask_txt = torch.ones(context.shape[0], context.shape[1]).to(x.device, non_blocking=True).bool()
+        return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            FluxMRACEPlus.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class FluxMRModiACEPlus(FluxMR):
+    def __init__(self, cfg, logger = None):
+        super().__init__(cfg, logger)
+    def prepare_input(self, x, cond):
+        context, y = cond["context"], cond["y"]
+        batch_frames, batch_frames_ids = [], []
+        for ix, shape, imask, ie, im, ie_mask in zip(x,
+                                                     cond['x_shapes'],
+                                                     cond['x_mask'],
+                                                     cond['edit'],
+                                                     cond['modify'],
+                                                     cond['edit_mask']):
+            # unpack image from sequence
+            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
+            imask = torch.ones_like(
+                ix[[0], :, :]) if imask is None else imask.squeeze(0)
+            if len(ie) > 0:
+                ie = [iie.squeeze(0) for iie in ie]
+                im = [iim.squeeze(0) for iim in im]
+                ie_mask = [
+                    torch.ones(
+                        (ix.shape[0] * 4, ix.shape[1],
+                         ix.shape[2])) if iime is None else iime.squeeze(0)
+                    for iime in ie_mask
+                ]
+                im = torch.cat(im, dim=-1)
+                ie = torch.cat(ie, dim=-1)
+                ie_mask = torch.cat(ie_mask, dim=-1)
+            else:
+                ie, im, ie_mask = torch.zeros_like(ix).to(x), torch.zeros_like(ix).to(x), torch.ones_like(
+                    imask).to(x),
+            ix = torch.cat([ix, ie, im, ie_mask], dim=0)
+            c, h, w = ix.shape
+            ix = rearrange(ix,
+                           'c (h ph) (w pw) -> (h w) (c ph pw)',
+                           ph=2,
+                           pw=2)
+            ix_id = torch.zeros(h // 2, w // 2, 3)
+            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
+            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
+            ix_id = rearrange(ix_id, 'h w c -> (h w) c')
+            batch_frames.append([ix])
+            batch_frames_ids.append([ix_id])
+        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
+        for frames, frame_ids in zip(batch_frames, batch_frames_ids):
+            proj_frames = []
+            for idx, one_frame in enumerate(frames):
+                one_frame = self.img_in(one_frame)
+                proj_frames.append(one_frame)
+            ix = torch.cat(proj_frames, dim=0)
+            if_id = torch.cat(frame_ids, dim=0)
+            x_list.append(ix)
+            x_id_list.append(if_id)
+            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
+            x_seq_length.append(ix.shape[0])
+        # if len(x_list) < 1: import pdb;pdb.set_trace()
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
+        if isinstance(context, list):
+            txt_list, mask_txt_list, y_list = [], [], []
+            for sample_id, (ctx, yy) in enumerate(zip(context, y)):
+                txt_list.append(self.txt_in(ctx.to(x)))
+                mask_txt_list.append(torch.ones(txt_list[-1].shape[0]).to(ctx.device, non_blocking=True).bool())
+                y_list.append(yy.to(x))
+            txt = pad_sequence(tuple(txt_list), batch_first=True)
+            txt_ids = torch.zeros(txt.shape[0], txt.shape[1], 3).to(x)
+            mask_txt = pad_sequence(tuple(mask_txt_list), batch_first=True)
+            y = torch.cat(y_list, dim=0)
+            assert y.ndim == 2 and txt.ndim == 3
+        else:
+            txt = self.txt_in(context)
+            txt_ids = torch.zeros(context.shape[0], context.shape[1], 3).to(x)
+            mask_txt = torch.ones(context.shape[0], context.shape[1]).to(x.device, non_blocking=True).bool()
+        return x, x_ids, txt, txt_ids, y, mask_x, mask_txt, x_seq_length
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            FluxMRACEPlus.para_dict,
+                            set_name=True)

modules/layers.py ADDED Viewed

	@@ -0,0 +1,521 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# This file contains code that is adapted from
+# https://github.com/black-forest-labs/flux.git
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from torch import Tensor, nn
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+try:
+    from flash_attn import (
+        flash_attn_varlen_func
+    )
+    FLASHATTN_IS_AVAILABLE = True
+except ImportError:
+    FLASHATTN_IS_AVAILABLE = False
+    flash_attn_varlen_func = None
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask: Tensor | None = None, backend = 'pytorch') -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    if backend == 'pytorch':
+        if mask is not None and mask.dtype == torch.bool:
+            mask = torch.zeros_like(mask).to(q).masked_fill_(mask.logical_not(), -1e20)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        # x = torch.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10)
+        x = rearrange(x, "B H L D -> B L (H D)")
+    elif backend == 'flash_attn':
+        # q: (B, H, L, D)
+        # k: (B, H, S, D) now L = S
+        # v: (B, H, S, D)
+        b, h, lq, d = q.shape
+        _, _, lk, _ = k.shape
+        q = rearrange(q, "B H L D -> B L H D")
+        k = rearrange(k, "B H S D -> B S H D")
+        v = rearrange(v, "B H S D -> B S H D")
+        if mask is None:
+            q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(q.device, non_blocking=True)
+            k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(k.device, non_blocking=True)
+        else:
+            q_lens = torch.sum(mask[:, 0, :, 0], dim=1).int()
+            k_lens = torch.sum(mask[:, 0, 0, :], dim=1).int()
+        q = torch.cat([q_v[:q_l] for q_v, q_l in zip(q, q_lens)])
+        k = torch.cat([k_v[:k_l] for k_v, k_l in zip(k, k_lens)])
+        v = torch.cat([v_v[:v_l] for v_v, v_l in zip(v, k_lens)])
+        cu_seqlens_q = torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32)
+        cu_seqlens_k = torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32)
+        max_seqlen_q = q_lens.max()
+        max_seqlen_k = k_lens.max()
+        x = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k
+        )
+        x_list = [x[cu_seqlens_q[i]:cu_seqlens_q[i+1]] for i in range(b)]
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x = rearrange(x, "B L H D -> B L (H D)")
+    else:
+        raise NotImplementedError
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe, mask=mask)
+        x = self.proj(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim: int, context_dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, context_dim * 2, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, context: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe, mask=mask)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, backend = 'pytorch'):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.backend = backend
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None, txt_length = None):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        txt, img = x[:, :txt_length], x[:, txt_length:]
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        attn = attention(q, k, v, pe=pe, mask = mask, backend = self.backend)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        x = torch.cat((txt, img), 1)
+        return x
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        backend='pytorch'
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        self.backend = backend
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        # compute attention
+        attn = attention(q, k, v, pe=pe, mask = mask, backend=self.backend)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+class DoubleStreamBlockC(DoubleStreamBlock):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float,
+                 qkv_bias: bool = False, backend='pytorch',
+                 abondon_cond = False):
+        super().__init__(hidden_size, num_heads, mlp_ratio,
+                 qkv_bias, backend)
+        self.abondon_cond = abondon_cond
+    def forward(self, x: Tensor, vec: Tensor,
+                pe: Tensor, mask: Tensor = None,
+                txt_length=None,
+                uncondi_length=None,
+                uncondi_pe = None,
+                mask_uncond = None):
+        # pad_sequence(tuple(x_list), batch_first=True)
+        if self.abondon_cond:
+            x = [ix[:u_l, :] for ix, u_l in zip(x, uncondi_length)]
+            x = pad_sequence(x, batch_first=True)
+        if not x.shape[1] == pe.shape[2]:
+            pe = uncondi_pe
+            mask = mask_uncond
+        # print("double stream block", x.shape, pe.shape)
+        x = super().forward(x, vec, pe, mask, txt_length)
+        return x
+class SingleStreamBlockC(SingleStreamBlock):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(self, hidden_size: int,
+                    num_heads: int,
+                    mlp_ratio: float = 4.0,
+                    qk_scale: float | None = None,
+                    backend='pytorch',
+                    abondon_cond = False):
+        super().__init__(hidden_size, num_heads, mlp_ratio,
+                 qk_scale, backend)
+        self.abondon_cond = abondon_cond
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None,
+                uncondi_length = None, uncondi_pe = None, mask_uncond = None) -> Tensor:
+        if self.abondon_cond:
+            x = [ix[:u_l, :] for ix, u_l in zip(x, uncondi_length)]
+            x = pad_sequence(x, batch_first=True)
+        if not x.shape[1] == pe.shape[2]:
+            pe = uncondi_pe
+            mask = mask_uncond
+        # print("single stream block", x.shape, pe.shape)
+        x = super().forward(x, vec, pe, mask)
+        return x
+class DoubleStreamBlockD(DoubleStreamBlock):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float,
+                 qkv_bias: bool = False, backend='pytorch'):
+        super().__init__(hidden_size, num_heads, mlp_ratio,
+                 qkv_bias, backend)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.edit_mod = Modulation(hidden_size, double=True)
+        self.edit_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.edit_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.edit_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.edit_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(self, x: Tensor, vec: Tensor,
+                pe: Tensor, mask: Tensor = None,
+                txt_length=None,
+                edit_length=None):
+        if edit_length is not None:
+            txt, edit, img = x[:, :txt_length], x[:, txt_length:txt_length + edit_length], x[:, txt_length + edit_length:]
+        else:
+            txt, img = x[:, :txt_length], x[:, txt_length:]
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        if edit_length is not None:
+            edit_mod1, edit_mod2 = self.edit_mod(vec)
+            # prepare edit for attention
+            edit_modulated = self.edit_norm1(edit)
+            edit_modulated = (1 + edit_mod1.scale) * edit_modulated + edit_mod1.shift
+            edit_qkv = self.edit_attn.qkv(edit_modulated)
+            edit_q, edit_k, edit_v = rearrange(edit_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+            edit_q, edit_k = self.edit_attn.norm(edit_q, edit_k, edit_v)
+        else:
+            edit_q, edit_k, edit_v = None, None, None
+        # run actual attention
+        q = torch.cat((txt_q,) + ((edit_q,) if edit_q is not None else ()) + (img_q,), dim=2)
+        k = torch.cat((txt_k,) + ((edit_k,) if edit_k is not None else ()) + (img_k,), dim=2)
+        v = torch.cat((txt_v,) + ((edit_v,) if edit_v is not None else ()) + (img_v,), dim=2)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        attn = attention(q, k, v, pe=pe, mask=mask, backend=self.backend)
+        if edit_length is not None:
+            txt_attn, edit_attn, img_attn = attn[:, : txt_length], attn[:, txt_length:txt_length + edit_length ], attn[:, txt_length + edit_length:]
+        else:
+            txt_attn, img_attn = attn[:, : txt_length], attn[:, txt_length:]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        # calculate the img bloks
+        if edit_length is not None:
+            edit = edit + edit_mod1.gate * self.edit_attn.proj(edit_attn)
+            edit = edit + edit_mod2.gate * self.edit_mlp((1 + edit_mod2.scale) * self.edit_norm2(edit) + edit_mod2.shift)
+            x = torch.cat((txt, edit, img), 1)
+        else:
+            x = torch.cat((txt, img), 1)
+        return x
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+if __name__ == '__main__':
+    pe = EmbedND(dim=64, theta=10000, axes_dim=[16, 56, 56])
+    ix_id = torch.zeros(64 // 2, 64 // 2, 3)
+    ix_id[..., 1] = ix_id[..., 1] + torch.arange(64 // 2)[:, None]
+    ix_id[..., 2] = ix_id[..., 2] + torch.arange(64 // 2)[None, :]
+    ix_id = rearrange(ix_id, "h w c -> 1 (h w) c")
+    pos = torch.cat([ix_id, ix_id], dim = 1)
+    a = pe(pos)
+    b = torch.cat([pe(ix_id), pe(ix_id)], dim = 2)
+    print(a - b)