ACE-Plus

Running on Zero

App Files Files Community

chaojiemao commited on Dec 16, 2024

Commit

0d206f3

1 Parent(s): 342aa6a

modify ace flux

Browse files

Files changed (6) hide show

ace_flux_inference.py +329 -0
config/models/ace_flux_dev.yaml +18 -33
models/__init__.py +2 -0
models/embedder.py +383 -0
models/flux.py +798 -0
models/layers.py +497 -0

ace_flux_inference.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import os
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+import torchvision.transforms as T
+from scepter.modules.model.registry import DIFFUSIONS, BACKBONES
+import torchvision.transforms.functional as TF
+from scepter.modules.model.utils.basic_utils import check_list_of_list
+from scepter.modules.model.utils.basic_utils import \
+    pack_imagelist_into_tensor_v2 as pack_imagelist_into_tensor
+from scepter.modules.model.utils.basic_utils import (
+    to_device, unpack_tensor_into_imagelist)
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+from scepter.modules.utils.logger import get_logger
+from scepter.modules.inference.diffusion_inference import DiffusionInference, get_model
+def process_edit_image(images,
+                       masks,
+                       tasks):
+    if not isinstance(images, list):
+        images = [images]
+    if not isinstance(masks, list):
+        masks = [masks]
+    if not isinstance(tasks, list):
+        tasks = [tasks]
+    img_tensors = []
+    mask_tensors = []
+    for img, mask, task in zip(images, masks, tasks):
+        if mask is None or mask == '':
+            mask = Image.new('L', img.size, 0)
+        img = TF.center_crop(img, [512, 512])
+        mask = TF.center_crop(mask, [512, 512])
+        mask = np.asarray(mask)
+        mask = np.where(mask > 128, 1, 0)
+        mask = mask.astype(
+            np.float32) if np.any(mask) else np.ones_like(mask).astype(
+                np.float32)
+        img_tensor = TF.to_tensor(img).to(we.device_id)
+        img_tensor = TF.normalize(img_tensor,
+                                  mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+        mask_tensor = TF.to_tensor(mask).to(we.device_id)
+        if task in ['inpainting', 'Try On', 'Inpainting']:
+            mask_indicator = mask_tensor.repeat(3, 1, 1)
+            img_tensor[mask_indicator == 1] = -1.0
+        img_tensors.append(img_tensor)
+        mask_tensors.append(mask_tensor)
+    return img_tensors, mask_tensors
+class FluxACEInference(DiffusionInference):
+    def __init__(self, logger=None):
+        if logger is None:
+            logger = get_logger(name='scepter')
+        self.logger = logger
+        self.loaded_model = {}
+        self.loaded_model_name = [
+            'diffusion_model', 'first_stage_model', 'cond_stage_model', 'ref_cond_stage_model'
+        ]
+    def init_from_cfg(self, cfg):
+        self.name = cfg.NAME
+        self.is_default = cfg.get('IS_DEFAULT', False)
+        self.use_dynamic_model = cfg.get('USE_DYNAMIC_MODEL', True)
+        module_paras = self.load_default(cfg.get('DEFAULT_PARAS', None))
+        assert cfg.have('MODEL')
+        self.size_factor = cfg.get('SIZE_FACTOR', 8)
+        self.diffusion_model = self.infer_model(
+            cfg.MODEL.DIFFUSION_MODEL, module_paras.get(
+                'DIFFUSION_MODEL',
+                None)) if cfg.MODEL.have('DIFFUSION_MODEL') else None
+        self.first_stage_model = self.infer_model(
+            cfg.MODEL.FIRST_STAGE_MODEL,
+            module_paras.get(
+                'FIRST_STAGE_MODEL',
+                None)) if cfg.MODEL.have('FIRST_STAGE_MODEL') else None
+        self.cond_stage_model = self.infer_model(
+            cfg.MODEL.COND_STAGE_MODEL,
+            module_paras.get(
+                'COND_STAGE_MODEL',
+                None)) if cfg.MODEL.have('COND_STAGE_MODEL') else None
+        self.ref_cond_stage_model = self.infer_model(
+            cfg.MODEL.REF_COND_STAGE_MODEL,
+            module_paras.get(
+                'REF_COND_STAGE_MODEL',
+                None)) if cfg.MODEL.have('REF_COND_STAGE_MODEL') else None
+        self.diffusion = DIFFUSIONS.build(cfg.MODEL.DIFFUSION,
+                                          logger=self.logger)
+        self.interpolate_func = lambda x: (F.interpolate(
+            x.unsqueeze(0),
+            scale_factor=1 / self.size_factor,
+            mode='nearest-exact') if x is not None else None)
+        self.max_seq_length = cfg.get("MAX_SEQ_LENGTH", 4096)
+        if not self.use_dynamic_model:
+            self.dynamic_load(self.first_stage_model, 'first_stage_model')
+            self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
+            if self.ref_cond_stage_model is not None: self.dynamic_load(self.ref_cond_stage_model, 'ref_cond_stage_model')
+            with torch.device("meta"):
+                pretrained_model = self.diffusion_model['cfg'].PRETRAINED_MODEL
+                self.diffusion_model['cfg'].PRETRAINED_MODEL = None
+                diffusers_lora = self.diffusion_model['cfg'].get("DIFFUSERS_LORA_MODEL", None)
+                self.diffusion_model['cfg'].DIFFUSERS_LORA_MODEL = None
+                swift_lora = self.diffusion_model['cfg'].get("SWIFT_LORA_MODEL", None)
+                self.diffusion_model['cfg'].SWIFT_LORA_MODEL = None
+                pretrain_adapter = self.diffusion_model['cfg'].get("PRETRAIN_ADAPTER", None)
+                self.diffusion_model['cfg'].PRETRAIN_ADAPTER = None
+                blackforest_lora = self.diffusion_model['cfg'].get("BLACKFOREST_LORA_MODEL", None)
+                self.diffusion_model['cfg'].BLACKFOREST_LORA_MODEL = None
+                self.diffusion_model['model'] = BACKBONES.build(self.diffusion_model['cfg'], logger=self.logger).eval()
+            # self.dynamic_load(self.diffusion_model, 'diffusion_model')
+            self.diffusion_model['model'].lora_model = diffusers_lora
+            self.diffusion_model['model'].swift_lora_model = swift_lora
+            self.diffusion_model['model'].pretrain_adapter = pretrain_adapter
+            self.diffusion_model['model'].blackforest_lora_model = blackforest_lora
+            self.diffusion_model['model'].load_pretrained_model(pretrained_model)
+            self.diffusion_model['device'] = we.device_id
+    def upscale_resize(self, image, interpolation=T.InterpolationMode.BILINEAR):
+        c, H, W = image.shape
+        scale = max(1.0, math.sqrt(self.max_seq_length / ((H / 16) * (W / 16))))
+        rH = int(H * scale) // 16 * 16  # ensure divisible by self.d
+        rW = int(W * scale) // 16 * 16
+        image = T.Resize((rH, rW), interpolation=interpolation, antialias=True)(image)
+        return image
+    @torch.no_grad()
+    def encode_first_stage(self, x, **kwargs):
+        _, dtype = self.get_function_info(self.first_stage_model, 'encode')
+        with torch.autocast('cuda',
+                            enabled=dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, dtype)):
+            def run_one_image(u):
+                zu = get_model(self.first_stage_model).encode(u)
+                if isinstance(zu, (tuple, list)):
+                    zu = zu[0]
+                return zu
+            z = [run_one_image(u.unsqueeze(0) if u.dim() == 3 else u) for u in x]
+            return z
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        _, dtype = self.get_function_info(self.first_stage_model, 'decode')
+        with torch.autocast('cuda',
+                            enabled=dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, dtype)):
+            return [get_model(self.first_stage_model).decode(zu) for zu in z]
+    def noise_sample(self, num_samples, h, w, seed, device = None, dtype = torch.bfloat16):
+        noise = torch.randn(
+            num_samples,
+            16,
+            # allow for packing
+            2 * math.ceil(h / 16),
+            2 * math.ceil(w / 16),
+            device="cpu",
+            dtype=dtype,
+            generator=torch.Generator().manual_seed(seed),
+        ).to(device)
+        return noise
+    @torch.no_grad()
+    def __call__(self,
+                 image=None,
+                 mask=None,
+                 prompt='',
+                 task=None,
+                 negative_prompt='',
+                 output_height=1024,
+                 output_width=1024,
+                 sampler='flow_euler',
+                 sample_steps=20,
+                 guide_scale=3.5,
+                 seed=-1,
+                 history_io=None,
+                 tar_index=0,
+                 # align=0,
+                 **kwargs):
+        input_image, input_mask = image, mask
+        seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
+        if input_image is not None:
+            # assert isinstance(input_image, list) and isinstance(input_mask, list)
+            if task is None:
+                task = [''] * len(input_image)
+            if not isinstance(prompt, list):
+                prompt = [prompt] * len(input_image)
+            prompt = [
+                pp.replace('{image}', f'{{image{i}}}') if i > 0 else pp
+                for i, pp in enumerate(prompt)
+            ]
+            edit_image, edit_image_mask = process_edit_image(
+                input_image, input_mask, task)
+            image = torch.zeros(
+                size=[3, int(output_height),
+                      int(output_width)])
+            image_mask = torch.ones(
+                size=[1, int(output_height),
+                      int(output_width)])
+            edit_image, edit_image_mask = [edit_image], [edit_image_mask]
+        else:
+            edit_image = edit_image_mask = [[]]
+            image = torch.zeros(
+                size=[3, int(output_height),
+                      int(output_width)])
+            image_mask = torch.ones(
+                size=[1, int(output_height),
+                      int(output_width)])
+            if not isinstance(prompt, list):
+                prompt = [prompt]
+        align = 0
+        image, image_mask, prompt = [image], [image_mask], [prompt],
+        align = [align for p in prompt] if isinstance(align, int) else align
+        assert check_list_of_list(prompt) and check_list_of_list(
+            edit_image) and check_list_of_list(edit_image_mask)
+        # negative prompt is not used
+        image = to_device(image)
+        ctx = {}
+        # Get Noise Shape
+        self.dynamic_load(self.first_stage_model, 'first_stage_model')
+        x = self.encode_first_stage(image)
+        self.dynamic_unload(self.first_stage_model,
+                            'first_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        g = torch.Generator(device=we.device_id).manual_seed(seed)
+        noise = [
+            torch.randn((1, 16, i.shape[2], i.shape[3]), device=we.device_id, dtype=torch.bfloat16).normal_(generator=g)
+            for i in x
+        ]
+        # import pdb;pdb.set_trace()
+        noise, x_shapes = pack_imagelist_into_tensor(noise)
+        ctx['x_shapes'] = x_shapes
+        ctx['align'] = align
+        image_mask = to_device(image_mask, strict=False)
+        cond_mask = [self.interpolate_func(i) for i in image_mask
+                     ] if image_mask is not None else [None] * len(image)
+        ctx['x_mask'] = cond_mask
+        # Encode Prompt
+        instruction_prompt = [[pp[-1]] if "{image}" in pp[-1] else ["{image} " + pp[-1]] for pp in prompt]
+        self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
+        function_name, dtype = self.get_function_info(self.cond_stage_model)
+        cont = getattr(get_model(self.cond_stage_model), function_name)(instruction_prompt)
+        cont["context"] = [ct[-1] for ct in cont["context"]]
+        cont["y"] = [ct[-1] for ct in cont["y"]]
+        self.dynamic_unload(self.cond_stage_model,
+                            'cond_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        ctx.update(cont)
+        # Encode Edit Images
+        self.dynamic_load(self.first_stage_model, 'first_stage_model')
+        edit_image = [to_device(i, strict=False) for i in edit_image]
+        edit_image_mask = [to_device(i, strict=False) for i in edit_image_mask]
+        e_img, e_mask = [], []
+        for u, m in zip(edit_image, edit_image_mask):
+            if u is None:
+                continue
+            if m is None:
+                m = [None] * len(u)
+            e_img.append(self.encode_first_stage(u, **kwargs))
+            e_mask.append([self.interpolate_func(i) for i in m])
+        self.dynamic_unload(self.first_stage_model,
+                            'first_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        ctx['edit'] = e_img
+        ctx['edit_mask'] = e_mask
+        # Encode Ref Images
+        if guide_scale is not None:
+            guide_scale = torch.full((noise.shape[0],), guide_scale, device=noise.device, dtype=noise.dtype)
+        else:
+            guide_scale = None
+        # Diffusion Process
+        self.dynamic_load(self.diffusion_model, 'diffusion_model')
+        function_name, dtype = self.get_function_info(self.diffusion_model)
+        with torch.autocast('cuda',
+                            enabled=dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, dtype)):
+            latent = self.diffusion.sample(
+                noise=noise,
+                sampler=sampler,
+                model=get_model(self.diffusion_model),
+                model_kwargs={
+                    "cond": ctx, "guidance": guide_scale, "gc_seg": -1
+                },
+                steps=sample_steps,
+                show_progress=True,
+                guide_scale=guide_scale,
+                return_intermediate=None,
+                reverse_scale=-1,
+                **kwargs).float()
+        if self.use_dynamic_model: self.dynamic_unload(self.diffusion_model,
+                            'diffusion_model',
+                            skip_loaded=not self.use_dynamic_model)
+        # Decode to Pixel Space
+        self.dynamic_load(self.first_stage_model, 'first_stage_model')
+        samples = unpack_tensor_into_imagelist(latent, x_shapes)
+        x_samples = self.decode_first_stage(samples)
+        self.dynamic_unload(self.first_stage_model,
+                            'first_stage_model',
+                            skip_loaded=not self.use_dynamic_model)
+        x_samples = [x.squeeze(0) for x in x_samples]
+        imgs = [
+            torch.clamp((x_i.float() + 1.0) / 2.0,
+                        min=0.0,
+                        max=1.0).squeeze(0).permute(1, 2, 0).cpu().numpy()
+            for x_i in x_samples
+        ]
+        imgs = [Image.fromarray((img * 255).astype(np.uint8)) for img in imgs]
+        return imgs

config/models/ace_flux_dev.yaml CHANGED Viewed

@@ -15,7 +15,7 @@ DEFAULT_PARAS:
     OUTPUT_HEIGHT: 1024
     OUTPUT_WIDTH: 1024
     SAMPLER: flow_euler
-    SAMPLE_STEPS: 28
     GUIDE_SCALE: 3.5
     SEED: -1
     TAR_INDEX: 0
@@ -44,24 +44,17 @@ DEFAULT_PARAS:
           INPUT: [ "SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE" ]
     COND_STAGE_MODEL:
       FUNCTION:
-        - NAME: encode_list_of_list
           DTYPE: bfloat16
           INPUT: [ "PROMPT" ]
-    REF_COND_STAGE_MODEL:
-      FUNCTION:
-        - NAME: encode_list_of_list
-          DTYPE: bfloat16
-          INPUT: [ "IMAGE" ]
 #
 MODEL:
-  NAME: LatentDiffusionFluxEdit
   PARAMETERIZATION: rf
   PRETRAINED_MODEL:
   IGNORE_KEYS: [ ]
   SIZE_FACTOR: 8
   TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
-  IMAGE_TOKEN: '<img>'
   USE_TEXT_POS_EMBEDDINGS: True
   DIFFUSION:
     # NAME DESCRIPTION:  TYPE:  default: 'DiffusionFluxRF'
@@ -69,30 +62,21 @@ MODEL:
     PREDICTION_TYPE: raw
     # NOISE_SCHEDULER DESCRIPTION:  TYPE:  default: ''
     NOISE_SCHEDULER:
-      # NAME DESCRIPTION:  TYPE:  default: 'FlowMatchSigmaScheduler'
-      NAME: FlowMatchFluxShiftScheduler
-      # SHIFT DESCRIPTION: Use timestamp shift or not, default is True. TYPE: bool default: True
-      SHIFT: True
-      # SIGMOID_SCALE DESCRIPTION: The scale of sigmoid function for sampling timesteps. TYPE: int default: 1
-      SIGMOID_SCALE: 1
-      # BASE_SHIFT DESCRIPTION: The base shift factor for the timestamp. TYPE: float default: 0.5
-      BASE_SHIFT: 0.5
-      # MAX_SHIFT DESCRIPTION: The max shift factor for the timestamp. TYPE: float default: 1.15
-      MAX_SHIFT: 1.15
       #
   DIFFUSION_MODEL:
     # NAME DESCRIPTION:  TYPE:  default: 'Flux'
-    NAME: FluxEdit
-    PRETRAINED_MODEL: hf://scepter-studio/ACE-FLUX.1-dev@ace_flux.1_dev_preview.pth
-    DIFFUSERS_LORA_MODEL:
-    PRETRAIN_ADAPTER:
     # IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
     IN_CHANNELS: 64
-    # OUT_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
-    OUT_CHANNELS: 64
     # HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
     HIDDEN_SIZE: 3072
-    REDUX_DIM: 1152
     # NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
     NUM_HEADS: 24
     # AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
@@ -113,12 +97,13 @@ MODEL:
     DEPTH: 19
     # DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
     DEPTH_SINGLE_BLOCKS: 38
-    ATTN_BACKEND: flash_attn
   #
   FIRST_STAGE_MODEL:
     NAME: AutoencoderKLFlux
     EMBED_DIM: 16
-    PRETRAINED_MODEL: hf://black-forest-labs/FLUX.1-dev@ae.safetensors
     IGNORE_KEYS: [ ]
     BATCH_SIZE: 8
     USE_CONV: False
@@ -164,11 +149,11 @@ MODEL:
       # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_MODEL_CLS: T5EncoderModel
       # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
-      MODEL_PATH: hf://black-forest-labs/FLUX.1-dev@text_encoder_2/
       # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_TOKENIZER_CLS: T5Tokenizer
       # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
-      TOKENIZER_PATH: hf://black-forest-labs/FLUX.1-dev@tokenizer_2/
       ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
       # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
       MAX_LENGTH: 512
@@ -186,11 +171,11 @@ MODEL:
       # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_MODEL_CLS: CLIPTextModel
       # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
-      MODEL_PATH: hf://black-forest-labs/FLUX.1-dev@text_encoder/
       # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_TOKENIZER_CLS: CLIPTokenizer
       # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
-      TOKENIZER_PATH: hf://black-forest-labs/FLUX.1-dev@tokenizer/
       # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
       MAX_LENGTH: 77
       # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'

     OUTPUT_HEIGHT: 1024
     OUTPUT_WIDTH: 1024
     SAMPLER: flow_euler
+    SAMPLE_STEPS: 20
     GUIDE_SCALE: 3.5
     SEED: -1
     TAR_INDEX: 0
           INPUT: [ "SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE" ]
     COND_STAGE_MODEL:
       FUNCTION:
+        - NAME: encode_list
           DTYPE: bfloat16
           INPUT: [ "PROMPT" ]
 #
 MODEL:
+  NAME: LatentDiffusionACEFlux
   PARAMETERIZATION: rf
   PRETRAINED_MODEL:
   IGNORE_KEYS: [ ]
   SIZE_FACTOR: 8
   TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
   USE_TEXT_POS_EMBEDDINGS: True
   DIFFUSION:
     # NAME DESCRIPTION:  TYPE:  default: 'DiffusionFluxRF'
     PREDICTION_TYPE: raw
     # NOISE_SCHEDULER DESCRIPTION:  TYPE:  default: ''
     NOISE_SCHEDULER:
+        NAME: FlowMatchFluxShiftScheduler
+        SHIFT: True
+        SIGMOID_SCALE: 1
+        BASE_SHIFT: 0.5
+        MAX_SHIFT: 1.15
       #
   DIFFUSION_MODEL:
     # NAME DESCRIPTION:  TYPE:  default: 'Flux'
+    NAME: ACEFlux
+    PRETRAINED_MODEL: ms://AI-ModelScope/FLUX.1-dev@flux1-dev.safetensors
+    SWIFT_LORA_MODEL: ["ms://iic/ACE-FLUX.1-dev@ace_flux.1_dev_lora.bin"]
     # IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
     IN_CHANNELS: 64
     # HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
     HIDDEN_SIZE: 3072
     # NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
     NUM_HEADS: 24
     # AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
     DEPTH: 19
     # DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
     DEPTH_SINGLE_BLOCKS: 38
+    ATTN_BACKEND: pytorch
   #
   FIRST_STAGE_MODEL:
     NAME: AutoencoderKLFlux
     EMBED_DIM: 16
+    PRETRAINED_MODEL: ms://AI-ModelScope/FLUX.1-dev@ae.safetensors
     IGNORE_KEYS: [ ]
     BATCH_SIZE: 8
     USE_CONV: False
       # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_MODEL_CLS: T5EncoderModel
       # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
+      MODEL_PATH: ms://AI-ModelScope/FLUX.1-dev@text_encoder_2/
       # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_TOKENIZER_CLS: T5Tokenizer
       # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
+      TOKENIZER_PATH: ms://AI-ModelScope/FLUX.1-dev@tokenizer_2/
       ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
       # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
       MAX_LENGTH: 512
       # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_MODEL_CLS: CLIPTextModel
       # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
+      MODEL_PATH: ms://AI-ModelScope/FLUX.1-dev@text_encoder/
       # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
       HF_TOKENIZER_CLS: CLIPTokenizer
       # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
+      TOKENIZER_PATH: ms://AI-ModelScope/FLUX.1-dev@tokenizer/
       # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
       MAX_LENGTH: 77
       # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'

models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .flux import Flux, ACEFlux
2	+ from .embedder import ACETextEmbedder, T5ACEPlusClipFluxEmbedder, ACEHFEmbedder

models/embedder.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import warnings
+from contextlib import nullcontext
+import torch
+import torch.nn.functional as F
+import torch.utils.dlpack
+import transformers
+from scepter.modules.model.embedder.base_embedder import BaseEmbedder
+from scepter.modules.model.registry import EMBEDDERS
+from scepter.modules.model.tokenizer.tokenizer_component import (
+    basic_clean, canonicalize, heavy_clean, whitespace_clean)
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+try:
+    from transformers import AutoTokenizer, T5EncoderModel
+except Exception as e:
+    warnings.warn(
+        f'Import transformers error, please deal with this problem: {e}')
+@EMBEDDERS.register_class()
+class ACETextEmbedder(BaseEmbedder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    """
+        Uses the OpenCLIP transformer encoder for text
+        """
+    para_dict = {
+        'PRETRAINED_MODEL': {
+            'value':
+            'google/umt5-small',
+            'description':
+            'Pretrained Model for umt5, modelcard path or local path.'
+        },
+        'TOKENIZER_PATH': {
+            'value': 'google/umt5-small',
+            'description':
+            'Tokenizer Path for umt5, modelcard path or local path.'
+        },
+        'FREEZE': {
+            'value': True,
+            'description': ''
+        },
+        'USE_GRAD': {
+            'value': False,
+            'description': 'Compute grad or not.'
+        },
+        'CLEAN': {
+            'value':
+            'whitespace',
+            'description':
+            'Set the clean strtegy for tokenizer, used when TOKENIZER_PATH is not None.'
+        },
+        'LAYER': {
+            'value': 'last',
+            'description': ''
+        },
+        'LEGACY': {
+            'value':
+            True,
+            'description':
+            'Whether use legacy returnd feature or not ,default True.'
+        }
+    }
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        pretrained_path = cfg.get('PRETRAINED_MODEL', None)
+        self.t5_dtype = cfg.get('T5_DTYPE', 'float32')
+        assert pretrained_path
+        with FS.get_dir_to_local_dir(pretrained_path,
+                                     wait_finish=True) as local_path:
+            self.model = T5EncoderModel.from_pretrained(
+                local_path,
+                torch_dtype=getattr(
+                    torch,
+                    'float' if self.t5_dtype == 'float32' else self.t5_dtype))
+        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
+        self.length = cfg.get('LENGTH', 77)
+        self.use_grad = cfg.get('USE_GRAD', False)
+        self.clean = cfg.get('CLEAN', 'whitespace')
+        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
+        if tokenizer_path:
+            self.tokenize_kargs = {'return_tensors': 'pt'}
+            with FS.get_dir_to_local_dir(tokenizer_path,
+                                         wait_finish=True) as local_path:
+                if self.added_identifier is not None and isinstance(
+                        self.added_identifier, list):
+                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
+                else:
+                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
+            if self.length is not None:
+                self.tokenize_kargs.update({
+                    'padding': 'max_length',
+                    'truncation': True,
+                    'max_length': self.length
+                })
+            self.eos_token = self.tokenizer(
+                self.tokenizer.eos_token)['input_ids'][0]
+        else:
+            self.tokenizer = None
+            self.tokenize_kargs = {}
+        self.use_grad = cfg.get('USE_GRAD', False)
+        self.clean = cfg.get('CLEAN', 'whitespace')
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    # encode && encode_text
+    def forward(self, tokens, return_mask=False, use_mask=True):
+        # tokenization
+        embedding_context = nullcontext if self.use_grad else torch.no_grad
+        with embedding_context():
+            if use_mask:
+                x = self.model(tokens.input_ids.to(we.device_id),
+                               tokens.attention_mask.to(we.device_id))
+            else:
+                x = self.model(tokens.input_ids.to(we.device_id))
+            x = x.last_hidden_state
+            if return_mask:
+                return x.detach() + 0.0, tokens.attention_mask.to(we.device_id)
+            else:
+                return x.detach() + 0.0, None
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        elif self.clean == 'heavy':
+            text = heavy_clean(basic_clean(text))
+        return text
+    def encode(self, text, return_mask=False, use_mask=True):
+        if isinstance(text, str):
+            text = [text]
+        if self.clean:
+            text = [self._clean(u) for u in text]
+        assert self.tokenizer is not None
+        cont, mask = [], []
+        with torch.autocast(device_type='cuda',
+                            enabled=self.t5_dtype in ('float16', 'bfloat16'),
+                            dtype=getattr(torch, self.t5_dtype)):
+            for tt in text:
+                tokens = self.tokenizer([tt], **self.tokenize_kargs)
+                one_cont, one_mask = self(tokens,
+                                          return_mask=return_mask,
+                                          use_mask=use_mask)
+                cont.append(one_cont)
+                mask.append(one_mask)
+        if return_mask:
+            return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
+        else:
+            return torch.cat(cont, dim=0)
+    def encode_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont, cont_mask = self.encode(pp, return_mask=return_mask)
+            cont_list.append(cont)
+            mask_list.append(cont_mask)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODELS',
+                            __class__.__name__,
+                            ACETextEmbedder.para_dict,
+                            set_name=True)
+@EMBEDDERS.register_class()
+class ACEHFEmbedder(BaseEmbedder):
+    para_dict = {
+        "HF_MODEL_CLS": {
+            "value": None,
+            "description": "huggingface cls in transfomer"
+        },
+        "MODEL_PATH": {
+            "value": None,
+            "description": "model folder path"
+        },
+        "HF_TOKENIZER_CLS": {
+            "value": None,
+            "description": "huggingface cls in transfomer"
+        },
+        "TOKENIZER_PATH": {
+            "value": None,
+            "description": "tokenizer folder path"
+        },
+        "MAX_LENGTH": {
+            "value": 77,
+            "description": "max length of input"
+        },
+        "OUTPUT_KEY": {
+            "value": "last_hidden_state",
+            "description": "output key"
+        },
+        "D_TYPE": {
+            "value": "float",
+            "description": "dtype"
+        },
+        "BATCH_INFER": {
+            "value": False,
+            "description": "batch infer"
+        }
+    }
+    para_dict.update(BaseEmbedder.para_dict)
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        hf_model_cls = cfg.get('HF_MODEL_CLS', None)
+        model_path = cfg.get("MODEL_PATH", None)
+        hf_tokenizer_cls = cfg.get('HF_TOKENIZER_CLS', None)
+        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
+        self.max_length = cfg.get('MAX_LENGTH', 77)
+        self.output_key = cfg.get("OUTPUT_KEY", "last_hidden_state")
+        self.d_type = cfg.get("D_TYPE", "float")
+        self.clean = cfg.get("CLEAN", "whitespace")
+        self.batch_infer = cfg.get("BATCH_INFER", False)
+        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
+        torch_dtype = getattr(torch, self.d_type)
+        assert hf_model_cls is not None and hf_tokenizer_cls is not None
+        assert model_path is not None and tokenizer_path is not None
+        with FS.get_dir_to_local_dir(tokenizer_path, wait_finish=True) as local_path:
+            self.tokenizer = getattr(transformers, hf_tokenizer_cls).from_pretrained(local_path,
+                                                                                     max_length = self.max_length,
+                                                                                     torch_dtype = torch_dtype,
+                                                                                     additional_special_tokens=self.added_identifier)
+        with FS.get_dir_to_local_dir(model_path, wait_finish=True) as local_path:
+            self.hf_module = getattr(transformers, hf_model_cls).from_pretrained(local_path, torch_dtype = torch_dtype)
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, text: list[str], return_mask = False):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        if return_mask:
+            return outputs[self.output_key], batch_encoding['attention_mask'].to(self.hf_module.device)
+        else:
+            return outputs[self.output_key], None
+    def encode(self, text, return_mask = False):
+        if isinstance(text, str):
+            text = [text]
+        if self.clean:
+            text = [self._clean(u) for u in text]
+        if not self.batch_infer:
+            cont, mask = [], []
+            for tt in text:
+                one_cont, one_mask = self([tt], return_mask=return_mask)
+                cont.append(one_cont)
+                mask.append(one_mask)
+            if return_mask:
+                return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
+            else:
+                return torch.cat(cont, dim=0)
+        else:
+            ret_data = self(text, return_mask = return_mask)
+            if return_mask:
+                return ret_data
+            else:
+                return ret_data[0]
+    def encode_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont = self.encode(pp, return_mask=return_mask)
+            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
+            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    def encode_list_of_list(self, text_list, return_mask=True):
+        cont_list = []
+        mask_list = []
+        for pp in text_list:
+            cont = self.encode_list(pp, return_mask=return_mask)
+            cont_list.append(cont[0]) if return_mask else cont_list.append(cont)
+            mask_list.append(cont[1]) if return_mask else mask_list.append(None)
+        if return_mask:
+            return cont_list, mask_list
+        else:
+            return cont_list
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('EMBEDDER',
+                            __class__.__name__,
+                            ACEHFEmbedder.para_dict,
+                            set_name=True)
+@EMBEDDERS.register_class()
+class T5ACEPlusClipFluxEmbedder(BaseEmbedder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    para_dict = {
+        'T5_MODEL': {},
+        'CLIP_MODEL': {}
+    }
+    def __init__(self, cfg, logger=None):
+        super().__init__(cfg, logger=logger)
+        self.t5_model = EMBEDDERS.build(cfg.T5_MODEL, logger=logger)
+        self.clip_model = EMBEDDERS.build(cfg.CLIP_MODEL, logger=logger)
+    def encode(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    def encode_list(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode_list(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode_list(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    def encode_list_of_list(self, text, return_mask = False):
+        t5_embeds = self.t5_model.encode_list_of_list(text, return_mask = return_mask)
+        clip_embeds = self.clip_model.encode_list_of_list(text, return_mask = return_mask)
+        # change embedding strategy here
+        return {
+            'context': t5_embeds,
+            'y': clip_embeds,
+        }
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('EMBEDDER',
+                            __class__.__name__,
+                            T5ACEPlusClipFluxEmbedder.para_dict,
+                            set_name=True)

models/flux.py ADDED Viewed

	@@ -0,0 +1,798 @@

+import math, torch
+from collections import OrderedDict
+from functools import partial
+from einops import rearrange, repeat
+from scepter.modules.model.base_model import BaseModel
+from scepter.modules.model.registry import BACKBONES
+from scepter.modules.utils.config import dict_to_yaml
+from scepter.modules.utils.distribute import we
+from scepter.modules.utils.file_system import FS
+from torch import Tensor, nn
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.checkpoint import checkpoint_sequential
+from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
+                                 MLPEmbedder, SingleStreamBlock,
+                                 timestep_embedding, DoubleStreamBlockACE, SingleStreamBlockACE)
+@BACKBONES.register_class()
+class Flux(BaseModel):
+    """
+    Transformer backbone Diffusion model with RoPE.
+    """
+    para_dict = {
+        "IN_CHANNELS": {
+            "value": 64,
+            "description": "model's input channels."
+        },
+        "OUT_CHANNELS": {
+            "value": 64,
+            "description": "model's output channels."
+        },
+        "HIDDEN_SIZE": {
+            "value": 1024,
+            "description": "model's hidden size."
+        },
+        "NUM_HEADS": {
+            "value": 16,
+            "description": "number of heads in the transformer."
+        },
+        "AXES_DIM": {
+            "value": [16, 56, 56],
+            "description": "dimensions of the axes of the positional encoding."
+        },
+        "THETA": {
+            "value": 10_000,
+            "description": "theta for positional encoding."
+        },
+        "VEC_IN_DIM": {
+            "value": 768,
+            "description": "dimension of the vector input."
+        },
+        "GUIDANCE_EMBED": {
+            "value": False,
+            "description": "whether to use guidance embedding."
+        },
+        "CONTEXT_IN_DIM": {
+            "value": 4096,
+            "description": "dimension of the context input."
+        },
+        "MLP_RATIO": {
+            "value": 4.0,
+            "description": "ratio of mlp hidden size to hidden size."
+        },
+        "QKV_BIAS": {
+            "value": True,
+            "description": "whether to use bias in qkv projection."
+        },
+        "DEPTH": {
+            "value": 19,
+            "description": "number of transformer blocks."
+        },
+        "DEPTH_SINGLE_BLOCKS": {
+            "value": 38,
+            "description": "number of transformer blocks in the single stream block."
+        },
+        "USE_GRAD_CHECKPOINT": {
+            "value": False,
+            "description": "whether to use gradient checkpointing."
+        },
+        "ATTN_BACKEND": {
+            "value": "pytorch",
+            "description": "backend for the transformer blocks, 'pytorch' or 'flash_attn'."
+        }
+    }
+    def __init__(
+            self,
+            cfg,
+            logger = None
+    ):
+        super().__init__(cfg, logger=logger)
+        self.in_channels = cfg.IN_CHANNELS
+        self.out_channels = cfg.get("OUT_CHANNELS", self.in_channels)
+        hidden_size = cfg.get("HIDDEN_SIZE", 1024)
+        num_heads = cfg.get("NUM_HEADS", 16)
+        axes_dim = cfg.AXES_DIM
+        theta = cfg.THETA
+        vec_in_dim = cfg.VEC_IN_DIM
+        self.guidance_embed = cfg.GUIDANCE_EMBED
+        context_in_dim = cfg.CONTEXT_IN_DIM
+        mlp_ratio = cfg.MLP_RATIO
+        qkv_bias = cfg.QKV_BIAS
+        depth = cfg.DEPTH
+        depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
+        self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
+        self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
+        self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
+        self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
+        self.blackforest_lora_model = cfg.get("BLACKFOREST_LORA_MODEL", None)
+        self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+        pe_dim = hidden_size // num_heads
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim= axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if self.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    backend=self.attn_backend
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
+                for _ in range(depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def prepare_input(self, x, context, y, x_shape=None):
+        # x.shape [6, 16, 16, 16] target is [6, 16, 768, 1360]
+        bs, c, h, w = x.shape
+        x = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        x_id = torch.zeros(h // 2, w // 2, 3)
+        x_id[..., 1] = x_id[..., 1] + torch.arange(h // 2)[:, None]
+        x_id[..., 2] = x_id[..., 2] + torch.arange(w // 2)[None, :]
+        x_ids = repeat(x_id, "h w c -> b (h w) c", b=bs)
+        txt_ids = torch.zeros(bs, context.shape[1], 3)
+        return x, x_ids.to(x), context.to(x), txt_ids.to(x), y.to(x), h, w
+    def unpack(self, x: Tensor, height: int, width: int) -> Tensor:
+        return rearrange(
+            x,
+            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+            h=math.ceil(height/2),
+            w=math.ceil(width/2),
+            ph=2,
+            pw=2,
+        )
+    # def merge_diffuser_lora(self, ori_sd, lora_sd, scale = 1.0):
+    #     key_map = {
+    #         "single_blocks.{}.linear1.weight": {"key_list": [
+    #             ["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
+    #              "transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight"],
+    #             ["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
+    #              "transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight"],
+    #             ["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
+    #              "transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight"],
+    #             ["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
+    #              "transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight"]
+    #         ], "num": 38},
+    #         "single_blocks.{}.modulation.lin.weight": {"key_list": [
+    #             ["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
+    #              "transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight"],
+    #         ], "num": 38},
+    #         "single_blocks.{}.linear2.weight": {"key_list": [
+    #             ["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
+    #              "transformer.single_transformer_blocks.{}.proj_out.lora_B.weight"],
+    #         ], "num": 38},
+    #         "double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight"],
+    #             ["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight"],
+    #             ["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight"],
+    #         ], "num": 19},
+    #         "double_blocks.{}.img_attn.qkv.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.to_q.lora_B.weight"],
+    #             ["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.to_k.lora_B.weight"],
+    #             ["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.to_v.lora_B.weight"],
+    #         ], "num": 19},
+    #         "double_blocks.{}.img_attn.proj.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.txt_attn.proj.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.img_mlp.0.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.img_mlp.2.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.ff.net.2.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.txt_mlp.0.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.txt_mlp.2.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.img_mod.lin.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.norm1.linear.lora_B.weight"]
+    #         ], "num": 19},
+    #         "double_blocks.{}.txt_mod.lin.weight": {"key_list": [
+    #             ["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
+    #              "transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight"]
+    #         ], "num": 19}
+    #     }
+    #     have_lora_keys = 0
+    #     for k, v in key_map.items():
+    #         key_list = v["key_list"]
+    #         block_num = v["num"]
+    #         for block_id in range(block_num):
+    #             current_weight_list = []
+    #             for k_list in key_list:
+    #                 current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
+    #                                               lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
+    #                 current_weight_list.append(current_weight)
+    #             current_weight = torch.cat(current_weight_list, dim=0)
+    #             ori_sd[k.format(block_id)] += scale*current_weight
+    #             have_lora_keys += 1
+    #     self.logger.info(f"merge_swift_lora loads lora'parameters {have_lora_keys}")
+    #     return ori_sd
+    def merge_diffuser_lora(self, ori_sd, lora_sd, scale=1.0):
+        key_map = {
+            "single_blocks.{}.linear1.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.attn.to_q.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_q.lora_B.weight", [0, 3072]],
+                ["transformer.single_transformer_blocks.{}.attn.to_k.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_k.lora_B.weight", [3072, 6144]],
+                ["transformer.single_transformer_blocks.{}.attn.to_v.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.attn.to_v.lora_B.weight", [6144, 9216]],
+                ["transformer.single_transformer_blocks.{}.proj_mlp.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.proj_mlp.lora_B.weight", [9216, 21504]]
+            ], "num": 38},
+            "single_blocks.{}.modulation.lin.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.norm.linear.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.norm.linear.lora_B.weight", [0, 9216]],
+            ], "num": 38},
+            "single_blocks.{}.linear2.weight": {"key_list": [
+                ["transformer.single_transformer_blocks.{}.proj_out.lora_A.weight",
+                 "transformer.single_transformer_blocks.{}.proj_out.lora_B.weight", [0, 3072]],
+            ], "num": 38},
+            "double_blocks.{}.txt_attn.qkv.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.add_q_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_q_proj.lora_B.weight", [0, 3072]],
+                ["transformer.transformer_blocks.{}.attn.add_k_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_k_proj.lora_B.weight", [3072, 6144]],
+                ["transformer.transformer_blocks.{}.attn.add_v_proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.add_v_proj.lora_B.weight", [6144, 9216]],
+            ], "num": 19},
+            "double_blocks.{}.img_attn.qkv.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_q.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_q.lora_B.weight", [0, 3072]],
+                ["transformer.transformer_blocks.{}.attn.to_k.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_k.lora_B.weight", [3072, 6144]],
+                ["transformer.transformer_blocks.{}.attn.to_v.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_v.lora_B.weight", [6144, 9216]],
+            ], "num": 19},
+            "double_blocks.{}.img_attn.proj.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_out.0.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_out.0.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.txt_attn.proj.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.attn.to_add_out.lora_A.weight",
+                 "transformer.transformer_blocks.{}.attn.to_add_out.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.img_mlp.0.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff.net.0.proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff.net.0.proj.lora_B.weight", [0, 12288]]
+            ], "num": 19},
+            "double_blocks.{}.img_mlp.2.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff.net.2.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff.net.2.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.txt_mlp.0.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff_context.net.0.proj.lora_B.weight", [0, 12288]]
+            ], "num": 19},
+            "double_blocks.{}.txt_mlp.2.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.ff_context.net.2.lora_A.weight",
+                 "transformer.transformer_blocks.{}.ff_context.net.2.lora_B.weight", [0, 3072]]
+            ], "num": 19},
+            "double_blocks.{}.img_mod.lin.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.norm1.linear.lora_A.weight",
+                 "transformer.transformer_blocks.{}.norm1.linear.lora_B.weight", [0, 18432]]
+            ], "num": 19},
+            "double_blocks.{}.txt_mod.lin.weight": {"key_list": [
+                ["transformer.transformer_blocks.{}.norm1_context.linear.lora_A.weight",
+                 "transformer.transformer_blocks.{}.norm1_context.linear.lora_B.weight", [0, 18432]]
+            ], "num": 19}
+        }
+        cover_lora_keys = set()
+        cover_ori_keys = set()
+        for k, v in key_map.items():
+            key_list = v["key_list"]
+            block_num = v["num"]
+            for block_id in range(block_num):
+                for k_list in key_list:
+                    if k_list[0].format(block_id) in lora_sd and k_list[1].format(block_id) in lora_sd:
+                        cover_lora_keys.add(k_list[0].format(block_id))
+                        cover_lora_keys.add(k_list[1].format(block_id))
+                        current_weight = torch.matmul(lora_sd[k_list[0].format(block_id)].permute(1, 0),
+                                                      lora_sd[k_list[1].format(block_id)].permute(1, 0)).permute(1, 0)
+                        ori_sd[k.format(block_id)][k_list[2][0]:k_list[2][1], ...] += scale * current_weight
+                        cover_ori_keys.add(k.format(block_id))
+                        # lora_sd.pop(k_list[0].format(block_id))
+                        # lora_sd.pop(k_list[1].format(block_id))
+        self.logger.info(f"merge_blackforest_lora loads lora'parameters lora-paras: \n"
+                         f"cover-{len(cover_lora_keys)} vs total {len(lora_sd)} \n"
+                         f"cover ori-{len(cover_ori_keys)} vs total {len(ori_sd)}")
+        return ori_sd
+    def merge_swift_lora(self, ori_sd, lora_sd, scale = 1.0):
+        have_lora_keys = {}
+        for k, v in lora_sd.items():
+            k = k[len("model."):] if k.startswith("model.") else k
+            ori_key = k.split("lora")[0] + "weight"
+            if ori_key not in ori_sd:
+                raise f"{ori_key} should in the original statedict"
+            if ori_key not in have_lora_keys:
+                have_lora_keys[ori_key] = {}
+            if "lora_A" in k:
+                have_lora_keys[ori_key]["lora_A"] = v
+            elif "lora_B" in k:
+                have_lora_keys[ori_key]["lora_B"] = v
+            else:
+                raise NotImplementedError
+        self.logger.info(f"merge_swift_lora loads lora'parameters {len(have_lora_keys)}")
+        for key, v in have_lora_keys.items():
+            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
+            ori_sd[key] += scale * current_weight
+        return ori_sd
+    def merge_blackforest_lora(self, ori_sd, lora_sd, scale = 1.0):
+        have_lora_keys = {}
+        cover_lora_keys = set()
+        cover_ori_keys = set()
+        for k, v in lora_sd.items():
+            if "lora" in k:
+                ori_key = k.split("lora")[0] + "weight"
+                if ori_key not in ori_sd:
+                    raise f"{ori_key} should in the original statedict"
+                if ori_key not in have_lora_keys:
+                    have_lora_keys[ori_key] = {}
+                if "lora_A" in k:
+                    have_lora_keys[ori_key]["lora_A"] = v
+                    cover_lora_keys.add(k)
+                    cover_ori_keys.add(ori_key)
+                elif "lora_B" in k:
+                    have_lora_keys[ori_key]["lora_B"] = v
+                    cover_lora_keys.add(k)
+                    cover_ori_keys.add(ori_key)
+            else:
+                if k in ori_sd:
+                    ori_sd[k] = v
+                    cover_lora_keys.add(k)
+                    cover_ori_keys.add(k)
+                else:
+                    print("unsurpport keys: ", k)
+        self.logger.info(f"merge_blackforest_lora loads lora'parameters lora-paras: \n"
+                         f"cover-{len(cover_lora_keys)} vs total {len(lora_sd)} \n"
+                         f"cover ori-{len(cover_ori_keys)} vs total {len(ori_sd)}")
+        for key, v in have_lora_keys.items():
+            current_weight = torch.matmul(v["lora_A"].permute(1, 0), v["lora_B"].permute(1, 0)).permute(1, 0)
+            # print(key, ori_sd[key].shape, current_weight.shape)
+            ori_sd[key] += scale * current_weight
+        return ori_sd
+    def load_pretrained_model(self, pretrained_model):
+        if next(self.parameters()).device.type == 'meta':
+            map_location = torch.device(we.device_id)
+            safe_device = we.device_id
+        else:
+            map_location = "cpu"
+            safe_device = "cpu"
+        if pretrained_model is not None:
+            with FS.get_from(pretrained_model, wait_finish=True) as local_model:
+                if local_model.endswith('safetensors'):
+                    from safetensors.torch import load_file as load_safetensors
+                    sd = load_safetensors(local_model, device=safe_device)
+                else:
+                    sd = torch.load(local_model, map_location=map_location, weights_only=True)
+            if "state_dict" in sd:
+                sd = sd["state_dict"]
+            if "model" in sd:
+                sd = sd["model"]["model"]
+            new_ckpt = OrderedDict()
+            for k, v in sd.items():
+                if k in ("img_in.weight"):
+                    model_p = self.state_dict()[k]
+                    if v.shape != model_p.shape:
+                        expanded_state_dict_weight = torch.zeros_like(model_p, device=v.device)
+                        slices = tuple(slice(0, dim) for dim in v.shape)
+                        expanded_state_dict_weight[slices] = v
+                        new_ckpt[k] = expanded_state_dict_weight
+                    else:
+                        new_ckpt[k] = v
+                else:
+                    new_ckpt[k] = v
+            if self.lora_model is not None:
+                with FS.get_from(self.lora_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        lora_sd = load_safetensors(local_model, device=safe_device)
+                    else:
+                        lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                new_ckpt = self.merge_diffuser_lora(new_ckpt, lora_sd)
+            if self.swift_lora_model is not None:
+                if not isinstance(self.swift_lora_model, list):
+                    self.swift_lora_model = [self.swift_lora_model]
+                for lora_model in self.swift_lora_model:
+                    self.logger.info(f"load swift lora model: {lora_model}")
+                    with FS.get_from(lora_model, wait_finish=True) as local_model:
+                        if local_model.endswith('safetensors'):
+                            from safetensors.torch import load_file as load_safetensors
+                            lora_sd = load_safetensors(local_model, device=safe_device)
+                        else:
+                            lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                    new_ckpt = self.merge_swift_lora(new_ckpt, lora_sd)
+            if self.blackforest_lora_model is not None:
+                with FS.get_from(self.blackforest_lora_model, wait_finish=True) as local_model:
+                    if local_model.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        lora_sd = load_safetensors(local_model, device=safe_device)
+                    else:
+                        lora_sd = torch.load(local_model, map_location=map_location, weights_only=True)
+                new_ckpt = self.merge_blackforest_lora(new_ckpt, lora_sd)
+            adapter_ckpt = {}
+            if self.pretrain_adapter is not None:
+                with FS.get_from(self.pretrain_adapter, wait_finish=True) as local_adapter:
+                    if local_adapter.endswith('safetensors'):
+                        from safetensors.torch import load_file as load_safetensors
+                        adapter_ckpt = load_safetensors(local_adapter, device=safe_device)
+                    else:
+                        adapter_ckpt = torch.load(local_adapter, map_location=map_location, weights_only=True)
+            new_ckpt.update(adapter_ckpt)
+            missing, unexpected = self.load_state_dict(new_ckpt, strict=False, assign=True)
+            self.logger.info(
+                f'Restored from {pretrained_model} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+            )
+            if len(missing) > 0:
+                self.logger.info(f'Missing Keys:\n {missing}')
+            if len(unexpected) > 0:
+                self.logger.info(f'\nUnexpected Keys:\n {unexpected}')
+    def forward(
+        self,
+        x: Tensor,
+        t: Tensor,
+        cond: dict = {},
+        guidance: Tensor | None = None,
+        gc_seg: int = 0
+    ) -> Tensor:
+        x, x_ids, txt, txt_ids, y, h, w = self.prepare_input(x, cond["context"], cond["y"])
+        # running on sequences img
+        x = self.img_in(x)
+        vec = self.time_in(timestep_embedding(t, 256))
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, x_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            txt_length=txt.shape[1],
+        )
+        x = torch.cat((txt, x), 1)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.double_blocks:
+                x = block(x, **kwargs)
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for block in self.single_blocks:
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1] :, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, h, w)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            Flux.para_dict,
+                            set_name=True)
+@BACKBONES.register_class()
+class ACEFlux(Flux):
+    '''
+        cat[x_seq, edit_seq]
+        pe[x_seq] pe[edit_seq]
+    '''
+    def __init__(
+            self,
+            cfg,
+            logger=None
+    ):
+        super().__init__(cfg, logger=logger)
+        self.in_channels = cfg.IN_CHANNELS
+        self.out_channels = cfg.get("OUT_CHANNELS", self.in_channels)
+        hidden_size = cfg.get("HIDDEN_SIZE", 1024)
+        num_heads = cfg.get("NUM_HEADS", 16)
+        axes_dim = cfg.AXES_DIM
+        theta = cfg.THETA
+        vec_in_dim = cfg.VEC_IN_DIM
+        self.guidance_embed = cfg.GUIDANCE_EMBED
+        context_in_dim = cfg.CONTEXT_IN_DIM
+        mlp_ratio = cfg.MLP_RATIO
+        qkv_bias = cfg.QKV_BIAS
+        depth = cfg.DEPTH
+        depth_single_blocks = cfg.DEPTH_SINGLE_BLOCKS
+        self.use_grad_checkpoint = cfg.get("USE_GRAD_CHECKPOINT", False)
+        self.attn_backend = cfg.get("ATTN_BACKEND", "pytorch")
+        self.lora_model = cfg.get("DIFFUSERS_LORA_MODEL", None)
+        self.swift_lora_model = cfg.get("SWIFT_LORA_MODEL", None)
+        self.blackforest_lora_model = cfg.get("BLACKFOREST_LORA_MODEL", None)
+        self.pretrain_adapter = cfg.get("PRETRAIN_ADAPTER", None)
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+        pe_dim = hidden_size // num_heads
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if self.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlockACE(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    backend=self.attn_backend
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlockACE(self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio, backend=self.attn_backend)
+                for _ in range(depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def prepare_input(self, x, cond, *args, **kwargs):
+        context, y = cond["context"], cond["y"]
+        # import pdb;pdb.set_trace()
+        batch_shift = []
+        x_list, x_id_list, mask_x_list, x_seq_length = [], [], [], []
+        for ix, shape, is_align in zip(x, cond["x_shapes"], cond['align']):
+            # unpack image from sequence
+            ix = ix[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
+            c, h, w = ix.shape
+            ix = rearrange(ix, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
+            ix_id = torch.zeros(h // 2, w // 2, 3)
+            ix_id[..., 1] = ix_id[..., 1] + torch.arange(h // 2)[:, None]
+            ix_id[..., 2] = ix_id[..., 2] + torch.arange(w // 2)[None, :]
+            batch_shift.append(w // 2) if is_align < 1 else batch_shift.append(0)
+            ix_id = rearrange(ix_id, "h w c -> (h w) c")
+            ix = self.img_in(ix)
+            x_list.append(ix)
+            x_id_list.append(ix_id)
+            mask_x_list.append(torch.ones(ix.shape[0]).to(ix.device, non_blocking=True).bool())
+            x_seq_length.append(ix.shape[0])
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x_ids = pad_sequence(tuple(x_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+        mask_x = pad_sequence(tuple(mask_x_list), batch_first=True)
+        if 'edit' in cond and sum(len(e) for e in cond['edit']) > 0:
+            batch_frames, batch_frames_ids = [], []
+            for i, edit in enumerate(cond['edit']):
+                batch_frames.append([])
+                batch_frames_ids.append([])
+                for ie in edit:
+                    ie = ie.squeeze(0)
+                    c, h, w = ie.shape
+                    ie = rearrange(ie, "c (h ph) (w pw) -> (h w) (c ph pw)", ph=2, pw=2)
+                    ie_id = torch.zeros(h // 2, w // 2, 3)
+                    ie_id[..., 1] = ie_id[..., 1] + torch.arange(h // 2)[:, None]
+                    ie_id[..., 2] = ie_id[..., 2] + torch.arange(batch_shift[i], batch_shift[i] + w // 2)[None, :]
+                    ie_id = rearrange(ie_id, "h w c -> (h w) c")
+                    batch_frames[i].append(ie)
+                    batch_frames_ids[i].append(ie_id)
+            edit_list, edit_id_list, edit_mask_x_list = [], [], []
+            for frames, frame_ids in zip(batch_frames, batch_frames_ids):
+                proj_frames = []
+                for idx, one_frame in enumerate(frames):
+                    one_frame = self.img_in(one_frame)
+                    proj_frames.append(one_frame)
+                ie = torch.cat(proj_frames, dim=0)
+                ie_id = torch.cat(frame_ids, dim=0)
+                edit_list.append(ie)
+                edit_id_list.append(ie_id)
+                edit_mask_x_list.append(torch.ones(ie.shape[0]).to(ie.device, non_blocking=True).bool())
+            edit = pad_sequence(tuple(edit_list), batch_first=True)
+            edit_ids = pad_sequence(tuple(edit_id_list), batch_first=True).to(x)  # [b,pad_seq,2] pad (0.,0.) at dim2
+            edit_mask_x = pad_sequence(tuple(edit_mask_x_list), batch_first=True)
+        else:
+            edit, edit_ids, edit_mask_x = None, None, None
+        txt_list, mask_txt_list, y_list = [], [], []
+        for sample_id, (ctx, yy) in enumerate(zip(context, y)):
+            txt_list.append(self.txt_in(ctx.to(x)))
+            mask_txt_list.append(torch.ones(txt_list[-1].shape[0]).to(ctx.device, non_blocking=True).bool())
+            y_list.append(yy.to(x))
+        txt = pad_sequence(tuple(txt_list), batch_first=True)
+        txt_ids = torch.zeros(txt.shape[0], txt.shape[1], 3).to(x)
+        mask_txt = pad_sequence(tuple(mask_txt_list), batch_first=True)
+        y = torch.cat(y_list, dim=0)
+        return x, x_ids, edit, edit_ids, txt, txt_ids, y, mask_x, edit_mask_x, mask_txt, x_seq_length
+    def unpack(self, x: Tensor, cond: dict = None, x_seq_length: list = None) -> Tensor:
+        x_list = []
+        image_shapes = cond["x_shapes"]
+        for u, shape, seq_length in zip(x, image_shapes, x_seq_length):
+            height, width = shape
+            h, w = math.ceil(height / 2), math.ceil(width / 2)
+            u = rearrange(
+                u[:h * w, ...],
+                "(h w) (c ph pw) -> (h ph w pw) c",
+                h=h,
+                w=w,
+                ph=2,
+                pw=2,
+            )
+            x_list.append(u)
+        x = pad_sequence(tuple(x_list), batch_first=True).permute(0, 2, 1)
+        return x
+    def forward(
+            self,
+            x: Tensor,
+            t: Tensor,
+            cond: dict = {},
+            guidance: Tensor | None = None,
+            gc_seg: int = 0,
+            **kwargs
+    ) -> Tensor:
+        x, x_ids, edit, edit_ids, txt, txt_ids, y, mask_x, edit_mask_x, mask_txt, seq_length_list = self.prepare_input(x, cond)
+        # running on sequences img
+        # condition use zero t
+        x_length = x.shape[1]
+        vec = self.time_in(timestep_embedding(t, 256))
+        if edit is not None:
+            edit_vec = self.time_in(timestep_embedding(t * 0, 256))
+            # print("edit_vec", torch.sum(edit_vec))
+        else:
+            edit_vec = None
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+            if edit is not None:
+                edit_vec = edit_vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        if edit is not None:
+            edit_vec = edit_vec + self.vector_in(y)
+            ids = torch.cat((txt_ids, x_ids, edit_ids), dim=1)
+            mask_aside = torch.cat((mask_txt, mask_x, edit_mask_x), dim=1)
+            x = torch.cat((txt, x, edit), 1)
+        else:
+            ids = torch.cat((txt_ids, x_ids), dim=1)
+            mask_aside = torch.cat((mask_txt, mask_x), dim=1)
+            x = torch.cat((txt, x), 1)
+        pe = self.pe_embedder(ids)
+        mask = mask_aside[:, None, :] * mask_aside[:, :, None]
+        kwargs = dict(
+            vec=vec,
+            pe=pe,
+            mask=mask,
+            txt_length=txt.shape[1],
+            x_length=x_length,
+            edit_vec=edit_vec,
+        )
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.double_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.double_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for idx, block in enumerate(self.double_blocks):
+                # print("double block", idx)
+                x = block(x, **kwargs)
+        if self.use_grad_checkpoint and gc_seg >= 0:
+            x = checkpoint_sequential(
+                functions=[partial(block, **kwargs) for block in self.single_blocks],
+                segments=gc_seg if gc_seg > 0 else len(self.single_blocks),
+                input=x,
+                use_reentrant=False
+            )
+        else:
+            for idx, block in enumerate(self.single_blocks):
+                # print("single block", idx)
+                x = block(x, **kwargs)
+        x = x[:, txt.shape[1]:txt.shape[1] + x_length, ...]
+        x = self.final_layer(x, vec)  # (N, T, patch_size ** 2 * out_channels) 6 64 64
+        x = self.unpack(x, cond, seq_length_list)
+        return x
+    @staticmethod
+    def get_config_template():
+        return dict_to_yaml('MODEL',
+                            __class__.__name__,
+                            ACEFlux.para_dict,
+                            set_name=True)

models/layers.py ADDED Viewed

	@@ -0,0 +1,497 @@

+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from torch import Tensor, nn
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+try:
+    from flash_attn import (
+        flash_attn_varlen_func
+    )
+    FLASHATTN_IS_AVAILABLE = True
+except ImportError:
+    FLASHATTN_IS_AVAILABLE = False
+    flash_attn_varlen_func = None
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask: Tensor | None = None, backend = 'pytorch') -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    if backend == 'pytorch':
+        if mask is not None and mask.dtype == torch.bool:
+            mask = torch.zeros_like(mask).to(q).masked_fill_(mask.logical_not(), -1e20)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        # x = torch.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10)
+        x = rearrange(x, "B H L D -> B L (H D)")
+    elif backend == 'flash_attn':
+        # q: (B, H, L, D)
+        # k: (B, H, S, D) now L = S
+        # v: (B, H, S, D)
+        b, h, lq, d = q.shape
+        _, _, lk, _ = k.shape
+        q = rearrange(q, "B H L D -> B L H D")
+        k = rearrange(k, "B H S D -> B S H D")
+        v = rearrange(v, "B H S D -> B S H D")
+        if mask is None:
+            q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(q.device, non_blocking=True)
+            k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(k.device, non_blocking=True)
+        else:
+            q_lens = torch.sum(mask[:, 0, :, 0], dim=1).int()
+            k_lens = torch.sum(mask[:, 0, 0, :], dim=1).int()
+        q = torch.cat([q_v[:q_l] for q_v, q_l in zip(q, q_lens)])
+        k = torch.cat([k_v[:k_l] for k_v, k_l in zip(k, k_lens)])
+        v = torch.cat([v_v[:v_l] for v_v, v_l in zip(v, k_lens)])
+        cu_seqlens_q = torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32)
+        cu_seqlens_k = torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32)
+        max_seqlen_q = q_lens.max()
+        max_seqlen_k = k_lens.max()
+        x = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k
+        )
+        x_list = [x[cu_seqlens_q[i]:cu_seqlens_q[i+1]] for i in range(b)]
+        x = pad_sequence(tuple(x_list), batch_first=True)
+        x = rearrange(x, "B L H D -> B L (H D)")
+    else:
+        raise NotImplementedError
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe, mask=mask)
+        x = self.proj(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim: int, context_dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, context_dim * 2, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, context: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe, mask=mask)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, backend = 'pytorch'):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.backend = backend
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None, txt_length = None):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        txt, img = x[:, :txt_length], x[:, txt_length:]
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        attn = attention(q, k, v, pe=pe, mask = mask, backend = self.backend)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        x = torch.cat((txt, img), 1)
+        return x
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        backend='pytorch'
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        self.backend = backend
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        # compute attention
+        attn = attention(q, k, v, pe=pe, mask = mask, backend=self.backend)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+class DoubleStreamBlockACE(DoubleStreamBlock):
+    def forward(self,
+                x: Tensor,
+                vec: Tensor,
+                pe: Tensor,
+                edit_vec: Tensor | None = None,
+                mask: Tensor = None,
+                txt_length = None,
+                x_length = None):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        if edit_vec is not None:
+            edit_mod1, edit_mod2 = self.img_mod(edit_vec)
+            txt, img, edit = x[:, :txt_length], x[:, txt_length:txt_length+x_length], x[:, txt_length+x_length:]
+        else:
+            edit_mod1, edit_mod2 = None, None
+            txt, img = x[:, :txt_length], x[:, txt_length:]
+            edit = None
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # prepare edit for attention
+        if edit_vec is not None:
+            edit_modulated = self.img_norm1(edit)
+            edit_modulated = (1 + edit_mod1.scale) * edit_modulated + edit_mod1.shift
+            edit_qkv = self.img_attn.qkv(edit_modulated)
+            edit_q, edit_k, edit_v = rearrange(edit_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+            edit_q, edit_k = self.img_attn.norm(edit_q, edit_k, edit_v)
+            q = torch.cat((txt_q, img_q, edit_q), dim=2)
+            k = torch.cat((txt_k, img_k, edit_k), dim=2)
+            v = torch.cat((txt_v, img_v, edit_v), dim=2)
+        else:
+            q = torch.cat((txt_q, img_q), dim=2)
+            k = torch.cat((txt_k, img_k), dim=2)
+            v = torch.cat((txt_v, img_v), dim=2)
+        # run actual attention
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        attn = attention(q, k, v, pe=pe, mask = mask, backend = "pytorch")
+        if edit_vec is not None:
+            txt_attn, img_attn, edit_attn = (attn[:, : txt.shape[1]],
+                                             attn[:, txt.shape[1] : txt.shape[1]+img.shape[1]],
+                                             attn[:, txt.shape[1]+img.shape[1]:])
+            # calculate the img bloks
+            img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+            img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+            # calculate the img bloks
+            edit = edit + edit_mod1.gate * self.img_attn.proj(edit_attn)
+            edit = edit + edit_mod2.gate * self.img_mlp((1 + edit_mod2.scale) * self.img_norm2(edit) + edit_mod2.shift)
+            # calculate the txt bloks
+            txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+            txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+            x = torch.cat((txt, img, edit), 1)
+        else:
+            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+            # calculate the img bloks
+            img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+            img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+            # calculate the txt bloks
+            txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+            txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+            x = torch.cat((txt, img), 1)
+        return x
+class SingleStreamBlockACE(SingleStreamBlock):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def forward(self, x: Tensor, vec: Tensor,
+                pe: Tensor, mask: Tensor = None,
+                edit_vec: Tensor | None = None,
+                txt_length=None,
+                x_length=None
+                ) -> Tensor:
+        mod, _ = self.modulation(vec)
+        if edit_vec is not None:
+            x, edit = x[:, :txt_length + x_length], x[:, txt_length + x_length:]
+            e_mod, _ = self.modulation(edit_vec)
+            edit_mod = (1 + e_mod.scale) * self.pre_norm(edit) + e_mod.shift
+            edit_qkv, edit_mlp = torch.split(self.linear1(edit_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+            x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+            qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+            qkv, mlp = torch.cat([qkv, edit_qkv], 1), torch.cat([mlp, edit_mlp], 1)
+        else:
+            x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+            qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        if mask is not None:
+            mask = repeat(mask, 'B L S->  B H L S', H=self.num_heads)
+        # compute attention
+        attn = attention(q, k, v, pe=pe, mask = mask, backend="pytorch")
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        if edit_vec is not None:
+            x_output, edit_output = output.split([x.shape[1], edit.shape[1]], dim = 1)
+            x = x + mod.gate * x_output
+            edit = edit + e_mod.gate * edit_output
+            x = torch.cat((x, edit), 1)
+            return x
+        else:
+            return x + mod.gate * output
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+if __name__ == '__main__':
+    pe = EmbedND(dim=64, theta=10000, axes_dim=[16, 56, 56])
+    ix_id = torch.zeros(64 // 2, 64 // 2, 3)
+    ix_id[..., 1] = ix_id[..., 1] + torch.arange(64 // 2)[:, None]
+    ix_id[..., 2] = ix_id[..., 2] + torch.arange(64 // 2)[None, :]
+    ix_id = rearrange(ix_id, "h w c -> 1 (h w) c")
+    pos = torch.cat([ix_id, ix_id], dim = 1)
+    a = pe(pos)
+    b = torch.cat([pe(ix_id), pe(ix_id)], dim = 2)
+    print(a - b)