Spaces:

scepter-studio
/

ACE-Chat

Running on Zero

App Files Files Community

pan-yl commited on Nov 21, 2024

Commit

9f254e0

1 Parent(s): c7b412c

modify somefiles

Browse files

Files changed (16) hide show

infer.py +0 -364
modules/__init__.py +0 -1
modules/model/__init__.py +0 -1
modules/model/backbone/__init__.py +0 -3
modules/model/backbone/ace.py +0 -373
modules/model/backbone/layers.py +0 -386
modules/model/backbone/pos_embed.py +0 -85
modules/model/diffusion/__init__.py +0 -6
modules/model/diffusion/diffusions.py +0 -206
modules/model/diffusion/samplers.py +0 -69
modules/model/diffusion/schedules.py +0 -30
modules/model/embedder/__init__.py +0 -1
modules/model/embedder/embedder.py +0 -184
modules/model/network/__init__.py +0 -1
modules/model/network/ldm_ace.py +0 -353
modules/model/utils/basic_utils.py +0 -104

infer.py DELETED Viewed

@@ -1,364 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import copy
-import math
-import random
-import numpy as np
-from PIL import Image
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.transforms.functional as TF
-from scepter.modules.model.registry import DIFFUSIONS
-from scepter.modules.utils.distribute import we
-from scepter.modules.utils.logger import get_logger
-from scepter.modules.inference.diffusion_inference import DiffusionInference, get_model
-from modules.model.utils.basic_utils import (
-    check_list_of_list,
-    pack_imagelist_into_tensor_v2 as pack_imagelist_into_tensor,
-    to_device,
-    unpack_tensor_into_imagelist
-)
-def process_edit_image(images,
-                       masks,
-                       tasks,
-                       max_seq_len=1024,
-                       max_aspect_ratio=4,
-                       d=16,
-                       **kwargs):
-    if not isinstance(images, list):
-        images = [images]
-    if not isinstance(masks, list):
-        masks = [masks]
-    if not isinstance(tasks, list):
-        tasks = [tasks]
-    img_tensors = []
-    mask_tensors = []
-    for img, mask, task in zip(images, masks, tasks):
-        if mask is None or mask == '':
-            mask = Image.new('L', img.size, 0)
-        W, H = img.size
-        if H / W > max_aspect_ratio:
-            img = TF.center_crop(img, [int(max_aspect_ratio * W), W])
-            mask = TF.center_crop(mask, [int(max_aspect_ratio * W), W])
-        elif W / H > max_aspect_ratio:
-            img = TF.center_crop(img, [H, int(max_aspect_ratio * H)])
-            mask = TF.center_crop(mask, [H, int(max_aspect_ratio * H)])
-        H, W = img.height, img.width
-        scale = min(1.0, math.sqrt(max_seq_len / ((H / d) * (W / d))))
-        rH = int(H * scale) // d * d  # ensure divisible by self.d
-        rW = int(W * scale) // d * d
-        img = TF.resize(img, (rH, rW),
-                        interpolation=TF.InterpolationMode.BICUBIC)
-        mask = TF.resize(mask, (rH, rW),
-                         interpolation=TF.InterpolationMode.NEAREST_EXACT)
-        mask = np.asarray(mask)
-        mask = np.where(mask > 128, 1, 0)
-        mask = mask.astype(
-            np.float32) if np.any(mask) else np.ones_like(mask).astype(
-                np.float32)
-        img_tensor = TF.to_tensor(img).to(we.device_id)
-        img_tensor = TF.normalize(img_tensor,
-                                  mean=[0.5, 0.5, 0.5],
-                                  std=[0.5, 0.5, 0.5])
-        mask_tensor = TF.to_tensor(mask).to(we.device_id)
-        if task in ['inpainting', 'Try On', 'Inpainting']:
-            mask_indicator = mask_tensor.repeat(3, 1, 1)
-            img_tensor[mask_indicator == 1] = -1.0
-        img_tensors.append(img_tensor)
-        mask_tensors.append(mask_tensor)
-    return img_tensors, mask_tensors
-class TextEmbedding(nn.Module):
-    def __init__(self, embedding_shape):
-        super().__init__()
-        self.pos = nn.Parameter(data=torch.zeros(embedding_shape))
-class ACEInference(DiffusionInference):
-    def __init__(self, logger=None):
-        if logger is None:
-            logger = get_logger(name='scepter')
-        self.logger = logger
-        self.loaded_model = {}
-        self.loaded_model_name = [
-            'diffusion_model', 'first_stage_model', 'cond_stage_model'
-        ]
-    def init_from_cfg(self, cfg):
-        self.name = cfg.NAME
-        self.is_default = cfg.get('IS_DEFAULT', False)
-        module_paras = self.load_default(cfg.get('DEFAULT_PARAS', None))
-        assert cfg.have('MODEL')
-        self.diffusion_model = self.infer_model(
-            cfg.MODEL.DIFFUSION_MODEL, module_paras.get(
-                'DIFFUSION_MODEL',
-                None)) if cfg.MODEL.have('DIFFUSION_MODEL') else None
-        self.first_stage_model = self.infer_model(
-            cfg.MODEL.FIRST_STAGE_MODEL,
-            module_paras.get(
-                'FIRST_STAGE_MODEL',
-                None)) if cfg.MODEL.have('FIRST_STAGE_MODEL') else None
-        self.cond_stage_model = self.infer_model(
-            cfg.MODEL.COND_STAGE_MODEL,
-            module_paras.get(
-                'COND_STAGE_MODEL',
-                None)) if cfg.MODEL.have('COND_STAGE_MODEL') else None
-        self.diffusion = DIFFUSIONS.build(cfg.MODEL.DIFFUSION,
-                                          logger=self.logger)
-        self.interpolate_func = lambda x: (F.interpolate(
-            x.unsqueeze(0),
-            scale_factor=1 / self.size_factor,
-            mode='nearest-exact') if x is not None else None)
-        self.text_indentifers = cfg.MODEL.get('TEXT_IDENTIFIER', [])
-        self.use_text_pos_embeddings = cfg.MODEL.get('USE_TEXT_POS_EMBEDDINGS',
-                                               False)
-        if self.use_text_pos_embeddings:
-            self.text_position_embeddings = TextEmbedding(
-                (10, 4096)).eval().requires_grad_(False).to(we.device_id)
-        else:
-            self.text_position_embeddings = None
-        self.max_seq_len = cfg.MODEL.DIFFUSION_MODEL.MAX_SEQ_LEN
-        self.scale_factor = cfg.get('SCALE_FACTOR', 0.18215)
-        self.size_factor = cfg.get('SIZE_FACTOR', 8)
-        self.decoder_bias = cfg.get('DECODER_BIAS', 0)
-        self.default_n_prompt = cfg.get('DEFAULT_N_PROMPT', '')
-        self.dynamic_load(self.first_stage_model, 'first_stage_model')
-        self.dynamic_load(self.cond_stage_model, 'cond_stage_model')
-        self.dynamic_load(self.diffusion_model, 'diffusion_model')
-    @torch.no_grad()
-    def encode_first_stage(self, x, **kwargs):
-        _, dtype = self.get_function_info(self.first_stage_model, 'encode')
-        with torch.autocast('cuda',
-                            enabled=(dtype != 'float32'),
-                            dtype=getattr(torch, dtype)):
-            z = [
-                self.scale_factor * get_model(self.first_stage_model)._encode(
-                    i.unsqueeze(0).to(getattr(torch, dtype))) for i in x
-            ]
-        return z
-    @torch.no_grad()
-    def decode_first_stage(self, z):
-        _, dtype = self.get_function_info(self.first_stage_model, 'decode')
-        with torch.autocast('cuda',
-                            enabled=(dtype != 'float32'),
-                            dtype=getattr(torch, dtype)):
-            x = [
-                get_model(self.first_stage_model)._decode(
-                    1. / self.scale_factor * i.to(getattr(torch, dtype)))
-                for i in z
-            ]
-        return x
-    @torch.no_grad()
-    def __call__(self,
-                 image=None,
-                 mask=None,
-                 prompt='',
-                 task=None,
-                 negative_prompt='',
-                 output_height=512,
-                 output_width=512,
-                 sampler='ddim',
-                 sample_steps=20,
-                 guide_scale=4.5,
-                 guide_rescale=0.5,
-                 seed=-1,
-                 history_io=None,
-                 tar_index=0,
-                 **kwargs):
-        input_image, input_mask = image, mask
-        g = torch.Generator(device=we.device_id)
-        seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
-        g.manual_seed(int(seed))
-        if input_image is not None:
-            assert isinstance(input_image, list) and isinstance(
-                input_mask, list)
-            if task is None:
-                task = [''] * len(input_image)
-            if not isinstance(prompt, list):
-                prompt = [prompt] * len(input_image)
-            if history_io is not None and len(history_io) > 0:
-                his_image, his_maks, his_prompt, his_task = history_io[
-                    'image'], history_io['mask'], history_io[
-                        'prompt'], history_io['task']
-                assert len(his_image) == len(his_maks) == len(
-                    his_prompt) == len(his_task)
-                input_image = his_image + input_image
-                input_mask = his_maks + input_mask
-                task = his_task + task
-                prompt = his_prompt + [prompt[-1]]
-                prompt = [
-                    pp.replace('{image}', f'{{image{i}}}') if i > 0 else pp
-                    for i, pp in enumerate(prompt)
-                ]
-            edit_image, edit_image_mask = process_edit_image(
-                input_image, input_mask, task, max_seq_len=self.max_seq_len)
-            image, image_mask = edit_image[tar_index], edit_image_mask[
-                tar_index]
-            edit_image, edit_image_mask = [edit_image], [edit_image_mask]
-        else:
-            edit_image = edit_image_mask = [[]]
-            image = torch.zeros(
-                size=[3, int(output_height),
-                      int(output_width)])
-            image_mask = torch.ones(
-                size=[1, int(output_height),
-                      int(output_width)])
-            if not isinstance(prompt, list):
-                prompt = [prompt]
-        image, image_mask, prompt = [image], [image_mask], [prompt]
-        assert check_list_of_list(prompt) and check_list_of_list(
-            edit_image) and check_list_of_list(edit_image_mask)
-        # Assign Negative Prompt
-        if isinstance(negative_prompt, list):
-            negative_prompt = negative_prompt[0]
-        assert isinstance(negative_prompt, str)
-        n_prompt = copy.deepcopy(prompt)
-        for nn_p_id, nn_p in enumerate(n_prompt):
-            assert isinstance(nn_p, list)
-            n_prompt[nn_p_id][-1] = negative_prompt
-        ctx, null_ctx = {}, {}
-        # Get Noise Shape
-        image = to_device(image)
-        x = self.encode_first_stage(image)
-        noise = [
-            torch.empty(*i.shape, device=we.device_id).normal_(generator=g)
-            for i in x
-        ]
-        noise, x_shapes = pack_imagelist_into_tensor(noise)
-        ctx['x_shapes'] = null_ctx['x_shapes'] = x_shapes
-        image_mask = to_device(image_mask, strict=False)
-        cond_mask = [self.interpolate_func(i) for i in image_mask
-                     ] if image_mask is not None else [None] * len(image)
-        ctx['x_mask'] = null_ctx['x_mask'] = cond_mask
-        # Encode Prompt
-        function_name, dtype = self.get_function_info(self.cond_stage_model)
-        cont, cont_mask = getattr(get_model(self.cond_stage_model),
-                                  function_name)(prompt)
-        cont, cont_mask = self.cond_stage_embeddings(prompt, edit_image, cont,
-                                                     cont_mask)
-        null_cont, null_cont_mask = getattr(get_model(self.cond_stage_model),
-                                            function_name)(n_prompt)
-        null_cont, null_cont_mask = self.cond_stage_embeddings(
-            prompt, edit_image, null_cont, null_cont_mask)
-        ctx['crossattn'] = cont
-        null_ctx['crossattn'] = null_cont
-        # Encode Edit Images
-        edit_image = [to_device(i, strict=False) for i in edit_image]
-        edit_image_mask = [to_device(i, strict=False) for i in edit_image_mask]
-        e_img, e_mask = [], []
-        for u, m in zip(edit_image, edit_image_mask):
-            if u is None:
-                continue
-            if m is None:
-                m = [None] * len(u)
-            e_img.append(self.encode_first_stage(u, **kwargs))
-            e_mask.append([self.interpolate_func(i) for i in m])
-        null_ctx['edit'] = ctx['edit'] = e_img
-        null_ctx['edit_mask'] = ctx['edit_mask'] = e_mask
-        # Diffusion Process
-        function_name, dtype = self.get_function_info(self.diffusion_model)
-        with torch.autocast('cuda',
-                            enabled=dtype in ('float16', 'bfloat16'),
-                            dtype=getattr(torch, dtype)):
-            latent = self.diffusion.sample(
-                noise=noise,
-                sampler=sampler,
-                model=get_model(self.diffusion_model),
-                model_kwargs=[{
-                    'cond':
-                    ctx,
-                    'mask':
-                    cont_mask,
-                    'text_position_embeddings':
-                    self.text_position_embeddings.pos if hasattr(
-                        self.text_position_embeddings, 'pos') else None
-                }, {
-                    'cond':
-                    null_ctx,
-                    'mask':
-                    null_cont_mask,
-                    'text_position_embeddings':
-                    self.text_position_embeddings.pos if hasattr(
-                        self.text_position_embeddings, 'pos') else None
-                }] if guide_scale is not None and guide_scale > 1 else {
-                    'cond':
-                    null_ctx,
-                    'mask':
-                    cont_mask,
-                    'text_position_embeddings':
-                    self.text_position_embeddings.pos if hasattr(
-                        self.text_position_embeddings, 'pos') else None
-                },
-                steps=sample_steps,
-                show_progress=True,
-                seed=seed,
-                guide_scale=guide_scale,
-                guide_rescale=guide_rescale,
-                return_intermediate=None,
-                **kwargs)
-        # Decode to Pixel Space
-        samples = unpack_tensor_into_imagelist(latent, x_shapes)
-        x_samples = self.decode_first_stage(samples)
-        imgs = [
-            torch.clamp((x_i + 1.0) / 2.0 + self.decoder_bias / 255,
-                        min=0.0,
-                        max=1.0).squeeze(0).permute(1, 2, 0).cpu().numpy()
-            for x_i in x_samples
-        ]
-        imgs = [Image.fromarray((img * 255).astype(np.uint8)) for img in imgs]
-        return imgs
-    def cond_stage_embeddings(self, prompt, edit_image, cont, cont_mask):
-        if self.use_text_pos_embeddings and not torch.sum(
-                self.text_position_embeddings.pos) > 0:
-            identifier_cont, _ = getattr(get_model(self.cond_stage_model),
-                                         'encode')(self.text_indentifers,
-                                                   return_mask=True)
-            self.text_position_embeddings.load_state_dict(
-                {'pos': identifier_cont[:, 0, :]})
-        cont_, cont_mask_ = [], []
-        for pp, edit, c, cm in zip(prompt, edit_image, cont, cont_mask):
-            if isinstance(pp, list):
-                cont_.append([c[-1], *c] if len(edit) > 0 else [c[-1]])
-                cont_mask_.append([cm[-1], *cm] if len(edit) > 0 else [cm[-1]])
-            else:
-                raise NotImplementedError
-        return cont_, cont_mask_

modules/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from . import model

modules/model/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from . import backbone, embedder, diffusion, network

modules/model/backbone/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from .ace import DiTACE

modules/model/backbone/ace.py DELETED Viewed

@@ -1,373 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import re
-from collections import OrderedDict
-from functools import partial
-import torch
-import torch.nn as nn
-from einops import rearrange
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.checkpoint import checkpoint_sequential
-from scepter.modules.model.base_model import BaseModel
-from scepter.modules.model.registry import BACKBONES
-from scepter.modules.utils.config import dict_to_yaml
-from scepter.modules.utils.file_system import FS
-from .layers import (
-    Mlp,
-    TimestepEmbedder,
-    PatchEmbed,
-    DiTACEBlock,
-    T2IFinalLayer
-)
-from .pos_embed import rope_params
-@BACKBONES.register_class()
-class DiTACE(BaseModel):
-    para_dict = {
-        'PATCH_SIZE': {
-            'value': 2,
-            'description': ''
-        },
-        'IN_CHANNELS': {
-            'value': 4,
-            'description': ''
-        },
-        'HIDDEN_SIZE': {
-            'value': 1152,
-            'description': ''
-        },
-        'DEPTH': {
-            'value': 28,
-            'description': ''
-        },
-        'NUM_HEADS': {
-            'value': 16,
-            'description': ''
-        },
-        'MLP_RATIO': {
-            'value': 4.0,
-            'description': ''
-        },
-        'PRED_SIGMA': {
-            'value': True,
-            'description': ''
-        },
-        'DROP_PATH': {
-            'value': 0.,
-            'description': ''
-        },
-        'WINDOW_SIZE': {
-            'value': 0,
-            'description': ''
-        },
-        'WINDOW_BLOCK_INDEXES': {
-            'value': None,
-            'description': ''
-        },
-        'Y_CHANNELS': {
-            'value': 4096,
-            'description': ''
-        },
-        'ATTENTION_BACKEND': {
-            'value': None,
-            'description': ''
-        },
-        'QK_NORM': {
-            'value': True,
-            'description': 'Whether to use RMSNorm for query and key.',
-        },
-    }
-    para_dict.update(BaseModel.para_dict)
-    def __init__(self, cfg, logger):
-        super().__init__(cfg, logger=logger)
-        self.window_block_indexes = cfg.get('WINDOW_BLOCK_INDEXES', None)
-        if self.window_block_indexes is None:
-            self.window_block_indexes = []
-        self.pred_sigma = cfg.get('PRED_SIGMA', True)
-        self.in_channels = cfg.get('IN_CHANNELS', 4)
-        self.out_channels = self.in_channels * 2 if self.pred_sigma else self.in_channels
-        self.patch_size = cfg.get('PATCH_SIZE', 2)
-        self.num_heads = cfg.get('NUM_HEADS', 16)
-        self.hidden_size = cfg.get('HIDDEN_SIZE', 1152)
-        self.y_channels = cfg.get('Y_CHANNELS', 4096)
-        self.drop_path = cfg.get('DROP_PATH', 0.)
-        self.depth = cfg.get('DEPTH', 28)
-        self.mlp_ratio = cfg.get('MLP_RATIO', 4.0)
-        self.use_grad_checkpoint = cfg.get('USE_GRAD_CHECKPOINT', False)
-        self.attention_backend = cfg.get('ATTENTION_BACKEND', None)
-        self.max_seq_len = cfg.get('MAX_SEQ_LEN', 1024)
-        self.qk_norm = cfg.get('QK_NORM', False)
-        self.ignore_keys = cfg.get('IGNORE_KEYS', [])
-        assert (self.hidden_size % self.num_heads
-                ) == 0 and (self.hidden_size // self.num_heads) % 2 == 0
-        d = self.hidden_size // self.num_heads
-        self.freqs = torch.cat(
-            [
-                rope_params(self.max_seq_len, d - 4 * (d // 6)),  # T (~1/3)
-                rope_params(self.max_seq_len, 2 * (d // 6)),  # H (~1/3)
-                rope_params(self.max_seq_len, 2 * (d // 6))  # W (~1/3)
-            ],
-            dim=1)
-        # init embedder
-        self.x_embedder = PatchEmbed(self.patch_size,
-                                     self.in_channels + 1,
-                                     self.hidden_size,
-                                     bias=True,
-                                     flatten=False)
-        self.t_embedder = TimestepEmbedder(self.hidden_size)
-        self.y_embedder = Mlp(in_features=self.y_channels,
-                              hidden_features=self.hidden_size,
-                              out_features=self.hidden_size,
-                              act_layer=lambda: nn.GELU(approximate='tanh'),
-                              drop=0)
-        self.t_block = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(self.hidden_size, 6 * self.hidden_size, bias=True))
-        # init blocks
-        drop_path = [
-            x.item() for x in torch.linspace(0, self.drop_path, self.depth)
-        ]
-        self.blocks = nn.ModuleList([
-            DiTACEBlock(self.hidden_size,
-                        self.num_heads,
-                        mlp_ratio=self.mlp_ratio,
-                        drop_path=drop_path[i],
-                        window_size=self.window_size
-                        if i in self.window_block_indexes else 0,
-                        backend=self.attention_backend,
-                        use_condition=True,
-                        qk_norm=self.qk_norm) for i in range(self.depth)
-        ])
-        self.final_layer = T2IFinalLayer(self.hidden_size, self.patch_size,
-                                         self.out_channels)
-        self.initialize_weights()
-    def load_pretrained_model(self, pretrained_model):
-        if pretrained_model:
-            with FS.get_from(pretrained_model, wait_finish=True) as local_path:
-                model = torch.load(local_path, map_location='cpu')
-                if 'state_dict' in model:
-                    model = model['state_dict']
-                new_ckpt = OrderedDict()
-                for k, v in model.items():
-                    if self.ignore_keys is not None:
-                        if (isinstance(self.ignore_keys, str) and re.match(self.ignore_keys, k)) or \
-                                (isinstance(self.ignore_keys, list) and k in self.ignore_keys):
-                            continue
-                    k = k.replace('.cross_attn.q_linear.', '.cross_attn.q.')
-                    k = k.replace('.cross_attn.proj.',
-                                  '.cross_attn.o.').replace(
-                                      '.attn.proj.', '.attn.o.')
-                    if '.cross_attn.kv_linear.' in k:
-                        k_p, v_p = torch.split(v, v.shape[0] // 2)
-                        new_ckpt[k.replace('.cross_attn.kv_linear.',
-                                           '.cross_attn.k.')] = k_p
-                        new_ckpt[k.replace('.cross_attn.kv_linear.',
-                                           '.cross_attn.v.')] = v_p
-                    elif '.attn.qkv.' in k:
-                        q_p, k_p, v_p = torch.split(v, v.shape[0] // 3)
-                        new_ckpt[k.replace('.attn.qkv.', '.attn.q.')] = q_p
-                        new_ckpt[k.replace('.attn.qkv.', '.attn.k.')] = k_p
-                        new_ckpt[k.replace('.attn.qkv.', '.attn.v.')] = v_p
-                    elif 'y_embedder.y_proj.' in k:
-                        new_ckpt[k.replace('y_embedder.y_proj.',
-                                           'y_embedder.')] = v
-                    elif k in ('x_embedder.proj.weight'):
-                        model_p = self.state_dict()[k]
-                        if v.shape != model_p.shape:
-                            model_p.zero_()
-                            model_p[:, :4, :, :].copy_(v)
-                            new_ckpt[k] = torch.nn.parameter.Parameter(model_p)
-                        else:
-                            new_ckpt[k] = v
-                    elif k in ('x_embedder.proj.bias'):
-                        new_ckpt[k] = v
-                    else:
-                        new_ckpt[k] = v
-                missing, unexpected = self.load_state_dict(new_ckpt,
-                                                           strict=False)
-                print(
-                    f'Restored from {pretrained_model} with {len(missing)} missing and {len(unexpected)} unexpected keys'
-                )
-                if len(missing) > 0:
-                    print(f'Missing Keys:\n {missing}')
-                if len(unexpected) > 0:
-                    print(f'\nUnexpected Keys:\n {unexpected}')
-    def forward(self,
-                x,
-                t=None,
-                cond=dict(),
-                mask=None,
-                text_position_embeddings=None,
-                gc_seg=-1,
-                **kwargs):
-        if self.freqs.device != x.device:
-            self.freqs = self.freqs.to(x.device)
-        if isinstance(cond, dict):
-            context = cond.get('crossattn', None)
-        else:
-            context = cond
-        if text_position_embeddings is not None:
-            # default use the text_position_embeddings in state_dict
-            # if state_dict doesn't including this key, use the arg: text_position_embeddings
-            proj_position_embeddings = self.y_embedder(
-                text_position_embeddings)
-        else:
-            proj_position_embeddings = None
-        ctx_batch, txt_lens = [], []
-        if mask is not None and isinstance(mask, list):
-            for ctx, ctx_mask in zip(context, mask):
-                for frame_id, one_ctx in enumerate(zip(ctx, ctx_mask)):
-                    u, m = one_ctx
-                    t_len = m.flatten().sum()  # l
-                    u = u[:t_len]
-                    u = self.y_embedder(u)
-                    if frame_id == 0:
-                        u = u + proj_position_embeddings[
-                            len(ctx) -
-                            1] if proj_position_embeddings is not None else u
-                    else:
-                        u = u + proj_position_embeddings[
-                            frame_id -
-                            1] if proj_position_embeddings is not None else u
-                    ctx_batch.append(u)
-                    txt_lens.append(t_len)
-        else:
-            raise TypeError
-        y = torch.cat(ctx_batch, dim=0)
-        txt_lens = torch.LongTensor(txt_lens).to(x.device, non_blocking=True)
-        batch_frames = []
-        for u, shape, m in zip(x, cond['x_shapes'], cond['x_mask']):
-            u = u[:, :shape[0] * shape[1]].view(-1, shape[0], shape[1])
-            m = torch.ones_like(u[[0], :, :]) if m is None else m.squeeze(0)
-            batch_frames.append([torch.cat([u, m], dim=0).unsqueeze(0)])
-        if 'edit' in cond:
-            for i, (edit, edit_mask) in enumerate(
-                    zip(cond['edit'], cond['edit_mask'])):
-                if edit is None:
-                    continue
-                for u, m in zip(edit, edit_mask):
-                    u = u.squeeze(0)
-                    m = torch.ones_like(
-                        u[[0], :, :]) if m is None else m.squeeze(0)
-                    batch_frames[i].append(
-                        torch.cat([u, m], dim=0).unsqueeze(0))
-        patch_batch, shape_batch, self_x_len, cross_x_len = [], [], [], []
-        for frames in batch_frames:
-            patches, patch_shapes = [], []
-            self_x_len.append(0)
-            for frame_id, u in enumerate(frames):
-                u = self.x_embedder(u)
-                h, w = u.size(2), u.size(3)
-                u = rearrange(u, '1 c h w -> (h w) c')
-                if frame_id == 0:
-                    u = u + proj_position_embeddings[
-                        len(frames) -
-                        1] if proj_position_embeddings is not None else u
-                else:
-                    u = u + proj_position_embeddings[
-                        frame_id -
-                        1] if proj_position_embeddings is not None else u
-                patches.append(u)
-                patch_shapes.append([h, w])
-                cross_x_len.append(h * w)  # b*s, 1
-                self_x_len[-1] += h * w  # b, 1
-            # u = torch.cat(patches, dim=0)
-            patch_batch.extend(patches)
-            shape_batch.append(
-                torch.LongTensor(patch_shapes).to(x.device, non_blocking=True))
-        # repeat t to align with x
-        t = torch.cat([t[i].repeat(l) for i, l in enumerate(self_x_len)])
-        self_x_len, cross_x_len = (torch.LongTensor(self_x_len).to(
-            x.device, non_blocking=True), torch.LongTensor(cross_x_len).to(
-                x.device, non_blocking=True))
-        # x = pad_sequence(tuple(patch_batch), batch_first=True)  # b, s*max(cl), c
-        x = torch.cat(patch_batch, dim=0)
-        x_shapes = pad_sequence(tuple(shape_batch),
-                                batch_first=True)  # b, max(len(frames)), 2
-        t = self.t_embedder(t)  # (N, D)
-        t0 = self.t_block(t)
-        # y = self.y_embedder(context)
-        kwargs = dict(y=y,
-                      t=t0,
-                      x_shapes=x_shapes,
-                      self_x_len=self_x_len,
-                      cross_x_len=cross_x_len,
-                      freqs=self.freqs,
-                      txt_lens=txt_lens)
-        if self.use_grad_checkpoint and gc_seg >= 0:
-            x = checkpoint_sequential(
-                functions=[partial(block, **kwargs) for block in self.blocks],
-                segments=gc_seg if gc_seg > 0 else len(self.blocks),
-                input=x,
-                use_reentrant=False)
-        else:
-            for block in self.blocks:
-                x = block(x, **kwargs)
-        x = self.final_layer(x, t)  # b*s*n, d
-        outs, cur_length = [], 0
-        p = self.patch_size
-        for seq_length, shape in zip(self_x_len, shape_batch):
-            x_i = x[cur_length:cur_length + seq_length]
-            h, w = shape[0].tolist()
-            u = x_i[:h * w].view(h, w, p, p, -1)
-            u = rearrange(u, 'h w p q c -> (h p w q) c'
-                          )  # dump into sequence for following tensor ops
-            cur_length = cur_length + seq_length
-            outs.append(u)
-        x = pad_sequence(tuple(outs), batch_first=True).permute(0, 2, 1)
-        if self.pred_sigma:
-            return x.chunk(2, dim=1)[0]
-        else:
-            return x
-    def initialize_weights(self):
-        # Initialize transformer layers:
-        def _basic_init(module):
-            if isinstance(module, nn.Linear):
-                torch.nn.init.xavier_uniform_(module.weight)
-                if module.bias is not None:
-                    nn.init.constant_(module.bias, 0)
-        self.apply(_basic_init)
-        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
-        w = self.x_embedder.proj.weight.data
-        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
-        # Initialize timestep embedding MLP:
-        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
-        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
-        nn.init.normal_(self.t_block[1].weight, std=0.02)
-        # Initialize caption embedding MLP:
-        if hasattr(self, 'y_embedder'):
-            nn.init.normal_(self.y_embedder.fc1.weight, std=0.02)
-            nn.init.normal_(self.y_embedder.fc2.weight, std=0.02)
-        # Zero-out adaLN modulation layers
-        for block in self.blocks:
-            nn.init.constant_(block.cross_attn.o.weight, 0)
-            nn.init.constant_(block.cross_attn.o.bias, 0)
-        # Zero-out output layers:
-        nn.init.constant_(self.final_layer.linear.weight, 0)
-        nn.init.constant_(self.final_layer.linear.bias, 0)
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('BACKBONE',
-                            __class__.__name__,
-                            DiTACE.para_dict,
-                            set_name=True)

modules/model/backbone/layers.py DELETED Viewed

@@ -1,386 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import math
-import warnings
-import torch
-import torch.nn as nn
-from .pos_embed import rope_apply_multires as rope_apply
-try:
-    from flash_attn import (flash_attn_varlen_func)
-    FLASHATTN_IS_AVAILABLE = True
-except ImportError as e:
-    FLASHATTN_IS_AVAILABLE = False
-    flash_attn_varlen_func = None
-    warnings.warn(f'{e}')
-__all__ = [
-    "drop_path",
-    "modulate",
-    "PatchEmbed",
-    "DropPath",
-    "RMSNorm",
-    "Mlp",
-    "TimestepEmbedder",
-    "DiTEditBlock",
-    "MultiHeadAttentionDiTEdit",
-    "T2IFinalLayer",
-]
-def drop_path(x, drop_prob: float = 0., training: bool = False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0], ) + (1, ) * (
-        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(
-        shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()  # binarize
-    output = x.div(keep_prob) * random_tensor
-    return output
-def modulate(x, shift, scale, unsqueeze=False):
-    if unsqueeze:
-        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-    else:
-        return x * (1 + scale) + shift
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-    def __init__(
-        self,
-        patch_size=16,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-        flatten=True,
-        bias=True,
-    ):
-        super().__init__()
-        self.flatten = flatten
-        self.proj = nn.Conv2d(in_chans,
-                              embed_dim,
-                              kernel_size=patch_size,
-                              stride=patch_size,
-                              bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-    def forward(self, x):
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-6):
-        super().__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return self._norm(x.float()).type_as(x) * self.weight
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-class TimestepEmbedder(nn.Module):
-    """
-    Embeds scalar timesteps into vector representations.
-    """
-    def __init__(self, hidden_size, frequency_embedding_size=256):
-        super().__init__()
-        self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
-            nn.SiLU(),
-            nn.Linear(hidden_size, hidden_size, bias=True),
-        )
-        self.frequency_embedding_size = frequency_embedding_size
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        """
-        Create sinusoidal timestep embeddings.
-        :param t: a 1-D Tensor of N indices, one per batch element.
-                          These may be fractional.
-        :param dim: the dimension of the output.
-        :param max_period: controls the minimum frequency of the embeddings.
-        :return: an (N, D) Tensor of positional embeddings.
-        """
-        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period) *
-            torch.arange(start=0, end=half, dtype=torch.float32) /
-            half).to(device=t.device)
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-        return embedding
-    def forward(self, t):
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-        t_emb = self.mlp(t_freq)
-        return t_emb
-class DiTACEBlock(nn.Module):
-    def __init__(self,
-                 hidden_size,
-                 num_heads,
-                 mlp_ratio=4.0,
-                 drop_path=0.,
-                 window_size=0,
-                 backend=None,
-                 use_condition=True,
-                 qk_norm=False,
-                 **block_kwargs):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.use_condition = use_condition
-        self.norm1 = nn.LayerNorm(hidden_size,
-                                  elementwise_affine=False,
-                                  eps=1e-6)
-        self.attn = MultiHeadAttention(hidden_size,
-                                        num_heads=num_heads,
-                                        qkv_bias=True,
-                                        backend=backend,
-                                        qk_norm=qk_norm,
-                                        **block_kwargs)
-        if self.use_condition:
-            self.cross_attn = MultiHeadAttention(
-                hidden_size,
-                context_dim=hidden_size,
-                num_heads=num_heads,
-                qkv_bias=True,
-                backend=backend,
-                qk_norm=qk_norm,
-                **block_kwargs)
-        self.norm2 = nn.LayerNorm(hidden_size,
-                                  elementwise_affine=False,
-                                  eps=1e-6)
-        # to be compatible with lower version pytorch
-        approx_gelu = lambda: nn.GELU(approximate='tanh')
-        self.mlp = Mlp(in_features=hidden_size,
-                       hidden_features=int(hidden_size * mlp_ratio),
-                       act_layer=approx_gelu,
-                       drop=0)
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
-        self.window_size = window_size
-        self.scale_shift_table = nn.Parameter(
-            torch.randn(6, hidden_size) / hidden_size**0.5)
-    def forward(self, x, y, t, **kwargs):
-        B = x.size(0)
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-            self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-            shift_msa.squeeze(1), scale_msa.squeeze(1), gate_msa.squeeze(1),
-            shift_mlp.squeeze(1), scale_mlp.squeeze(1), gate_mlp.squeeze(1))
-        x = x + self.drop_path(gate_msa * self.attn(
-            modulate(self.norm1(x), shift_msa, scale_msa, unsqueeze=False), **
-            kwargs))
-        if self.use_condition:
-            x = x + self.cross_attn(x, context=y, **kwargs)
-        x = x + self.drop_path(gate_mlp * self.mlp(
-            modulate(self.norm2(x), shift_mlp, scale_mlp, unsqueeze=False)))
-        return x
-class MultiHeadAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 context_dim=None,
-                 num_heads=None,
-                 head_dim=None,
-                 attn_drop=0.0,
-                 qkv_bias=False,
-                 dropout=0.0,
-                 backend=None,
-                 qk_norm=False,
-                 eps=1e-6,
-                 **block_kwargs):
-        super().__init__()
-        # consider head_dim first, then num_heads
-        num_heads = dim // head_dim if head_dim else num_heads
-        head_dim = dim // num_heads
-        assert num_heads * head_dim == dim
-        context_dim = context_dim or dim
-        self.dim = dim
-        self.context_dim = context_dim
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.scale = math.pow(head_dim, -0.25)
-        # layers
-        self.q = nn.Linear(dim, dim, bias=qkv_bias)
-        self.k = nn.Linear(context_dim, dim, bias=qkv_bias)
-        self.v = nn.Linear(context_dim, dim, bias=qkv_bias)
-        self.o = nn.Linear(dim, dim)
-        self.norm_q = RMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-        self.norm_k = RMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-        self.dropout = nn.Dropout(dropout)
-        self.attention_op = None
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.backend = backend
-        assert self.backend in ('flash_attn', 'xformer_attn', 'pytorch_attn',
-                                None)
-        if FLASHATTN_IS_AVAILABLE and self.backend in ('flash_attn', None):
-            self.backend = 'flash_attn'
-            self.softmax_scale = block_kwargs.get('softmax_scale', None)
-            self.causal = block_kwargs.get('causal', False)
-            self.window_size = block_kwargs.get('window_size', (-1, -1))
-            self.deterministic = block_kwargs.get('deterministic', False)
-        else:
-            raise NotImplementedError
-    def flash_attn(self, x, context=None, **kwargs):
-        '''
-         The implementation will be very slow when mask is not None,
-         because we need rearange the x/context features according to mask.
-        Args:
-            x:
-            context:
-            mask:
-            **kwargs:
-        Returns: x
-        '''
-        dtype = kwargs.get('dtype', torch.float16)
-        def half(x):
-            return x if x.dtype in [torch.float16, torch.bfloat16
-                                    ] else x.to(dtype)
-        x_shapes = kwargs['x_shapes']
-        freqs = kwargs['freqs']
-        self_x_len = kwargs['self_x_len']
-        cross_x_len = kwargs['cross_x_len']
-        txt_lens = kwargs['txt_lens']
-        n, d = self.num_heads, self.head_dim
-        if context is None:
-            # self-attn
-            q = self.norm_q(self.q(x)).view(-1, n, d)
-            k = self.norm_q(self.k(x)).view(-1, n, d)
-            v = self.v(x).view(-1, n, d)
-            q = rope_apply(q, self_x_len, x_shapes, freqs, pad=False)
-            k = rope_apply(k, self_x_len, x_shapes, freqs, pad=False)
-            q_lens = k_lens = self_x_len
-        else:
-            # cross-attn
-            q = self.norm_q(self.q(x)).view(-1, n, d)
-            k = self.norm_q(self.k(context)).view(-1, n, d)
-            v = self.v(context).view(-1, n, d)
-            q_lens = cross_x_len
-            k_lens = txt_lens
-        cu_seqlens_q = torch.cat([q_lens.new_zeros([1]),
-                                  q_lens]).cumsum(0, dtype=torch.int32)
-        cu_seqlens_k = torch.cat([k_lens.new_zeros([1]),
-                                  k_lens]).cumsum(0, dtype=torch.int32)
-        max_seqlen_q = q_lens.max()
-        max_seqlen_k = k_lens.max()
-        out_dtype = q.dtype
-        q, k, v = half(q), half(k), half(v)
-        x = flash_attn_varlen_func(q,
-                                   k,
-                                   v,
-                                   cu_seqlens_q=cu_seqlens_q,
-                                   cu_seqlens_k=cu_seqlens_k,
-                                   max_seqlen_q=max_seqlen_q,
-                                   max_seqlen_k=max_seqlen_k,
-                                   dropout_p=self.attn_drop.p,
-                                   softmax_scale=self.softmax_scale,
-                                   causal=self.causal,
-                                   window_size=self.window_size,
-                                   deterministic=self.deterministic)
-        x = x.type(out_dtype)
-        x = x.reshape(-1, n * d)
-        x = self.o(x)
-        x = self.dropout(x)
-        return x
-    def forward(self, x, context=None, **kwargs):
-        x = getattr(self, self.backend)(x, context=context, **kwargs)
-        return x
-class T2IFinalLayer(nn.Module):
-    """
-    The final layer of PixArt.
-    """
-    def __init__(self, hidden_size, patch_size, out_channels):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size,
-                                       elementwise_affine=False,
-                                       eps=1e-6)
-        self.linear = nn.Linear(hidden_size,
-                                patch_size * patch_size * out_channels,
-                                bias=True)
-        self.scale_shift_table = nn.Parameter(
-            torch.randn(2, hidden_size) / hidden_size**0.5)
-        self.out_channels = out_channels
-    def forward(self, x, t):
-        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2,
-                                                                         dim=1)
-        shift, scale = shift.squeeze(1), scale.squeeze(1)
-        x = modulate(self.norm_final(x), shift, scale)
-        x = self.linear(x)
-        return x

modules/model/backbone/pos_embed.py DELETED Viewed

@@ -1,85 +0,0 @@
-import numpy as np
-from einops import rearrange
-import torch
-import torch.cuda.amp as amp
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-def frame_pad(x, seq_len, shapes):
-    max_h, max_w = np.max(shapes, 0)
-    frames = []
-    cur_len = 0
-    for h, w in shapes:
-        frame_len = h * w
-        frames.append(
-            F.pad(
-                x[cur_len:cur_len + frame_len].view(h, w, -1),
-                (0, 0, 0, max_w - w, 0, max_h - h))  # .view(max_h * max_w, -1)
-        )
-        cur_len += frame_len
-        if cur_len >= seq_len:
-            break
-    return torch.stack(frames)
-def frame_unpad(x, shapes):
-    max_h, max_w = np.max(shapes, 0)
-    x = rearrange(x, '(b h w) n c -> b h w n c', h=max_h, w=max_w)
-    frames = []
-    for i, (h, w) in enumerate(shapes):
-        if i >= len(x):
-            break
-        frames.append(rearrange(x[i, :h, :w], 'h w n c -> (h w) n c'))
-    return torch.concat(frames)
-@amp.autocast(enabled=False)
-def rope_apply_multires(x, x_lens, x_shapes, freqs, pad=True):
-    """
-    x:          [B*L, N, C].
-    x_lens:     [B].
-    x_shapes:   [B, F, 2].
-    freqs:      [M, C // 2].
-    """
-    n, c = x.size(1), x.size(2) // 2
-    # split freqs
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = []
-    st = 0
-    for i, (seq_len,
-            shapes) in enumerate(zip(x_lens.tolist(), x_shapes.tolist())):
-        x_i = frame_pad(x[st:st + seq_len], seq_len, shapes)  # f, h, w, c
-        f, h, w = x_i.shape[:3]
-        pad_seq_len = f * h * w
-        # precompute multipliers
-        x_i = torch.view_as_complex(
-            x_i.to(torch.float64).reshape(pad_seq_len, n, -1, 2))
-        freqs_i = torch.cat([
-            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
-            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
-            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
-        ],
-                            dim=-1).reshape(pad_seq_len, 1, -1)
-        # apply rotary embedding
-        x_i = torch.view_as_real(x_i * freqs_i).flatten(2).type_as(x)
-        x_i = frame_unpad(x_i, shapes)
-        # append to collection
-        output.append(x_i)
-        st += seq_len
-    return pad_sequence(output) if pad else torch.concat(output)
-@amp.autocast(enabled=False)
-def rope_params(max_seq_len, dim, theta=10000):
-    """
-    Precompute the frequency tensor for complex exponentials.
-    """
-    assert dim % 2 == 0
-    freqs = torch.outer(
-        torch.arange(max_seq_len),
-        1.0 / torch.pow(theta,
-                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
-    freqs = torch.polar(torch.ones_like(freqs), freqs)
-    return freqs

modules/model/diffusion/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from .diffusions import ACEDiffusion
-from .samplers import DDIMSampler
-from .schedules import LinearScheduler

modules/model/diffusion/diffusions.py DELETED Viewed

@@ -1,206 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import math
-import os
-from collections import OrderedDict
-import torch
-from tqdm import trange
-from scepter.modules.model.registry import (DIFFUSION_SAMPLERS, DIFFUSIONS,
-                                            NOISE_SCHEDULERS)
-from scepter.modules.utils.config import Config, dict_to_yaml
-from scepter.modules.utils.distribute import we
-from scepter.modules.utils.file_system import FS
-@DIFFUSIONS.register_class()
-class ACEDiffusion(object):
-    para_dict = {
-        'NOISE_SCHEDULER': {},
-        'SAMPLER_SCHEDULER': {},
-        'MIN_SNR_GAMMA': {
-            'value': None,
-            'description': 'The minimum SNR gamma value for the loss function.'
-        },
-        'PREDICTION_TYPE': {
-            'value': 'eps',
-            'description':
-            'The type of prediction to use for the loss function.'
-        }
-    }
-    def __init__(self, cfg, logger=None):
-        super(ACEDiffusion, self).__init__()
-        self.logger = logger
-        self.cfg = cfg
-        self.init_params()
-    def init_params(self):
-        self.min_snr_gamma = self.cfg.get('MIN_SNR_GAMMA', None)
-        self.prediction_type = self.cfg.get('PREDICTION_TYPE', 'eps')
-        self.noise_scheduler = NOISE_SCHEDULERS.build(self.cfg.NOISE_SCHEDULER,
-                                                      logger=self.logger)
-        self.sampler_scheduler = NOISE_SCHEDULERS.build(self.cfg.get(
-            'SAMPLER_SCHEDULER', self.cfg.NOISE_SCHEDULER),
-                                                        logger=self.logger)
-        self.num_timesteps = self.noise_scheduler.num_timesteps
-        if self.cfg.have('WORK_DIR') and we.rank == 0:
-            schedule_visualization = os.path.join(self.cfg.WORK_DIR,
-                                                  'noise_schedule.png')
-            with FS.put_to(schedule_visualization) as local_path:
-                self.noise_scheduler.plot_noise_sampling_map(local_path)
-            schedule_visualization = os.path.join(self.cfg.WORK_DIR,
-                                                  'sampler_schedule.png')
-            with FS.put_to(schedule_visualization) as local_path:
-                self.sampler_scheduler.plot_noise_sampling_map(local_path)
-    def sample(self,
-               noise,
-               model,
-               model_kwargs={},
-               steps=20,
-               sampler=None,
-               use_dynamic_cfg=False,
-               guide_scale=None,
-               guide_rescale=None,
-               show_progress=False,
-               return_intermediate=None,
-               intermediate_callback=None,
-               **kwargs):
-        assert isinstance(steps, (int, torch.LongTensor))
-        assert return_intermediate in (None, 'x0', 'xt')
-        assert isinstance(sampler, (str, dict, Config))
-        intermediates = []
-        def callback_fn(x_t, t, sigma=None, alpha=None):
-            timestamp = t
-            t = t.repeat(len(x_t)).round().long().to(x_t.device)
-            sigma = sigma.repeat(len(x_t), *([1] * (len(sigma.shape) - 1)))
-            alpha = alpha.repeat(len(x_t), *([1] * (len(alpha.shape) - 1)))
-            if guide_scale is None or guide_scale == 1.0:
-                out = model(x=x_t, t=t, **model_kwargs)
-            else:
-                if use_dynamic_cfg:
-                    guidance_scale = 1 + guide_scale * (
-                        (1 - math.cos(math.pi * (
-                            (steps - timestamp.item()) / steps)**5.0)) / 2)
-                else:
-                    guidance_scale = guide_scale
-                y_out = model(x=x_t, t=t, **model_kwargs[0])
-                u_out = model(x=x_t, t=t, **model_kwargs[1])
-                out = u_out + guidance_scale * (y_out - u_out)
-            if guide_rescale is not None and guide_rescale > 0.0:
-                ratio = (
-                    y_out.flatten(1).std(dim=1) /
-                    (out.flatten(1).std(dim=1) + 1e-12)).view((-1, ) + (1, ) *
-                                                              (y_out.ndim - 1))
-                out *= guide_rescale * ratio + (1 - guide_rescale) * 1.0
-            if self.prediction_type == 'x0':
-                x0 = out
-            elif self.prediction_type == 'eps':
-                x0 = (x_t - sigma * out) / alpha
-            elif self.prediction_type == 'v':
-                x0 = alpha * x_t - sigma * out
-            else:
-                raise NotImplementedError(
-                    f'prediction_type {self.prediction_type} not implemented')
-            return x0
-        sampler_ins = self.get_sampler(sampler)
-        # this is ignored for schnell
-        sampler_output = sampler_ins.preprare_sampler(
-            noise,
-            steps=steps,
-            prediction_type=self.prediction_type,
-            scheduler_ins=self.sampler_scheduler,
-            callback_fn=callback_fn)
-        for _ in trange(steps, disable=not show_progress):
-            trange.desc = sampler_output.msg
-            sampler_output = sampler_ins.step(sampler_output)
-            if return_intermediate == 'x_0':
-                intermediates.append(sampler_output.x_0)
-            elif return_intermediate == 'x_t':
-                intermediates.append(sampler_output.x_t)
-            if intermediate_callback is not None:
-                intermediate_callback(intermediates[-1])
-        return (sampler_output.x_0, intermediates
-                ) if return_intermediate is not None else sampler_output.x_0
-    def loss(self,
-             x_0,
-             model,
-             model_kwargs={},
-             reduction='mean',
-             noise=None,
-             **kwargs):
-        # use noise scheduler to add noise
-        if noise is None:
-            noise = torch.randn_like(x_0)
-        schedule_output = self.noise_scheduler.add_noise(x_0, noise, **kwargs)
-        x_t, t, sigma, alpha = schedule_output.x_t, schedule_output.t, schedule_output.sigma, schedule_output.alpha
-        out = model(x=x_t, t=t, **model_kwargs)
-        # mse loss
-        target = {
-            'eps': noise,
-            'x0': x_0,
-            'v': alpha * noise - sigma * x_0
-        }[self.prediction_type]
-        loss = (out - target).pow(2)
-        if reduction == 'mean':
-            loss = loss.flatten(1).mean(dim=1)
-        if self.min_snr_gamma is not None:
-            alphas = self.noise_scheduler.alphas.to(x_0.device)[t]
-            sigmas = self.noise_scheduler.sigmas.pow(2).to(x_0.device)[t]
-            snrs = (alphas / sigmas).clamp(min=1e-20)
-            min_snrs = snrs.clamp(max=self.min_snr_gamma)
-            weights = min_snrs / snrs
-        else:
-            weights = 1
-        loss = loss * weights
-        return loss
-    def get_sampler(self, sampler):
-        if isinstance(sampler, str):
-            if sampler not in DIFFUSION_SAMPLERS.class_map:
-                if self.logger is not None:
-                    self.logger.info(
-                        f'{sampler} not in the defined samplers list {DIFFUSION_SAMPLERS.class_map.keys()}'
-                    )
-                else:
-                    print(
-                        f'{sampler} not in the defined samplers list {DIFFUSION_SAMPLERS.class_map.keys()}'
-                    )
-                return None
-            sampler_cfg = Config(cfg_dict={'NAME': sampler}, load=False)
-            sampler_ins = DIFFUSION_SAMPLERS.build(sampler_cfg,
-                                                   logger=self.logger)
-        elif isinstance(sampler, (Config, dict, OrderedDict)):
-            if isinstance(sampler, (dict, OrderedDict)):
-                sampler = Config(
-                    cfg_dict={k.upper(): v
-                              for k, v in dict(sampler).items()},
-                    load=False)
-            sampler_ins = DIFFUSION_SAMPLERS.build(sampler, logger=self.logger)
-        else:
-            raise NotImplementedError
-        return sampler_ins
-    def __repr__(self) -> str:
-        return f'{self.__class__.__name__}' + ' ' + super().__repr__()
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('DIFFUSIONS',
-                            __class__.__name__,
-                            ACEDiffusion.para_dict,
-                            set_name=True)

modules/model/diffusion/samplers.py DELETED Viewed

@@ -1,69 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import torch
-from scepter.modules.model.registry import DIFFUSION_SAMPLERS
-from scepter.modules.model.diffusion.samplers import BaseDiffusionSampler
-from scepter.modules.model.diffusion.util import _i
-def _i(tensor, t, x):
-    """
-    Index tensor using t and format the output according to x.
-    """
-    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
-    if isinstance(t, torch.Tensor):
-        t = t.to(tensor.device)
-    return tensor[t].view(shape).to(x.device)
-@DIFFUSION_SAMPLERS.register_class('ddim')
-class DDIMSampler(BaseDiffusionSampler):
-    def init_params(self):
-        super().init_params()
-        self.eta = self.cfg.get('ETA', 0.)
-        self.discretization_type = self.cfg.get('DISCRETIZATION_TYPE',
-                                                'trailing')
-    def preprare_sampler(self,
-                         noise,
-                         steps=20,
-                         scheduler_ins=None,
-                         prediction_type='',
-                         sigmas=None,
-                         betas=None,
-                         alphas=None,
-                         callback_fn=None,
-                         **kwargs):
-        output = super().preprare_sampler(noise, steps, scheduler_ins,
-                                          prediction_type, sigmas, betas,
-                                          alphas, callback_fn, **kwargs)
-        sigmas = output.sigmas
-        sigmas = torch.cat([sigmas, sigmas.new_zeros([1])])
-        sigmas_vp = (sigmas**2 / (1 + sigmas**2))**0.5
-        sigmas_vp[sigmas == float('inf')] = 1.
-        output.add_custom_field('sigmas_vp', sigmas_vp)
-        return output
-    def step(self, sampler_output):
-        x_t = sampler_output.x_t
-        step = sampler_output.step
-        t = sampler_output.ts[step]
-        sigmas_vp = sampler_output.sigmas_vp.to(x_t.device)
-        alpha_init = _i(sampler_output.alphas_init, step, x_t[:1])
-        sigma_init = _i(sampler_output.sigmas_init, step, x_t[:1])
-        x = sampler_output.callback_fn(x_t, t, sigma_init, alpha_init)
-        noise_factor = self.eta * (sigmas_vp[step + 1]**2 /
-                                   sigmas_vp[step]**2 *
-                                   (1 - (1 - sigmas_vp[step]**2) /
-                                    (1 - sigmas_vp[step + 1]**2)))
-        d = (x_t - (1 - sigmas_vp[step]**2)**0.5 * x) / sigmas_vp[step]
-        x = (1 - sigmas_vp[step + 1] ** 2) ** 0.5 * x + \
-            (sigmas_vp[step + 1] ** 2 - noise_factor ** 2) ** 0.5 * d
-        sampler_output.x_0 = x
-        if sigmas_vp[step + 1] > 0:
-            x += noise_factor * torch.randn_like(x)
-        sampler_output.x_t = x
-        sampler_output.step += 1
-        sampler_output.msg = f'step {step}'
-        return sampler_output

modules/model/diffusion/schedules.py DELETED Viewed

@@ -1,30 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import torch
-from scepter.modules.model.registry import NOISE_SCHEDULERS
-from scepter.modules.model.diffusion.schedules import BaseNoiseScheduler
-@NOISE_SCHEDULERS.register_class()
-class LinearScheduler(BaseNoiseScheduler):
-    para_dict = {}
-    def init_params(self):
-        super().init_params()
-        self.beta_min = self.cfg.get('BETA_MIN', 0.00085)
-        self.beta_max = self.cfg.get('BETA_MAX', 0.012)
-    def betas_to_sigmas(self, betas):
-        return torch.sqrt(1 - torch.cumprod(1 - betas, dim=0))
-    def get_schedule(self):
-        betas = torch.linspace(self.beta_min,
-                               self.beta_max,
-                               self.num_timesteps,
-                               dtype=torch.float32)
-        sigmas = self.betas_to_sigmas(betas)
-        self._sigmas = sigmas
-        self._betas = betas
-        self._alphas = torch.sqrt(1 - sigmas**2)
-        self._timesteps = torch.arange(len(sigmas), dtype=torch.float32)

modules/model/embedder/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .embedder import ACETextEmbedder

modules/model/embedder/embedder.py DELETED Viewed

@@ -1,184 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import warnings
-from contextlib import nullcontext
-import torch
-import torch.nn.functional as F
-import torch.utils.dlpack
-from scepter.modules.model.embedder.base_embedder import BaseEmbedder
-from scepter.modules.model.registry import EMBEDDERS
-from scepter.modules.model.tokenizer.tokenizer_component import (
-    basic_clean, canonicalize, heavy_clean, whitespace_clean)
-from scepter.modules.utils.config import dict_to_yaml
-from scepter.modules.utils.distribute import we
-from scepter.modules.utils.file_system import FS
-try:
-    from transformers import AutoTokenizer, T5EncoderModel
-except Exception as e:
-    warnings.warn(
-        f'Import transformers error, please deal with this problem: {e}')
-@EMBEDDERS.register_class()
-class ACETextEmbedder(BaseEmbedder):
-    """
-    Uses the OpenCLIP transformer encoder for text
-    """
-    """
-        Uses the OpenCLIP transformer encoder for text
-        """
-    para_dict = {
-        'PRETRAINED_MODEL': {
-            'value':
-            'google/umt5-small',
-            'description':
-            'Pretrained Model for umt5, modelcard path or local path.'
-        },
-        'TOKENIZER_PATH': {
-            'value': 'google/umt5-small',
-            'description':
-            'Tokenizer Path for umt5, modelcard path or local path.'
-        },
-        'FREEZE': {
-            'value': True,
-            'description': ''
-        },
-        'USE_GRAD': {
-            'value': False,
-            'description': 'Compute grad or not.'
-        },
-        'CLEAN': {
-            'value':
-            'whitespace',
-            'description':
-            'Set the clean strtegy for tokenizer, used when TOKENIZER_PATH is not None.'
-        },
-        'LAYER': {
-            'value': 'last',
-            'description': ''
-        },
-        'LEGACY': {
-            'value':
-            True,
-            'description':
-            'Whether use legacy returnd feature or not ,default True.'
-        }
-    }
-    def __init__(self, cfg, logger=None):
-        super().__init__(cfg, logger=logger)
-        pretrained_path = cfg.get('PRETRAINED_MODEL', None)
-        self.t5_dtype = cfg.get('T5_DTYPE', 'float32')
-        assert pretrained_path
-        with FS.get_dir_to_local_dir(pretrained_path,
-                                     wait_finish=True) as local_path:
-            self.model = T5EncoderModel.from_pretrained(
-                local_path,
-                torch_dtype=getattr(
-                    torch,
-                    'float' if self.t5_dtype == 'float32' else self.t5_dtype))
-        tokenizer_path = cfg.get('TOKENIZER_PATH', None)
-        self.length = cfg.get('LENGTH', 77)
-        self.use_grad = cfg.get('USE_GRAD', False)
-        self.clean = cfg.get('CLEAN', 'whitespace')
-        self.added_identifier = cfg.get('ADDED_IDENTIFIER', None)
-        if tokenizer_path:
-            self.tokenize_kargs = {'return_tensors': 'pt'}
-            with FS.get_dir_to_local_dir(tokenizer_path,
-                                         wait_finish=True) as local_path:
-                if self.added_identifier is not None and isinstance(
-                        self.added_identifier, list):
-                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
-                else:
-                    self.tokenizer = AutoTokenizer.from_pretrained(local_path)
-            if self.length is not None:
-                self.tokenize_kargs.update({
-                    'padding': 'max_length',
-                    'truncation': True,
-                    'max_length': self.length
-                })
-            self.eos_token = self.tokenizer(
-                self.tokenizer.eos_token)['input_ids'][0]
-        else:
-            self.tokenizer = None
-            self.tokenize_kargs = {}
-        self.use_grad = cfg.get('USE_GRAD', False)
-        self.clean = cfg.get('CLEAN', 'whitespace')
-    def freeze(self):
-        self.model = self.model.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-    # encode && encode_text
-    def forward(self, tokens, return_mask=False, use_mask=True):
-        # tokenization
-        embedding_context = nullcontext if self.use_grad else torch.no_grad
-        with embedding_context():
-            if use_mask:
-                x = self.model(tokens.input_ids.to(we.device_id),
-                               tokens.attention_mask.to(we.device_id))
-            else:
-                x = self.model(tokens.input_ids.to(we.device_id))
-            x = x.last_hidden_state
-            if return_mask:
-                return x.detach() + 0.0, tokens.attention_mask.to(we.device_id)
-            else:
-                return x.detach() + 0.0, None
-    def _clean(self, text):
-        if self.clean == 'whitespace':
-            text = whitespace_clean(basic_clean(text))
-        elif self.clean == 'lower':
-            text = whitespace_clean(basic_clean(text)).lower()
-        elif self.clean == 'canonicalize':
-            text = canonicalize(basic_clean(text))
-        elif self.clean == 'heavy':
-            text = heavy_clean(basic_clean(text))
-        return text
-    def encode(self, text, return_mask=False, use_mask=True):
-        if isinstance(text, str):
-            text = [text]
-        if self.clean:
-            text = [self._clean(u) for u in text]
-        assert self.tokenizer is not None
-        cont, mask = [], []
-        with torch.autocast(device_type='cuda',
-                            enabled=self.t5_dtype in ('float16', 'bfloat16'),
-                            dtype=getattr(torch, self.t5_dtype)):
-            for tt in text:
-                tokens = self.tokenizer([tt], **self.tokenize_kargs)
-                one_cont, one_mask = self(tokens,
-                                          return_mask=return_mask,
-                                          use_mask=use_mask)
-                cont.append(one_cont)
-                mask.append(one_mask)
-        if return_mask:
-            return torch.cat(cont, dim=0), torch.cat(mask, dim=0)
-        else:
-            return torch.cat(cont, dim=0)
-    def encode_list(self, text_list, return_mask=True):
-        cont_list = []
-        mask_list = []
-        for pp in text_list:
-            cont, cont_mask = self.encode(pp, return_mask=return_mask)
-            cont_list.append(cont)
-            mask_list.append(cont_mask)
-        if return_mask:
-            return cont_list, mask_list
-        else:
-            return cont_list
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('MODELS',
-                            __class__.__name__,
-                            ACETextEmbedder.para_dict,
-                            set_name=True)

modules/model/network/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .ldm_ace import LdmACE

modules/model/network/ldm_ace.py DELETED Viewed

@@ -1,353 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import copy
-import random
-from contextlib import nullcontext
-import torch
-import torch.nn.functional as F
-from torch import nn
-from scepter.modules.model.network.ldm import LatentDiffusion
-from scepter.modules.model.registry import MODELS
-from scepter.modules.utils.config import dict_to_yaml
-from scepter.modules.utils.distribute import we
-from ..utils.basic_utils import (
-    check_list_of_list,
-    pack_imagelist_into_tensor_v2 as pack_imagelist_into_tensor,
-    to_device,
-    unpack_tensor_into_imagelist
-)
-class TextEmbedding(nn.Module):
-    def __init__(self, embedding_shape):
-        super().__init__()
-        self.pos = nn.Parameter(data=torch.zeros(embedding_shape))
-@MODELS.register_class()
-class LdmACE(LatentDiffusion):
-    para_dict = LatentDiffusion.para_dict
-    para_dict['DECODER_BIAS'] = {'value': 0, 'description': ''}
-    def __init__(self, cfg, logger=None):
-        super().__init__(cfg, logger=logger)
-        self.interpolate_func = lambda x: (F.interpolate(
-            x.unsqueeze(0),
-            scale_factor=1 / self.size_factor,
-            mode='nearest-exact') if x is not None else None)
-        self.text_indentifers = cfg.get('TEXT_IDENTIFIER', [])
-        self.use_text_pos_embeddings = cfg.get('USE_TEXT_POS_EMBEDDINGS',
-                                               False)
-        if self.use_text_pos_embeddings:
-            self.text_position_embeddings = TextEmbedding(
-                (10, 4096)).eval().requires_grad_(False)
-        else:
-            self.text_position_embeddings = None
-        self.logger.info(self.model)
-    @torch.no_grad()
-    def encode_first_stage(self, x, **kwargs):
-        return [
-            self.scale_factor *
-            self.first_stage_model._encode(i.unsqueeze(0).to(torch.float16))
-            for i in x
-        ]
-    @torch.no_grad()
-    def decode_first_stage(self, z):
-        return [
-            self.first_stage_model._decode(1. / self.scale_factor *
-                                           i.to(torch.float16)) for i in z
-        ]
-    def cond_stage_embeddings(self, prompt, edit_image, cont, cont_mask):
-        if self.use_text_pos_embeddings and not torch.sum(
-                self.text_position_embeddings.pos) > 0:
-            identifier_cont, identifier_cont_mask = getattr(
-                self.cond_stage_model, 'encode')(self.text_indentifers,
-                                                 return_mask=True)
-            self.text_position_embeddings.load_state_dict(
-                {'pos': identifier_cont[:, 0, :]})
-        cont_, cont_mask_ = [], []
-        for pp, edit, c, cm in zip(prompt, edit_image, cont, cont_mask):
-            if isinstance(pp, list):
-                cont_.append([c[-1], *c] if len(edit) > 0 else [c[-1]])
-                cont_mask_.append([cm[-1], *cm] if len(edit) > 0 else [cm[-1]])
-            else:
-                raise NotImplementedError
-        return cont_, cont_mask_
-    def limit_batch_data(self, batch_data_list, log_num):
-        if log_num and log_num > 0:
-            batch_data_list_limited = []
-            for sub_data in batch_data_list:
-                if sub_data is not None:
-                    sub_data = sub_data[:log_num]
-                batch_data_list_limited.append(sub_data)
-            return batch_data_list_limited
-        else:
-            return batch_data_list
-    def forward_train(self,
-                      edit_image=[],
-                      edit_image_mask=[],
-                      image=None,
-                      image_mask=None,
-                      noise=None,
-                      prompt=[],
-                      **kwargs):
-        '''
-        Args:
-            edit_image: list of list of edit_image
-            edit_image_mask: list of list of edit_image_mask
-            image: target image
-            image_mask: target image mask
-            noise: default is None, generate automaticly
-            prompt: list of list of text
-            **kwargs:
-        Returns:
-        '''
-        assert check_list_of_list(prompt) and check_list_of_list(
-            edit_image) and check_list_of_list(edit_image_mask)
-        assert len(edit_image) == len(edit_image_mask) == len(prompt)
-        assert self.cond_stage_model is not None
-        gc_seg = kwargs.pop('gc_seg', [])
-        gc_seg = int(gc_seg[0]) if len(gc_seg) > 0 else 0
-        context = {}
-        # process image
-        image = to_device(image)
-        x_start = self.encode_first_stage(image, **kwargs)
-        x_start, x_shapes = pack_imagelist_into_tensor(x_start)  # B, C, L
-        n, _, _ = x_start.shape
-        t = torch.randint(0, self.num_timesteps, (n, ),
-                          device=x_start.device).long()
-        context['x_shapes'] = x_shapes
-        # process image mask
-        image_mask = to_device(image_mask, strict=False)
-        context['x_mask'] = [self.interpolate_func(i) for i in image_mask
-                             ] if image_mask is not None else [None] * n
-        # process text
-        # with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
-        prompt_ = [[pp] if isinstance(pp, str) else pp for pp in prompt]
-        try:
-            cont, cont_mask = getattr(self.cond_stage_model,
-                                      'encode_list')(prompt_, return_mask=True)
-        except Exception as e:
-            print(e, prompt_)
-        cont, cont_mask = self.cond_stage_embeddings(prompt, edit_image, cont,
-                                                     cont_mask)
-        context['crossattn'] = cont
-        # process edit image & edit image mask
-        edit_image = [to_device(i, strict=False) for i in edit_image]
-        edit_image_mask = [to_device(i, strict=False) for i in edit_image_mask]
-        e_img, e_mask = [], []
-        for u, m in zip(edit_image, edit_image_mask):
-            if m is None:
-                m = [None] * len(u) if u is not None else [None]
-            e_img.append(
-                self.encode_first_stage(u, **kwargs) if u is not None else u)
-            e_mask.append([
-                self.interpolate_func(i) if i is not None else None for i in m
-            ])
-        context['edit'], context['edit_mask'] = e_img, e_mask
-        # process loss
-        loss = self.diffusion.loss(
-            x_0=x_start,
-            t=t,
-            noise=noise,
-            model=self.model,
-            model_kwargs={
-                'cond':
-                context,
-                'mask':
-                cont_mask,
-                'gc_seg':
-                gc_seg,
-                'text_position_embeddings':
-                self.text_position_embeddings.pos if hasattr(
-                    self.text_position_embeddings, 'pos') else None
-            },
-            **kwargs)
-        loss = loss.mean()
-        ret = {'loss': loss, 'probe_data': {'prompt': prompt}}
-        return ret
-    @torch.no_grad()
-    def forward_test(self,
-                     edit_image=[],
-                     edit_image_mask=[],
-                     image=None,
-                     image_mask=None,
-                     prompt=[],
-                     n_prompt=[],
-                     sampler='ddim',
-                     sample_steps=20,
-                     guide_scale=4.5,
-                     guide_rescale=0.5,
-                     log_num=-1,
-                     seed=2024,
-                     **kwargs):
-        assert check_list_of_list(prompt) and check_list_of_list(
-            edit_image) and check_list_of_list(edit_image_mask)
-        assert len(edit_image) == len(edit_image_mask) == len(prompt)
-        assert self.cond_stage_model is not None
-        # gc_seg is unused
-        kwargs.pop('gc_seg', -1)
-        # prepare data
-        context, null_context = {}, {}
-        prompt, n_prompt, image, image_mask, edit_image, edit_image_mask = self.limit_batch_data(
-            [prompt, n_prompt, image, image_mask, edit_image, edit_image_mask],
-            log_num)
-        g = torch.Generator(device=we.device_id)
-        seed = seed if seed >= 0 else random.randint(0, 2**32 - 1)
-        g.manual_seed(seed)
-        n_prompt = copy.deepcopy(prompt)
-        # only modify the last prompt to be zero
-        for nn_p_id, nn_p in enumerate(n_prompt):
-            if isinstance(nn_p, str):
-                n_prompt[nn_p_id] = ['']
-            elif isinstance(nn_p, list):
-                n_prompt[nn_p_id][-1] = ''
-            else:
-                raise NotImplementedError
-        # process image
-        image = to_device(image)
-        x = self.encode_first_stage(image, **kwargs)
-        noise = [
-            torch.empty(*i.shape, device=we.device_id).normal_(generator=g)
-            for i in x
-        ]
-        noise, x_shapes = pack_imagelist_into_tensor(noise)
-        context['x_shapes'] = null_context['x_shapes'] = x_shapes
-        # process image mask
-        image_mask = to_device(image_mask, strict=False)
-        cond_mask = [self.interpolate_func(i) for i in image_mask
-                     ] if image_mask is not None else [None] * len(image)
-        context['x_mask'] = null_context['x_mask'] = cond_mask
-        # process text
-        # with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
-        prompt_ = [[pp] if isinstance(pp, str) else pp for pp in prompt]
-        cont, cont_mask = getattr(self.cond_stage_model,
-                                  'encode_list')(prompt_, return_mask=True)
-        cont, cont_mask = self.cond_stage_embeddings(prompt, edit_image, cont,
-                                                     cont_mask)
-        null_cont, null_cont_mask = getattr(self.cond_stage_model,
-                                            'encode_list')(n_prompt,
-                                                           return_mask=True)
-        null_cont, null_cont_mask = self.cond_stage_embeddings(
-            prompt, edit_image, null_cont, null_cont_mask)
-        context['crossattn'] = cont
-        null_context['crossattn'] = null_cont
-        # processe edit image & edit image mask
-        edit_image = [to_device(i, strict=False) for i in edit_image]
-        edit_image_mask = [to_device(i, strict=False) for i in edit_image_mask]
-        e_img, e_mask = [], []
-        for u, m in zip(edit_image, edit_image_mask):
-            if u is None:
-                continue
-            if m is None:
-                m = [None] * len(u)
-            e_img.append(self.encode_first_stage(u, **kwargs))
-            e_mask.append([self.interpolate_func(i) for i in m])
-        null_context['edit'] = context['edit'] = e_img
-        null_context['edit_mask'] = context['edit_mask'] = e_mask
-        # process sample
-        model = self.model_ema if self.use_ema and self.eval_ema else self.model
-        embedding_context = model.no_sync if isinstance(model, torch.distributed.fsdp.FullyShardedDataParallel) \
-            else nullcontext
-        with embedding_context():
-            samples = self.diffusion.sample(
-                sampler=sampler,
-                noise=noise,
-                model=model,
-                model_kwargs=[{
-                    'cond':
-                    context,
-                    'mask':
-                    cont_mask,
-                    'text_position_embeddings':
-                    self.text_position_embeddings.pos if hasattr(
-                        self.text_position_embeddings, 'pos') else None
-                }, {
-                    'cond':
-                    null_context,
-                    'mask':
-                    null_cont_mask,
-                    'text_position_embeddings':
-                    self.text_position_embeddings.pos if hasattr(
-                        self.text_position_embeddings, 'pos') else None
-                }] if guide_scale is not None and guide_scale > 1 else {
-                    'cond':
-                    context,
-                    'mask':
-                    cont_mask,
-                    'text_position_embeddings':
-                    self.text_position_embeddings.pos if hasattr(
-                        self.text_position_embeddings, 'pos') else None
-                },
-                steps=sample_steps,
-                guide_scale=guide_scale,
-                guide_rescale=guide_rescale,
-                show_progress=True,
-                **kwargs)
-        samples = unpack_tensor_into_imagelist(samples, x_shapes)
-        x_samples = self.decode_first_stage(samples)
-        outputs = list()
-        for i in range(len(prompt)):
-            rec_img = torch.clamp(
-                (x_samples[i] + 1.0) / 2.0 + self.decoder_bias / 255,
-                min=0.0,
-                max=1.0)
-            rec_img = rec_img.squeeze(0)
-            edit_imgs, edit_img_masks = [], []
-            if edit_image is not None and edit_image[i] is not None:
-                if edit_image_mask[i] is None:
-                    edit_image_mask[i] = [None] * len(edit_image[i])
-                for edit_img, edit_mask in zip(edit_image[i],
-                                               edit_image_mask[i]):
-                    edit_img = torch.clamp((edit_img + 1.0) / 2.0,
-                                           min=0.0,
-                                           max=1.0)
-                    edit_imgs.append(edit_img.squeeze(0))
-                    if edit_mask is None:
-                        edit_mask = torch.ones_like(edit_img[[0], :, :])
-                    edit_img_masks.append(edit_mask)
-            one_tup = {
-                'reconstruct_image': rec_img,
-                'instruction': prompt[i],
-                'edit_image': edit_imgs if len(edit_imgs) > 0 else None,
-                'edit_mask': edit_img_masks if len(edit_imgs) > 0 else None
-            }
-            if image is not None:
-                if image_mask is None:
-                    image_mask = [None] * len(image)
-                ori_img = torch.clamp((image[i] + 1.0) / 2.0, min=0.0, max=1.0)
-                one_tup['target_image'] = ori_img.squeeze(0)
-                one_tup['target_mask'] = image_mask[i] if image_mask[
-                    i] is not None else torch.ones_like(ori_img[[0], :, :])
-            outputs.append(one_tup)
-        return outputs
-    @staticmethod
-    def get_config_template():
-        return dict_to_yaml('MODEL',
-                            __class__.__name__,
-                            LdmACE.para_dict,
-                            set_name=True)

modules/model/utils/basic_utils.py DELETED Viewed

@@ -1,104 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from inspect import isfunction
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from scepter.modules.utils.distribute import we
-def exists(x):
-    return x is not None
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-def transfer_size(para_num):
-    if para_num > 1000 * 1000 * 1000 * 1000:
-        bill = para_num / (1000 * 1000 * 1000 * 1000)
-        return '{:.2f}T'.format(bill)
-    elif para_num > 1000 * 1000 * 1000:
-        gyte = para_num / (1000 * 1000 * 1000)
-        return '{:.2f}B'.format(gyte)
-    elif para_num > (1000 * 1000):
-        meta = para_num / (1000 * 1000)
-        return '{:.2f}M'.format(meta)
-    elif para_num > 1000:
-        kelo = para_num / 1000
-        return '{:.2f}K'.format(kelo)
-    else:
-        return para_num
-def count_params(model):
-    total_params = sum(p.numel() for p in model.parameters())
-    return transfer_size(total_params)
-def expand_dims_like(x, y):
-    while x.dim() != y.dim():
-        x = x.unsqueeze(-1)
-    return x
-def unpack_tensor_into_imagelist(image_tensor, shapes):
-    image_list = []
-    for img, shape in zip(image_tensor, shapes):
-        h, w = shape[0], shape[1]
-        image_list.append(img[:, :h * w].view(1, -1, h, w))
-    return image_list
-def find_example(tensor_list, image_list):
-    for i in tensor_list:
-        if isinstance(i, torch.Tensor):
-            return torch.zeros_like(i)
-    for i in image_list:
-        if isinstance(i, torch.Tensor):
-            _, c, h, w = i.size()
-            return torch.zeros_like(i.view(c, h * w).transpose(1, 0))
-    return None
-def pack_imagelist_into_tensor_v2(image_list):
-    # allow None
-    example = None
-    image_tensor, shapes = [], []
-    for img in image_list:
-        if img is None:
-            example = find_example(image_tensor,
-                                   image_list) if example is None else example
-            image_tensor.append(example)
-            shapes.append(None)
-            continue
-        _, c, h, w = img.size()
-        image_tensor.append(img.view(c, h * w).transpose(1, 0))  # h*w, c
-        shapes.append((h, w))
-    image_tensor = pad_sequence(image_tensor,
-                                batch_first=True).permute(0, 2, 1)  # b, c, l
-    return image_tensor, shapes
-def to_device(inputs, strict=True):
-    if inputs is None:
-        return None
-    if strict:
-        assert all(isinstance(i, torch.Tensor) for i in inputs)
-    return [i.to(we.device_id) if i is not None else None for i in inputs]
-def check_list_of_list(ll):
-    return isinstance(ll, list) and all(isinstance(i, list) for i in ll)