# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Mostly copy-paste from timm library.
https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
"""
from copy import deepcopy
from typing import List, Optional, Tuple
import math
from functools import partial
from sympy import flatten

import torch
import torch.nn as nn
from torch import Tensor, pixel_shuffle

from einops import rearrange, repeat
from einops.layers.torch import Rearrange
from torch.nn.modules import GELU

import torch.utils.benchmark as benchmark
def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
    )
    return t0.blocked_autorange().mean * 1e6

# from vit.vision_transformer import Conv3DCrossAttentionBlock

from .utils import trunc_normal_

from pdb import set_trace as st
# import apex
# from apex.normalization import FusedRMSNorm as RMSNorm
try:
    from apex.normalization import FusedRMSNorm as RMSNorm
except:
    from dit.norm import RMSNorm

from torch.nn import LayerNorm

try:
    import xformers
    import xformers.ops
    from xformers.ops import memory_efficient_attention, unbind, fmha
    from xformers.ops import MemoryEfficientAttentionFlashAttentionOp, MemoryEfficientAttentionCutlassOp
    # from xformers.ops import RMSNorm

    XFORMERS_AVAILABLE = True
except ImportError:
    # logger.warning("xFormers not available")
    XFORMERS_AVAILABLE = False

from packaging import version
assert version.parse(torch.__version__) >= version.parse("2.0.0")
SDP_IS_AVAILABLE = True
# from torch.backends.cuda import SDPBackend, sdp_kernel
# from torch.nn.attention import sdpa_kernel, SDPBackend


class Attention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0., 
                 enable_rmsnorm=False,
                 qk_norm=False,
                 no_flash_op=False,
                 enable_rope=False,):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        # https://github.com/huggingface/pytorch-image-models/blob/5dce71010174ad6599653da4e8ba37fd5f9fa572/timm/models/vision_transformer.py#L79C1-L80C78

        self.enable_rope = enable_rope
        # st()

        if enable_rope:
            self.q_norm = RMSNorm(dim, elementwise_affine=True) if qk_norm else nn.Identity()
            self.k_norm = RMSNorm(dim, elementwise_affine=True) if qk_norm else nn.Identity()
        else:
            self.q_norm = RMSNorm(head_dim, elementwise_affine=True) if qk_norm else nn.Identity()
            self.k_norm = RMSNorm(head_dim, elementwise_affine=True) if qk_norm else nn.Identity()

        # if qk_norm:
        #     self.q_norm = LayerNorm(dim, eps=1e-5)
        #     self.k_norm = LayerNorm(dim, eps=1e-5)
        self.qk_norm = qk_norm
        self.no_flash_op = no_flash_op
        self.attn_mode = "torch"
         
        # self.backend = SDPBackend.FLASH_ATTENTION # FA implemented by torch.

    @staticmethod
    def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
        """
        Reshape frequency tensor for broadcasting it with another tensor.

        This function reshapes the frequency tensor to have the same shape as
        the target tensor 'x' for the purpose of broadcasting the frequency
        tensor during element-wise operations.

        Args:
            freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
            x (torch.Tensor): Target tensor for broadcasting compatibility.

        Returns:
            torch.Tensor: Reshaped frequency tensor.

        Raises:
            AssertionError: If the frequency tensor doesn't match the expected
                shape.
            AssertionError: If the target tensor 'x' doesn't have the expected
                number of dimensions.
        """
        ndim = x.ndim
        assert 0 <= 1 < ndim
        assert freqs_cis.shape == (x.shape[1], x.shape[-1])
        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return freqs_cis.view(*shape)

    @staticmethod
    def apply_rotary_emb(
        xq: torch.Tensor,
        xk: torch.Tensor,
        freqs_cis: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Apply rotary embeddings to input tensors using the given frequency
        tensor.

        This function applies rotary embeddings to the given query 'xq' and
        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
        input tensors are reshaped as complex numbers, and the frequency tensor
        is reshaped for broadcasting compatibility. The resulting tensors
        contain rotary embeddings and are returned as real tensors.

        Args:
            xq (torch.Tensor): Query tensor to apply rotary embeddings.
            xk (torch.Tensor): Key tensor to apply rotary embeddings.
            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
                exponentials.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
                and key tensor with rotary embeddings.
        """
        with torch.cuda.amp.autocast(enabled=False):
            xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
            freqs_cis = Attention.reshape_for_broadcast(freqs_cis, xq_)
            xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
            return xq_out.type_as(xq), xk_out.type_as(xk)

    def forward(self, x):
        # B, N, C = x.shape
        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
        #                           C // self.num_heads).permute(2, 0, 3, 1, 4)
        # q, k, v = qkv[0], qkv[1], qkv[2]

        # attn = (q @ k.transpose(-2, -1)) * self.scale
        # attn = attn.softmax(dim=-1)
        # attn = self.attn_drop(attn)

        # x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        # return x, attn

        # https://github.com/Stability-AI/generative-models/blob/863665548f95ff827273948766a3f732ab01bc49/sgm/modules/attention.py#L179
        B, L, C = x.shape

        qkv = self.qkv(x)
        if self.attn_mode == "torch":
            qkv = rearrange(
                qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
            ).float()
            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
            q, k = self.q_norm(q), self.k_norm(k)

            # with sdpa_kernel([self.backend]): # new signature
            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
                 
            del q, k, v
            x = rearrange(x, "B H L D -> B L (H D)")

        x = self.proj(x)
        x = self.proj_drop(x)

        return x


class MemEffAttention(Attention):

    def forward(self, x: Tensor, attn_bias=None, freqs_cis=None) -> Tensor:
        if not XFORMERS_AVAILABLE:
            assert attn_bias is None, "xFormers is required for nested tensors usage"
            return super().forward(x)

        B, N, C = x.shape
        qkv = self.qkv(x)

        dtype = qkv.dtype

        if self.enable_rope:
            assert freqs_cis is not None

            qkv = qkv.reshape(B, N, 3, C)
            q, k, v = unbind(qkv, 2)
            q, k = self.q_norm(q), self.k_norm(k) # do q-k norm on the full seq instead.

            st()
            q, k = Attention.apply_rotary_emb(q, k, freqs_cis=freqs_cis)

            q = q.reshape(B, N, self.num_heads, C // self.num_heads)
            k = k.reshape(B, N, self.num_heads, C // self.num_heads)

            q, k, v = map(
                lambda t: t.reshape(b, N, self.num_heads, C // self.num_heads)
                (q, k, v),
            )
            q, k = q.to(dtype), k.to(dtype)


        else:
            qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
            q, k, v = unbind(qkv, 2)
            q, k = self.q_norm(q), self.k_norm(k)

        # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) # if not bf16, no flash-attn here.
        # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp) # force flash attention

        if self.no_flash_op: # F-A does not support large batch size? force cutlas?
            # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionCutlassOp) # force flash attention

            if version.parse(xformers.__version__) >= version.parse("0.0.21"):
                # NOTE: workaround for
                # https://github.com/facebookresearch/xformers/issues/845
                # def attn(max_bs, op):

                max_bs = 32768
                L = q.shape[0]
                n_batches = math.ceil(L / max_bs)
                x = list()
                for i_batch in range(n_batches):
                    batch = slice(i_batch * max_bs, (i_batch + 1) * max_bs)
                    x.append(
                        xformers.ops.memory_efficient_attention(
                            q[batch],
                            k[batch],
                            v[batch],
                            attn_bias=None,
                            # op=MemoryEfficientAttentionFlashAttentionOp,
                            # op=op,
                            op=MemoryEfficientAttentionCutlassOp,
                        )
                    )
                x = torch.cat(x, 0)
                # return x
                 
                # The cutlas implementation runs in 8396.681 microseconds
                # The Flash implementation runs in 19473.491 microseconds

                # max_bs = 32768
                # math_time = benchmark_torch_function_in_microseconds(attn, max_bs, MemoryEfficientAttentionCutlassOp)
                # print(f"The cutlas implementation runs in {math_time:.3f} microseconds")

                # max_bs = 32768 // 2 # works for flash attention
                # math_time = benchmark_torch_function_in_microseconds(attn, max_bs, MemoryEfficientAttentionFlashAttentionOp)
                # print(f"The Flash implementation runs in {math_time:.3f} microseconds")
                # st()
                # pass

        else: # will enable flash attention by default.
            # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp) # force flash attention
            x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) # force flash attention

        x = x.reshape([B, N, C])

        x = self.proj(x)
        x = self.proj_drop(x)
        return x

class MemEffCrossAttention(MemEffAttention):
    # for cross attention, where context serves as k and v
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0, proj_drop=0):
        super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop, proj_drop)
        del self.qkv
        self.q = nn.Linear(dim, dim * 1, bias=qkv_bias)
        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)

    def forward(self, x: Tensor, context: Tensor, attn_bias=None) -> Tensor:
        if not XFORMERS_AVAILABLE:
            assert attn_bias is None, "xFormers is required for nested tensors usage"
            return super().forward(x)

        B, N, C = x.shape
        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)

        q = self.q(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
        kv = self.kv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)

        k, v = unbind(kv, 2)

        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
        # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
        x = x.reshape([B, N, C])

        x = self.proj(x)
        x = self.proj_drop(x)
        return x


# https://github.com/IBM/CrossViT/blob/main/models/crossvit.py
class CrossAttention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):

        B, N, C = x.shape
        q = self.wq(x[:,
                      0:1, ...]).reshape(B, 1, self.num_heads,
                                         C // self.num_heads).permute(
                                             0, 2, 1,
                                             3)  # B1C -> B1H(C/H) -> BH1(C/H)
        k = self.wk(x).reshape(B, N,
                               self.num_heads, C // self.num_heads).permute(
                                   0, 2, 1, 3)  # BNC -> BNH(C/H) -> BHN(C/H)
        v = self.wv(x).reshape(B, N,
                               self.num_heads, C // self.num_heads).permute(
                                   0, 2, 1, 3)  # BNC -> BNH(C/H) -> BHN(C/H)

        attn = (q @ k.transpose(
            -2, -1)) * self.scale  # BH1(C/H) @ BH(C/H)N -> BH1N
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(
            B, 1, C)  # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class Conv3D_Aware_CrossAttention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):

        B, group_size, N, C = x.shape  # B 3 N C
        p = int(N**0.5)  # patch size
        assert p**2 == N, 'check input dim, no [cls] needed here'
        assert group_size == 3, 'designed for triplane here'

        x = x.reshape(B, group_size, p, p, C)  # expand patch token dim

        # * init qkv
        # q = torch.empty(B * group_size * N,
        #                 1,
        #                 self.num_heads,
        #                 C // self.num_heads,
        #                 device=x.device).permute(0, 2, 1, 3)
        # k = torch.empty(B * group_size * N,
        #                 2 * p,
        #                 self.num_heads,
        #                 C // self.num_heads,
        #                 device=x.device).permute(0, 2, 1, 3)
        # v = torch.empty_like(k)

        q_x = torch.empty(
            B * group_size * N,
            1,
            # self.num_heads,
            # C // self.num_heads,
            C,
            device=x.device)
        k_x = torch.empty(
            B * group_size * N,
            2 * p,
            # self.num_heads,
            # C // self.num_heads,
            C,
            device=x.device)
        v_x = torch.empty_like(k_x)

        # ! refer to the following plane order
        # N, M, _ = coordinates.shape
        # xy_coords = coordinates[..., [0, 1]]
        # yz_coords = coordinates[..., [1, 2]]
        # zx_coords = coordinates[..., [2, 0]]
        # return torch.stack([xy_coords, yz_coords, zx_coords],
        #                 dim=1).reshape(N * 3, M, 2)

        index_i, index_j = torch.meshgrid(torch.arange(0, p),
                                          torch.arange(0, p),
                                          indexing='ij')  # 16*16
        index_mesh_grid = torch.stack([index_i, index_j], 0).to(
            x.device).unsqueeze(0).repeat_interleave(B,
                                                     0).reshape(B, 2, p,
                                                                p)  # B 2 p p.

        for i in range(group_size):
            q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
                0, 2, 3, 1, 4).reshape(B * N, 1, C)  # B 1 p p C -> B*N, 1, C

            # TODO, how to batchify gather ops?
            plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
                         1]  # B 1 p p C
            plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]

            assert plane_yz.shape == plane_zx.shape == (
                B, 1, p, p, C), 'check sub plane dimensions'

            pooling_plane_yz = torch.gather(
                plane_yz,
                dim=2,
                index=index_mesh_grid[:, 0:1].reshape(B, 1, N, 1, 1).expand(
                    -1, -1, -1, p,
                    C)).permute(0, 2, 1, 3, 4)  # B 1 256 16 C => B 256 1 16 C
            pooling_plane_zx = torch.gather(
                plane_zx,
                dim=3,
                index=index_mesh_grid[:, 1:2].reshape(B, 1, 1, N, 1).expand(
                    -1, -1, p, -1,
                    C)).permute(0, 3, 1, 2, 4)  # B 1 16 256 C => B 256 1 16 C

            k_x[B * i * N:B * (i + 1) *
                N] = v_x[B * i * N:B * (i + 1) * N] = torch.cat(
                    [pooling_plane_yz, pooling_plane_zx],
                    dim=2).reshape(B * N, 2 * p,
                                   C)  # B 256 2 16 C => (B*256) 2*16 C

            # q[B * i * N: B * (i+1) * N] = self.wq(q_x).reshape(B*N, 1, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
            # k[B * i * N: B * (i+1) * N] = self.wk(k_x).reshape(B*N, 2*p, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
            # v[B * i * N: B * (i+1) * N] = self.wv(v_x).reshape(B*N, 2*p, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)

        q = self.wq(q_x).reshape(B * group_size * N, 1,
                                 self.num_heads, C // self.num_heads).permute(
                                     0, 2, 1,
                                     3)  # merge num_heads into Batch dimention
        k = self.wk(k_x).reshape(B * group_size * N, 2 * p, self.num_heads,
                                 C // self.num_heads).permute(0, 2, 1, 3)
        v = self.wv(v_x).reshape(B * group_size * N, 2 * p, self.num_heads,
                                 C // self.num_heads).permute(0, 2, 1, 3)

        attn = (q @ k.transpose(
            -2, -1)) * self.scale  # BH1(C/H) @ BH(C/H)N -> BH1N, N=2p here
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(
            B * 3 * N, 1,
            C)  # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
        x = self.proj(x)
        x = self.proj_drop(x)

        # reshape x back
        x = x.reshape(B, 3, N, C)

        return x


class xformer_Conv3D_Aware_CrossAttention(nn.Module):
    # https://github.dev/facebookresearch/dinov2
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()

        # https://pytorch.org/blog/accelerated-generative-diffusion-models/

        self.num_heads = num_heads
        self.wq = nn.Linear(dim, dim * 1, bias=qkv_bias)
        self.w_kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)

        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.index_mesh_grid = None

    def forward(self, x, attn_bias=None):

        B, group_size, N, C = x.shape  # B 3 N C
        p = int(N**0.5)  # patch size
        assert p**2 == N, 'check input dim, no [cls] needed here'
        assert group_size == 3, 'designed for triplane here'

        x = x.reshape(B, group_size, p, p, C)  # expand patch token dim

        q_x = torch.empty(B * group_size * N, 1, C, device=x.device)
        context = torch.empty(B * group_size * N, 2 * p, C,
                              device=x.device)  # k_x=v_x

        if self.index_mesh_grid is None:  # further accelerate
            index_i, index_j = torch.meshgrid(torch.arange(0, p),
                                              torch.arange(0, p),
                                              indexing='ij')  # 16*16
            index_mesh_grid = torch.stack([index_i, index_j], 0).to(
                x.device).unsqueeze(0).repeat_interleave(B, 0).reshape(
                    B, 2, p, p)  # B 2 p p.
            self.index_mesh_grid = index_mesh_grid[0:1]
        else:
            index_mesh_grid = self.index_mesh_grid.clone().repeat_interleave(
                B, 0)
            assert index_mesh_grid.shape == (
                B, 2, p, p), 'check index_mesh_grid dimension'

        for i in range(group_size):
            q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
                0, 2, 3, 1, 4).reshape(B * N, 1, C)  # B 1 p p C -> B*N, 1, C

            # TODO, how to batchify gather ops?
            plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
                         1]  # B 1 p p C
            plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]

            assert plane_yz.shape == plane_zx.shape == (
                B, 1, p, p, C), 'check sub plane dimensions'

            pooling_plane_yz = torch.gather(
                plane_yz,
                dim=2,
                index=index_mesh_grid[:, 0:1].reshape(B, 1, N, 1, 1).expand(
                    -1, -1, -1, p,
                    C)).permute(0, 2, 1, 3, 4)  # B 1 256 16 C => B 256 1 16 C
            pooling_plane_zx = torch.gather(
                plane_zx,
                dim=3,
                index=index_mesh_grid[:, 1:2].reshape(B, 1, 1, N, 1).expand(
                    -1, -1, p, -1,
                    C)).permute(0, 3, 1, 2, 4)  # B 1 16 256 C => B 256 1 16 C

            context[B * i * N:B * (i + 1) * N] = torch.cat(
                [pooling_plane_yz, pooling_plane_zx],
                dim=2).reshape(B * N, 2 * p,
                               C)  # B 256 2 16 C => (B*256) 2*16 C

        # B, N, C = x.shape

        q = self.wq(q_x).reshape(B * group_size * N, 1, self.num_heads,
                                 C // self.num_heads)

        kv = self.w_kv(context).reshape(B * group_size * N, 2 * p, 2,
                                        self.num_heads, C // self.num_heads)
        k, v = unbind(kv, 2)

        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
        # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
        x = x.transpose(1, 2).reshape([B * 3 * N, 1, C]).reshape(B, 3, N, C)

        x = self.proj(x)
        x = self.proj_drop(x)

        return x


class xformer_Conv3D_Aware_CrossAttention_xygrid(
        xformer_Conv3D_Aware_CrossAttention):
    """implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
    """

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.0,
                 proj_drop=0.0):
        super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop,
                         proj_drop)

    def forward(self, x, attn_bias=None):

        B, group_size, N, C = x.shape  # B 3 N C
        p = int(N**0.5)  # patch size
        assert p**2 == N, 'check input dim, no [cls] needed here'
        assert group_size == 3, 'designed for triplane here'

        x = x.reshape(B, group_size, p, p, C)  # expand patch token dim

        q_x = torch.empty(B * group_size * N, 1, C, device=x.device)
        context = torch.empty(B * group_size * N, 2 * p, C,
                              device=x.device)  # k_x=v_x

        if self.index_mesh_grid is None:  # further accelerate
            index_u, index_v = torch.meshgrid(
                torch.arange(0, p), torch.arange(0, p),
                indexing='xy')  # ! switch to 'xy' here to match uv coordinate
            index_mesh_grid = torch.stack([index_u, index_v], 0).to(
                x.device).unsqueeze(0).repeat_interleave(B, 0).reshape(
                    B, 2, p, p)  # B 2 p p.
            self.index_mesh_grid = index_mesh_grid[0:1]
        else:
            index_mesh_grid = self.index_mesh_grid.clone().repeat_interleave(
                B, 0)
            assert index_mesh_grid.shape == (
                B, 2, p, p), 'check index_mesh_grid dimension'

        for i in range(group_size):
            q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
                0, 2, 3, 1, 4).reshape(B * N, 1, C)  # B 1 p p C -> B*N, 1, C

            # TODO, how to batchify gather ops?
            plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
                         1]  # B 1 p p C
            plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]

            assert plane_yz.shape == plane_zx.shape == (
                B, 1, p, p, C), 'check sub plane dimensions'

            pooling_plane_yz = torch.gather(
                plane_yz,
                dim=2,
                index=index_mesh_grid[:, 1:2].reshape(B, 1, N, 1, 1).expand(
                    -1, -1, -1, p,
                    C)).permute(0, 2, 1, 3, 4)  # B 1 256 16 C => B 256 1 16 C
            pooling_plane_zx = torch.gather(
                plane_zx,
                dim=3,
                index=index_mesh_grid[:, 0:1].reshape(B, 1, 1, N, 1).expand(
                    -1, -1, p, -1,
                    C)).permute(0, 3, 1, 2, 4)  # B 1 16 256 C => B 256 1 16 C

            context[B * i * N:B * (i + 1) * N] = torch.cat(
                [pooling_plane_yz, pooling_plane_zx],
                dim=2).reshape(B * N, 2 * p,
                               C)  # B 256 2 16 C => (B*256) 2*16 C

        # B, N, C = x.shape
        q = self.wq(q_x).reshape(B * group_size * N, 1, self.num_heads,
                                 C // self.num_heads)

        kv = self.w_kv(context).reshape(B * group_size * N, 2 * p, 2,
                                        self.num_heads, C // self.num_heads)
        k, v = unbind(kv, 2)

        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
        # x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
        x = x.transpose(1, 2).reshape([B * 3 * N, 1, C]).reshape(B, 3, N, C)

        x = self.proj(x)
        x = self.proj_drop(x)

        return x


class xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(
        xformer_Conv3D_Aware_CrossAttention_xygrid):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0,
                 proj_drop=0):
        super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop,
                         proj_drop)

    def forward(self, x, attn_bias=None):
        # ! split x: B N C into B 3 N C//3
        B, N, C = x.shape
        x = x.reshape(B, N, C // 3, 3).permute(0, 3, 1,
                                               2)  # B N C 3 -> B 3 N C
        x_out = super().forward(x, attn_bias)  # B 3 N C
        x_out = x_out.permute(0, 2, 3, 1)# B 3 N C -> B N C 3
        x_out = x_out.reshape(*x_out.shape[:2], -1) # B N C 3 -> B N C3
        return x_out.contiguous()

class self_cross_attn(nn.Module):
    def __init__(self, dino_attn, cross_attn, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.dino_attn = dino_attn
        self.cross_attn = cross_attn
    
    def forward(self, x_norm):
        y = self.dino_attn(x_norm) + x_norm
        return self.cross_attn(y) # will add x in the original code

# class RodinRollOutConv(nn.Module):
#     """implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
#     Use Group Conv
#     """

#     def __init__(self, in_chans, out_chans=None):
#         super().__init__()
#         # input: B 3C H W
#         if out_chans is None:
#             out_chans = in_chans

#         self.roll_out_convs = nn.Conv2d(in_chans,
#                                         out_chans,
#                                         kernel_size=3,
#                                         groups=3,
#                                         padding=1)

#     def forward(self, x):
#         return self.roll_out_convs(x)


class RodinRollOutConv3D(nn.Module):
    """implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
    """

    def __init__(self, in_chans, out_chans=None):
        super().__init__()
        if out_chans is None:
            out_chans = in_chans

        self.out_chans = out_chans // 3

        self.roll_out_convs = nn.Conv2d(in_chans,
                                        self.out_chans,
                                        kernel_size=3,
                                        padding=1)

    def forward(self, x):
        # todo, reshape before input?

        B, C3, p, p = x.shape  # B 3C H W
        C = C3 // 3
        group_size = C3 // C
        assert group_size == 3

        x = x.reshape(B, 3, C, p, p)

        roll_out_x = torch.empty(B, group_size * C, p, 3 * p,
                                 device=x.device)  # B, 3C, H, 3W

        for i in range(group_size):
            plane_xy = x[:, i]  # B C H W

            # TODO, simply do the average pooling?
            plane_yz_pooling = x[:, (i + 1) % group_size].mean(
                dim=-1, keepdim=True).repeat_interleave(
                    p, dim=-1)  # B C H W -> B C H 1 -> B C H W, reduce z dim
            plane_zx_pooling = x[:, (i + 2) % group_size].mean(
                dim=-2, keepdim=True).repeat_interleave(
                    p, dim=-2)  # B C H W -> B C 1 W -> B C H W, reduce z dim

            roll_out_x[..., i * p:(i + 1) * p] = torch.cat(
                [plane_xy, plane_yz_pooling, plane_zx_pooling],
                1)  # fill in the 3W dim

        x = self.roll_out_convs(roll_out_x)  # B C H 3W

        x = x.reshape(B, self.out_chans, p, 3, p)
        x = x.permute(0, 3, 1, 2, 4).reshape(B, 3 * self.out_chans, p,
                                             p)  # B 3C H W

        return x


class RodinRollOutConv3D_GroupConv(nn.Module):
    """implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
    """

    def __init__(self,
                 in_chans,
                 out_chans=None,
                 kernel_size=3,
                 stride=1,
                 padding=1):
        super().__init__()
        if out_chans is None:
            out_chans = in_chans

        self.roll_out_convs = nn.Conv2d(
            in_chans * 3,
            out_chans,
            kernel_size=kernel_size,
            groups=3,  # B 9C H W
            stride=stride,
            padding=padding)

    # @torch.autocast(device_type='cuda')
    def forward(self, x):
        # todo, reshape before input?

        B, C3, p, p = x.shape  # B 3C H W
        C = C3 // 3
        group_size = C3 // C
        assert group_size == 3

        x = x.reshape(B, 3, C, p, p)

        roll_out_x = torch.empty(B, group_size * C * 3, p, p,
                                 device=x.device)  # B, 3C, H, 3W

        for i in range(group_size):
            plane_xy = x[:, i]  # B C H W

            # # TODO, simply do the average pooling?
            plane_yz_pooling = x[:, (i + 1) % group_size].mean(
                dim=-1, keepdim=True).repeat_interleave(
                    p, dim=-1)  # B C H W -> B C H 1 -> B C H W, reduce z dim
            plane_zx_pooling = x[:, (i + 2) % group_size].mean(
                dim=-2, keepdim=True).repeat_interleave(
                    p, dim=-2)  # B C H W -> B C 1 W -> B C H W, reduce z dim

            roll_out_x[:, i * 3 * C:(i + 1) * 3 * C] = torch.cat(
                [plane_xy, plane_yz_pooling, plane_zx_pooling],
                1)  # fill in the 3W dim

            # ! directly cat, avoid intermediate vars
            # ? why OOM
            # roll_out_x[:, i * 3 * C:(i + 1) * 3 * C] = torch.cat(
            #     [
            #         x[:, i],
            #         x[:, (i + 1) % group_size].mean(
            #             dim=-1, keepdim=True).repeat_interleave(p, dim=-1),
            #         x[:, (i + 2) % group_size].mean(
            #             dim=-2, keepdim=True).repeat_interleave(
            #                 p, dim=-2
            #             )  # B C H W -> B C 1 W -> B C H W, reduce z dim
            #     ],
            #     1)  # fill in the 3C dim

        x = self.roll_out_convs(roll_out_x)  # B 3C H W

        return x


class RodinRollOut_GroupConv_noConv3D(nn.Module):
    """only roll out and do Conv on individual planes
    """

    def __init__(self,
                 in_chans,
                 out_chans=None,
                 kernel_size=3,
                 stride=1,
                 padding=1):
        super().__init__()
        if out_chans is None:
            out_chans = in_chans

        self.roll_out_inplane_conv = nn.Conv2d(
            in_chans,
            out_chans,
            kernel_size=kernel_size,
            groups=3,  # B 3C H W
            stride=stride,
            padding=padding)

    def forward(self, x):
        x = self.roll_out_inplane_conv(x)  # B 3C H W
        return x


# class RodinConv3D_SynthesisLayer_withact(nn.Module):
#     def __init__(self, in_chans, out_chans) -> None:
#         super().__init__()

#         self.act = nn.LeakyReLU(inplace=True)
#         self.conv = nn.Sequential(
#             RodinRollOutConv3D_GroupConv(in_chans, out_chans),
#             nn.LeakyReLU(inplace=True),
#         )

#         if in_chans != out_chans:
#             self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
#         else:
#             self.short_cut = None

# def forward(self, feats):

#     if self.short_cut is not None:
#         res_feats = self.short_cut(feats)
#     else:
#         res_feats = feats

#     # return res_feats + self.conv(feats)
#     feats = res_feats + self.conv(feats)
#     return self.act(feats) # as in resnet, add an act before return


class RodinConv3D_SynthesisLayer_mlp_unshuffle_as_residual(nn.Module):

    def __init__(self, in_chans, out_chans) -> None:
        super().__init__()

        self.act = nn.LeakyReLU(inplace=True)
        self.conv = nn.Sequential(
            RodinRollOutConv3D_GroupConv(in_chans, out_chans),
            nn.LeakyReLU(inplace=True),
        )

        self.out_chans = out_chans
        if in_chans != out_chans:
            # self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
            self.short_cut = nn.Linear(  # B 3C H W -> B 3C 4H 4W
                in_chans // 3,  # 144 / 3 = 48
                out_chans // 3 * 4 * 4,  # 32 * 16
                bias=True)  # decoder to pat

            # RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
        else:
            self.short_cut = None

    def shortcut_unpatchify_triplane(self,
                                     x,
                                     p=None,
                                     unpatchify_out_chans=None):
        """separate triplane version; x shape: B (3*257) 768
        """

        assert self.short_cut is not None

        # B, L, C = x.shape
        B, C3, h, w = x.shape
        assert h == w
        L = h * w
        x = x.reshape(B, C3 // 3, 3, L).permute(0, 2, 3,
                                                1)  # (B, 3, L // 3, C)

        x = self.short_cut(x)

        p = h * 4

        x = x.reshape(shape=(B, 3, h, w, p, p, unpatchify_out_chans))
        x = torch.einsum('ndhwpqc->ndchpwq',
                         x)  # nplanes, C order in the renderer.py
        x = x.reshape(shape=(B, 3 * self.out_chans, h * p, h * p))
        return x

    def forward(self, feats):

        if self.short_cut is not None:
            res_feats = self.shortcut_unpatchify_triplane(feats)
        else:
            res_feats = feats

        # return res_feats + self.conv(feats)

        feats = res_feats + self.conv(feats)
        return self.act(feats)  # as in resnet, add an act before return


# class RodinConv3D_SynthesisLayer(nn.Module):
#     def __init__(self, in_chans, out_chans) -> None:
#         super().__init__()

#         self.act = nn.LeakyReLU(inplace=True)
#         self.conv = nn.Sequential(
#             RodinRollOutConv3D_GroupConv(in_chans, out_chans),
#             nn.LeakyReLU(inplace=True),
#         )

#         if in_chans != out_chans:
#             self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
#         else:
#             self.short_cut = None

#     def forward(self, feats):

#         if self.short_cut is not None:
#             res_feats = self.short_cut(feats)
#         else:
#             res_feats = feats

#         # return res_feats + self.conv(feats)

#         feats = res_feats + self.conv(feats)
#         # return self.act(feats) # as in resnet, add an act before return
#         return feats # ! old behaviour, no act


# previous worked version
class RodinConv3D_SynthesisLayer(nn.Module):

    def __init__(self, in_chans, out_chans) -> None:
        super().__init__()
        # x2 SR + 1x1 Conv Residual BLK
        # self.conv3D = RodinRollOutConv3D(in_chans, out_chans)

        self.act = nn.LeakyReLU(inplace=True)
        self.conv = nn.Sequential(
            RodinRollOutConv3D_GroupConv(in_chans, out_chans),
            nn.LeakyReLU(inplace=True),
        )

        if in_chans != out_chans:
            self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
        else:
            self.short_cut = None

    def forward(self, feats):
        feats_out = self.conv(feats)
        if self.short_cut is not None:
            # ! failed below
            feats_out = self.short_cut(
                feats
            ) + feats_out  # ! only difference here, no act() compared with baseline
            # feats_out = self.act(self.short_cut(feats)) + feats_out # ! only difference here, no act() compared with baseline
        else:
            feats_out = feats_out + feats
        return feats_out


class RodinRollOutConv3DSR2X(nn.Module):

    def __init__(self, in_chans, **kwargs) -> None:
        super().__init__()
        self.conv3D = RodinRollOutConv3D_GroupConv(in_chans)
        # self.conv3D = RodinRollOutConv3D(in_chans)
        self.act = nn.LeakyReLU(inplace=True)
        self.input_resolution = 224

    def forward(self, x):
        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        group_size = C3 // C

        assert group_size == 3
        # p = int(N**0.5)  # patch size
        # assert p**2 == N, 'check input dim, no [cls] needed here'
        assert group_size == 3, 'designed for triplane here'

        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if x.shape[-1] != self.input_resolution:
            x = torch.nn.functional.interpolate(x,
                                                size=(self.input_resolution,
                                                      self.input_resolution),
                                                mode='bilinear',
                                                align_corners=False,
                                                antialias=True)

        x = x + self.conv3D(x)

        return x


class RodinRollOutConv3DSR4X_lite(nn.Module):

    def __init__(self, in_chans, input_resolutiopn=256, **kwargs) -> None:
        super().__init__()
        self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans)
        self.conv3D_1 = RodinRollOutConv3D_GroupConv(in_chans)

        self.act = nn.LeakyReLU(inplace=True)
        self.input_resolution = input_resolutiopn

    def forward(self, x):
        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        group_size = C3 // C

        assert group_size == 3
        # p = int(N**0.5)  # patch size
        # assert p**2 == N, 'check input dim, no [cls] needed here'
        assert group_size == 3, 'designed for triplane here'

        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if x.shape[-1] != self.input_resolution:
            x = torch.nn.functional.interpolate(x,
                                                size=(self.input_resolution,
                                                      self.input_resolution),
                                                mode='bilinear',
                                                align_corners=False,
                                                antialias=True)

        # ! still not convering, not bug here?
        # x = x + self.conv3D_0(x)
        # x = x + self.conv3D_1(x)

        x = x + self.act(self.conv3D_0(x))
        x = x + self.act(self.conv3D_1(x))

        # TODO: which is better, bilinear + conv or PixelUnshuffle?

        return x


# class RodinConv3D2X_lite_mlp_as_residual(nn.Module):
#     """lite 4X version, with MLP unshuffle to change the dimention
#     """
#     def __init__(self, in_chans, out_chans, input_resolution=256) -> None:
#         super().__init__()

#         self.act = nn.LeakyReLU(inplace=True)

#         self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
#         self.conv3D_1 = RodinRollOutConv3D_GroupConv(out_chans, out_chans)

#         self.act = nn.LeakyReLU(inplace=True)
#         self.input_resolution = input_resolution

#         self.out_chans = out_chans
#         if in_chans != out_chans: # ! only change the dimension
#             self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
#                 in_chans//3, # 144 / 3 = 48
#                 out_chans//3, # 32 * 16
#                 bias=True)  # decoder to pat
#         else:
#             self.short_cut = None

#     def shortcut_unpatchify_triplane(self, x, p=None):
#         """separate triplane version; x shape: B (3*257) 768
#         """

#         assert self.short_cut is not None

#         # B, L, C = x.shape
#         B, C3, h, w = x.shape
#         assert h == w
#         L = h*w
#         x = x.reshape(B, C3//3, 3, L).permute(0,2,3,1) # (B, 3, L // 3, C_in)

#         x = self.short_cut(x) # B 3 L//3 C_out

#         x = x.permute(0,1,3,2) # B 3 C_out L//3
#         x = x.reshape(shape=(B, self.out_chans, h, w))

#         # directly resize to the target, no unpatchify here since no 3D ViT is included here
#         if w != self.input_resolution:
#             x = torch.nn.functional.interpolate(x, # 4X SR
#                                                 size=(self.input_resolution,
#                                                       self.input_resolution),
#                                                 mode='bilinear',
#                                                 align_corners=False,
#                                                 antialias=True)

#         return x

#     def forward(self, x):

#         # x: B 3 112*112 C
#         B, C3, p, p = x.shape  # after unpachify triplane
#         C = C3 // 3

#         if self.short_cut is not None:
#             res_feats = self.shortcut_unpatchify_triplane(x)
#         else:
#             res_feats = x

#         """following forward code copied from lite4x version
#         """
#         x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
#                                           p)  # B 3 C N -> B 3C h W

#         if x.shape[-1] != self.input_resolution:
#             x = torch.nn.functional.interpolate(x, # 4X SR
#                                                 size=(self.input_resolution,
#                                                       self.input_resolution),
#                                                 mode='bilinear',
#                                                 align_corners=False,
#                                                 antialias=True)

#         x = res_feats + self.act(self.conv3D_0(x))
#         x = x + self.act(self.conv3D_1(x))

#         return x


class RodinConv3D4X_lite_mlp_as_residual(nn.Module):
    """lite 4X version, with MLP unshuffle to change the dimention
    """

    def __init__(self,
                 in_chans,
                 out_chans,
                 input_resolution=256,
                 interp_mode='bilinear',
                 bcg_triplane=False) -> None:
        super().__init__()

        self.interp_mode = interp_mode

        self.act = nn.LeakyReLU(inplace=True)

        self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
        self.conv3D_1 = RodinRollOutConv3D_GroupConv(out_chans, out_chans)
        self.bcg_triplane = bcg_triplane
        if bcg_triplane:
            self.conv3D_1_bg = RodinRollOutConv3D_GroupConv(
                out_chans, out_chans)

        self.act = nn.LeakyReLU(inplace=True)
        self.input_resolution = input_resolution

        self.out_chans = out_chans
        if in_chans != out_chans:  # ! only change the dimension
            self.short_cut = nn.Linear(  # B 3C H W -> B 3C 4H 4W
                in_chans // 3,  # 144 / 3 = 48
                out_chans // 3,  # 32 * 16
                bias=True)  # decoder to pat
        else:
            self.short_cut = None

    def shortcut_unpatchify_triplane(self, x, p=None):
        """separate triplane version; x shape: B (3*257) 768
        """

        assert self.short_cut is not None

        B, C3, h, w = x.shape
        assert h == w
        L = h * w
        x = x.reshape(B, C3 // 3, 3, L).permute(0, 2, 3,
                                                1)  # (B, 3, L // 3, C_in)

        x = self.short_cut(x)  # B 3 L//3 C_out

        x = x.permute(0, 1, 3, 2)  # B 3 C_out L//3
        x = x.reshape(shape=(B, self.out_chans, h, w))

        # directly resize to the target, no unpatchify here since no 3D ViT is included here
        if w != self.input_resolution:
            x = torch.nn.functional.interpolate(
                x,  # 4X SR
                size=(self.input_resolution, self.input_resolution),
                mode='bilinear',
                align_corners=False,
                antialias=True)

        return x

    def interpolate(self, feats):
        if self.interp_mode == 'bilinear':
            return torch.nn.functional.interpolate(
                feats,  # 4X SR
                size=(self.input_resolution, self.input_resolution),
                mode='bilinear',
                align_corners=False,
                antialias=True)
        else:
            return torch.nn.functional.interpolate(
                feats,  # 4X SR
                size=(self.input_resolution, self.input_resolution),
                mode='nearest',
            )

    def forward(self, x):

        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3

        if self.short_cut is not None:
            res_feats = self.shortcut_unpatchify_triplane(x)
        else:
            res_feats = x
            if res_feats.shape[-1] != self.input_resolution:
                res_feats = self.interpolate(res_feats)
        """following forward code copied from lite4x version
        """
        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if x.shape[-1] != self.input_resolution:
            x = self.interpolate(x)

        x0 = res_feats + self.act(self.conv3D_0(x))  # the base feature
        x = x0 + self.act(self.conv3D_1(x0))
        if self.bcg_triplane:
            x_bcg = x0 + self.act(self.conv3D_1_bg(x0))
            return torch.cat([x, x_bcg], 1)
        else:
            return x


class RodinConv3D4X_lite_mlp_as_residual_litev2(
        RodinConv3D4X_lite_mlp_as_residual):

    def __init__(self,
                 in_chans,
                 out_chans,
                 num_feat=128,
                 input_resolution=256,
                 interp_mode='bilinear',
                 bcg_triplane=False) -> None:
        super().__init__(in_chans, out_chans, input_resolution, interp_mode,
                         bcg_triplane)

        self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, in_chans)
        self.conv_before_upsample = RodinRollOut_GroupConv_noConv3D(
            in_chans, num_feat * 3)
        self.conv3D_1 = RodinRollOut_GroupConv_noConv3D(
            num_feat * 3, num_feat * 3)
        self.conv_last = RodinRollOut_GroupConv_noConv3D(
            num_feat * 3, out_chans)
        self.short_cut = None

    def forward(self, x):

        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3

        # if self.short_cut is not None:
        #     res_feats = self.shortcut_unpatchify_triplane(x)
        # else:
        #     res_feats = x
        #     if res_feats.shape[-1] != self.input_resolution:
        #         res_feats = self.interpolate(res_feats)
        """following forward code copied from lite4x version
        """
        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        x = x + self.conv3D_0(x)  # the base feature
        x = self.act(self.conv_before_upsample(x))

        # if x.shape[-1] != self.input_resolution:
        x = self.conv_last(self.act(self.conv3D_1(self.interpolate(x))))

        return x


class RodinConv3D4X_lite_mlp_as_residual_lite(
        RodinConv3D4X_lite_mlp_as_residual):

    def __init__(self,
                 in_chans,
                 out_chans,
                 input_resolution=256,
                 interp_mode='bilinear') -> None:
        super().__init__(in_chans, out_chans, input_resolution, interp_mode)
        """replace the first Rodin Conv 3D with ordinary rollout conv to save memory
        """
        self.conv3D_0 = RodinRollOut_GroupConv_noConv3D(in_chans, out_chans)


class SR3D(nn.Module):
    # https://github.com/SeanChenxy/Mimic3D/blob/77d313656df3cd5536d2c4c5766db3a56208eea6/training/networks_stylegan2.py#L629
    # roll-out and apply two deconv/pixelUnshuffle layer

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)


class RodinConv3D4X_lite_mlp_as_residual_improved(nn.Module):

    def __init__(self,
                 in_chans,
                 num_feat,
                 out_chans,
                 input_resolution=256) -> None:
        super().__init__()

        assert in_chans == 4 * out_chans
        assert num_feat == 2 * out_chans
        self.input_resolution = input_resolution

        # refer to https://github.com/JingyunLiang/SwinIR/blob/6545850fbf8df298df73d81f3e8cba638787c8bd/models/network_swinir.py#L750
        self.upscale = 4

        self.conv_after_body = RodinRollOutConv3D_GroupConv(
            in_chans, in_chans, 3, 1, 1)
        self.conv_before_upsample = nn.Sequential(
            RodinRollOutConv3D_GroupConv(in_chans, num_feat, 3, 1, 1),
            nn.LeakyReLU(inplace=True))
        self.conv_up1 = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
                                                     1)
        if self.upscale == 4:
            self.conv_up2 = RodinRollOutConv3D_GroupConv(
                num_feat, num_feat, 3, 1, 1)
        self.conv_hr = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
                                                    1)
        self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, out_chans, 3,
                                                      1, 1)

        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

    def forward(self, x):

        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        """following forward code copied from lite4x version
        """
        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        # ? nearest or bilinear
        x = self.conv_after_body(x) + x
        x = self.conv_before_upsample(x)
        x = self.lrelu(
            self.conv_up1(
                torch.nn.functional.interpolate(
                    x,
                    scale_factor=2,
                    mode='nearest',
                    # align_corners=False,
                    # antialias=True
                )))
        if self.upscale == 4:
            x = self.lrelu(
                self.conv_up2(
                    torch.nn.functional.interpolate(
                        x,
                        scale_factor=2,
                        mode='nearest',
                        # align_corners=False,
                        # antialias=True
                    )))
        x = self.conv_last(self.lrelu(self.conv_hr(x)))

        assert x.shape[-1] == self.input_resolution

        return x


class RodinConv3D4X_lite_improved_lint_withresidual(nn.Module):

    def __init__(self,
                 in_chans,
                 num_feat,
                 out_chans,
                 input_resolution=256) -> None:
        super().__init__()

        assert in_chans == 4 * out_chans
        assert num_feat == 2 * out_chans
        self.input_resolution = input_resolution

        # refer to https://github.com/JingyunLiang/SwinIR/blob/6545850fbf8df298df73d81f3e8cba638787c8bd/models/network_swinir.py#L750
        self.upscale = 4

        self.conv_after_body = RodinRollOutConv3D_GroupConv(
            in_chans, in_chans, 3, 1, 1)
        self.conv_before_upsample = nn.Sequential(
            RodinRollOutConv3D_GroupConv(in_chans, num_feat, 3, 1, 1),
            nn.LeakyReLU(inplace=True))
        self.conv_up1 = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
                                                     1)
        if self.upscale == 4:
            self.conv_up2 = RodinRollOutConv3D_GroupConv(
                num_feat, num_feat, 3, 1, 1)
        self.conv_hr = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
                                                    1)
        self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, out_chans, 3,
                                                      1, 1)

        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

    def forward(self, x):

        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        """following forward code copied from lite4x version
        """
        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        # ? nearest or bilinear
        x = self.conv_after_body(x) + x
        x = self.conv_before_upsample(x)
        x = self.lrelu(
            self.conv_up1(
                torch.nn.functional.interpolate(
                    x,
                    scale_factor=2,
                    mode='nearest',
                    # align_corners=False,
                    # antialias=True
                )))
        if self.upscale == 4:
            x = self.lrelu(
                self.conv_up2(
                    torch.nn.functional.interpolate(
                        x,
                        scale_factor=2,
                        mode='nearest',
                        # align_corners=False,
                        # antialias=True
                    )))
        x = self.conv_last(self.lrelu(self.conv_hr(x) + x))

        assert x.shape[-1] == self.input_resolution

        return x


class RodinRollOutConv3DSR_FlexibleChannels(nn.Module):

    def __init__(self,
                 in_chans,
                 num_out_ch=96,
                 input_resolution=256,
                 **kwargs) -> None:
        super().__init__()

        self.block0 = RodinConv3D_SynthesisLayer(in_chans,
                                                 num_out_ch)  # in_chans=48
        self.block1 = RodinConv3D_SynthesisLayer(num_out_ch, num_out_ch)

        self.input_resolution = input_resolution  # 64 -> 256 SR

    def forward(self, x):
        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        # group_size = C3 // C

        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if x.shape[-1] != self.input_resolution:
            x = torch.nn.functional.interpolate(x,
                                                size=(self.input_resolution,
                                                      self.input_resolution),
                                                mode='bilinear',
                                                align_corners=False,
                                                antialias=True)

        x = self.block0(x)
        x = self.block1(x)

        return x


# previous worked version
class RodinRollOutConv3DSR4X(nn.Module):
    # follow PixelUnshuffleUpsample

    def __init__(self, in_chans, **kwargs) -> None:
        super().__init__()
        # self.block0 = RodinConv3D_SynthesisLayer(in_chans, 96 * 2) # TODO, match the old behaviour now.
        # self.block1 = RodinConv3D_SynthesisLayer(96 * 2, 96)

        self.block0 = RodinConv3D_SynthesisLayer(in_chans, 96)
        self.block1 = RodinConv3D_SynthesisLayer(
            96, 96)  # baseline choice, validate with no LPIPS loss here

        self.input_resolution = 64  # 64 -> 256

    def forward(self, x):
        # x: B 3 112*112 C
        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        # group_size = C3 // C

        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if x.shape[-1] != self.input_resolution:
            x = torch.nn.functional.interpolate(x,
                                                size=(self.input_resolution,
                                                      self.input_resolution),
                                                mode='bilinear',
                                                align_corners=False,
                                                antialias=True)

        x = self.block0(x)
        x = self.block1(x)

        return x


class Upsample3D(nn.Module):
    """Upsample module.
    Args:
        scale (int): Scale factor. Supported scales: 2^n and 3.
        num_feat (int): Channel number of intermediate features.
    """

    def __init__(self, scale, num_feat):
        super().__init__()

        m_convs = []
        m_pixelshuffle = []

        assert (scale & (scale - 1)) == 0, 'scale = 2^n'
        self.scale = scale

        for _ in range(int(math.log(scale, 2))):
            m_convs.append(
                RodinRollOutConv3D_GroupConv(num_feat, 4 * num_feat, 3, 1, 1))
            m_pixelshuffle.append(nn.PixelShuffle(2))

        self.m_convs = nn.ModuleList(m_convs)
        self.m_pixelshuffle = nn.ModuleList(m_pixelshuffle)

    # @torch.autocast(device_type='cuda')
    def forward(self, x):
        for scale_idx in range(int(math.log(self.scale, 2))):
            x = self.m_convs[scale_idx](x)  # B 3C H W
            # x =
            # B, C3, H, W = x.shape
            x = x.reshape(x.shape[0] * 3, x.shape[1] // 3, *x.shape[2:])
            x = self.m_pixelshuffle[scale_idx](x)
            x = x.reshape(x.shape[0] // 3, x.shape[1] * 3, *x.shape[2:])

        return x


class RodinConv3DPixelUnshuffleUpsample(nn.Module):

    def __init__(self,
                 output_dim,
                 num_feat=32 * 6,
                 num_out_ch=32 * 3,
                 sr_ratio=4,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.conv_after_body = RodinRollOutConv3D_GroupConv(
            output_dim, output_dim, 3, 1, 1)
        self.conv_before_upsample = nn.Sequential(
            RodinRollOutConv3D_GroupConv(output_dim, num_feat, 3, 1, 1),
            nn.LeakyReLU(inplace=True))
        self.upsample = Upsample3D(sr_ratio, num_feat)  # 4 time SR
        self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, num_out_ch, 3,
                                                      1, 1)

    # @torch.autocast(device_type='cuda')
    def forward(self, x, input_skip_connection=True, *args, **kwargs):
        # x = self.conv_first(x)
        if input_skip_connection:
            x = self.conv_after_body(x) + x
        else:
            x = self.conv_after_body(x)

        x = self.conv_before_upsample(x)
        x = self.upsample(x)
        x = self.conv_last(x)
        return x


class RodinConv3DPixelUnshuffleUpsample_improvedVersion(nn.Module):

    def __init__(
        self,
        output_dim,
        num_out_ch=32 * 3,
        sr_ratio=4,
        input_resolution=256,
    ) -> None:
        super().__init__()

        self.input_resolution = input_resolution

        # self.conv_first = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
        #                                               3, 1, 1)
        self.upsample = Upsample3D(sr_ratio, output_dim)  # 4 time SR
        self.conv_last = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
                                                      3, 1, 1)

    def forward(self, x, bilinear_upsample=True):

        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        group_size = C3 // C

        assert group_size == 3, 'designed for triplane here'

        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if bilinear_upsample and x.shape[-1] != self.input_resolution:
            x_bilinear_upsample = torch.nn.functional.interpolate(
                x,
                size=(self.input_resolution, self.input_resolution),
                mode='bilinear',
                align_corners=False,
                antialias=True)
            x = self.upsample(x) + x_bilinear_upsample
        else:
            # x_bilinear_upsample = x
            x = self.upsample(x)

        x = self.conv_last(x)

        return x


class RodinConv3DPixelUnshuffleUpsample_improvedVersion2(nn.Module):
    """removed nearest neighbour residual conenctions, add a conv layer residual conenction
    """

    def __init__(
        self,
        output_dim,
        num_out_ch=32 * 3,
        sr_ratio=4,
        input_resolution=256,
    ) -> None:
        super().__init__()

        self.input_resolution = input_resolution

        self.conv_after_body = RodinRollOutConv3D_GroupConv(
            output_dim, num_out_ch, 3, 1, 1)
        self.upsample = Upsample3D(sr_ratio, output_dim)  # 4 time SR
        self.conv_last = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
                                                      3, 1, 1)

    def forward(self, x, input_skip_connection=True):

        B, C3, p, p = x.shape  # after unpachify triplane
        C = C3 // 3
        group_size = C3 // C

        assert group_size == 3, 'designed for triplane here'

        x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
                                          p)  # B 3 C N -> B 3C h W

        if input_skip_connection:
            x = self.conv_after_body(x) + x
        else:
            x = self.conv_after_body(x)

        x = self.upsample(x)
        x = self.conv_last(x)

        return x


class CLSCrossAttentionBlock(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 has_mlp=False):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = CrossAttention(dim,
                                   num_heads=num_heads,
                                   qkv_bias=qkv_bias,
                                   qk_scale=qk_scale,
                                   attn_drop=attn_drop,
                                   proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.has_mlp = has_mlp
        if has_mlp:
            self.norm2 = norm_layer(dim)
            mlp_hidden_dim = int(dim * mlp_ratio)
            self.mlp = Mlp(in_features=dim,
                           hidden_features=mlp_hidden_dim,
                           act_layer=act_layer,
                           drop=drop)

    def forward(self, x):
        x = x[:, 0:1, ...] + self.drop_path(self.attn(self.norm1(x)))
        if self.has_mlp:
            x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


class Conv3DCrossAttentionBlock(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 has_mlp=False):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Conv3D_Aware_CrossAttention(dim,
                                                num_heads=num_heads,
                                                qkv_bias=qkv_bias,
                                                qk_scale=qk_scale,
                                                attn_drop=attn_drop,
                                                proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.has_mlp = has_mlp
        if has_mlp:
            self.norm2 = norm_layer(dim)
            mlp_hidden_dim = int(dim * mlp_ratio)
            self.mlp = Mlp(in_features=dim,
                           hidden_features=mlp_hidden_dim,
                           act_layer=act_layer,
                           drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        if self.has_mlp:
            x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


class Conv3DCrossAttentionBlockXformerMHA(Conv3DCrossAttentionBlock):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0,
                 attn_drop=0,
                 drop_path=0,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 has_mlp=False):
        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
                         attn_drop, drop_path, act_layer, norm_layer, has_mlp)
        # self.attn = xformer_Conv3D_Aware_CrossAttention(dim,
        self.attn = xformer_Conv3D_Aware_CrossAttention_xygrid(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)


class Conv3DCrossAttentionBlockXformerMHANested(
        Conv3DCrossAttentionBlockXformerMHA):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 has_mlp=False):
        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
                         attn_drop, drop_path, act_layer, norm_layer, has_mlp)
        """for in-place replaing the internal attn in Dino ViT.
        """

    def forward(self, x):
        Bx3, N, C = x.shape
        B, group_size = Bx3 // 3, 3
        x = x.reshape(B, group_size, N, C)  # in plane vit
        x = super().forward(x)
        return x.reshape(B * group_size, N,
                         C)  # to match the original attn size


class Conv3DCrossAttentionBlockXformerMHANested_withinC(
        Conv3DCrossAttentionBlockXformerMHANested):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0,
                 attn_drop=0,
                 drop_path=0,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 has_mlp=False):
        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
                         attn_drop, drop_path, act_layer, norm_layer, has_mlp)
        self.attn = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

    def forward(self, x):
        # basic TX attention forward function
        x = x + self.drop_path(self.attn(self.norm1(x)))
        if self.has_mlp:
            x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


class TriplaneFusionBlock(nn.Module):
    """4 ViT blocks + 1 CrossAttentionBlock
    """

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 cross_attention_blk=CLSCrossAttentionBlock,
                 *args,
                 **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        if use_fusion_blk:
            self.fusion = nn.ModuleList()

            # copied vit settings from https://github.dev/facebookresearch/dinov2
            nh = num_heads
            dim = embed_dim

            mlp_ratio = 4  # defined for all dino2 model
            qkv_bias = True
            norm_layer = partial(nn.LayerNorm, eps=1e-6)
            drop_path_rate = 0.3  # default setting
            attn_drop = proj_drop = 0.0
            qk_scale = None  # TODO, double check

            for d in range(self.num_branches):
                self.fusion.append(
                    cross_attention_blk(
                        dim=dim,
                        num_heads=nh,
                        mlp_ratio=mlp_ratio,
                        qkv_bias=qkv_bias,
                        qk_scale=qk_scale,
                        # drop=drop,
                        drop=proj_drop,
                        attn_drop=attn_drop,
                        drop_path=drop_path_rate,
                        norm_layer=norm_layer,  # type: ignore
                        has_mlp=False))
        else:
            self.fusion = None

    def forward(self, x):
        # modified from https://github.com/IBM/CrossViT/blob/main/models/crossvit.py#L132
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'
        x = x.view(B * group_size, N, C)

        for blk in self.vit_blks:
            x = blk(x)  # B 3 N C

        if self.fusion is None:
            return x.view(B, group_size, N, C)

        # outs_b = x.view(B, group_size, N,
        #                 C).chunk(chunks=3,
        #                          dim=1)  # 3 * [B, 1, N//3, C] Tensors, for fusion

        outs_b = x.chunk(chunks=3,
                         dim=0)  # 3 * [B, N//3, C] Tensors, for fusion

        # only take the cls token out
        proj_cls_token = [x[:, 0:1] for x in outs_b]
        # cross attention
        outs = []
        for i in range(self.num_branches):
            tmp = torch.cat(
                (proj_cls_token[i], outs_b[(i + 1) % self.num_branches][:, 1:,
                                                                        ...]),
                dim=1)
            tmp = self.fusion[i](tmp)
            # reverted_proj_cls_token = self.revert_projs[i](tmp[:, 0:1, ...])
            reverted_proj_cls_token = tmp[:, 0:1, ...]
            tmp = torch.cat((reverted_proj_cls_token, outs_b[i][:, 1:, ...]),
                            dim=1)
            outs.append(tmp)
        # outs = ? needs to merge back?
        outs = torch.stack(outs, 1)  # B 3 N C
        return outs


class TriplaneFusionBlockv2(nn.Module):
    """4 ViT blocks + 1 CrossAttentionBlock
    """

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlock,
                 *args,
                 **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        if use_fusion_blk:
            # self.fusion = nn.ModuleList()

            # copied vit settings from https://github.dev/facebookresearch/dinov2
            nh = num_heads
            dim = embed_dim

            mlp_ratio = 4  # defined for all dino2 model
            qkv_bias = True
            norm_layer = partial(nn.LayerNorm, eps=1e-6)
            drop_path_rate = 0.3  # default setting
            attn_drop = proj_drop = 0.0
            qk_scale = None  # TODO, double check

            # for d in range(self.num_branches):
            self.fusion = fusion_ca_blk(  # one fusion is enough
                dim=dim,
                num_heads=nh,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                # drop=drop,
                drop=proj_drop,
                attn_drop=attn_drop,
                drop_path=drop_path_rate,
                norm_layer=norm_layer,  # type: ignore
                has_mlp=False)
        else:
            self.fusion = None

    def forward(self, x):
        # modified from https://github.com/IBM/CrossViT/blob/main/models/crossvit.py#L132
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'
        x = x.reshape(B * group_size, N, C)

        for blk in self.vit_blks:
            x = blk(x)  # B 3 N C

        if self.fusion is None:
            return x.reshape(B, group_size, N, C)

        x = x.reshape(B, group_size, N, C)  # .chunk(chunks=3,
        #  dim=1)  # 3 * [B, N//3, C] Tensors, for fusion
        return self.fusion(x)


class TriplaneFusionBlockv3(TriplaneFusionBlockv2):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHA,
                 *args,
                 **kwargs) -> None:
        super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
                         fusion_ca_blk, *args, **kwargs)


class TriplaneFusionBlockv4(TriplaneFusionBlockv3):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHA,
                 *args,
                 **kwargs) -> None:
        super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
                         fusion_ca_blk, *args, **kwargs)
        """OOM? directly replace the atten here
        """

        assert len(vit_blks) == 2
        # del self.vit_blks[1].attn
        del self.vit_blks[1].attn, self.vit_blks[1].ls1, self.vit_blks[1].norm1

    def ffn_residual_func(self, tx_blk, x: Tensor) -> Tensor:
        return tx_blk.ls2(
            tx_blk.mlp(tx_blk.norm2(x))
        )  # https://github.com/facebookresearch/dinov2/blob/c3c2683a13cde94d4d99f523cf4170384b00c34c/dinov2/layers/block.py#L86C1-L87C53

    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """
        assert self.fusion is not None

        B, group_size, N, C = x.shape  # has [cls] token in N
        x = x.reshape(B * group_size, N, C)  # in plane vit

        # in plane self attention
        x = self.vit_blks[0](x)

        # 3D cross attention blk + ffn
        x = x + self.fusion(x.reshape(B, group_size, N, C)).reshape(
            B * group_size, N, C)
        x = x + self.ffn_residual_func(self.vit_blks[1], x)
        return x.reshape(B, group_size, N, C)


class TriplaneFusionBlockv4_nested(nn.Module):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        assert use_fusion_blk

        assert len(vit_blks) == 2

        # ! replace vit_blks[1] attn layer with 3D aware attention
        del self.vit_blks[
            1].attn  # , self.vit_blks[1].ls1, self.vit_blks[1].norm1

        # copied vit settings from https://github.dev/facebookresearch/dinov2
        nh = num_heads
        dim = embed_dim

        mlp_ratio = 4  # defined for all dino2 model
        qkv_bias = True
        norm_layer = partial(nn.LayerNorm, eps=1e-6)
        drop_path_rate = 0.3  # default setting
        attn_drop = proj_drop = 0.0
        qk_scale = None  # TODO, double check

        self.vit_blks[1].attn = fusion_ca_blk(  # one fusion is enough
            dim=dim,
            num_heads=nh,
            mlp_ratio=mlp_ratio,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            # drop=drop,
            drop=proj_drop,
            attn_drop=attn_drop,
            drop_path=drop_path_rate,
            norm_layer=norm_layer,  # type: ignore
            has_mlp=False)

    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'
        x = x.reshape(B * group_size, N, C)

        for blk in self.vit_blks:
            x = blk(x)  # B 3 N C

        # TODO, avoid the reshape overhead?
        return x.reshape(B, group_size, N, C)


class TriplaneFusionBlockv4_nested_init_from_dino(nn.Module):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
                 init_from_dino=True,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        assert use_fusion_blk

        assert len(vit_blks) == 2

        # copied vit settings from https://github.dev/facebookresearch/dinov2
        nh = num_heads
        dim = embed_dim

        mlp_ratio = 4  # defined for all dino2 model
        qkv_bias = True
        norm_layer = partial(nn.LayerNorm, eps=1e-6)
        drop_path_rate = 0.3  # default setting
        attn_drop = proj_drop = 0.0
        qk_scale = None  # TODO, double check

        attn_3d = fusion_ca_blk(  # one fusion is enough
            dim=dim,
            num_heads=nh,
            mlp_ratio=mlp_ratio,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            # drop=drop,
            drop=proj_drop,
            attn_drop=attn_drop,
            drop_path=drop_path_rate,
            norm_layer=norm_layer,  # type: ignore
            has_mlp=False)

        # ! initialize 3dattn from dino attn
        if init_from_dino:
            merged_qkv_linear = self.vit_blks[1].attn.qkv
            attn_3d.attn.proj.load_state_dict(
                self.vit_blks[1].attn.proj.state_dict())

            # Initialize the Q, K, and V linear layers using the weights of the merged QKV linear layer
            attn_3d.attn.wq.weight.data = merged_qkv_linear.weight.data[:
                                                                        dim, :]
            attn_3d.attn.w_kv.weight.data = merged_qkv_linear.weight.data[
                dim:, :]

            # Optionally, you can initialize the biases as well (if your QKV linear layer has biases)
            if qkv_bias:
                attn_3d.attn.wq.bias.data = merged_qkv_linear.bias.data[:dim]
                attn_3d.attn.w_kv.bias.data = merged_qkv_linear.bias.data[dim:]

        del self.vit_blks[1].attn
        # ! assign
        self.vit_blks[1].attn = attn_3d

    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'
        x = x.reshape(B * group_size, N, C)

        for blk in self.vit_blks:
            x = blk(x)  # B 3 N C

        # TODO, avoid the reshape overhead?
        return x.reshape(B, group_size, N, C)


class TriplaneFusionBlockv4_nested_init_from_dino_lite(nn.Module):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=None,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        assert use_fusion_blk

        assert len(vit_blks) == 2

        # copied vit settings from https://github.dev/facebookresearch/dinov2
        nh = num_heads
        dim = embed_dim

        mlp_ratio = 4  # defined for all dino2 model
        qkv_bias = True
        norm_layer = partial(nn.LayerNorm, eps=1e-6)
        drop_path_rate = 0.3  # default setting
        attn_drop = proj_drop = 0.0
        qk_scale = None  # TODO, double check

        attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(  # ! raw 3D attn layer
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=proj_drop)

        del self.vit_blks[1].attn
        # ! assign
        self.vit_blks[1].attn = attn_3d

    def forward(self, x):
        """x: B N C, where N = H*W tokens. Just raw ViT forward pass
        """

        # ! move the below to the front of the first call
        B, N, C = x.shape  # has [cls] token in N

        for blk in self.vit_blks:
            x = blk(x)  # B N C

        return x

class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge(nn.Module):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=None,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.vit_blks = vit_blks

        assert use_fusion_blk
        assert len(vit_blks) == 2

        # copied vit settings from https://github.dev/facebookresearch/dinov2
        nh = num_heads
        dim = embed_dim
        qkv_bias = True
        attn_drop = proj_drop = 0.0
        qk_scale = None  # TODO, double check

        if False: # abla
            for blk in self.vit_blks:
                attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(  # ! raw 3D attn layer
                    dim,
                    num_heads=num_heads,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    attn_drop=attn_drop,
                    proj_drop=proj_drop)
                blk.attn = self_cross_attn(blk.attn, attn_3d)

    def forward(self, x):
        """x: B N C, where N = H*W tokens. Just raw ViT forward pass
        """

        # ! move the below to the front of the first call
        B, N, C = x.shape  # has [cls] token in N

        for blk in self.vit_blks:
            x = blk(x)  # B N C

        return x

class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_B_3L_C(TriplaneFusionBlockv4_nested_init_from_dino_lite_merge):
    # on roll out + B 3L C
    def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, *args, **kwargs) -> None:
        super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, *args, **kwargs)


    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # ! move the below to the front of the first call

        # B, N, C = x.shape  # has [cls] token in N
        B, group_size, N, C = x.shape  # has [cls] token in N
        x = x.reshape(B, group_size*N, C)

        for blk in self.vit_blks:
            x = blk(x)  # B N C

        x = x.reshape(B, group_size, N, C) # outer loop tradition

        return x

class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_B_3L_C_withrollout(TriplaneFusionBlockv4_nested_init_from_dino_lite_merge):
    # roll out + B 3L C
    def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, *args, **kwargs) -> None:
        super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, *args, **kwargs)


    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # ! move the below to the front of the first call

        # B, N, C = x.shape  # has [cls] token in N
        B, group_size, N, C = x.shape  # has [cls] token in N
        x = x.reshape(B*group_size, N, C)
        x = self.vit_blks[0](x)

        x = x.reshape(B,group_size*N, C)
        x = self.vit_blks[1](x)

        x = x.reshape(B, group_size, N, C) # outer loop tradition

        return x


class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_add3DAttn(TriplaneFusionBlockv4_nested_init_from_dino):
    # no roll out + 3D Attention
    def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, *args, **kwargs) -> None:
        super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, *args, **kwargs)


    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        B, group_size, N, C = x.shape  # has [cls] token in N
        x = x.reshape(B, group_size*N, C)
        x = self.vit_blks[0](x) # B 3 L C

        # ! move the below to the front of the first call
        x = x.reshape(B, group_size, N, C).reshape(B*group_size, N, C)
        x = self.vit_blks[1](x) # has 3D attention
        return x.reshape(B, group_size, N, C)

        return x


class TriplaneFusionBlockv5_ldm_addCA(nn.Module):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        assert use_fusion_blk

        assert len(vit_blks) == 2

        # ! rather than replacing, add a 3D attention block after.
        # del self.vit_blks[
        #     1].attn  # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
        self.norm_for_atten_3d = deepcopy(self.vit_blks[1].norm1)

        # copied vit settings from https://github.dev/facebookresearch/dinov2
        nh = num_heads
        dim = embed_dim

        mlp_ratio = 4  # defined for all dino2 model
        qkv_bias = True
        norm_layer = partial(nn.LayerNorm, eps=1e-6)
        drop_path_rate = 0.3  # default setting
        attn_drop = proj_drop = 0.0
        qk_scale = None  # TODO, double check

        self.attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=proj_drop)

    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'

        flatten_token = lambda x: x.reshape(B * group_size, N, C)
        unflatten_token = lambda x: x.reshape(B, group_size, N, C)

        x = flatten_token(x)
        x = self.vit_blks[0](x)

        x = unflatten_token(x)
        x = self.attn_3d(self.norm_for_atten_3d(x)) + x

        x = flatten_token(x)
        x = self.vit_blks[1](x)

        return unflatten_token(x)


class TriplaneFusionBlockv6_ldm_addCA_Init3DAttnfrom2D(
        TriplaneFusionBlockv5_ldm_addCA):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
                 *args,
                 **kwargs) -> None:
        super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
                         fusion_ca_blk, *args, **kwargs)

    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'

        flatten_token = lambda x: x.reshape(B * group_size, N, C)
        unflatten_token = lambda x: x.reshape(B, group_size, N, C)

        x = flatten_token(x)
        x = self.vit_blks[0](x)

        x = unflatten_token(x)
        x = self.attn_3d(self.norm_for_atten_3d(x)) + x

        x = flatten_token(x)
        x = self.vit_blks[1](x)

        return unflatten_token(x)


class TriplaneFusionBlockv5_ldm_add_dualCA(nn.Module):

    def __init__(self,
                 vit_blks,
                 num_heads,
                 embed_dim,
                 use_fusion_blk=True,
                 fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
                 *args,
                 **kwargs) -> None:
        super().__init__()

        self.num_branches = 3  # triplane
        self.vit_blks = vit_blks

        assert use_fusion_blk

        assert len(vit_blks) == 2

        # ! rather than replacing, add a 3D attention block after.
        # del self.vit_blks[
        #     1].attn  # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
        self.norm_for_atten_3d_0 = deepcopy(self.vit_blks[0].norm1)
        self.norm_for_atten_3d_1 = deepcopy(self.vit_blks[1].norm1)

        # copied vit settings from https://github.dev/facebookresearch/dinov2
        nh = num_heads
        dim = embed_dim

        mlp_ratio = 4  # defined for all dino2 model
        qkv_bias = True
        norm_layer = partial(nn.LayerNorm, eps=1e-6)
        drop_path_rate = 0.3  # default setting
        attn_drop = proj_drop = 0.0
        qk_scale = None  # TODO, double check

        self.attn_3d_0 = xformer_Conv3D_Aware_CrossAttention_xygrid(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=proj_drop)

        self.attn_3d_1 = deepcopy(self.attn_3d_0)

    def forward(self, x):
        """x: B 3 N C, where N = H*W tokens
        """

        # self attention, by merging the triplane channel into B for parallel computation

        # ! move the below to the front of the first call
        B, group_size, N, C = x.shape  # has [cls] token in N
        assert group_size == 3, 'triplane'

        flatten_token = lambda x: x.reshape(B * group_size, N, C)
        unflatten_token = lambda x: x.reshape(B, group_size, N, C)

        x = flatten_token(x)
        x = self.vit_blks[0](x)

        x = unflatten_token(x)
        x = self.attn_3d_0(self.norm_for_atten_3d_0(x)) + x

        x = flatten_token(x)
        x = self.vit_blks[1](x)

        x = unflatten_token(x)
        x = self.attn_3d_1(self.norm_for_atten_3d_1(x)) + x

        return unflatten_token(x)


def drop_path(x, drop_prob: float = 0., training: bool = False):
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0], ) + (1, ) * (
        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(
        shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Mlp(nn.Module):

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Block(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        # self.attn = Attention(dim,
        self.attn = MemEffAttention(dim,
                              num_heads=num_heads,
                              qkv_bias=qkv_bias,
                              qk_scale=qk_scale,
                              attn_drop=attn_drop,
                              proj_drop=drop)
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)

    def forward(self, x, return_attention=False):
        y, attn = self.attn(self.norm1(x))
        if return_attention:
            return attn
        x = x + self.drop_path(y)
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        num_patches = (img_size // patch_size) * (img_size // patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans,
                              embed_dim,
                              kernel_size=patch_size,
                              stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.proj(x).flatten(2).transpose(1, 2)  # B, C, L -> B, L, C
        return x


class VisionTransformer(nn.Module):
    """ Vision Transformer """

    def __init__(self,
                 img_size=[224],
                 patch_size=16,
                 in_chans=3,
                 num_classes=0,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 norm_layer='nn.LayerNorm',
                 patch_embedding=True,
                 cls_token=True,
                 pixel_unshuffle=False,
                 **kwargs):
        super().__init__()
        self.num_features = self.embed_dim = embed_dim
        self.patch_size = patch_size

        # if norm_layer == 'nn.LayerNorm':
        norm_layer = partial(nn.LayerNorm, eps=1e-6)

        if patch_embedding:
            self.patch_embed = PatchEmbed(img_size=img_size[0],
                                          patch_size=patch_size,
                                          in_chans=in_chans,
                                          embed_dim=embed_dim)
            num_patches = self.patch_embed.num_patches
            self.img_size = self.patch_embed.img_size
        else:
            self.patch_embed = None
            self.img_size = img_size[0]
            num_patches = (img_size[0] // patch_size) * (img_size[0] //
                                                         patch_size)

        if cls_token:
            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
            self.pos_embed = nn.Parameter(
                torch.zeros(1, num_patches + 1, embed_dim))
        else:
            self.cls_token = None
            self.pos_embed = nn.Parameter(
                torch.zeros(1, num_patches, embed_dim))

        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
               ]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(dim=embed_dim,
                  num_heads=num_heads,
                  mlp_ratio=mlp_ratio,
                  qkv_bias=qkv_bias,
                  qk_scale=qk_scale,
                  drop=drop_rate,
                  attn_drop=attn_drop_rate,
                  drop_path=dpr[i],
                  norm_layer=norm_layer) for i in range(depth)
        ])
        self.norm = norm_layer(embed_dim)

        # Classifier head
        self.head = nn.Linear(
            embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        trunc_normal_(self.pos_embed, std=.02)
        if cls_token:
            trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

        # if pixel_unshuffle:
        #     self.decoder_pred = nn.Linear(embed_dim,
        #                                 patch_size**2 * out_chans,
        #                                 bias=True)  # decoder to patch

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def interpolate_pos_encoding(self, x, w, h):
        npatch = x.shape[1] - 1
        N = self.pos_embed.shape[1] - 1
        if npatch == N and w == h:
            return self.pos_embed
        patch_pos_embed = self.pos_embed[:, 1:]
        dim = x.shape[-1]
        w0 = w // self.patch_size
        h0 = h // self.patch_size
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        w0, h0 = w0 + 0.1, h0 + 0.1

        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
                                    dim).permute(0, 3, 1, 2),
            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
            mode='bicubic',
        )
        assert int(w0) == patch_pos_embed.shape[-2] and int(
            h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(2, -1, dim)

        if self.cls_token is not None:
            class_pos_embed = self.pos_embed[:, 0]
            return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed),
                             dim=1)
        return patch_pos_embed

    def prepare_tokens(self, x):
        B, nc, w, h = x.shape
        x = self.patch_embed(x)  # patch linear embedding

        # add the [CLS] token to the embed patch tokens
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        # add positional encoding to each token
        x = x + self.interpolate_pos_encoding(x, w, h)

        return self.pos_drop(x)

    def forward(self, x):
        x = self.prepare_tokens(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return x[:, 1:]  # return spatial feature maps, not the [CLS] token
        # return x[:, 0]

    def get_last_selfattention(self, x):
        x = self.prepare_tokens(x)
        for i, blk in enumerate(self.blocks):
            if i < len(self.blocks) - 1:
                x = blk(x)
            else:
                # return attention of the last block
                return blk(x, return_attention=True)

    def get_intermediate_layers(self, x, n=1):
        x = self.prepare_tokens(x)
        # we return the output tokens from the `n` last blocks
        output = []
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if len(self.blocks) - i <= n:
                output.append(self.norm(x))
        return output


def vit_tiny(patch_size=16, **kwargs):
    model = VisionTransformer(patch_size=patch_size,
                              embed_dim=192,
                              depth=12,
                              num_heads=3,
                              mlp_ratio=4,
                              qkv_bias=True,
                              norm_layer=partial(nn.LayerNorm, eps=1e-6),
                              **kwargs)
    return model


def vit_small(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size,
        embed_dim=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4,
        qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),  # type: ignore
        **kwargs)
    return model


def vit_base(patch_size=16, **kwargs):
    model = VisionTransformer(patch_size=patch_size,
                              embed_dim=768,
                              depth=12,
                              num_heads=12,
                              mlp_ratio=4,
                              qkv_bias=True,
                              norm_layer=partial(nn.LayerNorm, eps=1e-6),
                              **kwargs)
    return model


vits = vit_small
vitb = vit_base