LN3Diff_I23D / vit /vision_transformer.py
NIRVANALAN
update
615f2be
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Mostly copy-paste from timm library.
https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
"""
from copy import deepcopy
import math
from functools import partial
from sympy import flatten
import torch
import torch.nn as nn
from torch import Tensor, pixel_shuffle
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
from torch.nn.modules import GELU
# from vit.vision_transformer import Conv3DCrossAttentionBlock
from .utils import trunc_normal_
from pdb import set_trace as st
# import apex
try:
from apex.normalization import FusedRMSNorm as RMSNorm
except:
# from dit.norm import RMSNorm
from dit.norm import RMSNorm
# from apex.normalization import FusedLayerNorm as LayerNorm
try:
from xformers.ops import memory_efficient_attention, unbind, fmha
from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
# from xformers.ops import RMSNorm
XFORMERS_AVAILABLE = True
except ImportError:
# logger.warning("xFormers not available")
XFORMERS_AVAILABLE = False
class Attention(nn.Module):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
enable_rmsnorm=False,
qk_norm=False,):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
# https://github.com/huggingface/pytorch-image-models/blob/5dce71010174ad6599653da4e8ba37fd5f9fa572/timm/models/vision_transformer.py#L79C1-L80C78
self.q_norm = RMSNorm(head_dim, elementwise_affine=True, eps=1e-5) if qk_norm else nn.Identity() # sd-3
self.k_norm = RMSNorm(head_dim, elementwise_affine=True, eps=1e-5) if qk_norm else nn.Identity()
# if qk_norm:
# self.q_norm = LayerNorm(dim, eps=1e-5)
# self.k_norm = LayerNorm(dim, eps=1e-5)
self.qk_norm = qk_norm
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
# return x, attn
return x
class MemEffAttention(Attention):
def forward(self, x: Tensor, attn_bias=None) -> Tensor:
if not XFORMERS_AVAILABLE:
assert attn_bias is None, "xFormers is required for nested tensors usage"
return super().forward(x)
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
q, k, v = unbind(qkv, 2)
q, k = self.q_norm(q), self.k_norm(k)
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) # if not bf16, no flash-attn here.
# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp) # force flash attention
x = x.reshape([B, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class MemEffCrossAttention(MemEffAttention):
# for cross attention, where context serves as k and v
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0, proj_drop=0):
super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop, proj_drop)
del self.qkv
self.q = nn.Linear(dim, dim * 1, bias=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
def forward(self, x: Tensor, context: Tensor, attn_bias=None) -> Tensor:
if not XFORMERS_AVAILABLE:
assert attn_bias is None, "xFormers is required for nested tensors usage"
return super().forward(x)
B, N, C = x.shape
# qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
q = self.q(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
kv = self.kv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
k, v = unbind(kv, 2)
# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
x = x.reshape([B, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
# https://github.com/IBM/CrossViT/blob/main/models/crossvit.py
class CrossAttention(nn.Module):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim**-0.5
self.wq = nn.Linear(dim, dim, bias=qkv_bias)
self.wk = nn.Linear(dim, dim, bias=qkv_bias)
self.wv = nn.Linear(dim, dim, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
q = self.wq(x[:,
0:1, ...]).reshape(B, 1, self.num_heads,
C // self.num_heads).permute(
0, 2, 1,
3) # B1C -> B1H(C/H) -> BH1(C/H)
k = self.wk(x).reshape(B, N,
self.num_heads, C // self.num_heads).permute(
0, 2, 1, 3) # BNC -> BNH(C/H) -> BHN(C/H)
v = self.wv(x).reshape(B, N,
self.num_heads, C // self.num_heads).permute(
0, 2, 1, 3) # BNC -> BNH(C/H) -> BHN(C/H)
attn = (q @ k.transpose(
-2, -1)) * self.scale # BH1(C/H) @ BH(C/H)N -> BH1N
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(
B, 1, C) # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
x = self.proj(x)
x = self.proj_drop(x)
return x
class Conv3D_Aware_CrossAttention(nn.Module):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim**-0.5
self.wq = nn.Linear(dim, dim, bias=qkv_bias)
self.wk = nn.Linear(dim, dim, bias=qkv_bias)
self.wv = nn.Linear(dim, dim, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, group_size, N, C = x.shape # B 3 N C
p = int(N**0.5) # patch size
assert p**2 == N, 'check input dim, no [cls] needed here'
assert group_size == 3, 'designed for triplane here'
x = x.reshape(B, group_size, p, p, C) # expand patch token dim
# * init qkv
# q = torch.empty(B * group_size * N,
# 1,
# self.num_heads,
# C // self.num_heads,
# device=x.device).permute(0, 2, 1, 3)
# k = torch.empty(B * group_size * N,
# 2 * p,
# self.num_heads,
# C // self.num_heads,
# device=x.device).permute(0, 2, 1, 3)
# v = torch.empty_like(k)
q_x = torch.empty(
B * group_size * N,
1,
# self.num_heads,
# C // self.num_heads,
C,
device=x.device)
k_x = torch.empty(
B * group_size * N,
2 * p,
# self.num_heads,
# C // self.num_heads,
C,
device=x.device)
v_x = torch.empty_like(k_x)
# ! refer to the following plane order
# N, M, _ = coordinates.shape
# xy_coords = coordinates[..., [0, 1]]
# yz_coords = coordinates[..., [1, 2]]
# zx_coords = coordinates[..., [2, 0]]
# return torch.stack([xy_coords, yz_coords, zx_coords],
# dim=1).reshape(N * 3, M, 2)
index_i, index_j = torch.meshgrid(torch.arange(0, p),
torch.arange(0, p),
indexing='ij') # 16*16
index_mesh_grid = torch.stack([index_i, index_j], 0).to(
x.device).unsqueeze(0).repeat_interleave(B,
0).reshape(B, 2, p,
p) # B 2 p p.
for i in range(group_size):
q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
0, 2, 3, 1, 4).reshape(B * N, 1, C) # B 1 p p C -> B*N, 1, C
# TODO, how to batchify gather ops?
plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
1] # B 1 p p C
plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]
assert plane_yz.shape == plane_zx.shape == (
B, 1, p, p, C), 'check sub plane dimensions'
pooling_plane_yz = torch.gather(
plane_yz,
dim=2,
index=index_mesh_grid[:, 0:1].reshape(B, 1, N, 1, 1).expand(
-1, -1, -1, p,
C)).permute(0, 2, 1, 3, 4) # B 1 256 16 C => B 256 1 16 C
pooling_plane_zx = torch.gather(
plane_zx,
dim=3,
index=index_mesh_grid[:, 1:2].reshape(B, 1, 1, N, 1).expand(
-1, -1, p, -1,
C)).permute(0, 3, 1, 2, 4) # B 1 16 256 C => B 256 1 16 C
k_x[B * i * N:B * (i + 1) *
N] = v_x[B * i * N:B * (i + 1) * N] = torch.cat(
[pooling_plane_yz, pooling_plane_zx],
dim=2).reshape(B * N, 2 * p,
C) # B 256 2 16 C => (B*256) 2*16 C
# q[B * i * N: B * (i+1) * N] = self.wq(q_x).reshape(B*N, 1, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
# k[B * i * N: B * (i+1) * N] = self.wk(k_x).reshape(B*N, 2*p, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
# v[B * i * N: B * (i+1) * N] = self.wv(v_x).reshape(B*N, 2*p, self.num_heads, C // self.num_heads).permute( 0, 2, 1, 3)
q = self.wq(q_x).reshape(B * group_size * N, 1,
self.num_heads, C // self.num_heads).permute(
0, 2, 1,
3) # merge num_heads into Batch dimention
k = self.wk(k_x).reshape(B * group_size * N, 2 * p, self.num_heads,
C // self.num_heads).permute(0, 2, 1, 3)
v = self.wv(v_x).reshape(B * group_size * N, 2 * p, self.num_heads,
C // self.num_heads).permute(0, 2, 1, 3)
attn = (q @ k.transpose(
-2, -1)) * self.scale # BH1(C/H) @ BH(C/H)N -> BH1N, N=2p here
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(
B * 3 * N, 1,
C) # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C
x = self.proj(x)
x = self.proj_drop(x)
# reshape x back
x = x.reshape(B, 3, N, C)
return x
class xformer_Conv3D_Aware_CrossAttention(nn.Module):
# https://github.dev/facebookresearch/dinov2
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
# https://pytorch.org/blog/accelerated-generative-diffusion-models/
self.num_heads = num_heads
self.wq = nn.Linear(dim, dim * 1, bias=qkv_bias)
self.w_kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.index_mesh_grid = None
def forward(self, x, attn_bias=None):
B, group_size, N, C = x.shape # B 3 N C
p = int(N**0.5) # patch size
assert p**2 == N, 'check input dim, no [cls] needed here'
assert group_size == 3, 'designed for triplane here'
x = x.reshape(B, group_size, p, p, C) # expand patch token dim
q_x = torch.empty(B * group_size * N, 1, C, device=x.device)
context = torch.empty(B * group_size * N, 2 * p, C,
device=x.device) # k_x=v_x
if self.index_mesh_grid is None: # further accelerate
index_i, index_j = torch.meshgrid(torch.arange(0, p),
torch.arange(0, p),
indexing='ij') # 16*16
index_mesh_grid = torch.stack([index_i, index_j], 0).to(
x.device).unsqueeze(0).repeat_interleave(B, 0).reshape(
B, 2, p, p) # B 2 p p.
self.index_mesh_grid = index_mesh_grid[0:1]
else:
index_mesh_grid = self.index_mesh_grid.clone().repeat_interleave(
B, 0)
assert index_mesh_grid.shape == (
B, 2, p, p), 'check index_mesh_grid dimension'
for i in range(group_size):
q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
0, 2, 3, 1, 4).reshape(B * N, 1, C) # B 1 p p C -> B*N, 1, C
# TODO, how to batchify gather ops?
plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
1] # B 1 p p C
plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]
assert plane_yz.shape == plane_zx.shape == (
B, 1, p, p, C), 'check sub plane dimensions'
pooling_plane_yz = torch.gather(
plane_yz,
dim=2,
index=index_mesh_grid[:, 0:1].reshape(B, 1, N, 1, 1).expand(
-1, -1, -1, p,
C)).permute(0, 2, 1, 3, 4) # B 1 256 16 C => B 256 1 16 C
pooling_plane_zx = torch.gather(
plane_zx,
dim=3,
index=index_mesh_grid[:, 1:2].reshape(B, 1, 1, N, 1).expand(
-1, -1, p, -1,
C)).permute(0, 3, 1, 2, 4) # B 1 16 256 C => B 256 1 16 C
context[B * i * N:B * (i + 1) * N] = torch.cat(
[pooling_plane_yz, pooling_plane_zx],
dim=2).reshape(B * N, 2 * p,
C) # B 256 2 16 C => (B*256) 2*16 C
# B, N, C = x.shape
q = self.wq(q_x).reshape(B * group_size * N, 1, self.num_heads,
C // self.num_heads)
kv = self.w_kv(context).reshape(B * group_size * N, 2 * p, 2,
self.num_heads, C // self.num_heads)
k, v = unbind(kv, 2)
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
x = x.transpose(1, 2).reshape([B * 3 * N, 1, C]).reshape(B, 3, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class xformer_Conv3D_Aware_CrossAttention_xygrid(
xformer_Conv3D_Aware_CrossAttention):
"""implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
"""
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0):
super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop,
proj_drop)
def forward(self, x, attn_bias=None):
B, group_size, N, C = x.shape # B 3 N C
p = int(N**0.5) # patch size
assert p**2 == N, 'check input dim, no [cls] needed here'
assert group_size == 3, 'designed for triplane here'
x = x.reshape(B, group_size, p, p, C) # expand patch token dim
q_x = torch.empty(B * group_size * N, 1, C, device=x.device)
context = torch.empty(B * group_size * N, 2 * p, C,
device=x.device) # k_x=v_x
if self.index_mesh_grid is None: # further accelerate
index_u, index_v = torch.meshgrid(
torch.arange(0, p), torch.arange(0, p),
indexing='xy') # ! switch to 'xy' here to match uv coordinate
index_mesh_grid = torch.stack([index_u, index_v], 0).to(
x.device).unsqueeze(0).repeat_interleave(B, 0).reshape(
B, 2, p, p) # B 2 p p.
self.index_mesh_grid = index_mesh_grid[0:1]
else:
index_mesh_grid = self.index_mesh_grid.clone().repeat_interleave(
B, 0)
assert index_mesh_grid.shape == (
B, 2, p, p), 'check index_mesh_grid dimension'
for i in range(group_size):
q_x[B * i * N:B * (i + 1) * N] = x[:, i:i + 1].permute(
0, 2, 3, 1, 4).reshape(B * N, 1, C) # B 1 p p C -> B*N, 1, C
# TODO, how to batchify gather ops?
plane_yz = x[:, (i + 1) % group_size:(i + 1) % group_size +
1] # B 1 p p C
plane_zx = x[:, (i + 2) % group_size:(i + 2) % group_size + 1]
assert plane_yz.shape == plane_zx.shape == (
B, 1, p, p, C), 'check sub plane dimensions'
pooling_plane_yz = torch.gather(
plane_yz,
dim=2,
index=index_mesh_grid[:, 1:2].reshape(B, 1, N, 1, 1).expand(
-1, -1, -1, p,
C)).permute(0, 2, 1, 3, 4) # B 1 256 16 C => B 256 1 16 C
pooling_plane_zx = torch.gather(
plane_zx,
dim=3,
index=index_mesh_grid[:, 0:1].reshape(B, 1, 1, N, 1).expand(
-1, -1, p, -1,
C)).permute(0, 3, 1, 2, 4) # B 1 16 256 C => B 256 1 16 C
context[B * i * N:B * (i + 1) * N] = torch.cat(
[pooling_plane_yz, pooling_plane_zx],
dim=2).reshape(B * N, 2 * p,
C) # B 256 2 16 C => (B*256) 2*16 C
# B, N, C = x.shape
q = self.wq(q_x).reshape(B * group_size * N, 1, self.num_heads,
C // self.num_heads)
kv = self.w_kv(context).reshape(B * group_size * N, 2 * p, 2,
self.num_heads, C // self.num_heads)
k, v = unbind(kv, 2)
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
# x = memory_efficient_attention(q, k, v, attn_bias=attn_bias, op=MemoryEfficientAttentionFlashAttentionOp)
x = x.transpose(1, 2).reshape([B * 3 * N, 1, C]).reshape(B, 3, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(
xformer_Conv3D_Aware_CrossAttention_xygrid):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0,
proj_drop=0):
super().__init__(dim, num_heads, qkv_bias, qk_scale, attn_drop,
proj_drop)
def forward(self, x, attn_bias=None):
# ! split x: B N C into B 3 N C//3
B, N, C = x.shape
x = x.reshape(B, N, C // 3, 3).permute(0, 3, 1,
2) # B N C 3 -> B 3 N C
x_out = super().forward(x, attn_bias) # B 3 N C
x_out = x_out.permute(0, 2, 3, 1)# B 3 N C -> B N C 3
x_out = x_out.reshape(*x_out.shape[:2], -1) # B N C 3 -> B N C3
return x_out.contiguous()
class self_cross_attn(nn.Module):
def __init__(self, dino_attn, cross_attn, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.dino_attn = dino_attn
self.cross_attn = cross_attn
def forward(self, x_norm):
y = self.dino_attn(x_norm) + x_norm
return self.cross_attn(y) # will add x in the original code
# class RodinRollOutConv(nn.Module):
# """implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
# Use Group Conv
# """
# def __init__(self, in_chans, out_chans=None):
# super().__init__()
# # input: B 3C H W
# if out_chans is None:
# out_chans = in_chans
# self.roll_out_convs = nn.Conv2d(in_chans,
# out_chans,
# kernel_size=3,
# groups=3,
# padding=1)
# def forward(self, x):
# return self.roll_out_convs(x)
class RodinRollOutConv3D(nn.Module):
"""implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
"""
def __init__(self, in_chans, out_chans=None):
super().__init__()
if out_chans is None:
out_chans = in_chans
self.out_chans = out_chans // 3
self.roll_out_convs = nn.Conv2d(in_chans,
self.out_chans,
kernel_size=3,
padding=1)
def forward(self, x):
# todo, reshape before input?
B, C3, p, p = x.shape # B 3C H W
C = C3 // 3
group_size = C3 // C
assert group_size == 3
x = x.reshape(B, 3, C, p, p)
roll_out_x = torch.empty(B, group_size * C, p, 3 * p,
device=x.device) # B, 3C, H, 3W
for i in range(group_size):
plane_xy = x[:, i] # B C H W
# TODO, simply do the average pooling?
plane_yz_pooling = x[:, (i + 1) % group_size].mean(
dim=-1, keepdim=True).repeat_interleave(
p, dim=-1) # B C H W -> B C H 1 -> B C H W, reduce z dim
plane_zx_pooling = x[:, (i + 2) % group_size].mean(
dim=-2, keepdim=True).repeat_interleave(
p, dim=-2) # B C H W -> B C 1 W -> B C H W, reduce z dim
roll_out_x[..., i * p:(i + 1) * p] = torch.cat(
[plane_xy, plane_yz_pooling, plane_zx_pooling],
1) # fill in the 3W dim
x = self.roll_out_convs(roll_out_x) # B C H 3W
x = x.reshape(B, self.out_chans, p, 3, p)
x = x.permute(0, 3, 1, 2, 4).reshape(B, 3 * self.out_chans, p,
p) # B 3C H W
return x
class RodinRollOutConv3D_GroupConv(nn.Module):
"""implementation wise clearer, but yields identical results with xformer_Conv3D_Aware_CrossAttention
"""
def __init__(self,
in_chans,
out_chans=None,
kernel_size=3,
stride=1,
padding=1):
super().__init__()
if out_chans is None:
out_chans = in_chans
self.roll_out_convs = nn.Conv2d(
in_chans * 3,
out_chans,
kernel_size=kernel_size,
groups=3, # B 9C H W
stride=stride,
padding=padding)
# @torch.autocast(device_type='cuda')
def forward(self, x):
# todo, reshape before input?
B, C3, p, p = x.shape # B 3C H W
C = C3 // 3
group_size = C3 // C
assert group_size == 3
x = x.reshape(B, 3, C, p, p)
roll_out_x = torch.empty(B, group_size * C * 3, p, p,
device=x.device) # B, 3C, H, 3W
for i in range(group_size):
plane_xy = x[:, i] # B C H W
# # TODO, simply do the average pooling?
plane_yz_pooling = x[:, (i + 1) % group_size].mean(
dim=-1, keepdim=True).repeat_interleave(
p, dim=-1) # B C H W -> B C H 1 -> B C H W, reduce z dim
plane_zx_pooling = x[:, (i + 2) % group_size].mean(
dim=-2, keepdim=True).repeat_interleave(
p, dim=-2) # B C H W -> B C 1 W -> B C H W, reduce z dim
roll_out_x[:, i * 3 * C:(i + 1) * 3 * C] = torch.cat(
[plane_xy, plane_yz_pooling, plane_zx_pooling],
1) # fill in the 3W dim
# ! directly cat, avoid intermediate vars
# ? why OOM
# roll_out_x[:, i * 3 * C:(i + 1) * 3 * C] = torch.cat(
# [
# x[:, i],
# x[:, (i + 1) % group_size].mean(
# dim=-1, keepdim=True).repeat_interleave(p, dim=-1),
# x[:, (i + 2) % group_size].mean(
# dim=-2, keepdim=True).repeat_interleave(
# p, dim=-2
# ) # B C H W -> B C 1 W -> B C H W, reduce z dim
# ],
# 1) # fill in the 3C dim
x = self.roll_out_convs(roll_out_x) # B 3C H W
return x
class RodinRollOut_GroupConv_noConv3D(nn.Module):
"""only roll out and do Conv on individual planes
"""
def __init__(self,
in_chans,
out_chans=None,
kernel_size=3,
stride=1,
padding=1):
super().__init__()
if out_chans is None:
out_chans = in_chans
self.roll_out_inplane_conv = nn.Conv2d(
in_chans,
out_chans,
kernel_size=kernel_size,
groups=3, # B 3C H W
stride=stride,
padding=padding)
def forward(self, x):
x = self.roll_out_inplane_conv(x) # B 3C H W
return x
# class RodinConv3D_SynthesisLayer_withact(nn.Module):
# def __init__(self, in_chans, out_chans) -> None:
# super().__init__()
# self.act = nn.LeakyReLU(inplace=True)
# self.conv = nn.Sequential(
# RodinRollOutConv3D_GroupConv(in_chans, out_chans),
# nn.LeakyReLU(inplace=True),
# )
# if in_chans != out_chans:
# self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
# else:
# self.short_cut = None
# def forward(self, feats):
# if self.short_cut is not None:
# res_feats = self.short_cut(feats)
# else:
# res_feats = feats
# # return res_feats + self.conv(feats)
# feats = res_feats + self.conv(feats)
# return self.act(feats) # as in resnet, add an act before return
class RodinConv3D_SynthesisLayer_mlp_unshuffle_as_residual(nn.Module):
def __init__(self, in_chans, out_chans) -> None:
super().__init__()
self.act = nn.LeakyReLU(inplace=True)
self.conv = nn.Sequential(
RodinRollOutConv3D_GroupConv(in_chans, out_chans),
nn.LeakyReLU(inplace=True),
)
self.out_chans = out_chans
if in_chans != out_chans:
# self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
in_chans // 3, # 144 / 3 = 48
out_chans // 3 * 4 * 4, # 32 * 16
bias=True) # decoder to pat
# RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
else:
self.short_cut = None
def shortcut_unpatchify_triplane(self,
x,
p=None,
unpatchify_out_chans=None):
"""separate triplane version; x shape: B (3*257) 768
"""
assert self.short_cut is not None
# B, L, C = x.shape
B, C3, h, w = x.shape
assert h == w
L = h * w
x = x.reshape(B, C3 // 3, 3, L).permute(0, 2, 3,
1) # (B, 3, L // 3, C)
x = self.short_cut(x)
p = h * 4
x = x.reshape(shape=(B, 3, h, w, p, p, unpatchify_out_chans))
x = torch.einsum('ndhwpqc->ndchpwq',
x) # nplanes, C order in the renderer.py
x = x.reshape(shape=(B, 3 * self.out_chans, h * p, h * p))
return x
def forward(self, feats):
if self.short_cut is not None:
res_feats = self.shortcut_unpatchify_triplane(feats)
else:
res_feats = feats
# return res_feats + self.conv(feats)
feats = res_feats + self.conv(feats)
return self.act(feats) # as in resnet, add an act before return
# class RodinConv3D_SynthesisLayer(nn.Module):
# def __init__(self, in_chans, out_chans) -> None:
# super().__init__()
# self.act = nn.LeakyReLU(inplace=True)
# self.conv = nn.Sequential(
# RodinRollOutConv3D_GroupConv(in_chans, out_chans),
# nn.LeakyReLU(inplace=True),
# )
# if in_chans != out_chans:
# self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans) # PSNR 13 first iteration.
# else:
# self.short_cut = None
# def forward(self, feats):
# if self.short_cut is not None:
# res_feats = self.short_cut(feats)
# else:
# res_feats = feats
# # return res_feats + self.conv(feats)
# feats = res_feats + self.conv(feats)
# # return self.act(feats) # as in resnet, add an act before return
# return feats # ! old behaviour, no act
# previous worked version
class RodinConv3D_SynthesisLayer(nn.Module):
def __init__(self, in_chans, out_chans) -> None:
super().__init__()
# x2 SR + 1x1 Conv Residual BLK
# self.conv3D = RodinRollOutConv3D(in_chans, out_chans)
self.act = nn.LeakyReLU(inplace=True)
self.conv = nn.Sequential(
RodinRollOutConv3D_GroupConv(in_chans, out_chans),
nn.LeakyReLU(inplace=True),
)
if in_chans != out_chans:
self.short_cut = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
else:
self.short_cut = None
def forward(self, feats):
feats_out = self.conv(feats)
if self.short_cut is not None:
# ! failed below
feats_out = self.short_cut(
feats
) + feats_out # ! only difference here, no act() compared with baseline
# feats_out = self.act(self.short_cut(feats)) + feats_out # ! only difference here, no act() compared with baseline
else:
feats_out = feats_out + feats
return feats_out
class RodinRollOutConv3DSR2X(nn.Module):
def __init__(self, in_chans, **kwargs) -> None:
super().__init__()
self.conv3D = RodinRollOutConv3D_GroupConv(in_chans)
# self.conv3D = RodinRollOutConv3D(in_chans)
self.act = nn.LeakyReLU(inplace=True)
self.input_resolution = 224
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
group_size = C3 // C
assert group_size == 3
# p = int(N**0.5) # patch size
# assert p**2 == N, 'check input dim, no [cls] needed here'
assert group_size == 3, 'designed for triplane here'
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if x.shape[-1] != self.input_resolution:
x = torch.nn.functional.interpolate(x,
size=(self.input_resolution,
self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
x = x + self.conv3D(x)
return x
class RodinRollOutConv3DSR4X_lite(nn.Module):
def __init__(self, in_chans, input_resolutiopn=256, **kwargs) -> None:
super().__init__()
self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans)
self.conv3D_1 = RodinRollOutConv3D_GroupConv(in_chans)
self.act = nn.LeakyReLU(inplace=True)
self.input_resolution = input_resolutiopn
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
group_size = C3 // C
assert group_size == 3
# p = int(N**0.5) # patch size
# assert p**2 == N, 'check input dim, no [cls] needed here'
assert group_size == 3, 'designed for triplane here'
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if x.shape[-1] != self.input_resolution:
x = torch.nn.functional.interpolate(x,
size=(self.input_resolution,
self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
# ! still not convering, not bug here?
# x = x + self.conv3D_0(x)
# x = x + self.conv3D_1(x)
x = x + self.act(self.conv3D_0(x))
x = x + self.act(self.conv3D_1(x))
# TODO: which is better, bilinear + conv or PixelUnshuffle?
return x
# class RodinConv3D2X_lite_mlp_as_residual(nn.Module):
# """lite 4X version, with MLP unshuffle to change the dimention
# """
# def __init__(self, in_chans, out_chans, input_resolution=256) -> None:
# super().__init__()
# self.act = nn.LeakyReLU(inplace=True)
# self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
# self.conv3D_1 = RodinRollOutConv3D_GroupConv(out_chans, out_chans)
# self.act = nn.LeakyReLU(inplace=True)
# self.input_resolution = input_resolution
# self.out_chans = out_chans
# if in_chans != out_chans: # ! only change the dimension
# self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
# in_chans//3, # 144 / 3 = 48
# out_chans//3, # 32 * 16
# bias=True) # decoder to pat
# else:
# self.short_cut = None
# def shortcut_unpatchify_triplane(self, x, p=None):
# """separate triplane version; x shape: B (3*257) 768
# """
# assert self.short_cut is not None
# # B, L, C = x.shape
# B, C3, h, w = x.shape
# assert h == w
# L = h*w
# x = x.reshape(B, C3//3, 3, L).permute(0,2,3,1) # (B, 3, L // 3, C_in)
# x = self.short_cut(x) # B 3 L//3 C_out
# x = x.permute(0,1,3,2) # B 3 C_out L//3
# x = x.reshape(shape=(B, self.out_chans, h, w))
# # directly resize to the target, no unpatchify here since no 3D ViT is included here
# if w != self.input_resolution:
# x = torch.nn.functional.interpolate(x, # 4X SR
# size=(self.input_resolution,
# self.input_resolution),
# mode='bilinear',
# align_corners=False,
# antialias=True)
# return x
# def forward(self, x):
# # x: B 3 112*112 C
# B, C3, p, p = x.shape # after unpachify triplane
# C = C3 // 3
# if self.short_cut is not None:
# res_feats = self.shortcut_unpatchify_triplane(x)
# else:
# res_feats = x
# """following forward code copied from lite4x version
# """
# x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
# p) # B 3 C N -> B 3C h W
# if x.shape[-1] != self.input_resolution:
# x = torch.nn.functional.interpolate(x, # 4X SR
# size=(self.input_resolution,
# self.input_resolution),
# mode='bilinear',
# align_corners=False,
# antialias=True)
# x = res_feats + self.act(self.conv3D_0(x))
# x = x + self.act(self.conv3D_1(x))
# return x
class RodinConv3D4X_lite_mlp_as_residual(nn.Module):
"""lite 4X version, with MLP unshuffle to change the dimention
"""
def __init__(self,
in_chans,
out_chans,
input_resolution=256,
interp_mode='bilinear',
bcg_triplane=False) -> None:
super().__init__()
self.interp_mode = interp_mode
self.act = nn.LeakyReLU(inplace=True)
self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, out_chans)
self.conv3D_1 = RodinRollOutConv3D_GroupConv(out_chans, out_chans)
self.bcg_triplane = bcg_triplane
if bcg_triplane:
self.conv3D_1_bg = RodinRollOutConv3D_GroupConv(
out_chans, out_chans)
self.act = nn.LeakyReLU(inplace=True)
self.input_resolution = input_resolution
self.out_chans = out_chans
if in_chans != out_chans: # ! only change the dimension
self.short_cut = nn.Linear( # B 3C H W -> B 3C 4H 4W
in_chans // 3, # 144 / 3 = 48
out_chans // 3, # 32 * 16
bias=True) # decoder to pat
else:
self.short_cut = None
def shortcut_unpatchify_triplane(self, x, p=None):
"""separate triplane version; x shape: B (3*257) 768
"""
assert self.short_cut is not None
B, C3, h, w = x.shape
assert h == w
L = h * w
x = x.reshape(B, C3 // 3, 3, L).permute(0, 2, 3,
1) # (B, 3, L // 3, C_in)
x = self.short_cut(x) # B 3 L//3 C_out
x = x.permute(0, 1, 3, 2) # B 3 C_out L//3
x = x.reshape(shape=(B, self.out_chans, h, w))
# directly resize to the target, no unpatchify here since no 3D ViT is included here
if w != self.input_resolution:
x = torch.nn.functional.interpolate(
x, # 4X SR
size=(self.input_resolution, self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
return x
def interpolate(self, feats):
if self.interp_mode == 'bilinear':
return torch.nn.functional.interpolate(
feats, # 4X SR
size=(self.input_resolution, self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
else:
return torch.nn.functional.interpolate(
feats, # 4X SR
size=(self.input_resolution, self.input_resolution),
mode='nearest',
)
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
if self.short_cut is not None:
res_feats = self.shortcut_unpatchify_triplane(x)
else:
res_feats = x
if res_feats.shape[-1] != self.input_resolution:
res_feats = self.interpolate(res_feats)
"""following forward code copied from lite4x version
"""
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if x.shape[-1] != self.input_resolution:
x = self.interpolate(x)
x0 = res_feats + self.act(self.conv3D_0(x)) # the base feature
x = x0 + self.act(self.conv3D_1(x0))
if self.bcg_triplane:
x_bcg = x0 + self.act(self.conv3D_1_bg(x0))
return torch.cat([x, x_bcg], 1)
else:
return x
class RodinConv3D4X_lite_mlp_as_residual_litev2(
RodinConv3D4X_lite_mlp_as_residual):
def __init__(self,
in_chans,
out_chans,
num_feat=128,
input_resolution=256,
interp_mode='bilinear',
bcg_triplane=False) -> None:
super().__init__(in_chans, out_chans, input_resolution, interp_mode,
bcg_triplane)
self.conv3D_0 = RodinRollOutConv3D_GroupConv(in_chans, in_chans)
self.conv_before_upsample = RodinRollOut_GroupConv_noConv3D(
in_chans, num_feat * 3)
self.conv3D_1 = RodinRollOut_GroupConv_noConv3D(
num_feat * 3, num_feat * 3)
self.conv_last = RodinRollOut_GroupConv_noConv3D(
num_feat * 3, out_chans)
self.short_cut = None
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
# if self.short_cut is not None:
# res_feats = self.shortcut_unpatchify_triplane(x)
# else:
# res_feats = x
# if res_feats.shape[-1] != self.input_resolution:
# res_feats = self.interpolate(res_feats)
"""following forward code copied from lite4x version
"""
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
x = x + self.conv3D_0(x) # the base feature
x = self.act(self.conv_before_upsample(x))
# if x.shape[-1] != self.input_resolution:
x = self.conv_last(self.act(self.conv3D_1(self.interpolate(x))))
return x
class RodinConv3D4X_lite_mlp_as_residual_lite(
RodinConv3D4X_lite_mlp_as_residual):
def __init__(self,
in_chans,
out_chans,
input_resolution=256,
interp_mode='bilinear') -> None:
super().__init__(in_chans, out_chans, input_resolution, interp_mode)
"""replace the first Rodin Conv 3D with ordinary rollout conv to save memory
"""
self.conv3D_0 = RodinRollOut_GroupConv_noConv3D(in_chans, out_chans)
class SR3D(nn.Module):
# https://github.com/SeanChenxy/Mimic3D/blob/77d313656df3cd5536d2c4c5766db3a56208eea6/training/networks_stylegan2.py#L629
# roll-out and apply two deconv/pixelUnshuffle layer
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
class RodinConv3D4X_lite_mlp_as_residual_improved(nn.Module):
def __init__(self,
in_chans,
num_feat,
out_chans,
input_resolution=256) -> None:
super().__init__()
assert in_chans == 4 * out_chans
assert num_feat == 2 * out_chans
self.input_resolution = input_resolution
# refer to https://github.com/JingyunLiang/SwinIR/blob/6545850fbf8df298df73d81f3e8cba638787c8bd/models/network_swinir.py#L750
self.upscale = 4
self.conv_after_body = RodinRollOutConv3D_GroupConv(
in_chans, in_chans, 3, 1, 1)
self.conv_before_upsample = nn.Sequential(
RodinRollOutConv3D_GroupConv(in_chans, num_feat, 3, 1, 1),
nn.LeakyReLU(inplace=True))
self.conv_up1 = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
1)
if self.upscale == 4:
self.conv_up2 = RodinRollOutConv3D_GroupConv(
num_feat, num_feat, 3, 1, 1)
self.conv_hr = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
1)
self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, out_chans, 3,
1, 1)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
"""following forward code copied from lite4x version
"""
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
# ? nearest or bilinear
x = self.conv_after_body(x) + x
x = self.conv_before_upsample(x)
x = self.lrelu(
self.conv_up1(
torch.nn.functional.interpolate(
x,
scale_factor=2,
mode='nearest',
# align_corners=False,
# antialias=True
)))
if self.upscale == 4:
x = self.lrelu(
self.conv_up2(
torch.nn.functional.interpolate(
x,
scale_factor=2,
mode='nearest',
# align_corners=False,
# antialias=True
)))
x = self.conv_last(self.lrelu(self.conv_hr(x)))
assert x.shape[-1] == self.input_resolution
return x
class RodinConv3D4X_lite_improved_lint_withresidual(nn.Module):
def __init__(self,
in_chans,
num_feat,
out_chans,
input_resolution=256) -> None:
super().__init__()
assert in_chans == 4 * out_chans
assert num_feat == 2 * out_chans
self.input_resolution = input_resolution
# refer to https://github.com/JingyunLiang/SwinIR/blob/6545850fbf8df298df73d81f3e8cba638787c8bd/models/network_swinir.py#L750
self.upscale = 4
self.conv_after_body = RodinRollOutConv3D_GroupConv(
in_chans, in_chans, 3, 1, 1)
self.conv_before_upsample = nn.Sequential(
RodinRollOutConv3D_GroupConv(in_chans, num_feat, 3, 1, 1),
nn.LeakyReLU(inplace=True))
self.conv_up1 = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
1)
if self.upscale == 4:
self.conv_up2 = RodinRollOutConv3D_GroupConv(
num_feat, num_feat, 3, 1, 1)
self.conv_hr = RodinRollOutConv3D_GroupConv(num_feat, num_feat, 3, 1,
1)
self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, out_chans, 3,
1, 1)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
"""following forward code copied from lite4x version
"""
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
# ? nearest or bilinear
x = self.conv_after_body(x) + x
x = self.conv_before_upsample(x)
x = self.lrelu(
self.conv_up1(
torch.nn.functional.interpolate(
x,
scale_factor=2,
mode='nearest',
# align_corners=False,
# antialias=True
)))
if self.upscale == 4:
x = self.lrelu(
self.conv_up2(
torch.nn.functional.interpolate(
x,
scale_factor=2,
mode='nearest',
# align_corners=False,
# antialias=True
)))
x = self.conv_last(self.lrelu(self.conv_hr(x) + x))
assert x.shape[-1] == self.input_resolution
return x
class RodinRollOutConv3DSR_FlexibleChannels(nn.Module):
def __init__(self,
in_chans,
num_out_ch=96,
input_resolution=256,
**kwargs) -> None:
super().__init__()
self.block0 = RodinConv3D_SynthesisLayer(in_chans,
num_out_ch) # in_chans=48
self.block1 = RodinConv3D_SynthesisLayer(num_out_ch, num_out_ch)
self.input_resolution = input_resolution # 64 -> 256 SR
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
# group_size = C3 // C
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if x.shape[-1] != self.input_resolution:
x = torch.nn.functional.interpolate(x,
size=(self.input_resolution,
self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
x = self.block0(x)
x = self.block1(x)
return x
# previous worked version
class RodinRollOutConv3DSR4X(nn.Module):
# follow PixelUnshuffleUpsample
def __init__(self, in_chans, **kwargs) -> None:
super().__init__()
# self.block0 = RodinConv3D_SynthesisLayer(in_chans, 96 * 2) # TODO, match the old behaviour now.
# self.block1 = RodinConv3D_SynthesisLayer(96 * 2, 96)
self.block0 = RodinConv3D_SynthesisLayer(in_chans, 96)
self.block1 = RodinConv3D_SynthesisLayer(
96, 96) # baseline choice, validate with no LPIPS loss here
self.input_resolution = 64 # 64 -> 256
def forward(self, x):
# x: B 3 112*112 C
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
# group_size = C3 // C
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if x.shape[-1] != self.input_resolution:
x = torch.nn.functional.interpolate(x,
size=(self.input_resolution,
self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
x = self.block0(x)
x = self.block1(x)
return x
class Upsample3D(nn.Module):
"""Upsample module.
Args:
scale (int): Scale factor. Supported scales: 2^n and 3.
num_feat (int): Channel number of intermediate features.
"""
def __init__(self, scale, num_feat):
super().__init__()
m_convs = []
m_pixelshuffle = []
assert (scale & (scale - 1)) == 0, 'scale = 2^n'
self.scale = scale
for _ in range(int(math.log(scale, 2))):
m_convs.append(
RodinRollOutConv3D_GroupConv(num_feat, 4 * num_feat, 3, 1, 1))
m_pixelshuffle.append(nn.PixelShuffle(2))
self.m_convs = nn.ModuleList(m_convs)
self.m_pixelshuffle = nn.ModuleList(m_pixelshuffle)
# @torch.autocast(device_type='cuda')
def forward(self, x):
for scale_idx in range(int(math.log(self.scale, 2))):
x = self.m_convs[scale_idx](x) # B 3C H W
# x =
# B, C3, H, W = x.shape
x = x.reshape(x.shape[0] * 3, x.shape[1] // 3, *x.shape[2:])
x = self.m_pixelshuffle[scale_idx](x)
x = x.reshape(x.shape[0] // 3, x.shape[1] * 3, *x.shape[2:])
return x
class RodinConv3DPixelUnshuffleUpsample(nn.Module):
def __init__(self,
output_dim,
num_feat=32 * 6,
num_out_ch=32 * 3,
sr_ratio=4,
*args,
**kwargs) -> None:
super().__init__()
self.conv_after_body = RodinRollOutConv3D_GroupConv(
output_dim, output_dim, 3, 1, 1)
self.conv_before_upsample = nn.Sequential(
RodinRollOutConv3D_GroupConv(output_dim, num_feat, 3, 1, 1),
nn.LeakyReLU(inplace=True))
self.upsample = Upsample3D(sr_ratio, num_feat) # 4 time SR
self.conv_last = RodinRollOutConv3D_GroupConv(num_feat, num_out_ch, 3,
1, 1)
# @torch.autocast(device_type='cuda')
def forward(self, x, input_skip_connection=True, *args, **kwargs):
# x = self.conv_first(x)
if input_skip_connection:
x = self.conv_after_body(x) + x
else:
x = self.conv_after_body(x)
x = self.conv_before_upsample(x)
x = self.upsample(x)
x = self.conv_last(x)
return x
class RodinConv3DPixelUnshuffleUpsample_improvedVersion(nn.Module):
def __init__(
self,
output_dim,
num_out_ch=32 * 3,
sr_ratio=4,
input_resolution=256,
) -> None:
super().__init__()
self.input_resolution = input_resolution
# self.conv_first = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
# 3, 1, 1)
self.upsample = Upsample3D(sr_ratio, output_dim) # 4 time SR
self.conv_last = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
3, 1, 1)
def forward(self, x, bilinear_upsample=True):
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
group_size = C3 // C
assert group_size == 3, 'designed for triplane here'
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if bilinear_upsample and x.shape[-1] != self.input_resolution:
x_bilinear_upsample = torch.nn.functional.interpolate(
x,
size=(self.input_resolution, self.input_resolution),
mode='bilinear',
align_corners=False,
antialias=True)
x = self.upsample(x) + x_bilinear_upsample
else:
# x_bilinear_upsample = x
x = self.upsample(x)
x = self.conv_last(x)
return x
class RodinConv3DPixelUnshuffleUpsample_improvedVersion2(nn.Module):
"""removed nearest neighbour residual conenctions, add a conv layer residual conenction
"""
def __init__(
self,
output_dim,
num_out_ch=32 * 3,
sr_ratio=4,
input_resolution=256,
) -> None:
super().__init__()
self.input_resolution = input_resolution
self.conv_after_body = RodinRollOutConv3D_GroupConv(
output_dim, num_out_ch, 3, 1, 1)
self.upsample = Upsample3D(sr_ratio, output_dim) # 4 time SR
self.conv_last = RodinRollOutConv3D_GroupConv(output_dim, num_out_ch,
3, 1, 1)
def forward(self, x, input_skip_connection=True):
B, C3, p, p = x.shape # after unpachify triplane
C = C3 // 3
group_size = C3 // C
assert group_size == 3, 'designed for triplane here'
x = x.permute(0, 1, 3, 2).reshape(B, 3 * C, p,
p) # B 3 C N -> B 3C h W
if input_skip_connection:
x = self.conv_after_body(x) + x
else:
x = self.conv_after_body(x)
x = self.upsample(x)
x = self.conv_last(x)
return x
class CLSCrossAttentionBlock(nn.Module):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
has_mlp=False):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = CrossAttention(dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.has_mlp = has_mlp
if has_mlp:
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x):
x = x[:, 0:1, ...] + self.drop_path(self.attn(self.norm1(x)))
if self.has_mlp:
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class Conv3DCrossAttentionBlock(nn.Module):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
has_mlp=False):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Conv3D_Aware_CrossAttention(dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.has_mlp = has_mlp
if has_mlp:
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
if self.has_mlp:
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class Conv3DCrossAttentionBlockXformerMHA(Conv3DCrossAttentionBlock):
def __init__(self,
dim,
num_heads,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop=0,
attn_drop=0,
drop_path=0,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
has_mlp=False):
super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
attn_drop, drop_path, act_layer, norm_layer, has_mlp)
# self.attn = xformer_Conv3D_Aware_CrossAttention(dim,
self.attn = xformer_Conv3D_Aware_CrossAttention_xygrid(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
class Conv3DCrossAttentionBlockXformerMHANested(
Conv3DCrossAttentionBlockXformerMHA):
def __init__(self,
dim,
num_heads,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
has_mlp=False):
super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
attn_drop, drop_path, act_layer, norm_layer, has_mlp)
"""for in-place replaing the internal attn in Dino ViT.
"""
def forward(self, x):
Bx3, N, C = x.shape
B, group_size = Bx3 // 3, 3
x = x.reshape(B, group_size, N, C) # in plane vit
x = super().forward(x)
return x.reshape(B * group_size, N,
C) # to match the original attn size
class Conv3DCrossAttentionBlockXformerMHANested_withinC(
Conv3DCrossAttentionBlockXformerMHANested):
def __init__(self,
dim,
num_heads,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop=0,
attn_drop=0,
drop_path=0,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
has_mlp=False):
super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
attn_drop, drop_path, act_layer, norm_layer, has_mlp)
self.attn = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
def forward(self, x):
# basic TX attention forward function
x = x + self.drop_path(self.attn(self.norm1(x)))
if self.has_mlp:
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class TriplaneFusionBlock(nn.Module):
"""4 ViT blocks + 1 CrossAttentionBlock
"""
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
cross_attention_blk=CLSCrossAttentionBlock,
*args,
**kwargs) -> None:
super().__init__(*args, **kwargs)
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
if use_fusion_blk:
self.fusion = nn.ModuleList()
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
for d in range(self.num_branches):
self.fusion.append(
cross_attention_blk(
dim=dim,
num_heads=nh,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
# drop=drop,
drop=proj_drop,
attn_drop=attn_drop,
drop_path=drop_path_rate,
norm_layer=norm_layer, # type: ignore
has_mlp=False))
else:
self.fusion = None
def forward(self, x):
# modified from https://github.com/IBM/CrossViT/blob/main/models/crossvit.py#L132
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
x = x.view(B * group_size, N, C)
for blk in self.vit_blks:
x = blk(x) # B 3 N C
if self.fusion is None:
return x.view(B, group_size, N, C)
# outs_b = x.view(B, group_size, N,
# C).chunk(chunks=3,
# dim=1) # 3 * [B, 1, N//3, C] Tensors, for fusion
outs_b = x.chunk(chunks=3,
dim=0) # 3 * [B, N//3, C] Tensors, for fusion
# only take the cls token out
proj_cls_token = [x[:, 0:1] for x in outs_b]
# cross attention
outs = []
for i in range(self.num_branches):
tmp = torch.cat(
(proj_cls_token[i], outs_b[(i + 1) % self.num_branches][:, 1:,
...]),
dim=1)
tmp = self.fusion[i](tmp)
# reverted_proj_cls_token = self.revert_projs[i](tmp[:, 0:1, ...])
reverted_proj_cls_token = tmp[:, 0:1, ...]
tmp = torch.cat((reverted_proj_cls_token, outs_b[i][:, 1:, ...]),
dim=1)
outs.append(tmp)
# outs = ? needs to merge back?
outs = torch.stack(outs, 1) # B 3 N C
return outs
class TriplaneFusionBlockv2(nn.Module):
"""4 ViT blocks + 1 CrossAttentionBlock
"""
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlock,
*args,
**kwargs) -> None:
super().__init__(*args, **kwargs)
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
if use_fusion_blk:
# self.fusion = nn.ModuleList()
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
# for d in range(self.num_branches):
self.fusion = fusion_ca_blk( # one fusion is enough
dim=dim,
num_heads=nh,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
# drop=drop,
drop=proj_drop,
attn_drop=attn_drop,
drop_path=drop_path_rate,
norm_layer=norm_layer, # type: ignore
has_mlp=False)
else:
self.fusion = None
def forward(self, x):
# modified from https://github.com/IBM/CrossViT/blob/main/models/crossvit.py#L132
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
x = x.reshape(B * group_size, N, C)
for blk in self.vit_blks:
x = blk(x) # B 3 N C
if self.fusion is None:
return x.reshape(B, group_size, N, C)
x = x.reshape(B, group_size, N, C) # .chunk(chunks=3,
# dim=1) # 3 * [B, N//3, C] Tensors, for fusion
return self.fusion(x)
class TriplaneFusionBlockv3(TriplaneFusionBlockv2):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHA,
*args,
**kwargs) -> None:
super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
fusion_ca_blk, *args, **kwargs)
class TriplaneFusionBlockv4(TriplaneFusionBlockv3):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHA,
*args,
**kwargs) -> None:
super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
fusion_ca_blk, *args, **kwargs)
"""OOM? directly replace the atten here
"""
assert len(vit_blks) == 2
# del self.vit_blks[1].attn
del self.vit_blks[1].attn, self.vit_blks[1].ls1, self.vit_blks[1].norm1
def ffn_residual_func(self, tx_blk, x: Tensor) -> Tensor:
return tx_blk.ls2(
tx_blk.mlp(tx_blk.norm2(x))
) # https://github.com/facebookresearch/dinov2/blob/c3c2683a13cde94d4d99f523cf4170384b00c34c/dinov2/layers/block.py#L86C1-L87C53
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
assert self.fusion is not None
B, group_size, N, C = x.shape # has [cls] token in N
x = x.reshape(B * group_size, N, C) # in plane vit
# in plane self attention
x = self.vit_blks[0](x)
# 3D cross attention blk + ffn
x = x + self.fusion(x.reshape(B, group_size, N, C)).reshape(
B * group_size, N, C)
x = x + self.ffn_residual_func(self.vit_blks[1], x)
return x.reshape(B, group_size, N, C)
class TriplaneFusionBlockv4_nested(nn.Module):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
*args,
**kwargs) -> None:
super().__init__()
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
assert use_fusion_blk
assert len(vit_blks) == 2
# ! replace vit_blks[1] attn layer with 3D aware attention
del self.vit_blks[
1].attn # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
self.vit_blks[1].attn = fusion_ca_blk( # one fusion is enough
dim=dim,
num_heads=nh,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
# drop=drop,
drop=proj_drop,
attn_drop=attn_drop,
drop_path=drop_path_rate,
norm_layer=norm_layer, # type: ignore
has_mlp=False)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
x = x.reshape(B * group_size, N, C)
for blk in self.vit_blks:
x = blk(x) # B 3 N C
# TODO, avoid the reshape overhead?
return x.reshape(B, group_size, N, C)
class TriplaneFusionBlockv4_nested_init_from_dino(nn.Module):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
init_from_dino=True,
*args,
**kwargs) -> None:
super().__init__()
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
assert use_fusion_blk
assert len(vit_blks) == 2
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
attn_3d = fusion_ca_blk( # one fusion is enough
dim=dim,
num_heads=nh,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
# drop=drop,
drop=proj_drop,
attn_drop=attn_drop,
drop_path=drop_path_rate,
norm_layer=norm_layer, # type: ignore
has_mlp=False)
# ! initialize 3dattn from dino attn
if init_from_dino:
merged_qkv_linear = self.vit_blks[1].attn.qkv
attn_3d.attn.proj.load_state_dict(
self.vit_blks[1].attn.proj.state_dict())
# Initialize the Q, K, and V linear layers using the weights of the merged QKV linear layer
attn_3d.attn.wq.weight.data = merged_qkv_linear.weight.data[:
dim, :]
attn_3d.attn.w_kv.weight.data = merged_qkv_linear.weight.data[
dim:, :]
# Optionally, you can initialize the biases as well (if your QKV linear layer has biases)
if qkv_bias:
attn_3d.attn.wq.bias.data = merged_qkv_linear.bias.data[:dim]
attn_3d.attn.w_kv.bias.data = merged_qkv_linear.bias.data[dim:]
del self.vit_blks[1].attn
# ! assign
self.vit_blks[1].attn = attn_3d
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
x = x.reshape(B * group_size, N, C)
for blk in self.vit_blks:
x = blk(x) # B 3 N C
# TODO, avoid the reshape overhead?
return x.reshape(B, group_size, N, C)
class TriplaneFusionBlockv4_nested_init_from_dino_lite(nn.Module):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=None,
*args,
**kwargs) -> None:
super().__init__()
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
assert use_fusion_blk
assert len(vit_blks) == 2
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC( # ! raw 3D attn layer
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=proj_drop)
del self.vit_blks[1].attn
# ! assign
self.vit_blks[1].attn = attn_3d
def forward(self, x):
"""x: B N C, where N = H*W tokens. Just raw ViT forward pass
"""
# ! move the below to the front of the first call
B, N, C = x.shape # has [cls] token in N
for blk in self.vit_blks:
x = blk(x) # B N C
return x
class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge(nn.Module):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=None,
*args,
**kwargs) -> None:
super().__init__()
self.vit_blks = vit_blks
assert use_fusion_blk
assert len(vit_blks) == 2
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
qkv_bias = True
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
if False: # abla
for blk in self.vit_blks:
attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid_withinC( # ! raw 3D attn layer
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=proj_drop)
blk.attn = self_cross_attn(blk.attn, attn_3d)
def forward(self, x):
"""x: B N C, where N = H*W tokens. Just raw ViT forward pass
"""
# ! move the below to the front of the first call
B, N, C = x.shape # has [cls] token in N
for blk in self.vit_blks:
x = blk(x) # B N C
return x
class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_B_3L_C(TriplaneFusionBlockv4_nested_init_from_dino_lite_merge):
# on roll out + B 3L C
def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, *args, **kwargs) -> None:
super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, *args, **kwargs)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# ! move the below to the front of the first call
# B, N, C = x.shape # has [cls] token in N
B, group_size, N, C = x.shape # has [cls] token in N
x = x.reshape(B, group_size*N, C)
for blk in self.vit_blks:
x = blk(x) # B N C
x = x.reshape(B, group_size, N, C) # outer loop tradition
return x
class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_B_3L_C_withrollout(TriplaneFusionBlockv4_nested_init_from_dino_lite_merge):
# roll out + B 3L C
def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, *args, **kwargs) -> None:
super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, *args, **kwargs)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# ! move the below to the front of the first call
# B, N, C = x.shape # has [cls] token in N
B, group_size, N, C = x.shape # has [cls] token in N
x = x.reshape(B*group_size, N, C)
x = self.vit_blks[0](x)
x = x.reshape(B,group_size*N, C)
x = self.vit_blks[1](x)
x = x.reshape(B, group_size, N, C) # outer loop tradition
return x
class TriplaneFusionBlockv4_nested_init_from_dino_lite_merge_add3DAttn(TriplaneFusionBlockv4_nested_init_from_dino):
# no roll out + 3D Attention
def __init__(self, vit_blks, num_heads, embed_dim, use_fusion_blk=True, fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested, init_from_dino=True, *args, **kwargs) -> None:
super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk, fusion_ca_blk, init_from_dino, *args, **kwargs)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
B, group_size, N, C = x.shape # has [cls] token in N
x = x.reshape(B, group_size*N, C)
x = self.vit_blks[0](x) # B 3 L C
# ! move the below to the front of the first call
x = x.reshape(B, group_size, N, C).reshape(B*group_size, N, C)
x = self.vit_blks[1](x) # has 3D attention
return x.reshape(B, group_size, N, C)
return x
class TriplaneFusionBlockv5_ldm_addCA(nn.Module):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
*args,
**kwargs) -> None:
super().__init__()
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
assert use_fusion_blk
assert len(vit_blks) == 2
# ! rather than replacing, add a 3D attention block after.
# del self.vit_blks[
# 1].attn # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
self.norm_for_atten_3d = deepcopy(self.vit_blks[1].norm1)
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
self.attn_3d = xformer_Conv3D_Aware_CrossAttention_xygrid(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=proj_drop)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
flatten_token = lambda x: x.reshape(B * group_size, N, C)
unflatten_token = lambda x: x.reshape(B, group_size, N, C)
x = flatten_token(x)
x = self.vit_blks[0](x)
x = unflatten_token(x)
x = self.attn_3d(self.norm_for_atten_3d(x)) + x
x = flatten_token(x)
x = self.vit_blks[1](x)
return unflatten_token(x)
class TriplaneFusionBlockv6_ldm_addCA_Init3DAttnfrom2D(
TriplaneFusionBlockv5_ldm_addCA):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
*args,
**kwargs) -> None:
super().__init__(vit_blks, num_heads, embed_dim, use_fusion_blk,
fusion_ca_blk, *args, **kwargs)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
flatten_token = lambda x: x.reshape(B * group_size, N, C)
unflatten_token = lambda x: x.reshape(B, group_size, N, C)
x = flatten_token(x)
x = self.vit_blks[0](x)
x = unflatten_token(x)
x = self.attn_3d(self.norm_for_atten_3d(x)) + x
x = flatten_token(x)
x = self.vit_blks[1](x)
return unflatten_token(x)
class TriplaneFusionBlockv5_ldm_add_dualCA(nn.Module):
def __init__(self,
vit_blks,
num_heads,
embed_dim,
use_fusion_blk=True,
fusion_ca_blk=Conv3DCrossAttentionBlockXformerMHANested,
*args,
**kwargs) -> None:
super().__init__()
self.num_branches = 3 # triplane
self.vit_blks = vit_blks
assert use_fusion_blk
assert len(vit_blks) == 2
# ! rather than replacing, add a 3D attention block after.
# del self.vit_blks[
# 1].attn # , self.vit_blks[1].ls1, self.vit_blks[1].norm1
self.norm_for_atten_3d_0 = deepcopy(self.vit_blks[0].norm1)
self.norm_for_atten_3d_1 = deepcopy(self.vit_blks[1].norm1)
# copied vit settings from https://github.dev/facebookresearch/dinov2
nh = num_heads
dim = embed_dim
mlp_ratio = 4 # defined for all dino2 model
qkv_bias = True
norm_layer = partial(nn.LayerNorm, eps=1e-6)
drop_path_rate = 0.3 # default setting
attn_drop = proj_drop = 0.0
qk_scale = None # TODO, double check
self.attn_3d_0 = xformer_Conv3D_Aware_CrossAttention_xygrid(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=proj_drop)
self.attn_3d_1 = deepcopy(self.attn_3d_0)
def forward(self, x):
"""x: B 3 N C, where N = H*W tokens
"""
# self attention, by merging the triplane channel into B for parallel computation
# ! move the below to the front of the first call
B, group_size, N, C = x.shape # has [cls] token in N
assert group_size == 3, 'triplane'
flatten_token = lambda x: x.reshape(B * group_size, N, C)
unflatten_token = lambda x: x.reshape(B, group_size, N, C)
x = flatten_token(x)
x = self.vit_blks[0](x)
x = unflatten_token(x)
x = self.attn_3d_0(self.norm_for_atten_3d_0(x)) + x
x = flatten_token(x)
x = self.vit_blks[1](x)
x = unflatten_token(x)
x = self.attn_3d_1(self.norm_for_atten_3d_1(x)) + x
return unflatten_token(x)
def drop_path(x, drop_prob: float = 0., training: bool = False):
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0], ) + (1, ) * (
x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(
shape, dtype=x.dtype, device=x.device)
random_tensor.floor_() # binarize
output = x.div(keep_prob) * random_tensor
return output
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Mlp(nn.Module):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Block(nn.Module):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
# self.attn = Attention(dim,
self.attn = MemEffAttention(dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x, return_attention=False):
y, attn = self.attn(self.norm1(x))
if return_attention:
return attn
x = x + self.drop_path(y)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
num_patches = (img_size // patch_size) * (img_size // patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2d(in_chans,
embed_dim,
kernel_size=patch_size,
stride=patch_size)
def forward(self, x):
B, C, H, W = x.shape
x = self.proj(x).flatten(2).transpose(1, 2) # B, C, L -> B, L, C
return x
class VisionTransformer(nn.Module):
""" Vision Transformer """
def __init__(self,
img_size=[224],
patch_size=16,
in_chans=3,
num_classes=0,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
patch_embedding=True,
cls_token=True,
pixel_unshuffle=False,
**kwargs):
super().__init__()
self.num_features = self.embed_dim = embed_dim
self.patch_size = patch_size
# if norm_layer == 'nn.LayerNorm':
norm_layer = partial(nn.LayerNorm, eps=1e-6)
if patch_embedding:
self.patch_embed = PatchEmbed(img_size=img_size[0],
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches
self.img_size = self.patch_embed.img_size
else:
self.patch_embed = None
self.img_size = img_size[0]
num_patches = (img_size[0] // patch_size) * (img_size[0] //
patch_size)
if cls_token:
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(
torch.zeros(1, num_patches + 1, embed_dim))
else:
self.cls_token = None
self.pos_embed = nn.Parameter(
torch.zeros(1, num_patches, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule
self.blocks = nn.ModuleList([
Block(dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer) for i in range(depth)
])
self.norm = norm_layer(embed_dim)
# Classifier head
self.head = nn.Linear(
embed_dim, num_classes) if num_classes > 0 else nn.Identity()
trunc_normal_(self.pos_embed, std=.02)
if cls_token:
trunc_normal_(self.cls_token, std=.02)
self.apply(self._init_weights)
# if pixel_unshuffle:
# self.decoder_pred = nn.Linear(embed_dim,
# patch_size**2 * out_chans,
# bias=True) # decoder to patch
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def interpolate_pos_encoding(self, x, w, h):
npatch = x.shape[1] - 1
N = self.pos_embed.shape[1] - 1
if npatch == N and w == h:
return self.pos_embed
patch_pos_embed = self.pos_embed[:, 1:]
dim = x.shape[-1]
w0 = w // self.patch_size
h0 = h // self.patch_size
# we add a small number to avoid floating point error in the interpolation
# see discussion at https://github.com/facebookresearch/dino/issues/8
w0, h0 = w0 + 0.1, h0 + 0.1
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
dim).permute(0, 3, 1, 2),
scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
mode='bicubic',
)
assert int(w0) == patch_pos_embed.shape[-2] and int(
h0) == patch_pos_embed.shape[-1]
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(2, -1, dim)
if self.cls_token is not None:
class_pos_embed = self.pos_embed[:, 0]
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed),
dim=1)
return patch_pos_embed
def prepare_tokens(self, x):
B, nc, w, h = x.shape
x = self.patch_embed(x) # patch linear embedding
# add the [CLS] token to the embed patch tokens
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, x), dim=1)
# add positional encoding to each token
x = x + self.interpolate_pos_encoding(x, w, h)
return self.pos_drop(x)
def forward(self, x):
x = self.prepare_tokens(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
return x[:, 1:] # return spatial feature maps, not the [CLS] token
# return x[:, 0]
def get_last_selfattention(self, x):
x = self.prepare_tokens(x)
for i, blk in enumerate(self.blocks):
if i < len(self.blocks) - 1:
x = blk(x)
else:
# return attention of the last block
return blk(x, return_attention=True)
def get_intermediate_layers(self, x, n=1):
x = self.prepare_tokens(x)
# we return the output tokens from the `n` last blocks
output = []
for i, blk in enumerate(self.blocks):
x = blk(x)
if len(self.blocks) - i <= n:
output.append(self.norm(x))
return output
def vit_tiny(patch_size=16, **kwargs):
model = VisionTransformer(patch_size=patch_size,
embed_dim=192,
depth=12,
num_heads=3,
mlp_ratio=4,
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
**kwargs)
return model
def vit_small(patch_size=16, **kwargs):
model = VisionTransformer(
patch_size=patch_size,
embed_dim=384,
depth=12,
num_heads=6,
mlp_ratio=4,
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6), # type: ignore
**kwargs)
return model
def vit_base(patch_size=16, **kwargs):
model = VisionTransformer(patch_size=patch_size,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
**kwargs)
return model
vits = vit_small
vitb = vit_base