Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (C) 2022-present Naver Corporation. All rights reserved. | |
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). | |
# -------------------------------------------------------- | |
# DPT head for ViTs | |
# -------------------------------------------------------- | |
# References: | |
# https://github.com/isl-org/DPT | |
# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from einops import rearrange, repeat | |
from typing import Union, Tuple, Iterable, List, Optional, Dict | |
from einops.layers.torch import Reduce, Rearrange | |
from pdb import set_trace as st | |
def pair(t): | |
return t if isinstance(t, tuple) else (t, t) | |
def make_scratch(in_shape, out_shape, groups=1, expand=False): | |
scratch = nn.Module() | |
out_shape1 = out_shape | |
out_shape2 = out_shape | |
out_shape3 = out_shape | |
out_shape4 = out_shape | |
if expand == True: | |
out_shape1 = out_shape | |
out_shape2 = out_shape * 2 | |
out_shape3 = out_shape * 4 | |
out_shape4 = out_shape * 8 | |
scratch.layer1_rn = nn.Conv2d( | |
in_shape[0], | |
out_shape1, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=False, | |
groups=groups, | |
) | |
scratch.layer2_rn = nn.Conv2d( | |
in_shape[1], | |
out_shape2, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=False, | |
groups=groups, | |
) | |
scratch.layer3_rn = nn.Conv2d( | |
in_shape[2], | |
out_shape3, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=False, | |
groups=groups, | |
) | |
scratch.layer4_rn = nn.Conv2d( | |
in_shape[3], | |
out_shape4, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=False, | |
groups=groups, | |
) | |
scratch.layer_rn = nn.ModuleList([ | |
scratch.layer1_rn, | |
scratch.layer2_rn, | |
scratch.layer3_rn, | |
scratch.layer4_rn, | |
]) | |
return scratch | |
class ResidualConvUnit_custom(nn.Module): | |
"""Residual convolution module.""" | |
def __init__(self, features, activation, bn): | |
"""Init. | |
Args: | |
features (int): number of features | |
""" | |
super().__init__() | |
self.bn = bn | |
self.groups = 1 | |
self.conv1 = nn.Conv2d( | |
features, | |
features, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=not self.bn, | |
groups=self.groups, | |
) | |
self.conv2 = nn.Conv2d( | |
features, | |
features, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=not self.bn, | |
groups=self.groups, | |
) | |
if self.bn == True: | |
self.bn1 = nn.BatchNorm2d(features) | |
self.bn2 = nn.BatchNorm2d(features) | |
self.activation = activation | |
self.skip_add = nn.quantized.FloatFunctional() | |
def forward(self, x): | |
"""Forward pass. | |
Args: | |
x (tensor): input | |
Returns: | |
tensor: output | |
""" | |
out = self.activation(x) | |
out = self.conv1(out) | |
if self.bn == True: | |
out = self.bn1(out) | |
out = self.activation(out) | |
out = self.conv2(out) | |
if self.bn == True: | |
out = self.bn2(out) | |
if self.groups > 1: | |
out = self.conv_merge(out) | |
return self.skip_add.add(out, x) | |
class FeatureFusionBlock_custom(nn.Module): | |
"""Feature fusion block.""" | |
def __init__( | |
self, | |
features, | |
activation, | |
deconv=False, | |
bn=False, | |
expand=False, | |
align_corners=True, | |
width_ratio=1, | |
): | |
"""Init. | |
Args: | |
features (int): number of features | |
""" | |
super(FeatureFusionBlock_custom, self).__init__() | |
self.width_ratio = width_ratio | |
self.deconv = deconv | |
self.align_corners = align_corners | |
self.groups = 1 | |
self.expand = expand | |
out_features = features | |
if self.expand == True: | |
out_features = features // 2 | |
self.out_conv = nn.Conv2d( | |
features, | |
out_features, | |
kernel_size=1, | |
stride=1, | |
padding=0, | |
bias=True, | |
groups=1, | |
) | |
self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) | |
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) | |
self.skip_add = nn.quantized.FloatFunctional() | |
def forward(self, *xs): | |
"""Forward pass. | |
Returns: | |
tensor: output | |
""" | |
output = xs[0] | |
if len(xs) == 2: | |
res = self.resConfUnit1(xs[1]) | |
if self.width_ratio != 1: | |
res = F.interpolate(res, | |
size=(output.shape[2], output.shape[3]), | |
mode='bilinear') | |
output = self.skip_add.add(output, res) | |
# output += res | |
output = self.resConfUnit2(output) | |
if self.width_ratio != 1: | |
# and output.shape[3] < self.width_ratio * output.shape[2] | |
#size=(image.shape[]) | |
if (output.shape[3] / | |
output.shape[2]) < (2 / 3) * self.width_ratio: | |
shape = 3 * output.shape[3] | |
else: | |
shape = int(self.width_ratio * 2 * output.shape[2]) | |
output = F.interpolate(output, | |
size=(2 * output.shape[2], shape), | |
mode='bilinear') | |
else: | |
output = nn.functional.interpolate( | |
output, | |
scale_factor=2, | |
mode="bilinear", | |
align_corners=self.align_corners) | |
output = self.out_conv(output) | |
return output | |
def make_fusion_block(features, use_bn, width_ratio=1): | |
return FeatureFusionBlock_custom( | |
features, | |
nn.ReLU(False), | |
deconv=False, | |
bn=use_bn, | |
expand=False, | |
align_corners=True, | |
width_ratio=width_ratio, | |
) | |
class Interpolate(nn.Module): | |
"""Interpolation module.""" | |
def __init__(self, scale_factor, mode, align_corners=False): | |
"""Init. | |
Args: | |
scale_factor (float): scaling | |
mode (str): interpolation mode | |
""" | |
super(Interpolate, self).__init__() | |
self.interp = nn.functional.interpolate | |
self.scale_factor = scale_factor | |
self.mode = mode | |
self.align_corners = align_corners | |
def forward(self, x): | |
"""Forward pass. | |
Args: | |
x (tensor): input | |
Returns: | |
tensor: interpolated data | |
""" | |
x = self.interp( | |
x, | |
scale_factor=self.scale_factor, | |
mode=self.mode, | |
align_corners=self.align_corners, | |
) | |
return x | |
class DPTOutputAdapter(nn.Module): | |
"""DPT output adapter. | |
:param num_cahnnels: Number of output channels | |
:param stride_level: tride level compared to the full-sized image. | |
E.g. 4 for 1/4th the size of the image. | |
:param patch_size_full: Int or tuple of the patch size over the full image size. | |
Patch size for smaller inputs will be computed accordingly. | |
:param hooks: Index of intermediate layers | |
:param layer_dims: Dimension of intermediate layers | |
:param feature_dim: Feature dimension | |
:param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression | |
:param use_bn: If set to True, activates batch norm | |
:param dim_tokens_enc: Dimension of tokens coming from encoder | |
""" | |
def __init__( | |
self, | |
num_channels: int = 1, | |
stride_level: int = 1, | |
# patch_size: Union[int, Tuple[int, int]] = 16, | |
patch_size: Union[int, Tuple[int, int]] = 32, | |
main_tasks: Iterable[str] = ('rgb', ), | |
hooks: List[int] = [2, 5, 8, 11], | |
layer_dims: List[int] = [96, 192, 384, 768], | |
feature_dim: int = 256, | |
last_dim: int = 32, | |
use_bn: bool = False, | |
dim_tokens_enc: Optional[int] = None, | |
head_type: str = 'regression', | |
output_width_ratio=1, | |
**kwargs): | |
super().__init__() | |
self.num_channels = num_channels | |
self.stride_level = stride_level | |
self.patch_size = pair(patch_size) | |
self.main_tasks = main_tasks | |
self.hooks = hooks | |
self.layer_dims = layer_dims | |
self.feature_dim = feature_dim | |
self.dim_tokens_enc = dim_tokens_enc * len( | |
self.main_tasks) if dim_tokens_enc is not None else None | |
self.head_type = head_type | |
# Actual patch height and width, taking into account stride of input | |
self.P_H = max(1, self.patch_size[0] // stride_level) | |
self.P_W = max(1, self.patch_size[1] // stride_level) | |
self.scratch = make_scratch(layer_dims, | |
feature_dim, | |
groups=1, | |
expand=False) | |
self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, | |
output_width_ratio) | |
self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, | |
output_width_ratio) | |
self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, | |
output_width_ratio) | |
self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, | |
output_width_ratio) | |
if self.head_type == 'regression': | |
# The "DPTDepthModel" head | |
self.head = nn.Sequential( | |
nn.Conv2d(feature_dim, | |
feature_dim // 2, | |
kernel_size=3, | |
stride=1, | |
padding=1), | |
Interpolate(scale_factor=2, | |
mode="bilinear", | |
align_corners=True), | |
nn.Conv2d(feature_dim // 2, | |
last_dim, | |
kernel_size=3, | |
stride=1, | |
padding=1), nn.ReLU(True), | |
nn.Conv2d(last_dim, | |
self.num_channels, | |
kernel_size=1, | |
stride=1, | |
padding=0)) | |
elif self.head_type == 'regression_voxel': | |
# The "DPTDepthModel" head | |
downscale_factor = 8 # from 128 -> 32 | |
c_3d = 32 * 4 # hard coded | |
self.head = nn.Sequential( | |
nn.Conv2d(feature_dim, | |
feature_dim // 2, | |
kernel_size=3, | |
stride=1, | |
padding=1), | |
# | |
nn.PixelUnshuffle(downscale_factor=downscale_factor | |
), #B 64*16 32 32 | |
# | |
Rearrange(f'b (c2 z) h w -> b c2 z h w', z=32, c2=c_3d), # B 32 32 32 32 | |
nn.ReLU(True), | |
# Interpolate(scale_factor=2, mode="bilinear", align_corners=True), | |
# nn.Conv2d(feature_dim // 2 * downscale_factor**2, last_dim, kernel_size=3, stride=1, padding=1), | |
nn.Conv3d(c_3d, | |
last_dim, | |
kernel_size=3, | |
stride=1, | |
padding=1), | |
nn.ReLU(True), | |
nn.Conv3d(last_dim, | |
self.num_channels, | |
kernel_size=1, | |
stride=1, | |
padding=0)) | |
elif self.head_type == 'regression_gs': # avoid upsampling here | |
self.head = nn.Sequential( | |
nn.Conv2d(feature_dim, | |
feature_dim // 2, | |
kernel_size=3, | |
stride=1, | |
padding=1), | |
# Interpolate(scale_factor=2, mode="bilinear", align_corners=True), | |
nn.ReLU(True), | |
# nn.Dropout(0.1, False), | |
nn.Conv2d(feature_dim // 2, | |
last_dim, | |
kernel_size=3, | |
stride=1, | |
padding=1), | |
nn.ReLU(True), | |
nn.Conv2d(last_dim, | |
self.num_channels, | |
kernel_size=1, | |
stride=1, | |
padding=0)) | |
elif self.head_type == 'semseg': | |
# The "DPTSegmentationModel" head | |
self.head = nn.Sequential( | |
nn.Conv2d(feature_dim, | |
feature_dim, | |
kernel_size=3, | |
padding=1, | |
bias=False), | |
nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(), | |
nn.ReLU(True), | |
nn.Dropout(0.1, False), | |
nn.Conv2d(feature_dim, self.num_channels, kernel_size=1), | |
Interpolate(scale_factor=2, | |
mode="bilinear", | |
align_corners=True), | |
) | |
else: | |
raise ValueError('DPT head_type must be "regression" or "semseg".') | |
if self.dim_tokens_enc is not None: | |
self.init(dim_tokens_enc=dim_tokens_enc) | |
def init(self, dim_tokens_enc=768): | |
""" | |
Initialize parts of decoder that are dependent on dimension of encoder tokens. | |
Should be called when setting up MultiMAE. | |
:param dim_tokens_enc: Dimension of tokens coming from encoder | |
""" | |
#print(dim_tokens_enc) | |
# Set up activation postprocessing layers | |
if isinstance(dim_tokens_enc, int): | |
dim_tokens_enc = 4 * [dim_tokens_enc] | |
self.dim_tokens_enc = [ | |
dt * len(self.main_tasks) for dt in dim_tokens_enc | |
] | |
self.act_1_postprocess = nn.Sequential( | |
nn.Conv2d( | |
in_channels=self.dim_tokens_enc[0], | |
out_channels=self.layer_dims[0], | |
kernel_size=1, | |
stride=1, | |
padding=0, | |
), | |
nn.ConvTranspose2d( | |
in_channels=self.layer_dims[0], | |
out_channels=self.layer_dims[0], | |
kernel_size=4, | |
stride=4, | |
padding=0, | |
bias=True, | |
dilation=1, | |
groups=1, | |
)) | |
self.act_2_postprocess = nn.Sequential( | |
nn.Conv2d( | |
in_channels=self.dim_tokens_enc[1], | |
out_channels=self.layer_dims[1], | |
kernel_size=1, | |
stride=1, | |
padding=0, | |
), | |
nn.ConvTranspose2d( | |
in_channels=self.layer_dims[1], | |
out_channels=self.layer_dims[1], | |
kernel_size=2, | |
stride=2, | |
padding=0, | |
bias=True, | |
dilation=1, | |
groups=1, | |
)) | |
self.act_3_postprocess = nn.Sequential( | |
nn.Conv2d( | |
in_channels=self.dim_tokens_enc[2], | |
out_channels=self.layer_dims[2], | |
kernel_size=1, | |
stride=1, | |
padding=0, | |
)) | |
self.act_4_postprocess = nn.Sequential( | |
nn.Conv2d( | |
in_channels=self.dim_tokens_enc[3], | |
out_channels=self.layer_dims[3], | |
kernel_size=1, | |
stride=1, | |
padding=0, | |
), | |
nn.Conv2d( | |
in_channels=self.layer_dims[3], | |
out_channels=self.layer_dims[3], | |
kernel_size=3, | |
stride=2, | |
padding=1, | |
)) | |
self.act_postprocess = nn.ModuleList([ | |
self.act_1_postprocess, self.act_2_postprocess, | |
self.act_3_postprocess, self.act_4_postprocess | |
]) | |
def adapt_tokens(self, encoder_tokens): | |
# Adapt tokens | |
x = [] | |
x.append(encoder_tokens[:, :]) | |
x = torch.cat(x, dim=-1) | |
return x | |
def forward(self, encoder_tokens: List[torch.Tensor], image_size): | |
#input_info: Dict): | |
assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first' | |
H, W = image_size | |
# Number of patches in height and width | |
N_H = H // (self.stride_level * self.P_H) | |
N_W = W // (self.stride_level * self.P_W) | |
# Hook decoder onto 4 layers from specified ViT layers | |
layers = [encoder_tokens[hook] for hook in self.hooks] | |
# Extract only task-relevant tokens and ignore global tokens. | |
layers = [self.adapt_tokens(l) for l in layers] | |
# Reshape tokens to spatial representation | |
layers = [ | |
rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) | |
for l in layers | |
] | |
layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)] | |
# Project layers to chosen feature dim | |
layers = [ | |
self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers) | |
] | |
# Fuse layers using refinement stages | |
path_4 = self.scratch.refinenet4(layers[3]) | |
path_3 = self.scratch.refinenet3(path_4, layers[2]) | |
path_2 = self.scratch.refinenet2(path_3, layers[1]) | |
path_1 = self.scratch.refinenet1(path_2, layers[0]) | |
# Output head | |
out = self.head(path_1) | |
return out | |