# MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch import torch.nn as nn import numpy as np from torchvision.transforms import Normalize # from zoedepth.models.base_models.dpt_dinov2.dpt import DPT_DINOv2 from depth_anything.dpt import DPT_DINOv2 def denormalize(x): """Reverses the imagenet normalization applied to the input. Args: x (torch.Tensor - shape(N,3,H,W)): input tensor Returns: torch.Tensor - shape(N,3,H,W): Denormalized input """ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device) std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device) return x * std + mean def get_activation(name, bank): def hook(model, input, output): bank[name] = output return hook class Resize(object): """Resize sample to given size (width, height). """ def __init__( self, width, height, resize_target=True, keep_aspect_ratio=False, ensure_multiple_of=1, resize_method="lower_bound", ): """Init. Args: width (int): desired output width height (int): desired output height resize_target (bool, optional): True: Resize the full sample (image, mask, target). False: Resize image only. Defaults to True. keep_aspect_ratio (bool, optional): True: Keep the aspect ratio of the input sample. Output sample might not have the given width and height, and resize behaviour depends on the parameter 'resize_method'. Defaults to False. ensure_multiple_of (int, optional): Output width and height is constrained to be multiple of this parameter. Defaults to 1. resize_method (str, optional): "lower_bound": Output will be at least as large as the given size. "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) "minimal": Scale as least as possible. (Output size might be smaller than given size.) Defaults to "lower_bound". """ # print("Params passed to Resize transform:") # print("\twidth: ", width) # print("\theight: ", height) # print("\tresize_target: ", resize_target) # print("\tkeep_aspect_ratio: ", keep_aspect_ratio) # print("\tensure_multiple_of: ", ensure_multiple_of) # print("\tresize_method: ", resize_method) self.__width = width self.__height = height self.__keep_aspect_ratio = keep_aspect_ratio self.__multiple_of = ensure_multiple_of self.__resize_method = resize_method def constrain_to_multiple_of(self, x, min_val=0, max_val=None): y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) if max_val is not None and y > max_val: y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) if y < min_val: y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) return y def get_size(self, width, height): # determine new height and width scale_height = self.__height / height scale_width = self.__width / width if self.__keep_aspect_ratio: if self.__resize_method == "lower_bound": # scale such that output size is lower bound if scale_width > scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "upper_bound": # scale such that output size is upper bound if scale_width < scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "minimal": # scale as least as possbile if abs(1 - scale_width) < abs(1 - scale_height): # fit width scale_height = scale_width else: # fit height scale_width = scale_height else: raise ValueError( f"resize_method {self.__resize_method} not implemented" ) if self.__resize_method == "lower_bound": new_height = self.constrain_to_multiple_of( scale_height * height, min_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, min_val=self.__width ) elif self.__resize_method == "upper_bound": new_height = self.constrain_to_multiple_of( scale_height * height, max_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, max_val=self.__width ) elif self.__resize_method == "minimal": new_height = self.constrain_to_multiple_of(scale_height * height) new_width = self.constrain_to_multiple_of(scale_width * width) else: raise ValueError( f"resize_method {self.__resize_method} not implemented") return (new_width, new_height) def __call__(self, x): width, height = self.get_size(*x.shape[-2:][::-1]) return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True) class PrepForMidas(object): def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True): if isinstance(img_size, int): img_size = (img_size, img_size) net_h, net_w = img_size # self.normalization = Normalize( # mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) self.normalization = Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=14, resize_method=resize_mode) \ if do_resize else nn.Identity() def __call__(self, x): return self.normalization(self.resizer(x)) class DepthAnythingCore(nn.Module): def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True, img_size=384, core_type='vits', **kwargs): """Midas Base model used for multi-scale feature extraction. Args: midas (torch.nn.Module): Midas model. trainable (bool, optional): Train midas model. Defaults to False. fetch_features (bool, optional): Extract multi-scale features. Defaults to True. layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'). freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False. keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True. img_size (int, tuple, optional): Input resolution. Defaults to 384. """ super().__init__() self.core_type = core_type self.core = midas self.output_channels = None self.core_out = {} self.trainable = trainable self.fetch_features = fetch_features # midas.scratch.output_conv = nn.Identity() self.handles = [] # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1'] self.layer_names = layer_names self.set_trainable(trainable) self.set_fetch_features(fetch_features) self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio, img_size=img_size, do_resize=kwargs.get('do_resize', True)) if freeze_bn: self.freeze_bn() def set_trainable(self, trainable): self.trainable = trainable if trainable: self.unfreeze() else: self.freeze() return self def set_fetch_features(self, fetch_features): self.fetch_features = fetch_features if fetch_features: if len(self.handles) == 0: self.attach_hooks(self.core) else: self.remove_hooks() return self def freeze(self): for p in self.parameters(): p.requires_grad = False self.trainable = False return self def unfreeze(self): for p in self.parameters(): p.requires_grad = True self.trainable = True return self def freeze_bn(self): for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() return self def forward(self, x, denorm=False, return_rel_depth=False): # print('input to midas:', x.shape) with torch.no_grad(): if denorm: x = denormalize(x) x = self.prep(x) with torch.set_grad_enabled(self.trainable): rel_depth = self.core(x) if not self.fetch_features: return rel_depth out = [self.core_out[k] for k in self.layer_names] if return_rel_depth: return rel_depth, out return out def get_rel_pos_params(self): for name, p in self.core.pretrained.named_parameters(): if "pos_embed" in name: yield p def get_enc_params_except_rel_pos(self): for name, p in self.core.pretrained.named_parameters(): if "pos_embed" not in name: yield p def freeze_encoder(self, freeze_rel_pos=False): if freeze_rel_pos: for p in self.core.pretrained.parameters(): p.requires_grad = False else: for p in self.get_enc_params_except_rel_pos(): p.requires_grad = False return self def attach_hooks(self, midas): if len(self.handles) > 0: self.remove_hooks() if "out_conv" in self.layer_names: self.handles.append(list(midas.depth_head.scratch.output_conv2.children())[ 1].register_forward_hook(get_activation("out_conv", self.core_out))) if "r4" in self.layer_names: self.handles.append(midas.depth_head.scratch.refinenet4.register_forward_hook( get_activation("r4", self.core_out))) if "r3" in self.layer_names: self.handles.append(midas.depth_head.scratch.refinenet3.register_forward_hook( get_activation("r3", self.core_out))) if "r2" in self.layer_names: self.handles.append(midas.depth_head.scratch.refinenet2.register_forward_hook( get_activation("r2", self.core_out))) if "r1" in self.layer_names: self.handles.append(midas.depth_head.scratch.refinenet1.register_forward_hook( get_activation("r1", self.core_out))) if "l4_rn" in self.layer_names: self.handles.append(midas.depth_head.scratch.layer4_rn.register_forward_hook( get_activation("l4_rn", self.core_out))) return self def remove_hooks(self): for h in self.handles: h.remove() return self def __del__(self): self.remove_hooks() def set_output_channels(self): if self.core_type == 'vits': self.output_channels = [64, 64, 64, 64, 64] elif self.core_type == 'vitb': self.output_channels = [128, 128, 128, 128, 128] elif self.core_type == 'vitl': self.output_channels = [256, 256, 256, 256, 256] @staticmethod def build(midas_model_type="dinov2_large", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs): if "img_size" in kwargs: kwargs = DepthAnythingCore.parse_img_size(kwargs) img_size = kwargs.pop("img_size", [384, 384]) if midas_model_type == 'vits': depth_anything = DPT_DINOv2(encoder=midas_model_type, features=64, out_channels=[48, 96, 192, 384], use_clstoken=False) state_dict = torch.load('/ibex/ai/home/liz0l/codes/ZoeDepth/depth_anything_vits14.pth', map_location='cpu') elif midas_model_type == 'vitb': depth_anything = DPT_DINOv2(encoder=midas_model_type, features=128, out_channels=[96, 192, 384, 768], use_clstoken=False) state_dict = torch.load('/ibex/ai/home/liz0l/codes/ZoeDepth/depth_anything_vitb14.pth', map_location='cpu') elif midas_model_type == 'vitl': depth_anything = DPT_DINOv2(encoder=midas_model_type, features=256, out_channels=[256, 512, 1024, 1024], use_clstoken=False) state_dict = torch.load('/ibex/ai/home/liz0l/codes/ZoeDepth/depth_anything_vitl14.pth', map_location='cpu') else: raise NotImplementedError depth_anything.load_state_dict(state_dict) kwargs.update({'keep_aspect_ratio': force_keep_ar}) depth_anything_core = DepthAnythingCore(depth_anything, trainable=train_midas, fetch_features=fetch_features, freeze_bn=freeze_bn, img_size=img_size, core_type=midas_model_type, **kwargs) depth_anything_core.set_output_channels() return depth_anything_core @staticmethod def parse_img_size(config): assert 'img_size' in config if isinstance(config['img_size'], str): assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W" config['img_size'] = list(map(int, config['img_size'].split(","))) assert len( config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W" elif isinstance(config['img_size'], int): config['img_size'] = [config['img_size'], config['img_size']] else: assert isinstance(config['img_size'], list) and len( config['img_size']) == 2, "img_size should be a list of H,W" return config nchannels2models = { tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"], (512, 256, 128, 64, 64): ["MiDaS_small"] } # Model name to number of output channels MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items() for m in v }