Spaces:
Running
on
L40S
Running
on
L40S
File size: 2,389 Bytes
e3e5f9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.
# References:
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
from functools import partial
import math
import logging
from typing import Sequence, Tuple, Union, Callable
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.utils.checkpoint
from torch.nn.init import trunc_normal_
from .dinov2.hub.backbones import dinov2_vitb14
class FrozenDinoV2ImageEmbedder(nn.Module):
"""
Uses the dinov2 image encoder with camera modulation.
Not actually frozen... If you want that set cond_stage_trainable=False in cfg
"""
def __init__(
self,
version='dinov2_vitb14',
ckpt_path=None,
lrm_mode='plain_lrm',
):
super().__init__()
self.lrm_mode = lrm_mode
assert version in ['dinov2_vitb14', 'dinov2_vits14', 'dinov2_vitl14', 'dinov2_vitg14']
self.model = dinov2_vitb14(pretrained=False)
if ckpt_path is not None:
self.load_pretrained(ckpt_path)
else:
print('None pretrained model for dinov2 encoder ...')
def load_pretrained(self, ckpt_path):
print('Loading dinov2 encoder ...')
orig_state_dict = torch.load(ckpt_path, map_location='cpu')
try:
ret = self.model.load_state_dict(orig_state_dict, strict=False)
print(ret)
print('Successfully loaded orig state dict')
except:
new_state_dict = OrderedDict()
for k, v in orig_state_dict['state_dict'].items():
if 'img_encoder' in k:
new_state_dict[k.replace('img_encoder.model.', '')] = v
ret = self.model.load_state_dict(new_state_dict, strict=False)
print(ret)
print('Successfully loaded new state dict')
def forward(self, x, *args, **kwargs):
ret = self.model.forward_features_with_camera(x, *args, **kwargs)
output = torch.cat([ret['x_norm_clstoken'].unsqueeze(1), ret['x_norm_patchtokens']], dim=1)
return output
|