Spaces:
Runtime error
Runtime error
File size: 12,553 Bytes
159f437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
# Copyright (c) Facebook, Inc. and its affiliates.
import torch
import torch.nn.functional as F
from torch import nn
import math
from detectron2.modeling import META_ARCH_REGISTRY, build_backbone
from detectron2.structures import Boxes, Instances
from ..utils import load_class_freq, get_fed_loss_inds
from models.backbone import Joiner
from models.deformable_detr import DeformableDETR, SetCriterion, MLP
from models.deformable_detr import _get_clones
from models.matcher import HungarianMatcher
from models.position_encoding import PositionEmbeddingSine
from models.deformable_transformer import DeformableTransformer
from models.segmentation import sigmoid_focal_loss
from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
from util.misc import NestedTensor, accuracy
__all__ = ["DeformableDetr"]
class CustomSetCriterion(SetCriterion):
def __init__(self, num_classes, matcher, weight_dict, losses, \
focal_alpha=0.25, use_fed_loss=False):
super().__init__(num_classes, matcher, weight_dict, losses, focal_alpha)
self.use_fed_loss = use_fed_loss
if self.use_fed_loss:
self.register_buffer(
'fed_loss_weight', load_class_freq(freq_weight=0.5))
def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
"""
assert 'pred_logits' in outputs
src_logits = outputs['pred_logits']
idx = self._get_src_permutation_idx(indices)
target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(src_logits.shape[:2], self.num_classes,
dtype=torch.int64, device=src_logits.device)
target_classes[idx] = target_classes_o
target_classes_onehot = torch.zeros(
[src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
dtype=src_logits.dtype, layout=src_logits.layout,
device=src_logits.device)
target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
target_classes_onehot = target_classes_onehot[:,:,:-1] # B x N x C
if self.use_fed_loss:
inds = get_fed_loss_inds(
gt_classes=target_classes_o,
num_sample_cats=50,
weight=self.fed_loss_weight,
C=target_classes_onehot.shape[2])
loss_ce = sigmoid_focal_loss(
src_logits[:, :, inds],
target_classes_onehot[:, :, inds],
num_boxes,
alpha=self.focal_alpha,
gamma=2) * src_logits.shape[1]
else:
loss_ce = sigmoid_focal_loss(
src_logits, target_classes_onehot, num_boxes,
alpha=self.focal_alpha,
gamma=2) * src_logits.shape[1]
losses = {'loss_ce': loss_ce}
if log:
# TODO this should probably be a separate loss, not hacked in this one here
losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
return losses
class MaskedBackbone(nn.Module):
""" This is a thin wrapper around D2's backbone to provide padding masking"""
def __init__(self, cfg):
super().__init__()
self.backbone = build_backbone(cfg)
backbone_shape = self.backbone.output_shape()
self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
self.strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
self.num_channels = [backbone_shape[x].channels for x in backbone_shape.keys()]
def forward(self, tensor_list: NestedTensor):
xs = self.backbone(tensor_list.tensors)
out = {}
for name, x in xs.items():
m = tensor_list.mask
assert m is not None
mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
out[name] = NestedTensor(x, mask)
return out
@META_ARCH_REGISTRY.register()
class DeformableDetr(nn.Module):
"""
Implement Deformable Detr
"""
def __init__(self, cfg):
super().__init__()
self.with_image_labels = cfg.WITH_IMAGE_LABELS
self.weak_weight = cfg.MODEL.DETR.WEAK_WEIGHT
self.device = torch.device(cfg.MODEL.DEVICE)
self.test_topk = cfg.TEST.DETECTIONS_PER_IMAGE
self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
self.mask_on = cfg.MODEL.MASK_ON
hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
# Transformer parameters:
nheads = cfg.MODEL.DETR.NHEADS
dropout = cfg.MODEL.DETR.DROPOUT
dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
enc_layers = cfg.MODEL.DETR.ENC_LAYERS
dec_layers = cfg.MODEL.DETR.DEC_LAYERS
num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
two_stage = cfg.MODEL.DETR.TWO_STAGE
with_box_refine = cfg.MODEL.DETR.WITH_BOX_REFINE
# Loss parameters:
giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
l1_weight = cfg.MODEL.DETR.L1_WEIGHT
deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
cls_weight = cfg.MODEL.DETR.CLS_WEIGHT
focal_alpha = cfg.MODEL.DETR.FOCAL_ALPHA
N_steps = hidden_dim // 2
d2_backbone = MaskedBackbone(cfg)
backbone = Joiner(d2_backbone, PositionEmbeddingSine(N_steps, normalize=True))
transformer = DeformableTransformer(
d_model=hidden_dim,
nhead=nheads,
num_encoder_layers=enc_layers,
num_decoder_layers=dec_layers,
dim_feedforward=dim_feedforward,
dropout=dropout,
activation="relu",
return_intermediate_dec=True,
num_feature_levels=num_feature_levels,
dec_n_points=4,
enc_n_points=4,
two_stage=two_stage,
two_stage_num_proposals=num_queries)
self.detr = DeformableDETR(
backbone, transformer, num_classes=self.num_classes,
num_queries=num_queries,
num_feature_levels=num_feature_levels,
aux_loss=deep_supervision,
with_box_refine=with_box_refine,
two_stage=two_stage,
)
if self.mask_on:
assert 0, 'Mask is not supported yet :('
matcher = HungarianMatcher(
cost_class=cls_weight, cost_bbox=l1_weight, cost_giou=giou_weight)
weight_dict = {"loss_ce": cls_weight, "loss_bbox": l1_weight}
weight_dict["loss_giou"] = giou_weight
if deep_supervision:
aux_weight_dict = {}
for i in range(dec_layers - 1):
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
print('weight_dict', weight_dict)
losses = ["labels", "boxes", "cardinality"]
if self.mask_on:
losses += ["masks"]
self.criterion = CustomSetCriterion(
self.num_classes, matcher=matcher, weight_dict=weight_dict,
focal_alpha=focal_alpha,
losses=losses,
use_fed_loss=cfg.MODEL.DETR.USE_FED_LOSS
)
pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
self.normalizer = lambda x: (x - pixel_mean) / pixel_std
def forward(self, batched_inputs):
"""
Args:
Returns:
dict[str: Tensor]:
mapping from a named loss to a tensor storing the loss. Used during training only.
"""
images = self.preprocess_image(batched_inputs)
output = self.detr(images)
if self.training:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
targets = self.prepare_targets(gt_instances)
loss_dict = self.criterion(output, targets)
weight_dict = self.criterion.weight_dict
for k in loss_dict.keys():
if k in weight_dict:
loss_dict[k] *= weight_dict[k]
if self.with_image_labels:
if batched_inputs[0]['ann_type'] in ['image', 'captiontag']:
loss_dict['loss_image'] = self.weak_weight * self._weak_loss(
output, batched_inputs)
else:
loss_dict['loss_image'] = images[0].new_zeros(
[1], dtype=torch.float32)[0]
# import pdb; pdb.set_trace()
return loss_dict
else:
image_sizes = output["pred_boxes"].new_tensor(
[(t["height"], t["width"]) for t in batched_inputs])
results = self.post_process(output, image_sizes)
return results
def prepare_targets(self, targets):
new_targets = []
for targets_per_image in targets:
h, w = targets_per_image.image_size
image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
gt_classes = targets_per_image.gt_classes
gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
gt_boxes = box_xyxy_to_cxcywh(gt_boxes)
new_targets.append({"labels": gt_classes, "boxes": gt_boxes})
if self.mask_on and hasattr(targets_per_image, 'gt_masks'):
assert 0, 'Mask is not supported yet :('
gt_masks = targets_per_image.gt_masks
gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
new_targets[-1].update({'masks': gt_masks})
return new_targets
def post_process(self, outputs, target_sizes):
"""
"""
out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
assert len(out_logits) == len(target_sizes)
assert target_sizes.shape[1] == 2
prob = out_logits.sigmoid()
topk_values, topk_indexes = torch.topk(
prob.view(out_logits.shape[0], -1), self.test_topk, dim=1)
scores = topk_values
topk_boxes = topk_indexes // out_logits.shape[2]
labels = topk_indexes % out_logits.shape[2]
boxes = box_cxcywh_to_xyxy(out_bbox)
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
# and from relative [0, 1] to absolute [0, height] coordinates
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = []
for s, l, b, size in zip(scores, labels, boxes, target_sizes):
r = Instances((size[0], size[1]))
r.pred_boxes = Boxes(b)
r.scores = s
r.pred_classes = l
results.append({'instances': r})
return results
def preprocess_image(self, batched_inputs):
"""
Normalize, pad and batch the input images.
"""
images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
return images
def _weak_loss(self, outputs, batched_inputs):
loss = 0
for b, x in enumerate(batched_inputs):
labels = x['pos_category_ids']
pred_logits = [outputs['pred_logits'][b]]
pred_boxes = [outputs['pred_boxes'][b]]
for xx in outputs['aux_outputs']:
pred_logits.append(xx['pred_logits'][b])
pred_boxes.append(xx['pred_boxes'][b])
pred_logits = torch.stack(pred_logits, dim=0) # L x N x C
pred_boxes = torch.stack(pred_boxes, dim=0) # L x N x 4
for label in labels:
loss += self._max_size_loss(
pred_logits, pred_boxes, label) / len(labels)
loss = loss / len(batched_inputs)
return loss
def _max_size_loss(self, logits, boxes, label):
'''
Inputs:
logits: L x N x C
boxes: L x N x 4
'''
target = logits.new_zeros((logits.shape[0], logits.shape[2]))
target[:, label] = 1.
sizes = boxes[..., 2] * boxes[..., 3] # L x N
ind = sizes.argmax(dim=1) # L
loss = F.binary_cross_entropy_with_logits(
logits[range(len(ind)), ind], target, reduction='sum')
return loss |