RegionSpot / regionspot /detector.py
bklg's picture
Upload 114 files
a153c95
from collections import namedtuple
from .modeling.regionspot import build_regionspot_model
import torch.cuda.amp as amp
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from einops import rearrange
import json
from detectron2.modeling import META_ARCH_REGISTRY
from .util.postprocessing import segmentation_postprocess
from detectron2.structures import Boxes, Instances
from .util.preprocessing import prepare_prompt_infer, prepare_prompt_train
__all__ = ["RegionSpot"]
@META_ARCH_REGISTRY.register()
class RegionSpot(nn.Module):
"""
Implement RegionSpot
"""
def __init__(self, cfg):
super().__init__()
self.device = torch.device(cfg.MODEL.DEVICE)
self.clip_type = cfg.MODEL.CLIP_TYPE
self.inference_box_type = cfg.MODEL.BOX_TYPE
self.clip_input_size = cfg.MODEL.CLIP_INPUT_SIZE
self.clip_target_size = (self.clip_input_size, self.clip_input_size)
self.model, _ = build_regionspot_model(clip_type = self.clip_type, is_training=cfg.MODEL.TRAINING, image_size=self.clip_input_size)
self.model.to(self.device)
if self.inference_box_type != 'GT':
path = './datasets/glip_results/nms_results_glip_tiny_model_o365_goldg_cc_sbu_lvis_val.json'
with open(path, 'r') as file:
self.pred_results = json.load(file)
else:
self.pred_results = None
@torch.no_grad()
def foward_inference(self, batched_inputs, do_postprocess=True):
with amp.autocast(enabled=True):
with torch.no_grad():
logits_per_image, pred_mask = self.model.forward_eval(batched_inputs, multimask_output=False)
image_sizes = [x["original_size"] for x in batched_inputs]
if self.inference_box_type == 'GT':
boxes = torch.stack([x["instances"].gt_boxes.tensor for x in batched_inputs], dim=0) #n, n_box, n_token, 256
if len(boxes[0]) == 0:
boxes=torch.tensor([[[0,0, image_sizes[0][0], image_sizes[0][1]]]])
else:
boxes = torch.stack([x["pred_boxes"] for x in batched_inputs], dim=0) #n, n_box, n_token, 256
scores = torch.stack([x["scores"] for x in batched_inputs], dim=0)
box_cls = logits_per_image
box_pred = boxes
if self.inference_box_type == 'GT':
results = self.inference_gt_box(box_cls, box_pred, pred_mask, image_sizes)
else:
results = self.inference_pred_box(box_cls, box_pred, scores, pred_mask, image_sizes)
if do_postprocess:
processed_results = []
for results_per_image, input_per_image, image_size in zip(results, batched_inputs, image_sizes):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = segmentation_postprocess(results_per_image, height, width)
processed_results.append({"instances": r})
return processed_results
else:
return results
def foward_train(self, batched_inputs):
with amp.autocast(enabled=True):
outputs = self.model.forward_train(batched_inputs)
loss = {'loss': outputs}
return loss
def forward(self, batched_inputs, do_postprocess=True):
if not self.training:
# Prepare Prompt.
batched_inputs = prepare_prompt_infer(batched_inputs, pred_results = self.pred_results, target_size=self.clip_target_size)
results = self.foward_inference(batched_inputs)
return results
if self.training:
batched_inputs = prepare_prompt_train(batched_inputs, target_size=self.clip_target_size)
loss_dict = self.foward_train(batched_inputs)
return loss_dict
def inference_gt_box(self, box_cls, box_pred, pred_mask, image_sizes=None):
device = box_cls.device # assuming all tensors are on the same device
results = []
for logits, boxes, masks, img_size in zip(box_cls, box_pred, pred_mask, image_sizes):
# Calculate probabilities and flatten them
probs = F.softmax(logits, dim=-1)
probs_flattened = probs.view(-1)
# Determine number of top predictions to consider
top_num = min(900, len(probs_flattened))
# Get top-k values and indices
topk_probs, topk_indices = torch.topk(probs_flattened, top_num)
# Decode the top-k indices to get corresponding labels and boxes
topk_labels = topk_indices % logits.shape[1]
topk_boxes_indices = topk_indices // logits.shape[1]
# Ensure boxes, masks and topk_boxes_indices are on the same device
topk_boxes_indices = topk_boxes_indices.to(device)
boxes = boxes.to(device)
masks = masks.to(device)
# Retrieve predictions using the top-k indices
boxes_for_topk = boxes[topk_boxes_indices]
masks_for_topk = masks[topk_boxes_indices]
scores_for_topk = topk_probs # Modify accordingly if you have another score tensor
# Create Instances object for top-k predictions
result = Instances(img_size)
result.pred_boxes = Boxes(boxes_for_topk)
result.scores = scores_for_topk
result.pred_classes = topk_labels
result.pred_masks = masks_for_topk # Added masks to the result
results.append(result)
return results
def inference_pred_box(self, box_cls, box_pred, box_score, masks, image_sizes=None):
results = []
for i, (logits, box_pred_i, box_score_i, mask_i, img_size) in enumerate(zip(box_cls, box_pred, box_score, masks, image_sizes)):
logits = logits.cuda()
box_pred_i = box_pred_i.cuda()
box_score_i = box_score_i.cuda()
# Calculate probabilities and flatten them
probs = F.softmax(logits, dim=-1)
probs_flattened = probs.view(-1)
# Determine number of top predictions to consider
top_num = min(900, len(probs_flattened))
# Get top-k values and indices
topk_probs, topk_indices = torch.topk(probs_flattened, top_num)
# Decode the top-k indices to get corresponding labels and boxes
topk_labels = topk_indices % logits.shape[1]
topk_boxes_indices = topk_indices // logits.shape[1]
# Retrieve predictions using the top-k indices
boxes = box_pred_i[topk_boxes_indices]
masks = mask_i[topk_boxes_indices]
scores = box_score_i[topk_boxes_indices] * topk_probs
# Construct result for the current image
result = Instances(img_size)
result.pred_boxes = Boxes(boxes)
result.scores = scores
result.pred_classes = topk_labels
result.pred_masks = masks
results.append(result)
return results