Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

App Files Files Community

GLIP-BLIP-Object-Detection-VQA / maskrcnn_benchmark /data /datasets /evaluation /box_aug.py

Pinwheel

HF Demo

128757a about 2 years ago

raw

history blame

13.6 kB

	import torch
	import numpy as np

	from maskrcnn_benchmark.config import cfg
	from maskrcnn_benchmark.data import transforms as T
	from maskrcnn_benchmark.structures.image_list import to_image_list
	from maskrcnn_benchmark.structures.bounding_box import BoxList
	from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
	from maskrcnn_benchmark.layers import nms, soft_nms


	def im_detect_bbox_aug(model, images, device, captions=None, positive_map_label_to_token=None):
	# Collect detections computed under different transformations
	boxlists_ts = []
	for _ in range(len(images)):
	boxlists_ts.append([])

	def add_preds_t(boxlists_t):
	for i, boxlist_t in enumerate(boxlists_t):
	# Resize the boxlist as the first one
	boxlists_ts[i].append(boxlist_t.resize(images[i].size))

	# Compute detections at different scales
	if len(cfg.TEST.RANGES)==len(cfg.TEST.SCALES):
	keep_ranges = cfg.TEST.RANGES
	else:
	keep_ranges = [None for _ in cfg.TEST.SCALES]

	for scale, keep_range in zip(cfg.TEST.SCALES, keep_ranges):
	max_size = cfg.TEST.MAX_SIZE
	boxlists_scl = im_detect_bbox_scale(
	model, images, scale, max_size, device,
	captions=captions,
	positive_map_label_to_token=positive_map_label_to_token,
	)
	if keep_range is not None:
	boxlists_scl = remove_boxes(boxlists_scl, *keep_range)
	add_preds_t(boxlists_scl)

	if cfg.TEST.FLIP:
	boxlists_scl_hf = im_detect_bbox_scale(
	model, images, scale, max_size, device,
	captions=captions,
	positive_map_label_to_token=positive_map_label_to_token,
	hflip=True
	)
	if keep_range is not None:
	boxlists_scl_hf = remove_boxes(boxlists_scl_hf, *keep_range)
	add_preds_t(boxlists_scl_hf)

	# Merge boxlists detected by different bbox aug params
	boxlists = []
	for i, boxlist_ts in enumerate(boxlists_ts):
	bbox = torch.cat([boxlist_t.bbox for boxlist_t in boxlist_ts])
	scores = torch.cat([boxlist_t.get_field('scores') for boxlist_t in boxlist_ts])
	labels = torch.cat([boxlist_t.get_field('labels') for boxlist_t in boxlist_ts])
	boxlist = BoxList(bbox, boxlist_ts[0].size, boxlist_ts[0].mode)
	boxlist.add_field('scores', scores)
	boxlist.add_field('labels', labels)
	boxlists.append(boxlist)
	results = merge_result_from_multi_scales(boxlists)
	return results


	def im_detect_bbox(model, images, target_scale, target_max_size, device,
	captions=None,
	positive_map_label_to_token=None
	):
	"""
	Performs bbox detection on the original image.
	"""
	if cfg.INPUT.FORMAT is not '':
	input_format = cfg.INPUT.FORMAT
	elif cfg.INPUT.TO_BGR255:
	input_format = 'bgr255'
	transform = T.Compose([
	T.Resize(target_scale, target_max_size),
	T.ToTensor(),
	T.Normalize(
	mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format
	)
	])
	images = [transform(image) for image in images]
	images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY)
	if captions is None:
	return model(images.to(device))
	else:
	return model(images.to(device),
	captions=captions,
	positive_map=positive_map_label_to_token
	)


	def im_detect_bbox_hflip(model, images, target_scale, target_max_size, device,
	captions=None,
	positive_map_label_to_token=None
	):
	"""
	Performs bbox detection on the horizontally flipped image.
	Function signature is the same as for im_detect_bbox.
	"""
	if cfg.INPUT.FORMAT is not '':
	input_format = cfg.INPUT.FORMAT
	elif cfg.INPUT.TO_BGR255:
	input_format = 'bgr255'
	transform = T.Compose([
	T.Resize(target_scale, target_max_size),
	T.RandomHorizontalFlip(1.0),
	T.ToTensor(),
	T.Normalize(
	mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format
	)
	])
	images = [transform(image) for image in images]
	images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY)
	if captions is None:
	boxlists = model(images.to(device))
	else:
	boxlists = model(images.to(device),
	captions=captions,
	positive_map=positive_map_label_to_token
	)

	# Invert the detections computed on the flipped image
	boxlists_inv = [boxlist.transpose(0) for boxlist in boxlists]
	return boxlists_inv


	def im_detect_bbox_scale(model, images, target_scale, target_max_size, device,
	captions=None,
	positive_map_label_to_token=None,
	hflip=False):
	"""
	Computes bbox detections at the given scale.
	Returns predictions in the scaled image space.
	"""
	if hflip:
	boxlists_scl = im_detect_bbox_hflip(model, images, target_scale, target_max_size, device,
	captions=captions,
	positive_map_label_to_token=positive_map_label_to_token
	)
	else:
	boxlists_scl = im_detect_bbox(model, images, target_scale, target_max_size, device,
	captions=captions,
	positive_map_label_to_token=positive_map_label_to_token
	)
	return boxlists_scl


	def remove_boxes(boxlist_ts, min_scale, max_scale):
	new_boxlist_ts = []
	for _, boxlist_t in enumerate(boxlist_ts):
	mode = boxlist_t.mode
	boxlist_t = boxlist_t.convert("xyxy")
	boxes = boxlist_t.bbox
	keep = []
	for j, box in enumerate(boxes):
	w = box[2] - box[0] + 1
	h = box[3] - box[1] + 1
	if (w * h > min_scale * min_scale) and (w * h < max_scale * max_scale):
	keep.append(j)
	new_boxlist_ts.append(boxlist_t[keep].convert(mode))
	return new_boxlist_ts


	def merge_result_from_multi_scales(boxlists):
	num_images = len(boxlists)
	results = []
	for i in range(num_images):
	scores = boxlists[i].get_field("scores")
	labels = boxlists[i].get_field("labels")
	boxes = boxlists[i].bbox
	boxlist = boxlists[i]
	result = []
	# test on classes
	if len(cfg.TEST.SELECT_CLASSES):
	class_list = cfg.TEST.SELECT_CLASSES
	else:
	class_list = range(1, cfg.TEST.NUM_CLASSES)
	for j in class_list:
	inds = (labels == j).nonzero().view(-1)

	scores_j = scores[inds]
	boxes_j = boxes[inds, :].view(-1, 4)
	boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
	boxlist_for_class.add_field("scores", scores_j)
	boxlist_for_class = boxlist_nms(boxlist_for_class, cfg.TEST.TH, score_field="scores", nms_type=cfg.TEST.SPECIAL_NMS)
	num_labels = len(boxlist_for_class)
	boxlist_for_class.add_field("labels", torch.full((num_labels,), j, dtype=torch.int64, device=scores.device))
	result.append(boxlist_for_class)

	result = cat_boxlist(result)
	number_of_detections = len(result)

	# Limit to max_per_image detections over all classes
	if number_of_detections > cfg.TEST.PRE_NMS_TOP_N > 0:
	cls_scores = result.get_field("scores")
	image_thresh, _ = torch.kthvalue(
	cls_scores.cpu(),
	number_of_detections - cfg.TEST.PRE_NMS_TOP_N + 1
	)
	keep = cls_scores >= image_thresh.item()
	keep = torch.nonzero(keep).squeeze(1)
	result = result[keep]
	results.append(result)
	return results


	def boxlist_nms(boxlist, thresh, max_proposals=-1, score_field="scores", nms_type='nms'):
	if thresh <= 0:
	return boxlist
	mode = boxlist.mode
	boxlist = boxlist.convert("xyxy")
	boxes = boxlist.bbox
	score = boxlist.get_field(score_field)

	if nms_type == 'vote':
	boxes_vote, scores_vote = bbox_vote(boxes, score, thresh)
	if len(boxes_vote) > 0:
	boxlist.bbox = boxes_vote
	boxlist.extra_fields['scores'] = scores_vote
	elif nms_type == 'soft-vote':
	boxes_vote, scores_vote = soft_bbox_vote(boxes, score, thresh)
	if len(boxes_vote) > 0:
	boxlist.bbox = boxes_vote
	boxlist.extra_fields['scores'] = scores_vote
	elif nms_type == 'soft-nms':
	keep, new_score = soft_nms(boxes.cpu(), score.cpu(), thresh, 0.95)
	if max_proposals > 0:
	keep = keep[: max_proposals]
	boxlist = boxlist[keep]
	boxlist.extra_fields['scores'] = new_score
	else:
	keep = nms(boxes, score, thresh)
	if max_proposals > 0:
	keep = keep[: max_proposals]
	boxlist = boxlist[keep]
	return boxlist.convert(mode)


	def bbox_vote(boxes, scores, vote_thresh):
	boxes = boxes.cpu().numpy()
	scores = scores.cpu().numpy().reshape(-1, 1)
	det = np.concatenate((boxes, scores), axis=1)
	if det.shape[0] <= 1:
	return np.zeros((0, 5)), np.zeros((0, 1))
	order = det[:, 4].ravel().argsort()[::-1]
	det = det[order, :]
	dets = []
	while det.shape[0] > 0:
	# IOU
	area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
	xx1 = np.maximum(det[0, 0], det[:, 0])
	yy1 = np.maximum(det[0, 1], det[:, 1])
	xx2 = np.minimum(det[0, 2], det[:, 2])
	yy2 = np.minimum(det[0, 3], det[:, 3])
	w = np.maximum(0.0, xx2 - xx1 + 1)
	h = np.maximum(0.0, yy2 - yy1 + 1)
	inter = w * h
	o = inter / (area[0] + area[:] - inter)

	# get needed merge det and delete these det
	merge_index = np.where(o >= vote_thresh)[0]
	det_accu = det[merge_index, :]
	det = np.delete(det, merge_index, 0)

	if merge_index.shape[0] <= 1:
	try:
	dets = np.row_stack((dets, det_accu))
	except:
	dets = det_accu
	continue
	else:
	det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
	max_score = np.max(det_accu[:, 4])
	det_accu_sum = np.zeros((1, 5))
	det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
	det_accu_sum[:, 4] = max_score
	try:
	dets = np.row_stack((dets, det_accu_sum))
	except:
	dets = det_accu_sum

	boxes = torch.from_numpy(dets[:, :4]).float().cuda()
	scores = torch.from_numpy(dets[:, 4]).float().cuda()

	return boxes, scores


	def soft_bbox_vote(boxes, scores, vote_thresh):
	boxes = boxes.cpu().numpy()
	scores = scores.cpu().numpy().reshape(-1, 1)
	det = np.concatenate((boxes, scores), axis=1)
	if det.shape[0] <= 1:
	return np.zeros((0, 5)), np.zeros((0, 1))
	order = det[:, 4].ravel().argsort()[::-1]
	det = det[order, :]
	dets = []
	while det.shape[0] > 0:
	# IOU
	area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
	xx1 = np.maximum(det[0, 0], det[:, 0])
	yy1 = np.maximum(det[0, 1], det[:, 1])
	xx2 = np.minimum(det[0, 2], det[:, 2])
	yy2 = np.minimum(det[0, 3], det[:, 3])
	w = np.maximum(0.0, xx2 - xx1 + 1)
	h = np.maximum(0.0, yy2 - yy1 + 1)
	inter = w * h
	o = inter / (area[0] + area[:] - inter)

	# get needed merge det and delete these det
	merge_index = np.where(o >= vote_thresh)[0]
	det_accu = det[merge_index, :]
	det_accu_iou = o[merge_index]
	det = np.delete(det, merge_index, 0)

	if merge_index.shape[0] <= 1:
	try:
	dets = np.row_stack((dets, det_accu))
	except:
	dets = det_accu
	continue
	else:
	soft_det_accu = det_accu.copy()
	soft_det_accu[:, 4] = soft_det_accu[:, 4] * (1 - det_accu_iou)
	soft_index = np.where(soft_det_accu[:, 4] >= cfg.MODEL.RETINANET.INFERENCE_TH)[0]
	soft_det_accu = soft_det_accu[soft_index, :]

	det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
	max_score = np.max(det_accu[:, 4])
	det_accu_sum = np.zeros((1, 5))
	det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
	det_accu_sum[:, 4] = max_score

	if soft_det_accu.shape[0] > 0:
	det_accu_sum = np.row_stack((det_accu_sum, soft_det_accu))

	try:
	dets = np.row_stack((dets, det_accu_sum))
	except:
	dets = det_accu_sum

	order = dets[:, 4].ravel().argsort()[::-1]
	dets = dets[order, :]

	boxes = torch.from_numpy(dets[:, :4]).float().cuda()
	scores = torch.from_numpy(dets[:, 4]).float().cuda()

	return boxes, scores