Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

App Files Files Community

GLIP-BLIP-Object-Detection-VQA / maskrcnn_benchmark /engine /predictor.py

Pinwheel

HF Demo

128757a about 2 years ago

raw

history blame

20.9 kB

	# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
	import cv2
	import torch
	import numpy as np
	from torchvision import transforms as T

	from maskrcnn_benchmark.modeling.detector import build_detection_model
	from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
	from maskrcnn_benchmark.structures.image_list import to_image_list
	from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
	from maskrcnn_benchmark.structures.bounding_box import BoxList
	from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker
	from maskrcnn_benchmark import layers as L
	from maskrcnn_benchmark.utils import cv2_util


	import timeit

	class COCODemo(object):
	# COCO categories for pretty print
	CATEGORIES = [
	"__background",
	"person",
	"bicycle",
	"car",
	"motorcycle",
	"airplane",
	"bus",
	"train",
	"truck",
	"boat",
	"traffic light",
	"fire hydrant",
	"stop sign",
	"parking meter",
	"bench",
	"bird",
	"cat",
	"dog",
	"horse",
	"sheep",
	"cow",
	"elephant",
	"bear",
	"zebra",
	"giraffe",
	"backpack",
	"umbrella",
	"handbag",
	"tie",
	"suitcase",
	"frisbee",
	"skis",
	"snowboard",
	"sports ball",
	"kite",
	"baseball bat",
	"baseball glove",
	"skateboard",
	"surfboard",
	"tennis racket",
	"bottle",
	"wine glass",
	"cup",
	"fork",
	"knife",
	"spoon",
	"bowl",
	"banana",
	"apple",
	"sandwich",
	"orange",
	"broccoli",
	"carrot",
	"hot dog",
	"pizza",
	"donut",
	"cake",
	"chair",
	"couch",
	"potted plant",
	"bed",
	"dining table",
	"toilet",
	"tv",
	"laptop",
	"mouse",
	"remote",
	"keyboard",
	"cell phone",
	"microwave",
	"oven",
	"toaster",
	"sink",
	"refrigerator",
	"book",
	"clock",
	"vase",
	"scissors",
	"teddy bear",
	"hair drier",
	"toothbrush",
	]

	def __init__(
	self,
	cfg,
	confidence_threshold=0.7,
	show_mask_heatmaps=False,
	masks_per_dim=2,
	min_image_size=None,
	exclude_region=None,
	):
	self.cfg = cfg.clone()
	self.model = build_detection_model(cfg)
	self.model.eval()
	self.device = torch.device(cfg.MODEL.DEVICE)
	self.model.to(self.device)
	self.min_image_size = min_image_size

	save_dir = cfg.OUTPUT_DIR
	checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir)
	_ = checkpointer.load(cfg.MODEL.WEIGHT)

	self.transforms = self.build_transform()

	mask_threshold = -1 if show_mask_heatmaps else 0.5
	self.masker = Masker(threshold=mask_threshold, padding=1)

	# used to make colors for each class
	self.palette = torch.tensor([2 25 - 1, 2 15 - 1, 2 ** 21 - 1])

	self.cpu_device = torch.device("cpu")
	self.confidence_threshold = confidence_threshold
	self.show_mask_heatmaps = show_mask_heatmaps
	self.masks_per_dim = masks_per_dim
	self.exclude_region = exclude_region

	def build_transform(self):
	"""
	Creates a basic transformation that was used to train the models
	"""
	cfg = self.cfg

	# we are loading images with OpenCV, so we don't need to convert them
	# to BGR, they are already! So all we need to do is to normalize
	# by 255 if we want to convert to BGR255 format, or flip the channels
	# if we want it to be in RGB in [0-1] range.
	if cfg.INPUT.TO_BGR255:
	to_bgr_transform = T.Lambda(lambda x: x * 255)
	else:
	to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]])

	normalize_transform = T.Normalize(
	mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD
	)

	transform = T.Compose(
	[
	T.ToPILImage(),
	T.Resize(self.min_image_size) if self.min_image_size is not None else lambda x:x,
	T.ToTensor(),
	to_bgr_transform,
	normalize_transform,
	]
	)
	return transform

	def inference(self, image, debug=False):
	"""
	Arguments:
	image (np.ndarray): an image as returned by OpenCV

	Returns:
	prediction (BoxList): the detected objects. Additional information
	of the detection properties can be found in the fields of
	the BoxList via `prediction.fields()`
	"""
	predictions, debug_info = self.compute_prediction(image)
	top_predictions = self.select_top_predictions(predictions)

	if debug:
	return top_predictions, debug_info
	else:
	return top_predictions

	def run_on_opencv_image(self, image):
	"""
	Arguments:
	image (np.ndarray): an image as returned by OpenCV

	Returns:
	prediction (BoxList): the detected objects. Additional information
	of the detection properties can be found in the fields of
	the BoxList via `prediction.fields()`
	"""
	predictions, debug_info = self.compute_prediction(image)
	top_predictions = self.select_top_predictions(predictions)

	result = image.copy()
	if self.show_mask_heatmaps:
	return self.create_mask_montage(result, top_predictions)
	result = self.overlay_boxes(result, top_predictions)
	if self.cfg.MODEL.MASK_ON:
	result = self.overlay_mask(result, top_predictions)
	if self.cfg.MODEL.KEYPOINT_ON:
	result = self.overlay_keypoints(result, top_predictions)
	result = self.overlay_class_names(result, top_predictions)

	return result, debug_info, top_predictions

	def compute_prediction(self, original_image):
	"""
	Arguments:
	original_image (np.ndarray): an image as returned by OpenCV

	Returns:
	prediction (BoxList): the detected objects. Additional information
	of the detection properties can be found in the fields of
	the BoxList via `prediction.fields()`
	"""
	# apply pre-processing to image
	# if self.exclude_region:
	# for region in self.exclude_region:
	# original_image[region[1]:region[3], region[0]:region[2], :] = 255
	image = self.transforms(original_image)


	# convert to an ImageList, padded so that it is divisible by
	# cfg.DATALOADER.SIZE_DIVISIBILITY
	image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY)
	image_list = image_list.to(self.device)
	tic = timeit.time.perf_counter()

	# compute predictions
	with torch.no_grad():
	predictions, debug_info = self.model(image_list)
	predictions = [o.to(self.cpu_device) for o in predictions]
	debug_info['total_time'] = timeit.time.perf_counter() - tic

	# always single image is passed at a time
	prediction = predictions[0]

	# reshape prediction (a BoxList) into the original image size
	height, width = original_image.shape[:-1]
	prediction = prediction.resize((width, height))

	if prediction.has_field("mask"):
	# if we have masks, paste the masks in the right position
	# in the image, as defined by the bounding boxes
	masks = prediction.get_field("mask")
	# always single image is passed at a time
	masks = self.masker([masks], [prediction])[0]
	prediction.add_field("mask", masks)

	return prediction, debug_info

	def select_top_predictions(self, predictions):
	"""
	Select only predictions which have a `score` > self.confidence_threshold,
	and returns the predictions in descending order of score

	Arguments:
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `scores`.

	Returns:
	prediction (BoxList): the detected objects. Additional information
	of the detection properties can be found in the fields of
	the BoxList via `prediction.fields()`
	"""

	scores = predictions.get_field("scores")
	labels = predictions.get_field("labels").tolist()
	thresh = scores.clone()
	for i,lb in enumerate(labels):
	if isinstance(self.confidence_threshold, float):
	thresh[i] = self.confidence_threshold
	elif len(self.confidence_threshold)==1:
	thresh[i] = self.confidence_threshold[0]
	else:
	thresh[i] = self.confidence_threshold[lb-1]
	keep = torch.nonzero(scores > thresh).squeeze(1)
	predictions = predictions[keep]

	if self.exclude_region:
	exlude = BoxList(self.exclude_region, predictions.size)
	iou = boxlist_iou(exlude, predictions)
	keep = torch.nonzero(torch.sum(iou>0.5, dim=0)==0).squeeze(1)
	if len(keep)>0:
	predictions = predictions[keep]

	scores = predictions.get_field("scores")
	_, idx = scores.sort(0, descending=True)
	return predictions[idx]

	def compute_colors_for_labels(self, labels):
	"""
	Simple function that adds fixed colors depending on the class
	"""
	colors = (30(labels[:, None] -1)+1)self.palette
	colors = (colors % 255).numpy().astype("uint8")
	return colors

	def overlay_boxes(self, image, predictions):
	"""
	Adds the predicted boxes on top of the image

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `labels`.
	"""
	labels = predictions.get_field("labels")
	boxes = predictions.bbox

	colors = self.compute_colors_for_labels(labels).tolist()

	for box, color in zip(boxes, colors):
	box = box.to(torch.int64)
	top_left, bottom_right = box[:2].tolist(), box[2:].tolist()
	image = cv2.rectangle(
	image, tuple(top_left), tuple(bottom_right), tuple(color), 2)

	return image

	def overlay_scores(self, image, predictions):
	"""
	Adds the predicted boxes on top of the image

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `labels`.
	"""
	scores = predictions.get_field("scores")
	boxes = predictions.bbox

	for box, score in zip(boxes, scores):
	box = box.to(torch.int64)
	image = cv2.putText(image, '%.3f'%score,
	(box[0], (box[1]+box[3])/2),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5,
	(255,255,255), 1)

	return image

	def overlay_cboxes(self, image, predictions):
	"""
	Adds the predicted boxes on top of the image

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `labels`.
	"""
	scores = predictions.get_field("scores")
	boxes = predictions.bbox
	for box, score in zip(boxes, scores):
	box = box.to(torch.int64)
	top_left, bottom_right = box[:2].tolist(), box[2:].tolist()
	image = cv2.rectangle(
	image, tuple(top_left), tuple(bottom_right), (255,0,0), 2)
	image = cv2.putText(image, '%.3f'%score,
	(box[0], (box[1]+box[3])/2),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5,
	(255,0,0), 1)
	return image

	def overlay_centers(self, image, predictions):
	"""
	Adds the predicted boxes on top of the image

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `labels`.
	"""
	centers = predictions.get_field("centers")

	for cord in centers:
	cord = cord.to(torch.int64)
	image = cv2.circle(image, (cord[0].item(),cord[1].item()),
	2, (255,0,0), 20)

	return image

	def overlay_count(self, image, predictions):
	"""
	Adds the predicted boxes on top of the image

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `labels`.
	"""
	if isinstance(predictions, int):
	count = predictions
	else:
	count = len(predictions)
	image = cv2.putText(image, 'Count: %d'%count, (0,100), cv2.FONT_HERSHEY_SIMPLEX, 3, (255,0,0), 3)

	return image

	def overlay_mask(self, image, predictions):
	"""
	Adds the instances contours for each predicted object.
	Each label has a different color.

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `mask` and `labels`.
	"""
	masks = predictions.get_field("mask").numpy()
	labels = predictions.get_field("labels")

	colors = self.compute_colors_for_labels(labels).tolist()

	for mask, color in zip(masks, colors):
	thresh = mask[0, :, :, None].astype(np.uint8)
	contours, hierarchy = cv2_util.findContours(
	thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
	)
	image = cv2.drawContours(image, contours, -1, color, 3)

	composite = image

	return composite

	def overlay_keypoints(self, image, predictions):
	keypoints = predictions.get_field("keypoints")
	kps = keypoints.keypoints
	scores = keypoints.get_field("logits")
	kps = torch.cat((kps[:, :, 0:2], scores[:, :, None]), dim=2).numpy()
	for region in kps:
	image = vis_keypoints(image, region.transpose((1, 0)),
	names=keypoints.NAMES, connections=keypoints.CONNECTIONS)
	return image

	def create_mask_montage(self, image, predictions):
	"""
	Create a montage showing the probability heatmaps for each one one of the
	detected objects

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `mask`.
	"""
	masks = predictions.get_field("mask")
	masks_per_dim = self.masks_per_dim
	masks = L.interpolate(
	masks.float(), scale_factor=1 / masks_per_dim
	).byte()
	height, width = masks.shape[-2:]
	max_masks = masks_per_dim ** 2
	masks = masks[:max_masks]
	# handle case where we have less detections than max_masks
	if len(masks) < max_masks:
	masks_padded = torch.zeros(max_masks, 1, height, width, dtype=torch.uint8)
	masks_padded[: len(masks)] = masks
	masks = masks_padded
	masks = masks.reshape(masks_per_dim, masks_per_dim, height, width)
	result = torch.zeros(
	(masks_per_dim * height, masks_per_dim * width), dtype=torch.uint8
	)
	for y in range(masks_per_dim):
	start_y = y * height
	end_y = (y + 1) * height
	for x in range(masks_per_dim):
	start_x = x * width
	end_x = (x + 1) * width
	result[start_y:end_y, start_x:end_x] = masks[y, x]
	return cv2.applyColorMap(result.numpy(), cv2.COLORMAP_JET)

	def overlay_class_names(self, image, predictions, names=None):
	"""
	Adds detected class names and scores in the positions defined by the
	top-left corner of the predicted bounding box

	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	predictions (BoxList): the result of the computation by the model.
	It should contain the field `scores` and `labels`.
	"""
	scores = predictions.get_field("scores").tolist()
	labels = predictions.get_field("labels").tolist()
	if names:
	labels = [names[i-1] for i in labels]
	else:
	labels = [self.CATEGORIES[i] for i in labels]
	boxes = predictions.bbox

	template = "{}: {:.2f}"
	for box, score, label in zip(boxes, scores, labels):
	x, y = box[:2]
	s = template.format(label, score)
	cv2.putText(
	image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
	)

	return image

	def vis_keypoints(img, kps, kp_thresh=0, alpha=0.7, names=None, connections=None):
	"""Visualizes keypoints (adapted from vis_one_image).
	kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob).
	"""

	dataset_keypoints = names
	kp_lines = connections

	# simple rainbow color map implementation
	blue_red_ratio = 0.8
	gx = lambda x: (6-2blue_red_ratio)x + blue_red_ratio
	colors = [[256*max(0, (3-abs(gx(i)-4)-abs(gx(i)-5))/2),
	256*max(0, (3-abs(gx(i)-2)-abs(gx(i)-4))/2),
	256*max(0, (3-abs(gx(i)-1)-abs(gx(i)-2))/2),] for i in np.linspace(0, 1, len(kp_lines) + 2)]

	# Perform the drawing on a copy of the image, to allow for blending.
	kp_mask = np.copy(img)

	# Draw mid shoulder / mid hip first for better visualization.
	mid_shoulder = (
	kps[:2, dataset_keypoints.index('right_shoulder')] +
	kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0
	sc_mid_shoulder = np.minimum(
	kps[2, dataset_keypoints.index('right_shoulder')],
	kps[2, dataset_keypoints.index('left_shoulder')])
	nose_idx = dataset_keypoints.index('nose')
	if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh:
	cv2.line(
	kp_mask, tuple(mid_shoulder), tuple(kps[:2, nose_idx]),
	color=colors[len(kp_lines)], thickness=2, lineType=cv2.LINE_AA)

	if 'right_hip' in names and 'left_hip' in names:
	mid_hip = (
	kps[:2, dataset_keypoints.index('right_hip')] +
	kps[:2, dataset_keypoints.index('left_hip')]) / 2.0
	sc_mid_hip = np.minimum(
	kps[2, dataset_keypoints.index('right_hip')],
	kps[2, dataset_keypoints.index('left_hip')])
	if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh:
	cv2.line(
	kp_mask, tuple(mid_shoulder), tuple(mid_hip),
	color=colors[len(kp_lines) + 1], thickness=2, lineType=cv2.LINE_AA)

	# Draw the keypoints.
	for l in range(len(kp_lines)):
	i1 = kp_lines[l][0]
	i2 = kp_lines[l][1]
	p1 = kps[0, i1], kps[1, i1]
	p2 = kps[0, i2], kps[1, i2]
	if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh:
	cv2.line(
	kp_mask, p1, p2,
	color=colors[l], thickness=2, lineType=cv2.LINE_AA)
	if kps[2, i1] > kp_thresh:
	cv2.circle(
	kp_mask, p1,
	radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
	if kps[2, i2] > kp_thresh:
	cv2.circle(
	kp_mask, p2,
	radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)

	# Blend the keypoints.
	return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)