Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

App Files Files Community

GLIP-BLIP-Object-Detection-VQA / maskrcnn_benchmark /data /datasets /pseudo_data.py

Pinwheel

HF Demo

128757a about 2 years ago

raw

history blame

9.13 kB

	import torch
	import torch.distributed as dist
	import time
	from torchvision.ops import nms
	import random
	import numpy as np
	from PIL import Image, ImageDraw
	import pdb
	from maskrcnn_benchmark.structures.bounding_box import BoxList
	from .modulated_coco import ConvertCocoPolysToMask
	from .tsv import ODTSVDataset, TSVYamlDataset
	from .od_to_grounding import sanity_check_target_after_processing
	from copy import deepcopy

	class PseudoData(TSVYamlDataset):
	def __init__(self,
	yaml_file,
	transforms,
	return_tokens,
	return_masks,
	tokenizer,
	caption_min_box=1,
	replace_clean_label=False,
	further_screen=False,
	caption_conf=0.5,
	caption_nms=-1,
	pack_random_caption_number=0,
	inference_caption=False,
	sample_negative_for_grounding_data=-1,
	random_pack_prob=-1.0,
	no_random_pack_probability=0.0,
	safeguard_positive_caption=True,
	mlm_obj_for_only_positive=False,
	caption_format_version="v1",
	local_debug=False,
	max_query_len=256,
	diver_box_for_vqa=False,
	**kwargs
	):
	super(PseudoData, self).__init__(yaml_file, None, replace_clean_label)
	self.yaml_file = yaml_file
	self._transforms = transforms
	self.max_query_len = max_query_len
	self.prepare = ConvertCocoPolysToMask(return_masks=return_masks,
	return_tokens=return_tokens,
	tokenizer=tokenizer,
	max_query_len=max_query_len)
	self.diver_box_for_vqa = diver_box_for_vqa
	if "qa" in self.yaml_file:
	assert(self.diver_box_for_vqa) # must diver box
	self.tokenizer = tokenizer
	self.caption_min_box = caption_min_box
	self.replace_clean_label = replace_clean_label
	self.further_screen = further_screen
	self.pack_random_caption_number = pack_random_caption_number
	self.caption_format_version = caption_format_version

	self.caption_conf = caption_conf
	self.caption_nms = caption_nms
	self.inference_caption = inference_caption
	self.sample_negative_for_grounding_data = sample_negative_for_grounding_data
	self.random_pack_prob = random_pack_prob
	self.no_random_pack_probability = no_random_pack_probability
	self.safeguard_positive_caption = safeguard_positive_caption
	self.mlm_obj_for_only_positive = mlm_obj_for_only_positive
	self.local_debug = local_debug
	try:
	self.rank = dist.get_rank()
	except:
	self.rank = 0

	def __len__(self):
	return super(PseudoData, self).__len__()

	@staticmethod
	def check_for_overlap(range1, range2):
	if range1[0] > range2[1] or range2[0] > range1[1]:
	return False
	return True

	def divert_boxes(self, anno):
	# first get answer start and end
	answer_start = len(anno['text']) + 1 # +1 for the space
	answer_end = len(anno["caption"])

	question = anno["caption"][:answer_start] # get the question

	mask_start = len(question)
	# add the mask token
	mask_token = self.tokenizer.mask_token
	if mask_token is None:
	mask_token = 'answer'
	question += mask_token
	mask_end = len(question)

	# divert the box
	for i in range(len(anno["bboxes"])):
	# check over lap
	for j in range(len(anno["tokens_positive"][i])):
	if self.check_for_overlap(anno["tokens_positive"][i][j], [answer_start, answer_end]):
	# if overlap, then divert the box to the mask token
	anno["tokens_positive"][i][j] = [mask_start, mask_end]

	anno["caption"] = question
	return question, anno

	def __getitem__(self, idx):
	img, anno, _, scale = super(PseudoData, self).__getitem__(idx)
	if self.inference_caption:
	caption = None
	if isinstance(anno, list):
	caption = anno[0]["caption"] # inference mode for bing
	anno = []
	elif len(anno) == 1:
	caption = anno["caption"] # inference mode for googlecc
	anno = []
	else:
	caption = " ".join(anno["captions"])
	anno = []
	else:
	if self.caption_format_version == "v2":
	anno = self.convert_anno_from_yiling_to_ours(anno)

	if self.further_screen:
	conf = self.caption_conf
	nms_thre = self.caption_nms

	bboxes = torch.as_tensor(anno["bboxes"]).float()
	scores = torch.as_tensor(anno["scores"])
	tokens_positive = anno["tokens_positive"]

	keep = scores > conf
	scores = scores[keep]
	bboxes = bboxes[keep]
	tokens_positive = [i for index, i in enumerate(tokens_positive) if keep[index]]

	assert (len(tokens_positive) == len(bboxes) == len(scores))

	if len(bboxes) < self.caption_min_box: # Retry triggered!
	return self[np.random.choice(len(self))]

	if nms_thre > 0:
	keep = nms(boxes=bboxes, scores=scores, iou_threshold=nms_thre)
	scores = scores[keep]
	bboxes = bboxes[keep]
	tokens_positive = [tokens_positive[i] for i in keep]
	assert (len(tokens_positive) == len(bboxes) == len(scores))

	# Write back
	anno["bboxes"] = bboxes.tolist()
	anno["scores"] = scores.tolist()
	anno["tokens_positive"] = tokens_positive

	boxes = torch.as_tensor(anno["bboxes"])

	if len(boxes) < self.caption_min_box: # Retry triggered!
	return self[np.random.choice(len(self))]

	target = BoxList(boxes, (anno["img_w"], anno["img_h"]), mode="xyxy")
	target = target.clip_to_image(remove_empty=True)

	if self.diver_box_for_vqa:
	caption, anno = self.divert_boxes(anno=anno) # will change caption and "tokens_positive"

	caption = anno["caption"]

	greenlight_span_for_masked_lm_objective = [(0, len(caption))]

	new_anno = []
	areas = target.area()
	for i in range(len(target)):
	new_anno_i = {}
	new_anno_i["area"] = areas[i]
	new_anno_i["iscrowd"] = 0
	new_anno_i["image_id"] = idx
	new_anno_i["category_id"] = 1 # following vg and others
	new_anno_i["id"] = None
	new_anno_i['bbox'] = target.bbox[i].numpy().tolist()
	new_anno_i["tokens_positive"] = anno["tokens_positive"][i]
	new_anno.append(new_anno_i)
	anno = new_anno

	annotations = {"image_id": idx, "annotations": anno, "caption": caption}
	annotations["greenlight_span_for_masked_lm_objective"] = greenlight_span_for_masked_lm_objective
	img, annotations = self.prepare(img, annotations, box_format="xyxy")

	if self._transforms is not None:
	img, target = self._transforms(img, target)

	# add additional property
	for ann in annotations:
	target.add_field(ann, annotations[ann])

	# This is the real image_id
	image_id = self.get_img_id(idx)
	# Can insert additional field into target if needed

	sanity_check_target_after_processing(target)

	return img, target, idx

	def convert_anno_from_yiling_to_ours(self, anno):
	flatterned_bboxes = []
	flatterned_tokens_positive = []
	flatterned_bboxes_scores = []
	for i in range(len(anno["bboxes"])):
	# i is the index for entity
	for j in range(len(anno["bboxes"][i])):
	# j is the index for each box
	flatterned_bboxes.append(anno["bboxes"][i][j])
	flatterned_tokens_positive.append(
	anno["tokens_positive"][i]) # Assume this box corresponds to all the token_spans for this entity
	flatterned_bboxes_scores.append(anno["scores"][i][j])
	anno["bboxes"] = flatterned_bboxes
	anno["tokens_positive"] = flatterned_tokens_positive
	anno["scores"] = flatterned_bboxes_scores
	return anno

	def get_raw_image(self, idx):
	image, *_ = super(PseudoData, self).__getitem__(idx)
	return image

	def get_img_id(self, idx):
	line_no = self.get_line_no(idx)
	if self.label_tsv is not None:
	row = self.label_tsv.seek(line_no)
	img_id = row[0]
	return img_id