from typing import Dict import cv2 import numpy as np from skimage.filters import gaussian from yacs.config import CfgNode import torch from .utils import (convert_cvimg_to_tensor, expand_to_aspect_ratio, generate_image_patch_cv2) DEFAULT_MEAN = 255. * np.array([0.485, 0.456, 0.406]) DEFAULT_STD = 255. * np.array([0.229, 0.224, 0.225]) class ViTDetDataset(torch.utils.data.Dataset): def __init__(self, cfg: CfgNode, img_cv2: np.array, boxes: np.array, right: np.array, rescale_factor=2.5, train: bool = False, **kwargs): super().__init__() self.cfg = cfg self.img_cv2 = img_cv2 # self.boxes = boxes assert train == False, "ViTDetDataset is only for inference" self.train = train self.img_size = cfg.MODEL.IMAGE_SIZE self.mean = 255. * np.array(self.cfg.MODEL.IMAGE_MEAN) self.std = 255. * np.array(self.cfg.MODEL.IMAGE_STD) # Preprocess annotations boxes = boxes.astype(np.float32) self.center = (boxes[:, 2:4] + boxes[:, 0:2]) / 2.0 self.scale = rescale_factor * (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0 #self.scale = (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0 self.personid = np.arange(len(boxes), dtype=np.int32) self.right = right.astype(np.float32) def __len__(self) -> int: return len(self.personid) def __getitem__(self, idx: int) -> Dict[str, np.array]: center = self.center[idx].copy() center_x = center[0] center_y = center[1] scale = self.scale[idx] BBOX_SHAPE = self.cfg.MODEL.get('BBOX_SHAPE', None) bbox_size = expand_to_aspect_ratio(scale*200, target_aspect_ratio=BBOX_SHAPE).max() #bbox_size = scale.max()*200 patch_width = patch_height = self.img_size right = self.right[idx].copy() flip = right == 0 # 3. generate image patch # if use_skimage_antialias: cvimg = self.img_cv2.copy() if True: # Blur image to avoid aliasing artifacts downsampling_factor = ((bbox_size*1.0) / patch_width) print(f'{downsampling_factor=}') downsampling_factor = downsampling_factor / 2.0 if downsampling_factor > 1.1: cvimg = gaussian(cvimg, sigma=(downsampling_factor-1)/2, channel_axis=2, preserve_range=True) img_patch_cv, trans = generate_image_patch_cv2(cvimg, center_x, center_y, bbox_size, bbox_size, patch_width, patch_height, flip, 1.0, 0, border_mode=cv2.BORDER_CONSTANT) img_patch_cv = img_patch_cv[:, :, ::-1] img_patch = convert_cvimg_to_tensor(img_patch_cv) # apply normalization for n_c in range(min(self.img_cv2.shape[2], 3)): img_patch[n_c, :, :] = (img_patch[n_c, :, :] - self.mean[n_c]) / self.std[n_c] item = { 'img': img_patch, 'personid': int(self.personid[idx]), } item['box_center'] = self.center[idx].copy() item['box_size'] = bbox_size item['img_size'] = 1.0 * np.array([cvimg.shape[1], cvimg.shape[0]]) item['right'] = self.right[idx].copy() return item