HaMeR / hamer /datasets /vitdet_dataset.py
geopavlakos's picture
Initial commit
d7a991a
from typing import Dict
import cv2
import numpy as np
from skimage.filters import gaussian
from yacs.config import CfgNode
import torch
from .utils import (convert_cvimg_to_tensor,
expand_to_aspect_ratio,
generate_image_patch_cv2)
DEFAULT_MEAN = 255. * np.array([0.485, 0.456, 0.406])
DEFAULT_STD = 255. * np.array([0.229, 0.224, 0.225])
class ViTDetDataset(torch.utils.data.Dataset):
def __init__(self,
cfg: CfgNode,
img_cv2: np.array,
boxes: np.array,
right: np.array,
rescale_factor=2.5,
train: bool = False,
**kwargs):
super().__init__()
self.cfg = cfg
self.img_cv2 = img_cv2
# self.boxes = boxes
assert train == False, "ViTDetDataset is only for inference"
self.train = train
self.img_size = cfg.MODEL.IMAGE_SIZE
self.mean = 255. * np.array(self.cfg.MODEL.IMAGE_MEAN)
self.std = 255. * np.array(self.cfg.MODEL.IMAGE_STD)
# Preprocess annotations
boxes = boxes.astype(np.float32)
self.center = (boxes[:, 2:4] + boxes[:, 0:2]) / 2.0
self.scale = rescale_factor * (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0
#self.scale = (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0
self.personid = np.arange(len(boxes), dtype=np.int32)
self.right = right.astype(np.float32)
def __len__(self) -> int:
return len(self.personid)
def __getitem__(self, idx: int) -> Dict[str, np.array]:
center = self.center[idx].copy()
center_x = center[0]
center_y = center[1]
scale = self.scale[idx]
BBOX_SHAPE = self.cfg.MODEL.get('BBOX_SHAPE', None)
bbox_size = expand_to_aspect_ratio(scale*200, target_aspect_ratio=BBOX_SHAPE).max()
#bbox_size = scale.max()*200
patch_width = patch_height = self.img_size
right = self.right[idx].copy()
flip = right == 0
# 3. generate image patch
# if use_skimage_antialias:
cvimg = self.img_cv2.copy()
if True:
# Blur image to avoid aliasing artifacts
downsampling_factor = ((bbox_size*1.0) / patch_width)
print(f'{downsampling_factor=}')
downsampling_factor = downsampling_factor / 2.0
if downsampling_factor > 1.1:
cvimg = gaussian(cvimg, sigma=(downsampling_factor-1)/2, channel_axis=2, preserve_range=True)
img_patch_cv, trans = generate_image_patch_cv2(cvimg,
center_x, center_y,
bbox_size, bbox_size,
patch_width, patch_height,
flip, 1.0, 0,
border_mode=cv2.BORDER_CONSTANT)
img_patch_cv = img_patch_cv[:, :, ::-1]
img_patch = convert_cvimg_to_tensor(img_patch_cv)
# apply normalization
for n_c in range(min(self.img_cv2.shape[2], 3)):
img_patch[n_c, :, :] = (img_patch[n_c, :, :] - self.mean[n_c]) / self.std[n_c]
item = {
'img': img_patch,
'personid': int(self.personid[idx]),
}
item['box_center'] = self.center[idx].copy()
item['box_size'] = bbox_size
item['img_size'] = 1.0 * np.array([cvimg.shape[1], cvimg.shape[0]])
item['right'] = self.right[idx].copy()
return item