# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. from typing import Optional, Union import cv2 import mmcv import numpy as np from mmcv.transforms import BaseTransform from mmcv.transforms.utils import cache_randomness from mmdet.registry import TRANSFORMS from mmdet.structures.bbox import autocast_box_type from .augment_wrappers import _MAX_LEVEL, level_to_mag @TRANSFORMS.register_module() class GeomTransform(BaseTransform): """Base class for geometric transformations. All geometric transformations need to inherit from this base class. ``GeomTransform`` unifies the class attributes and class functions of geometric transformations (ShearX, ShearY, Rotate, TranslateX, and TranslateY), and records the homography matrix. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - homography_matrix Args: prob (float): The probability for performing the geometric transformation and should be in range [0, 1]. Defaults to 1.0. level (int, optional): The level should be in range [0, _MAX_LEVEL]. If level is None, it will generate from [0, _MAX_LEVEL] randomly. Defaults to None. min_mag (float): The minimum magnitude for geometric transformation. Defaults to 0.0. max_mag (float): The maximum magnitude for geometric transformation. Defaults to 1.0. reversal_prob (float): The probability that reverses the geometric transformation magnitude. Should be in range [0,1]. Defaults to 0.5. img_border_value (int | float | tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, prob: float = 1.0, level: Optional[int] = None, min_mag: float = 0.0, max_mag: float = 1.0, reversal_prob: float = 0.5, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, interpolation: str = 'bilinear') -> None: assert 0 <= prob <= 1.0, f'The probability of the transformation ' \ f'should be in range [0,1], got {prob}.' assert level is None or isinstance(level, int), \ f'The level should be None or type int, got {type(level)}.' assert level is None or 0 <= level <= _MAX_LEVEL, \ f'The level should be in range [0,{_MAX_LEVEL}], got {level}.' assert isinstance(min_mag, float), \ f'min_mag should be type float, got {type(min_mag)}.' assert isinstance(max_mag, float), \ f'max_mag should be type float, got {type(max_mag)}.' assert min_mag <= max_mag, \ f'min_mag should smaller than max_mag, ' \ f'got min_mag={min_mag} and max_mag={max_mag}' assert isinstance(reversal_prob, float), \ f'reversal_prob should be type float, got {type(max_mag)}.' assert 0 <= reversal_prob <= 1.0, \ f'The reversal probability of the transformation magnitude ' \ f'should be type float, got {type(reversal_prob)}.' if isinstance(img_border_value, (float, int)): img_border_value = tuple([float(img_border_value)] * 3) elif isinstance(img_border_value, tuple): assert len(img_border_value) == 3, \ f'img_border_value as tuple must have 3 elements, ' \ f'got {len(img_border_value)}.' img_border_value = tuple([float(val) for val in img_border_value]) else: raise ValueError( 'img_border_value must be float or tuple with 3 elements.') assert np.all([0 <= val <= 255 for val in img_border_value]), 'all ' \ 'elements of img_border_value should between range [0,255].' \ f'got {img_border_value}.' self.prob = prob self.level = level self.min_mag = min_mag self.max_mag = max_mag self.reversal_prob = reversal_prob self.img_border_value = img_border_value self.mask_border_value = mask_border_value self.seg_ignore_label = seg_ignore_label self.interpolation = interpolation def _transform_img(self, results: dict, mag: float) -> None: """Transform the image.""" pass def _transform_masks(self, results: dict, mag: float) -> None: """Transform the masks.""" pass def _transform_seg(self, results: dict, mag: float) -> None: """Transform the segmentation map.""" pass def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: """Get the homography matrix for the geometric transformation.""" return np.eye(3, dtype=np.float32) def _transform_bboxes(self, results: dict, mag: float) -> None: """Transform the bboxes.""" results['gt_bboxes'].project_(self.homography_matrix) results['gt_bboxes'].clip_(results['img_shape']) def _record_homography_matrix(self, results: dict) -> None: """Record the homography matrix for the geometric transformation.""" if results.get('homography_matrix', None) is None: results['homography_matrix'] = self.homography_matrix else: results['homography_matrix'] = self.homography_matrix @ results[ 'homography_matrix'] @cache_randomness def _random_disable(self): """Randomly disable the transform.""" return np.random.rand() > self.prob @cache_randomness def _get_mag(self): """Get the magnitude of the transform.""" mag = level_to_mag(self.level, self.min_mag, self.max_mag) return -mag if np.random.rand() > self.reversal_prob else mag @autocast_box_type() def transform(self, results: dict) -> dict: """Transform function for images, bounding boxes, masks and semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Transformed results. """ if self._random_disable(): return results mag = self._get_mag() self.homography_matrix = self._get_homography_matrix(results, mag) self._record_homography_matrix(results) self._transform_img(results, mag) if results.get('gt_bboxes', None) is not None: self._transform_bboxes(results, mag) if results.get('gt_masks', None) is not None: self._transform_masks(results, mag) if results.get('gt_seg_map', None) is not None: self._transform_seg(results, mag) return results def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(prob={self.prob}, ' repr_str += f'level={self.level}, ' repr_str += f'min_mag={self.min_mag}, ' repr_str += f'max_mag={self.max_mag}, ' repr_str += f'reversal_prob={self.reversal_prob}, ' repr_str += f'img_border_value={self.img_border_value}, ' repr_str += f'mask_border_value={self.mask_border_value}, ' repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' repr_str += f'interpolation={self.interpolation})' return repr_str @TRANSFORMS.register_module() class ShearX(GeomTransform): """Shear the images, bboxes, masks and segmentation map horizontally. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - homography_matrix Args: prob (float): The probability for performing Shear and should be in range [0, 1]. Defaults to 1.0. level (int, optional): The level should be in range [0, _MAX_LEVEL]. If level is None, it will generate from [0, _MAX_LEVEL] randomly. Defaults to None. min_mag (float): The minimum angle for the horizontal shear. Defaults to 0.0. max_mag (float): The maximum angle for the horizontal shear. Defaults to 30.0. reversal_prob (float): The probability that reverses the horizontal shear magnitude. Should be in range [0,1]. Defaults to 0.5. img_border_value (int | float | tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, prob: float = 1.0, level: Optional[int] = None, min_mag: float = 0.0, max_mag: float = 30.0, reversal_prob: float = 0.5, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, interpolation: str = 'bilinear') -> None: assert 0. <= min_mag <= 90., \ f'min_mag angle for ShearX should be ' \ f'in range [0, 90], got {min_mag}.' assert 0. <= max_mag <= 90., \ f'max_mag angle for ShearX should be ' \ f'in range [0, 90], got {max_mag}.' super().__init__( prob=prob, level=level, min_mag=min_mag, max_mag=max_mag, reversal_prob=reversal_prob, img_border_value=img_border_value, mask_border_value=mask_border_value, seg_ignore_label=seg_ignore_label, interpolation=interpolation) @cache_randomness def _get_mag(self): """Get the magnitude of the transform.""" mag = level_to_mag(self.level, self.min_mag, self.max_mag) mag = np.tan(mag * np.pi / 180) return -mag if np.random.rand() > self.reversal_prob else mag def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: """Get the homography matrix for ShearX.""" return np.array([[1, mag, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32) def _transform_img(self, results: dict, mag: float) -> None: """Shear the image horizontally.""" results['img'] = mmcv.imshear( results['img'], mag, direction='horizontal', border_value=self.img_border_value, interpolation=self.interpolation) def _transform_masks(self, results: dict, mag: float) -> None: """Shear the masks horizontally.""" results['gt_masks'] = results['gt_masks'].shear( results['img_shape'], mag, direction='horizontal', border_value=self.mask_border_value, interpolation=self.interpolation) def _transform_seg(self, results: dict, mag: float) -> None: """Shear the segmentation map horizontally.""" results['gt_seg_map'] = mmcv.imshear( results['gt_seg_map'], mag, direction='horizontal', border_value=self.seg_ignore_label, interpolation='nearest') @TRANSFORMS.register_module() class ShearY(GeomTransform): """Shear the images, bboxes, masks and segmentation map vertically. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - homography_matrix Args: prob (float): The probability for performing ShearY and should be in range [0, 1]. Defaults to 1.0. level (int, optional): The level should be in range [0,_MAX_LEVEL]. If level is None, it will generate from [0, _MAX_LEVEL] randomly. Defaults to None. min_mag (float): The minimum angle for the vertical shear. Defaults to 0.0. max_mag (float): The maximum angle for the vertical shear. Defaults to 30.0. reversal_prob (float): The probability that reverses the vertical shear magnitude. Should be in range [0,1]. Defaults to 0.5. img_border_value (int | float | tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, prob: float = 1.0, level: Optional[int] = None, min_mag: float = 0.0, max_mag: float = 30., reversal_prob: float = 0.5, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, interpolation: str = 'bilinear') -> None: assert 0. <= min_mag <= 90., \ f'min_mag angle for ShearY should be ' \ f'in range [0, 90], got {min_mag}.' assert 0. <= max_mag <= 90., \ f'max_mag angle for ShearY should be ' \ f'in range [0, 90], got {max_mag}.' super().__init__( prob=prob, level=level, min_mag=min_mag, max_mag=max_mag, reversal_prob=reversal_prob, img_border_value=img_border_value, mask_border_value=mask_border_value, seg_ignore_label=seg_ignore_label, interpolation=interpolation) @cache_randomness def _get_mag(self): """Get the magnitude of the transform.""" mag = level_to_mag(self.level, self.min_mag, self.max_mag) mag = np.tan(mag * np.pi / 180) return -mag if np.random.rand() > self.reversal_prob else mag def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: """Get the homography matrix for ShearY.""" return np.array([[1, 0, 0], [mag, 1, 0], [0, 0, 1]], dtype=np.float32) def _transform_img(self, results: dict, mag: float) -> None: """Shear the image vertically.""" results['img'] = mmcv.imshear( results['img'], mag, direction='vertical', border_value=self.img_border_value, interpolation=self.interpolation) def _transform_masks(self, results: dict, mag: float) -> None: """Shear the masks vertically.""" results['gt_masks'] = results['gt_masks'].shear( results['img_shape'], mag, direction='vertical', border_value=self.mask_border_value, interpolation=self.interpolation) def _transform_seg(self, results: dict, mag: float) -> None: """Shear the segmentation map vertically.""" results['gt_seg_map'] = mmcv.imshear( results['gt_seg_map'], mag, direction='vertical', border_value=self.seg_ignore_label, interpolation='nearest') @TRANSFORMS.register_module() class Rotate(GeomTransform): """Rotate the images, bboxes, masks and segmentation map. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - homography_matrix Args: prob (float): The probability for perform transformation and should be in range 0 to 1. Defaults to 1.0. level (int, optional): The level should be in range [0, _MAX_LEVEL]. If level is None, it will generate from [0, _MAX_LEVEL] randomly. Defaults to None. min_mag (float): The maximum angle for rotation. Defaults to 0.0. max_mag (float): The maximum angle for rotation. Defaults to 30.0. reversal_prob (float): The probability that reverses the rotation magnitude. Should be in range [0,1]. Defaults to 0.5. img_border_value (int | float | tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, prob: float = 1.0, level: Optional[int] = None, min_mag: float = 0.0, max_mag: float = 30.0, reversal_prob: float = 0.5, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, interpolation: str = 'bilinear') -> None: assert 0. <= min_mag <= 180., \ f'min_mag for Rotate should be in range [0,180], got {min_mag}.' assert 0. <= max_mag <= 180., \ f'max_mag for Rotate should be in range [0,180], got {max_mag}.' super().__init__( prob=prob, level=level, min_mag=min_mag, max_mag=max_mag, reversal_prob=reversal_prob, img_border_value=img_border_value, mask_border_value=mask_border_value, seg_ignore_label=seg_ignore_label, interpolation=interpolation) def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: """Get the homography matrix for Rotate.""" img_shape = results['img_shape'] center = ((img_shape[1] - 1) * 0.5, (img_shape[0] - 1) * 0.5) cv2_rotation_matrix = cv2.getRotationMatrix2D(center, -mag, 1.0) return np.concatenate( [cv2_rotation_matrix, np.array([0, 0, 1]).reshape((1, 3))]).astype(np.float32) def _transform_img(self, results: dict, mag: float) -> None: """Rotate the image.""" results['img'] = mmcv.imrotate( results['img'], mag, border_value=self.img_border_value, interpolation=self.interpolation) def _transform_masks(self, results: dict, mag: float) -> None: """Rotate the masks.""" results['gt_masks'] = results['gt_masks'].rotate( results['img_shape'], mag, border_value=self.mask_border_value, interpolation=self.interpolation) def _transform_seg(self, results: dict, mag: float) -> None: """Rotate the segmentation map.""" results['gt_seg_map'] = mmcv.imrotate( results['gt_seg_map'], mag, border_value=self.seg_ignore_label, interpolation='nearest') @TRANSFORMS.register_module() class TranslateX(GeomTransform): """Translate the images, bboxes, masks and segmentation map horizontally. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - homography_matrix Args: prob (float): The probability for perform transformation and should be in range 0 to 1. Defaults to 1.0. level (int, optional): The level should be in range [0, _MAX_LEVEL]. If level is None, it will generate from [0, _MAX_LEVEL] randomly. Defaults to None. min_mag (float): The minimum pixel's offset ratio for horizontal translation. Defaults to 0.0. max_mag (float): The maximum pixel's offset ratio for horizontal translation. Defaults to 0.1. reversal_prob (float): The probability that reverses the horizontal translation magnitude. Should be in range [0,1]. Defaults to 0.5. img_border_value (int | float | tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, prob: float = 1.0, level: Optional[int] = None, min_mag: float = 0.0, max_mag: float = 0.1, reversal_prob: float = 0.5, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, interpolation: str = 'bilinear') -> None: assert 0. <= min_mag <= 1., \ f'min_mag ratio for TranslateX should be ' \ f'in range [0, 1], got {min_mag}.' assert 0. <= max_mag <= 1., \ f'max_mag ratio for TranslateX should be ' \ f'in range [0, 1], got {max_mag}.' super().__init__( prob=prob, level=level, min_mag=min_mag, max_mag=max_mag, reversal_prob=reversal_prob, img_border_value=img_border_value, mask_border_value=mask_border_value, seg_ignore_label=seg_ignore_label, interpolation=interpolation) def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: """Get the homography matrix for TranslateX.""" mag = int(results['img_shape'][1] * mag) return np.array([[1, 0, mag], [0, 1, 0], [0, 0, 1]], dtype=np.float32) def _transform_img(self, results: dict, mag: float) -> None: """Translate the image horizontally.""" mag = int(results['img_shape'][1] * mag) results['img'] = mmcv.imtranslate( results['img'], mag, direction='horizontal', border_value=self.img_border_value, interpolation=self.interpolation) def _transform_masks(self, results: dict, mag: float) -> None: """Translate the masks horizontally.""" mag = int(results['img_shape'][1] * mag) results['gt_masks'] = results['gt_masks'].translate( results['img_shape'], mag, direction='horizontal', border_value=self.mask_border_value, interpolation=self.interpolation) def _transform_seg(self, results: dict, mag: float) -> None: """Translate the segmentation map horizontally.""" mag = int(results['img_shape'][1] * mag) results['gt_seg_map'] = mmcv.imtranslate( results['gt_seg_map'], mag, direction='horizontal', border_value=self.seg_ignore_label, interpolation='nearest') @TRANSFORMS.register_module() class TranslateY(GeomTransform): """Translate the images, bboxes, masks and segmentation map vertically. Required Keys: - img - gt_bboxes (BaseBoxes[torch.float32]) (optional) - gt_masks (BitmapMasks | PolygonMasks) (optional) - gt_seg_map (np.uint8) (optional) Modified Keys: - img - gt_bboxes - gt_masks - gt_seg_map Added Keys: - homography_matrix Args: prob (float): The probability for perform transformation and should be in range 0 to 1. Defaults to 1.0. level (int, optional): The level should be in range [0, _MAX_LEVEL]. If level is None, it will generate from [0, _MAX_LEVEL] randomly. Defaults to None. min_mag (float): The minimum pixel's offset ratio for vertical translation. Defaults to 0.0. max_mag (float): The maximum pixel's offset ratio for vertical translation. Defaults to 0.1. reversal_prob (float): The probability that reverses the vertical translation magnitude. Should be in range [0,1]. Defaults to 0.5. img_border_value (int | float | tuple): The filled values for image border. If float, the same fill value will be used for all the three channels of image. If tuple, it should be 3 elements. Defaults to 128. mask_border_value (int): The fill value used for masks. Defaults to 0. seg_ignore_label (int): The fill value used for segmentation map. Note this value must equals ``ignore_label`` in ``semantic_head`` of the corresponding config. Defaults to 255. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. Defaults to 'bilinear'. """ def __init__(self, prob: float = 1.0, level: Optional[int] = None, min_mag: float = 0.0, max_mag: float = 0.1, reversal_prob: float = 0.5, img_border_value: Union[int, float, tuple] = 128, mask_border_value: int = 0, seg_ignore_label: int = 255, interpolation: str = 'bilinear') -> None: assert 0. <= min_mag <= 1., \ f'min_mag ratio for TranslateY should be ' \ f'in range [0,1], got {min_mag}.' assert 0. <= max_mag <= 1., \ f'max_mag ratio for TranslateY should be ' \ f'in range [0,1], got {max_mag}.' super().__init__( prob=prob, level=level, min_mag=min_mag, max_mag=max_mag, reversal_prob=reversal_prob, img_border_value=img_border_value, mask_border_value=mask_border_value, seg_ignore_label=seg_ignore_label, interpolation=interpolation) def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: """Get the homography matrix for TranslateY.""" mag = int(results['img_shape'][0] * mag) return np.array([[1, 0, 0], [0, 1, mag], [0, 0, 1]], dtype=np.float32) def _transform_img(self, results: dict, mag: float) -> None: """Translate the image vertically.""" mag = int(results['img_shape'][0] * mag) results['img'] = mmcv.imtranslate( results['img'], mag, direction='vertical', border_value=self.img_border_value, interpolation=self.interpolation) def _transform_masks(self, results: dict, mag: float) -> None: """Translate masks vertically.""" mag = int(results['img_shape'][0] * mag) results['gt_masks'] = results['gt_masks'].translate( results['img_shape'], mag, direction='vertical', border_value=self.mask_border_value, interpolation=self.interpolation) def _transform_seg(self, results: dict, mag: float) -> None: """Translate segmentation map vertically.""" mag = int(results['img_shape'][0] * mag) results['gt_seg_map'] = mmcv.imtranslate( results['gt_seg_map'], mag, direction='vertical', border_value=self.seg_ignore_label, interpolation='nearest')