Spaces:
Paused
Paused
# Copyright (c) Facebook, Inc. and its affiliates. | |
import torch | |
from torch.nn import functional as F | |
from detectron2.structures import Instances, ROIMasks | |
# perhaps should rename to "resize_instance" | |
def detector_postprocess( | |
results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5 | |
): | |
""" | |
Resize the output instances. | |
The input images are often resized when entering an object detector. | |
As a result, we often need the outputs of the detector in a different | |
resolution from its inputs. | |
This function will resize the raw outputs of an R-CNN detector | |
to produce outputs according to the desired output resolution. | |
Args: | |
results (Instances): the raw outputs from the detector. | |
`results.image_size` contains the input image resolution the detector sees. | |
This object might be modified in-place. | |
output_height, output_width: the desired output resolution. | |
Returns: | |
Instances: the resized output from the model, based on the output resolution | |
""" | |
if isinstance(output_width, torch.Tensor): | |
# This shape might (but not necessarily) be tensors during tracing. | |
# Converts integer tensors to float temporaries to ensure true | |
# division is performed when computing scale_x and scale_y. | |
output_width_tmp = output_width.float() | |
output_height_tmp = output_height.float() | |
new_size = torch.stack([output_height, output_width]) | |
else: | |
new_size = (output_height, output_width) | |
output_width_tmp = output_width | |
output_height_tmp = output_height | |
scale_x, scale_y = ( | |
output_width_tmp / results.image_size[1], | |
output_height_tmp / results.image_size[0], | |
) | |
results = Instances(new_size, **results.get_fields()) | |
if results.has("pred_boxes"): | |
output_boxes = results.pred_boxes | |
elif results.has("proposal_boxes"): | |
output_boxes = results.proposal_boxes | |
else: | |
output_boxes = None | |
assert output_boxes is not None, "Predictions must contain boxes!" | |
output_boxes.scale(scale_x, scale_y) | |
output_boxes.clip(results.image_size) | |
results = results[output_boxes.nonempty()] | |
if results.has("pred_masks"): | |
if isinstance(results.pred_masks, ROIMasks): | |
roi_masks = results.pred_masks | |
else: | |
# pred_masks is a tensor of shape (N, 1, M, M) | |
roi_masks = ROIMasks(results.pred_masks[:, 0, :, :]) | |
results.pred_masks = roi_masks.to_bitmasks( | |
results.pred_boxes, output_height, output_width, mask_threshold | |
).tensor # TODO return ROIMasks/BitMask object in the future | |
if results.has("pred_keypoints"): | |
results.pred_keypoints[:, :, 0] *= scale_x | |
results.pred_keypoints[:, :, 1] *= scale_y | |
return results | |
def bbox_postprocess(result, input_size, img_size, output_height, output_width): | |
""" | |
result: [xc,yc,w,h] range [0,1] to [x1,y1,x2,y2] range [0,w], [0,h] | |
""" | |
if result is None: | |
return None | |
scale = torch.tensor([input_size[1], input_size[0], input_size[1], input_size[0]])[None,:].to(result.device) | |
result = result.sigmoid() * scale | |
x1,y1,x2,y2 = result[:,0] - result[:,2]/2, result[:,1] - result[:,3]/2, result[:,0] + result[:,2]/2, result[:,1] + result[:,3]/2 | |
h,w = img_size | |
x1 = x1.clamp(min=0, max=w) | |
y1 = y1.clamp(min=0, max=h) | |
x2 = x2.clamp(min=0, max=w) | |
y2 = y2.clamp(min=0, max=h) | |
box = torch.stack([x1,y1,x2,y2]).permute(1,0) | |
scale = torch.tensor([output_width/w, output_height/h, output_width/w, output_height/h])[None,:].to(result.device) | |
box = box*scale | |
return box | |
def sem_seg_postprocess(result, img_size, output_height, output_width): | |
""" | |
Return semantic segmentation predictions in the original resolution. | |
The input images are often resized when entering semantic segmentor. Moreover, in same | |
cases, they also padded inside segmentor to be divisible by maximum network stride. | |
As a result, we often need the predictions of the segmentor in a different | |
resolution from its inputs. | |
Args: | |
result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), | |
where C is the number of classes, and H, W are the height and width of the prediction. | |
img_size (tuple): image size that segmentor is taking as input. | |
output_height, output_width: the desired output resolution. | |
Returns: | |
semantic segmentation prediction (Tensor): A tensor of the shape | |
(C, output_height, output_width) that contains per-pixel soft predictions. | |
""" | |
result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) | |
result = F.interpolate( | |
result, size=(output_height, output_width), mode="bilinear", align_corners=False | |
)[0] | |
return result | |