Spaces:
Running
Running
# Copyright 2019 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Spatial transformation ops like RoIAlign, CropAndResize.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import tensorflow.compat.v1 as tf | |
def _coordinate_vector_1d(start, end, size, align_endpoints): | |
"""Generates uniformly spaced coordinate vector. | |
Args: | |
start: A float tensor of shape [batch, num_boxes] indicating start values. | |
end: A float tensor of shape [batch, num_boxes] indicating end values. | |
size: Number of points in coordinate vector. | |
align_endpoints: Whether to align first and last points exactly to | |
endpoints. | |
Returns: | |
A 3D float tensor of shape [batch, num_boxes, size] containing grid | |
coordinates. | |
""" | |
start = tf.expand_dims(start, -1) | |
end = tf.expand_dims(end, -1) | |
length = end - start | |
if align_endpoints: | |
relative_grid_spacing = tf.linspace(0.0, 1.0, size) | |
offset = 0 if size > 1 else length / 2 | |
else: | |
relative_grid_spacing = tf.linspace(0.0, 1.0, size + 1)[:-1] | |
offset = length / (2 * size) | |
relative_grid_spacing = tf.reshape(relative_grid_spacing, [1, 1, size]) | |
relative_grid_spacing = tf.cast(relative_grid_spacing, dtype=start.dtype) | |
absolute_grid = start + offset + relative_grid_spacing * length | |
return absolute_grid | |
def box_grid_coordinate_vectors(boxes, size_y, size_x, align_corners=False): | |
"""Generates coordinate vectors for a `size x size` grid in boxes. | |
Each box is subdivided uniformly into a grid consisting of size x size | |
rectangular cells. This function returns coordinate vectors describing | |
the center of each cell. | |
If `align_corners` is true, grid points are uniformly spread such that the | |
corner points on the grid exactly overlap corners of the boxes. | |
Note that output coordinates are expressed in the same coordinate frame as | |
input boxes. | |
Args: | |
boxes: A float tensor of shape [batch, num_boxes, 4] containing boxes of the | |
form [ymin, xmin, ymax, xmax]. | |
size_y: Size of the grid in y axis. | |
size_x: Size of the grid in x axis. | |
align_corners: Whether to align the corner grid points exactly with box | |
corners. | |
Returns: | |
box_grid_y: A float tensor of shape [batch, num_boxes, size_y] containing y | |
coordinates for grid points. | |
box_grid_x: A float tensor of shape [batch, num_boxes, size_x] containing x | |
coordinates for grid points. | |
""" | |
ymin, xmin, ymax, xmax = tf.unstack(boxes, axis=-1) | |
box_grid_y = _coordinate_vector_1d(ymin, ymax, size_y, align_corners) | |
box_grid_x = _coordinate_vector_1d(xmin, xmax, size_x, align_corners) | |
return box_grid_y, box_grid_x | |
def feature_grid_coordinate_vectors(box_grid_y, box_grid_x): | |
"""Returns feature grid point coordinate vectors for bilinear interpolation. | |
Box grid is specified in absolute coordinate system with origin at left top | |
(0, 0). The returned coordinate vectors contain 0-based feature point indices. | |
This function snaps each point in the box grid to nearest 4 points on the | |
feature map. | |
In this function we also follow the convention of treating feature pixels as | |
point objects with no spatial extent. | |
Args: | |
box_grid_y: A float tensor of shape [batch, num_boxes, size] containing y | |
coordinate vector of the box grid. | |
box_grid_x: A float tensor of shape [batch, num_boxes, size] containing x | |
coordinate vector of the box grid. | |
Returns: | |
feature_grid_y0: An int32 tensor of shape [batch, num_boxes, size] | |
containing y coordinate vector for the top neighbors. | |
feature_grid_x0: A int32 tensor of shape [batch, num_boxes, size] | |
containing x coordinate vector for the left neighbors. | |
feature_grid_y1: A int32 tensor of shape [batch, num_boxes, size] | |
containing y coordinate vector for the bottom neighbors. | |
feature_grid_x1: A int32 tensor of shape [batch, num_boxes, size] | |
containing x coordinate vector for the right neighbors. | |
""" | |
feature_grid_y0 = tf.floor(box_grid_y) | |
feature_grid_x0 = tf.floor(box_grid_x) | |
feature_grid_y1 = tf.floor(box_grid_y + 1) | |
feature_grid_x1 = tf.floor(box_grid_x + 1) | |
feature_grid_y0 = tf.cast(feature_grid_y0, dtype=tf.int32) | |
feature_grid_y1 = tf.cast(feature_grid_y1, dtype=tf.int32) | |
feature_grid_x0 = tf.cast(feature_grid_x0, dtype=tf.int32) | |
feature_grid_x1 = tf.cast(feature_grid_x1, dtype=tf.int32) | |
return (feature_grid_y0, feature_grid_x0, feature_grid_y1, feature_grid_x1) | |
def _valid_indicator(feature_grid_y, feature_grid_x, true_feature_shapes): | |
"""Computes a indicator vector for valid indices. | |
Computes an indicator vector which is true for points on feature map and | |
false for points off feature map. | |
Args: | |
feature_grid_y: An int32 tensor of shape [batch, num_boxes, size_y] | |
containing y coordinate vector. | |
feature_grid_x: An int32 tensor of shape [batch, num_boxes, size_x] | |
containing x coordinate vector. | |
true_feature_shapes: A int32 tensor of shape [batch, num_boxes, 2] | |
containing valid height and width of feature maps. Feature maps are | |
assumed to be aligned to the left top corner. | |
Returns: | |
indices: A 1D bool tensor indicating valid feature indices. | |
""" | |
height = tf.cast(true_feature_shapes[:, :, 0:1], dtype=feature_grid_y.dtype) | |
width = tf.cast(true_feature_shapes[:, :, 1:2], dtype=feature_grid_x.dtype) | |
valid_indicator = tf.logical_and( | |
tf.expand_dims( | |
tf.logical_and(feature_grid_y >= 0, tf.less(feature_grid_y, height)), | |
3), | |
tf.expand_dims( | |
tf.logical_and(feature_grid_x >= 0, tf.less(feature_grid_x, width)), | |
2)) | |
return tf.reshape(valid_indicator, [-1]) | |
def ravel_indices(feature_grid_y, feature_grid_x, num_levels, height, width, | |
box_levels): | |
"""Returns grid indices in a flattened feature map of shape [-1, channels]. | |
The returned 1-D array can be used to gather feature grid points from a | |
feature map that has been flattened from [batch, num_levels, max_height, | |
max_width, channels] to [batch * num_levels * max_height * max_width, | |
channels]. | |
Args: | |
feature_grid_y: An int32 tensor of shape [batch, num_boxes, size_y] | |
containing y coordinate vector. | |
feature_grid_x: An int32 tensor of shape [batch, num_boxes, size_x] | |
containing x coordinate vector. | |
num_levels: Number of feature levels. | |
height: An integer indicating the padded height of feature maps. | |
width: An integer indicating the padded width of feature maps. | |
box_levels: An int32 tensor of shape [batch, num_boxes] indicating | |
feature level assigned to each box. | |
Returns: | |
indices: A 1D int32 tensor containing feature point indices in a flattened | |
feature grid. | |
""" | |
num_boxes = tf.shape(feature_grid_y)[1] | |
batch_size = tf.shape(feature_grid_y)[0] | |
size_y = tf.shape(feature_grid_y)[2] | |
size_x = tf.shape(feature_grid_x)[2] | |
height_dim_offset = width | |
level_dim_offset = height * height_dim_offset | |
batch_dim_offset = num_levels * level_dim_offset | |
batch_dim_indices = ( | |
tf.reshape( | |
tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]) * | |
tf.ones([1, num_boxes, size_y, size_x], dtype=tf.int32)) | |
box_level_indices = ( | |
tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]) * | |
tf.ones([1, 1, size_y, size_x], dtype=tf.int32)) | |
height_indices = ( | |
tf.reshape(feature_grid_y * height_dim_offset, | |
[batch_size, num_boxes, size_y, 1]) * | |
tf.ones([1, 1, 1, size_x], dtype=tf.int32)) | |
width_indices = ( | |
tf.reshape(feature_grid_x, [batch_size, num_boxes, 1, size_x]) | |
* tf.ones([1, 1, size_y, 1], dtype=tf.int32)) | |
indices = ( | |
batch_dim_indices + box_level_indices + height_indices + width_indices) | |
flattened_indices = tf.reshape(indices, [-1]) | |
return flattened_indices | |
def pad_to_max_size(features): | |
"""Pads features to max height and max width and stacks them up. | |
Args: | |
features: A list of num_levels 4D float tensors of shape [batch, height_i, | |
width_i, channels] containing feature maps. | |
Returns: | |
stacked_features: A 5D float tensor of shape [batch, num_levels, max_height, | |
max_width, channels] containing stacked features. | |
true_feature_shapes: A 2D int32 tensor of shape [num_levels, 2] containing | |
height and width of the feature maps before padding. | |
""" | |
if len(features) == 1: | |
return tf.expand_dims(features[0], | |
1), tf.expand_dims(tf.shape(features[0])[1:3], 0) | |
if all([feature.shape.is_fully_defined() for feature in features]): | |
heights = [feature.shape[1] for feature in features] | |
widths = [feature.shape[2] for feature in features] | |
max_height = max(heights) | |
max_width = max(widths) | |
else: | |
heights = [tf.shape(feature)[1] for feature in features] | |
widths = [tf.shape(feature)[2] for feature in features] | |
max_height = tf.reduce_max(heights) | |
max_width = tf.reduce_max(widths) | |
features_all = [ | |
tf.image.pad_to_bounding_box(feature, 0, 0, max_height, | |
max_width) for feature in features | |
] | |
features_all = tf.stack(features_all, axis=1) | |
true_feature_shapes = tf.stack([tf.shape(feature)[1:3] | |
for feature in features]) | |
return features_all, true_feature_shapes | |
def _gather_valid_indices(tensor, indices, padding_value=0.0): | |
"""Gather values for valid indices. | |
TODO(rathodv): We can't use ops.gather_with_padding_values due to cyclic | |
dependency. Start using it after migrating all users of spatial ops to import | |
this module directly rather than util/ops.py | |
Args: | |
tensor: A tensor to gather valid values from. | |
indices: A 1-D int32 tensor containing indices along axis 0 of `tensor`. | |
Invalid indices must be marked with -1. | |
padding_value: Value to return for invalid indices. | |
Returns: | |
A tensor sliced based on indices. For indices that are equal to -1, returns | |
rows of padding value. | |
""" | |
padded_tensor = tf.concat( | |
[ | |
padding_value * | |
tf.ones([1, tf.shape(tensor)[-1]], dtype=tensor.dtype), tensor | |
], | |
axis=0, | |
) | |
# tf.concat gradient op uses tf.where(condition) (which is not | |
# supported on TPU) when the inputs to it are tf.IndexedSlices instead of | |
# tf.Tensor. Since gradient op for tf.gather returns tf.IndexedSlices, | |
# we add a dummy op inbetween tf.concat and tf.gather to ensure tf.concat | |
# gradient function gets tf.Tensor inputs and not tf.IndexedSlices. | |
padded_tensor *= 1.0 | |
return tf.gather(padded_tensor, indices + 1) | |
def multilevel_roi_align(features, boxes, box_levels, output_size, | |
num_samples_per_cell_y=1, num_samples_per_cell_x=1, | |
align_corners=False, extrapolation_value=0.0, | |
scope=None): | |
"""Applies RoI Align op and returns feature for boxes. | |
Given multiple features maps indexed by different levels, and a set of boxes | |
where each box is mapped to a certain level, this function selectively crops | |
and resizes boxes from the corresponding feature maps. | |
We follow the RoI Align technique in https://arxiv.org/pdf/1703.06870.pdf | |
figure 3. Specifically, each box is subdivided uniformly into a grid | |
consisting of output_size[0] x output_size[1] rectangular cells. Within each | |
cell we select `num_points` points uniformly and compute feature values using | |
bilinear interpolation. Finally, we average pool the interpolated values in | |
each cell to obtain a [output_size[0], output_size[1], channels] feature. | |
If `align_corners` is true, sampling points are uniformly spread such that | |
corner points exactly overlap corners of the boxes. | |
In this function we also follow the convention of treating feature pixels as | |
point objects with no spatial extent. | |
Args: | |
features: A list of 4D float tensors of shape [batch_size, max_height, | |
max_width, channels] containing features. Note that each feature map must | |
have the same number of channels. | |
boxes: A 3D float tensor of shape [batch_size, num_boxes, 4] containing | |
boxes of the form [ymin, xmin, ymax, xmax] in normalized coordinates. | |
box_levels: A 3D int32 tensor of shape [batch_size, num_boxes] | |
representing the feature level index for each box. | |
output_size: An list of two integers [size_y, size_x] indicating the output | |
feature size for each box. | |
num_samples_per_cell_y: Number of grid points to sample along y axis in each | |
cell. | |
num_samples_per_cell_x: Number of grid points to sample along x axis in each | |
cell. | |
align_corners: Whether to align the corner grid points exactly with box | |
corners. | |
extrapolation_value: a float value to use for extrapolation. | |
scope: Scope name to use for this op. | |
Returns: | |
A 5D float tensor of shape [batch_size, num_boxes, output_size[0], | |
output_size[1], channels] representing the cropped features. | |
""" | |
with tf.name_scope(scope, 'MultiLevelRoIAlign'): | |
features, true_feature_shapes = pad_to_max_size(features) | |
batch_size = tf.shape(features)[0] | |
num_levels = features.get_shape().as_list()[1] | |
max_feature_height = tf.shape(features)[2] | |
max_feature_width = tf.shape(features)[3] | |
num_filters = features.get_shape().as_list()[4] | |
num_boxes = tf.shape(boxes)[1] | |
# Convert boxes to absolute co-ordinates. | |
true_feature_shapes = tf.cast(true_feature_shapes, dtype=boxes.dtype) | |
true_feature_shapes = tf.gather(true_feature_shapes, box_levels) | |
boxes *= tf.concat([true_feature_shapes - 1] * 2, axis=-1) | |
size_y = output_size[0] * num_samples_per_cell_y | |
size_x = output_size[1] * num_samples_per_cell_x | |
box_grid_y, box_grid_x = box_grid_coordinate_vectors( | |
boxes, size_y=size_y, size_x=size_x, align_corners=align_corners) | |
(feature_grid_y0, feature_grid_x0, feature_grid_y1, | |
feature_grid_x1) = feature_grid_coordinate_vectors(box_grid_y, box_grid_x) | |
feature_grid_y = tf.reshape( | |
tf.stack([feature_grid_y0, feature_grid_y1], axis=3), | |
[batch_size, num_boxes, -1]) | |
feature_grid_x = tf.reshape( | |
tf.stack([feature_grid_x0, feature_grid_x1], axis=3), | |
[batch_size, num_boxes, -1]) | |
feature_coordinates = ravel_indices(feature_grid_y, feature_grid_x, | |
num_levels, max_feature_height, | |
max_feature_width, box_levels) | |
valid_indices = _valid_indicator(feature_grid_y, feature_grid_x, | |
true_feature_shapes) | |
feature_coordinates = tf.where(valid_indices, feature_coordinates, | |
-1 * tf.ones_like(feature_coordinates)) | |
flattened_features = tf.reshape(features, [-1, num_filters]) | |
flattened_feature_values = _gather_valid_indices(flattened_features, | |
feature_coordinates, | |
extrapolation_value) | |
features_per_box = tf.reshape( | |
flattened_feature_values, | |
[batch_size, num_boxes, size_y * 2, size_x * 2, num_filters]) | |
# Cast tensors into dtype of features. | |
box_grid_y = tf.cast(box_grid_y, dtype=features_per_box.dtype) | |
box_grid_x = tf.cast(box_grid_x, dtype=features_per_box.dtype) | |
feature_grid_y0 = tf.cast(feature_grid_y0, dtype=features_per_box.dtype) | |
feature_grid_x0 = tf.cast(feature_grid_x0, dtype=features_per_box.dtype) | |
# RoI Align operation is a bilinear interpolation of four | |
# neighboring feature points f0, f1, f2, and f3 onto point y, x given by | |
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T | |
# [f10, f11]] | |
# | |
# Unrolling the matrix multiplies gives us: | |
# f(y, x) = (hy * hx) f00 + (hy * lx) f01 + (ly * hx) f10 + (lx * ly) f11 | |
# f(y, x) = w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11 | |
# | |
# This can be computed by applying pointwise multiplication and sum_pool in | |
# a 2x2 window. | |
ly = box_grid_y - feature_grid_y0 | |
lx = box_grid_x - feature_grid_x0 | |
hy = 1.0 - ly | |
hx = 1.0 - lx | |
kernel_y = tf.reshape( | |
tf.stack([hy, ly], axis=3), [batch_size, num_boxes, size_y * 2, 1]) | |
kernel_x = tf.reshape( | |
tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, size_x * 2]) | |
# Multiplier 4 is to make tf.nn.avg_pool behave like sum_pool. | |
interpolation_kernel = kernel_y * kernel_x * 4 | |
# Interpolate the gathered features with computed interpolation kernels. | |
features_per_box *= tf.expand_dims(interpolation_kernel, axis=4), | |
features_per_box = tf.reshape( | |
features_per_box, | |
[batch_size * num_boxes, size_y * 2, size_x * 2, num_filters]) | |
# This combines the two pooling operations - sum_pool to perform bilinear | |
# interpolation and avg_pool to pool the values in each bin. | |
features_per_box = tf.nn.avg_pool( | |
features_per_box, | |
[1, num_samples_per_cell_y * 2, num_samples_per_cell_x * 2, 1], | |
[1, num_samples_per_cell_y * 2, num_samples_per_cell_x * 2, 1], 'VALID') | |
features_per_box = tf.reshape( | |
features_per_box, | |
[batch_size, num_boxes, output_size[0], output_size[1], num_filters]) | |
return features_per_box | |
def native_crop_and_resize(image, boxes, crop_size, scope=None): | |
"""Same as `matmul_crop_and_resize` but uses tf.image.crop_and_resize.""" | |
def get_box_inds(proposals): | |
proposals_shape = proposals.shape.as_list() | |
if any(dim is None for dim in proposals_shape): | |
proposals_shape = tf.shape(proposals) | |
ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) | |
multiplier = tf.expand_dims( | |
tf.range(start=0, limit=proposals_shape[0]), 1) | |
return tf.reshape(ones_mat * multiplier, [-1]) | |
with tf.name_scope(scope, 'CropAndResize'): | |
cropped_regions = tf.image.crop_and_resize( | |
image, tf.reshape(boxes, [-1] + boxes.shape.as_list()[2:]), | |
get_box_inds(boxes), crop_size) | |
final_shape = tf.concat([tf.shape(boxes)[:2], | |
tf.shape(cropped_regions)[1:]], axis=0) | |
return tf.reshape(cropped_regions, final_shape) | |
def matmul_crop_and_resize(image, boxes, crop_size, extrapolation_value=0.0, | |
scope=None): | |
"""Matrix multiplication based implementation of the crop and resize op. | |
Extracts crops from the input image tensor and bilinearly resizes them | |
(possibly with aspect ratio change) to a common output size specified by | |
crop_size. This is more general than the crop_to_bounding_box op which | |
extracts a fixed size slice from the input image and does not allow | |
resizing or aspect ratio change. | |
Returns a tensor with crops from the input image at positions defined at | |
the bounding box locations in boxes. The cropped boxes are all resized | |
(with bilinear interpolation) to a fixed size = `[crop_height, crop_width]`. | |
The result is a 5-D tensor `[batch, num_boxes, crop_height, crop_width, | |
depth]`. | |
Note that this operation is meant to replicate the behavior of the standard | |
tf.image.crop_and_resize operation but there are a few differences. | |
Specifically: | |
1) There is no `box_indices` argument --- to run this op on multiple images, | |
one must currently call this op independently on each image. | |
2) The `crop_size` parameter is assumed to be statically defined. | |
Moreover, the number of boxes must be strictly nonzero. | |
Args: | |
image: A `Tensor`. Must be one of the following types: `uint8`, `int8`, | |
`int16`, `int32`, `int64`, `half`, 'bfloat16', `float32`, `float64`. | |
A 4-D tensor of shape `[batch, image_height, image_width, depth]`. | |
Both `image_height` and `image_width` need to be positive. | |
boxes: A `Tensor` of type `float32` or 'bfloat16'. | |
A 3-D tensor of shape `[batch, num_boxes, 4]`. The boxes are specified in | |
normalized coordinates and are of the form `[y1, x1, y2, x2]`. A | |
normalized coordinate value of `y` is mapped to the image coordinate at | |
`y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image | |
height is mapped to `[0, image_height - 1] in image height coordinates. | |
We do allow y1 > y2, in which case the sampled crop is an up-down flipped | |
version of the original image. The width dimension is treated similarly. | |
Normalized coordinates outside the `[0, 1]` range are allowed, in which | |
case we use `extrapolation_value` to extrapolate the input image values. | |
crop_size: A list of two integers `[crop_height, crop_width]`. All | |
cropped image patches are resized to this size. The aspect ratio of the | |
image content is not preserved. Both `crop_height` and `crop_width` need | |
to be positive. | |
extrapolation_value: a float value to use for extrapolation. | |
scope: A name for the operation (optional). | |
Returns: | |
A 5-D tensor of shape `[batch, num_boxes, crop_height, crop_width, depth]` | |
""" | |
with tf.name_scope(scope, 'MatMulCropAndResize'): | |
box_levels = tf.zeros(tf.shape(boxes)[:2], dtype=tf.int32) | |
return multilevel_roi_align([image], | |
boxes, | |
box_levels, | |
crop_size, | |
align_corners=True, | |
extrapolation_value=extrapolation_value) | |