|
""" |
|
Mask R-CNN |
|
The main Mask R-CNN model implemenetation. |
|
|
|
Copyright (c) 2017 Matterport, Inc. |
|
Licensed under the MIT License (see LICENSE for details) |
|
Written by Waleed Abdulla |
|
""" |
|
|
|
import os |
|
import sys |
|
import glob |
|
import random |
|
import math |
|
import datetime |
|
import itertools |
|
import json |
|
import re |
|
import logging |
|
from collections import OrderedDict |
|
import numpy as np |
|
import scipy.misc |
|
import tensorflow as tf |
|
import keras |
|
import keras.backend as K |
|
import keras.layers as KL |
|
import keras.initializers as KI |
|
import keras.engine as KE |
|
import keras.models as KM |
|
|
|
import utils |
|
|
|
|
|
from distutils.version import LooseVersion |
|
assert LooseVersion(tf.__version__) >= LooseVersion("1.3") |
|
assert LooseVersion(keras.__version__) >= LooseVersion('2.0.8') |
|
|
|
|
|
|
|
|
|
|
|
|
|
def log(text, array=None): |
|
"""Prints a text message. And, optionally, if a Numpy array is provided it |
|
prints it's shape, min, and max values. |
|
""" |
|
if array is not None: |
|
text = text.ljust(25) |
|
text += ("shape: {:20} min: {:10.5f} max: {:10.5f}".format( |
|
str(array.shape), |
|
array.min() if array.size else "", |
|
array.max() if array.size else "")) |
|
print(text) |
|
|
|
|
|
class BatchNorm(KL.BatchNormalization): |
|
"""Batch Normalization class. Subclasses the Keras BN class and |
|
hardcodes training=False so the BN layer doesn't update |
|
during training. |
|
|
|
Batch normalization has a negative effect on training if batches are small |
|
so we disable it here. |
|
""" |
|
|
|
def call(self, inputs, training=None): |
|
return super(self.__class__, self).call(inputs, training=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def identity_block(input_tensor, kernel_size, filters, stage, block, |
|
use_bias=True): |
|
"""The identity_block is the block that has no conv layer at shortcut |
|
# Arguments |
|
input_tensor: input tensor |
|
kernel_size: defualt 3, the kernel size of middle conv layer at main path |
|
filters: list of integers, the nb_filters of 3 conv layer at main path |
|
stage: integer, current stage label, used for generating layer names |
|
block: 'a','b'..., current block label, used for generating layer names |
|
""" |
|
nb_filter1, nb_filter2, nb_filter3 = filters |
|
conv_name_base = 'res' + str(stage) + block + '_branch' |
|
bn_name_base = 'bn' + str(stage) + block + '_branch' |
|
|
|
x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', |
|
use_bias=use_bias)(input_tensor) |
|
x = BatchNorm(axis=3, name=bn_name_base + '2a')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same', |
|
name=conv_name_base + '2b', use_bias=use_bias)(x) |
|
x = BatchNorm(axis=3, name=bn_name_base + '2b')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', |
|
use_bias=use_bias)(x) |
|
x = BatchNorm(axis=3, name=bn_name_base + '2c')(x) |
|
|
|
x = KL.Add()([x, input_tensor]) |
|
x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x) |
|
return x |
|
|
|
|
|
def conv_block(input_tensor, kernel_size, filters, stage, block, |
|
strides=(2, 2), use_bias=True): |
|
"""conv_block is the block that has a conv layer at shortcut |
|
# Arguments |
|
input_tensor: input tensor |
|
kernel_size: defualt 3, the kernel size of middle conv layer at main path |
|
filters: list of integers, the nb_filters of 3 conv layer at main path |
|
stage: integer, current stage label, used for generating layer names |
|
block: 'a','b'..., current block label, used for generating layer names |
|
Note that from stage 3, the first conv layer at main path is with subsample=(2,2) |
|
And the shortcut should have subsample=(2,2) as well |
|
""" |
|
nb_filter1, nb_filter2, nb_filter3 = filters |
|
conv_name_base = 'res' + str(stage) + block + '_branch' |
|
bn_name_base = 'bn' + str(stage) + block + '_branch' |
|
|
|
x = KL.Conv2D(nb_filter1, (1, 1), strides=strides, |
|
name=conv_name_base + '2a', use_bias=use_bias)(input_tensor) |
|
x = BatchNorm(axis=3, name=bn_name_base + '2a')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same', |
|
name=conv_name_base + '2b', use_bias=use_bias)(x) |
|
x = BatchNorm(axis=3, name=bn_name_base + '2b')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + |
|
'2c', use_bias=use_bias)(x) |
|
x = BatchNorm(axis=3, name=bn_name_base + '2c')(x) |
|
|
|
shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides, |
|
name=conv_name_base + '1', use_bias=use_bias)(input_tensor) |
|
shortcut = BatchNorm(axis=3, name=bn_name_base + '1')(shortcut) |
|
|
|
x = KL.Add()([x, shortcut]) |
|
x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x) |
|
return x |
|
|
|
|
|
def resnet_graph(input_image, architecture, stage5=False): |
|
assert architecture in ["resnet50", "resnet101"] |
|
|
|
x = KL.ZeroPadding2D((3, 3))(input_image) |
|
x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x) |
|
x = BatchNorm(axis=3, name='bn_conv1')(x) |
|
x = KL.Activation('relu')(x) |
|
C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x) |
|
|
|
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) |
|
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') |
|
C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') |
|
|
|
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') |
|
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') |
|
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') |
|
C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') |
|
|
|
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') |
|
block_count = {"resnet50": 5, "resnet101": 22}[architecture] |
|
for i in range(block_count): |
|
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i)) |
|
C4 = x |
|
|
|
if stage5: |
|
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') |
|
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') |
|
C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') |
|
else: |
|
C5 = None |
|
return [C1, C2, C3, C4, C5] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def apply_box_deltas_graph(boxes, deltas): |
|
"""Applies the given deltas to the given boxes. |
|
boxes: [N, 4] where each row is y1, x1, y2, x2 |
|
deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)] |
|
""" |
|
|
|
height = boxes[:, 2] - boxes[:, 0] |
|
width = boxes[:, 3] - boxes[:, 1] |
|
center_y = boxes[:, 0] + 0.5 * height |
|
center_x = boxes[:, 1] + 0.5 * width |
|
|
|
center_y += deltas[:, 0] * height |
|
center_x += deltas[:, 1] * width |
|
height *= tf.exp(deltas[:, 2]) |
|
width *= tf.exp(deltas[:, 3]) |
|
|
|
y1 = center_y - 0.5 * height |
|
x1 = center_x - 0.5 * width |
|
y2 = y1 + height |
|
x2 = x1 + width |
|
result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out") |
|
return result |
|
|
|
|
|
def clip_boxes_graph(boxes, window): |
|
""" |
|
boxes: [N, 4] each row is y1, x1, y2, x2 |
|
window: [4] in the form y1, x1, y2, x2 |
|
""" |
|
|
|
wy1, wx1, wy2, wx2 = tf.split(window, 4) |
|
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1) |
|
|
|
y1 = tf.maximum(tf.minimum(y1, wy2), wy1) |
|
x1 = tf.maximum(tf.minimum(x1, wx2), wx1) |
|
y2 = tf.maximum(tf.minimum(y2, wy2), wy1) |
|
x2 = tf.maximum(tf.minimum(x2, wx2), wx1) |
|
clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes") |
|
clipped.set_shape((clipped.shape[0], 4)) |
|
return clipped |
|
|
|
|
|
class ProposalLayer(KE.Layer): |
|
"""Receives anchor scores and selects a subset to pass as proposals |
|
to the second stage. Filtering is done based on anchor scores and |
|
non-max suppression to remove overlaps. It also applies bounding |
|
box refinement deltas to anchors. |
|
|
|
Inputs: |
|
rpn_probs: [batch, anchors, (bg prob, fg prob)] |
|
rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))] |
|
|
|
Returns: |
|
Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)] |
|
""" |
|
|
|
def __init__(self, proposal_count, nms_threshold, anchors, |
|
config=None, **kwargs): |
|
""" |
|
anchors: [N, (y1, x1, y2, x2)] anchors defined in image coordinates |
|
""" |
|
super(ProposalLayer, self).__init__(**kwargs) |
|
self.config = config |
|
self.proposal_count = proposal_count |
|
self.nms_threshold = nms_threshold |
|
self.anchors = anchors.astype(np.float32) |
|
|
|
def call(self, inputs): |
|
|
|
scores = inputs[0][:, :, 1] |
|
|
|
deltas = inputs[1] |
|
deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4]) |
|
|
|
anchors = self.anchors |
|
|
|
|
|
|
|
pre_nms_limit = min(6000, self.anchors.shape[0]) |
|
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, |
|
name="top_anchors").indices |
|
scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y), |
|
self.config.IMAGES_PER_GPU) |
|
deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y), |
|
self.config.IMAGES_PER_GPU) |
|
anchors = utils.batch_slice(ix, lambda x: tf.gather(anchors, x), |
|
self.config.IMAGES_PER_GPU, |
|
names=["pre_nms_anchors"]) |
|
|
|
|
|
|
|
boxes = utils.batch_slice([anchors, deltas], |
|
lambda x, y: apply_box_deltas_graph(x, y), |
|
self.config.IMAGES_PER_GPU, |
|
names=["refined_anchors"]) |
|
|
|
|
|
height, width = self.config.IMAGE_SHAPE[:2] |
|
window = np.array([0, 0, height, width]).astype(np.float32) |
|
boxes = utils.batch_slice(boxes, |
|
lambda x: clip_boxes_graph(x, window), |
|
self.config.IMAGES_PER_GPU, |
|
names=["refined_anchors_clipped"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
normalized_boxes = boxes / np.array([[height, width, height, width]]) |
|
|
|
|
|
def nms(normalized_boxes, scores): |
|
indices = tf.image.non_max_suppression( |
|
normalized_boxes, scores, self.proposal_count, |
|
self.nms_threshold, name="rpn_non_max_suppression") |
|
proposals = tf.gather(normalized_boxes, indices) |
|
|
|
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0) |
|
proposals = tf.pad(proposals, [(0, padding), (0, 0)]) |
|
return proposals |
|
proposals = utils.batch_slice([normalized_boxes, scores], nms, |
|
self.config.IMAGES_PER_GPU) |
|
return proposals |
|
|
|
def compute_output_shape(self, input_shape): |
|
return (None, self.proposal_count, 4) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def log2_graph(x): |
|
"""Implementatin of Log2. TF doesn't have a native implemenation.""" |
|
return tf.log(x) / tf.log(2.0) |
|
|
|
|
|
class PyramidROIAlign(KE.Layer): |
|
"""Implements ROI Pooling on multiple levels of the feature pyramid. |
|
|
|
Params: |
|
- pool_shape: [height, width] of the output pooled regions. Usually [7, 7] |
|
- image_shape: [height, width, channels]. Shape of input image in pixels |
|
|
|
Inputs: |
|
- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized |
|
coordinates. Possibly padded with zeros if not enough |
|
boxes to fill the array. |
|
- Feature maps: List of feature maps from different levels of the pyramid. |
|
Each is [batch, height, width, channels] |
|
|
|
Output: |
|
Pooled regions in the shape: [batch, num_boxes, height, width, channels]. |
|
The width and height are those specific in the pool_shape in the layer |
|
constructor. |
|
""" |
|
|
|
def __init__(self, pool_shape, image_shape, **kwargs): |
|
super(PyramidROIAlign, self).__init__(**kwargs) |
|
self.pool_shape = tuple(pool_shape) |
|
self.image_shape = tuple(image_shape) |
|
|
|
def call(self, inputs): |
|
|
|
boxes = inputs[0] |
|
|
|
|
|
|
|
feature_maps = inputs[1:] |
|
|
|
|
|
y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) |
|
h = y2 - y1 |
|
w = x2 - x1 |
|
|
|
|
|
|
|
image_area = tf.cast( |
|
self.image_shape[0] * self.image_shape[1], tf.float32) |
|
roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) |
|
roi_level = tf.minimum(5, tf.maximum( |
|
2, 4 + tf.cast(tf.round(roi_level), tf.int32))) |
|
roi_level = tf.squeeze(roi_level, 2) |
|
|
|
|
|
pooled = [] |
|
box_to_level = [] |
|
for i, level in enumerate(range(2, 6)): |
|
ix = tf.where(tf.equal(roi_level, level)) |
|
level_boxes = tf.gather_nd(boxes, ix) |
|
|
|
|
|
box_indices = tf.cast(ix[:, 0], tf.int32) |
|
|
|
|
|
box_to_level.append(ix) |
|
|
|
|
|
level_boxes = tf.stop_gradient(level_boxes) |
|
box_indices = tf.stop_gradient(box_indices) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pooled.append(tf.image.crop_and_resize( |
|
feature_maps[i], level_boxes, box_indices, self.pool_shape, |
|
method="bilinear")) |
|
|
|
|
|
pooled = tf.concat(pooled, axis=0) |
|
|
|
|
|
|
|
box_to_level = tf.concat(box_to_level, axis=0) |
|
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) |
|
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], |
|
axis=1) |
|
|
|
|
|
|
|
|
|
sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1] |
|
ix = tf.nn.top_k(sorting_tensor, k=tf.shape( |
|
box_to_level)[0]).indices[::-1] |
|
ix = tf.gather(box_to_level[:, 2], ix) |
|
pooled = tf.gather(pooled, ix) |
|
|
|
|
|
pooled = tf.expand_dims(pooled, 0) |
|
return pooled |
|
|
|
def compute_output_shape(self, input_shape): |
|
return input_shape[0][:2] + self.pool_shape + (input_shape[1][-1], ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def overlaps_graph(boxes1, boxes2): |
|
"""Computes IoU overlaps between two sets of boxes. |
|
boxes1, boxes2: [N, (y1, x1, y2, x2)]. |
|
""" |
|
|
|
|
|
|
|
|
|
b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1), |
|
[1, 1, tf.shape(boxes2)[0]]), [-1, 4]) |
|
b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1]) |
|
|
|
b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1) |
|
b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1) |
|
y1 = tf.maximum(b1_y1, b2_y1) |
|
x1 = tf.maximum(b1_x1, b2_x1) |
|
y2 = tf.minimum(b1_y2, b2_y2) |
|
x2 = tf.minimum(b1_x2, b2_x2) |
|
intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0) |
|
|
|
b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1) |
|
b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1) |
|
union = b1_area + b2_area - intersection |
|
|
|
iou = intersection / union |
|
overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]]) |
|
return overlaps |
|
|
|
|
|
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config): |
|
"""Generates detection targets for one image. Subsamples proposals and |
|
generates target class IDs, bounding box deltas, and masks for each. |
|
|
|
Inputs: |
|
proposals: [N, (y1, x1, y2, x2)] in normalized coordinates. Might |
|
be zero padded if there are not enough proposals. |
|
gt_class_ids: [MAX_GT_INSTANCES] int class IDs |
|
gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. |
|
gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type. |
|
|
|
Returns: Target ROIs and corresponding class IDs, bounding box shifts, |
|
and masks. |
|
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates |
|
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded. |
|
deltas: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw))] |
|
Class-specific bbox refinements. |
|
masks: [TRAIN_ROIS_PER_IMAGE, height, width). Masks cropped to bbox |
|
boundaries and resized to neural network output size. |
|
|
|
Note: Returned arrays might be zero padded if not enough target ROIs. |
|
""" |
|
|
|
asserts = [ |
|
tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals], |
|
name="roi_assertion"), |
|
] |
|
with tf.control_dependencies(asserts): |
|
proposals = tf.identity(proposals) |
|
|
|
|
|
proposals, _ = trim_zeros_graph(proposals, name="trim_proposals") |
|
gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes") |
|
gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros, |
|
name="trim_gt_class_ids") |
|
gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2, |
|
name="trim_gt_masks") |
|
|
|
|
|
|
|
|
|
crowd_ix = tf.where(gt_class_ids < 0)[:, 0] |
|
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0] |
|
crowd_boxes = tf.gather(gt_boxes, crowd_ix) |
|
crowd_masks = tf.gather(gt_masks, crowd_ix, axis=2) |
|
gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix) |
|
gt_boxes = tf.gather(gt_boxes, non_crowd_ix) |
|
gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2) |
|
|
|
|
|
overlaps = overlaps_graph(proposals, gt_boxes) |
|
|
|
|
|
crowd_overlaps = overlaps_graph(proposals, crowd_boxes) |
|
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1) |
|
no_crowd_bool = (crowd_iou_max < 0.001) |
|
|
|
|
|
roi_iou_max = tf.reduce_max(overlaps, axis=1) |
|
|
|
positive_roi_bool = (roi_iou_max >= 0.5) |
|
positive_indices = tf.where(positive_roi_bool)[:, 0] |
|
|
|
negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0] |
|
|
|
|
|
|
|
positive_count = int(config.TRAIN_ROIS_PER_IMAGE * |
|
config.ROI_POSITIVE_RATIO) |
|
positive_indices = tf.random_shuffle(positive_indices)[:positive_count] |
|
positive_count = tf.shape(positive_indices)[0] |
|
|
|
r = 1.0 / config.ROI_POSITIVE_RATIO |
|
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count |
|
negative_indices = tf.random_shuffle(negative_indices)[:negative_count] |
|
|
|
positive_rois = tf.gather(proposals, positive_indices) |
|
negative_rois = tf.gather(proposals, negative_indices) |
|
|
|
|
|
positive_overlaps = tf.gather(overlaps, positive_indices) |
|
roi_gt_box_assignment = tf.argmax(positive_overlaps, axis=1) |
|
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) |
|
roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment) |
|
|
|
|
|
deltas = utils.box_refinement_graph(positive_rois, roi_gt_boxes) |
|
deltas /= config.BBOX_STD_DEV |
|
|
|
|
|
|
|
transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1) |
|
|
|
roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment) |
|
|
|
|
|
boxes = positive_rois |
|
if config.USE_MINI_MASK: |
|
|
|
|
|
y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1) |
|
gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1) |
|
gt_h = gt_y2 - gt_y1 |
|
gt_w = gt_x2 - gt_x1 |
|
y1 = (y1 - gt_y1) / gt_h |
|
x1 = (x1 - gt_x1) / gt_w |
|
y2 = (y2 - gt_y1) / gt_h |
|
x2 = (x2 - gt_x1) / gt_w |
|
boxes = tf.concat([y1, x1, y2, x2], 1) |
|
box_ids = tf.range(0, tf.shape(roi_masks)[0]) |
|
masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes, |
|
box_ids, |
|
config.MASK_SHAPE) |
|
|
|
masks = tf.squeeze(masks, axis=3) |
|
|
|
|
|
|
|
masks = tf.round(masks) |
|
|
|
|
|
|
|
rois = tf.concat([positive_rois, negative_rois], axis=0) |
|
N = tf.shape(negative_rois)[0] |
|
P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0) |
|
rois = tf.pad(rois, [(0, P), (0, 0)]) |
|
roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)]) |
|
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)]) |
|
deltas = tf.pad(deltas, [(0, N + P), (0, 0)]) |
|
masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)]) |
|
|
|
return rois, roi_gt_class_ids, deltas, masks |
|
|
|
|
|
class DetectionTargetLayer(KE.Layer): |
|
"""Subsamples proposals and generates target box refinement, class_ids, |
|
and masks for each. |
|
|
|
Inputs: |
|
proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might |
|
be zero padded if there are not enough proposals. |
|
gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs. |
|
gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized |
|
coordinates. |
|
gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type |
|
|
|
Returns: Target ROIs and corresponding class IDs, bounding box shifts, |
|
and masks. |
|
rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized |
|
coordinates |
|
target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs. |
|
target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, |
|
(dy, dx, log(dh), log(dw), class_id)] |
|
Class-specific bbox refinements. |
|
target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width) |
|
Masks cropped to bbox boundaries and resized to neural |
|
network output size. |
|
|
|
Note: Returned arrays might be zero padded if not enough target ROIs. |
|
""" |
|
|
|
def __init__(self, config, **kwargs): |
|
super(DetectionTargetLayer, self).__init__(**kwargs) |
|
self.config = config |
|
|
|
def call(self, inputs): |
|
proposals = inputs[0] |
|
gt_class_ids = inputs[1] |
|
gt_boxes = inputs[2] |
|
gt_masks = inputs[3] |
|
|
|
|
|
|
|
names = ["rois", "target_class_ids", "target_bbox", "target_mask"] |
|
outputs = utils.batch_slice( |
|
[proposals, gt_class_ids, gt_boxes, gt_masks], |
|
lambda w, x, y, z: detection_targets_graph( |
|
w, x, y, z, self.config), |
|
self.config.IMAGES_PER_GPU, names=names) |
|
return outputs |
|
|
|
def compute_output_shape(self, input_shape): |
|
return [ |
|
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), |
|
(None, 1), |
|
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), |
|
(None, self.config.TRAIN_ROIS_PER_IMAGE, self.config.MASK_SHAPE[0], |
|
self.config.MASK_SHAPE[1]) |
|
] |
|
|
|
def compute_mask(self, inputs, mask=None): |
|
return [None, None, None, None] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def clip_to_window(window, boxes): |
|
""" |
|
window: (y1, x1, y2, x2). The window in the image we want to clip to. |
|
boxes: [N, (y1, x1, y2, x2)] |
|
""" |
|
boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], window[2]), window[0]) |
|
boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], window[3]), window[1]) |
|
boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], window[2]), window[0]) |
|
boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], window[3]), window[1]) |
|
return boxes |
|
|
|
|
|
def refine_detections_graph(rois, probs, deltas, window, config): |
|
"""Refine classified proposals and filter overlaps and return final |
|
detections. |
|
|
|
Inputs: |
|
rois: [N, (y1, x1, y2, x2)] in normalized coordinates |
|
probs: [N, num_classes]. Class probabilities. |
|
deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific |
|
bounding box deltas. |
|
window: (y1, x1, y2, x2) in image coordinates. The part of the image |
|
that contains the image excluding the padding. |
|
|
|
Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)] where |
|
coordinates are in image domain. |
|
""" |
|
|
|
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32) |
|
|
|
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1) |
|
class_scores = tf.gather_nd(probs, indices) |
|
|
|
deltas_specific = tf.gather_nd(deltas, indices) |
|
|
|
|
|
refined_rois = apply_box_deltas_graph( |
|
rois, deltas_specific * config.BBOX_STD_DEV) |
|
|
|
|
|
height, width = config.IMAGE_SHAPE[:2] |
|
refined_rois *= tf.constant([height, width, height, width], dtype=tf.float32) |
|
|
|
refined_rois = clip_boxes_graph(refined_rois, window) |
|
|
|
refined_rois = tf.to_int32(tf.rint(refined_rois)) |
|
|
|
|
|
|
|
|
|
keep = tf.where(class_ids > 0)[:, 0] |
|
|
|
if config.DETECTION_MIN_CONFIDENCE: |
|
conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0] |
|
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), |
|
tf.expand_dims(conf_keep, 0)) |
|
keep = tf.sparse_tensor_to_dense(keep)[0] |
|
|
|
|
|
|
|
pre_nms_class_ids = tf.gather(class_ids, keep) |
|
pre_nms_scores = tf.gather(class_scores, keep) |
|
pre_nms_rois = tf.gather(refined_rois, keep) |
|
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0] |
|
|
|
def nms_keep_map(class_id): |
|
"""Apply Non-Maximum Suppression on ROIs of the given class.""" |
|
|
|
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0] |
|
|
|
class_keep = tf.image.non_max_suppression( |
|
tf.to_float(tf.gather(pre_nms_rois, ixs)), |
|
tf.gather(pre_nms_scores, ixs), |
|
max_output_size=config.DETECTION_MAX_INSTANCES, |
|
iou_threshold=config.DETECTION_NMS_THRESHOLD) |
|
|
|
class_keep = tf.gather(keep, tf.gather(ixs, class_keep)) |
|
|
|
gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0] |
|
class_keep = tf.pad(class_keep, [(0, gap)], |
|
mode='CONSTANT', constant_values=-1) |
|
|
|
class_keep.set_shape([config.DETECTION_MAX_INSTANCES]) |
|
return class_keep |
|
|
|
|
|
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids, |
|
dtype=tf.int64) |
|
|
|
nms_keep = tf.reshape(nms_keep, [-1]) |
|
nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0]) |
|
|
|
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), |
|
tf.expand_dims(nms_keep, 0)) |
|
keep = tf.sparse_tensor_to_dense(keep)[0] |
|
|
|
roi_count = config.DETECTION_MAX_INSTANCES |
|
class_scores_keep = tf.gather(class_scores, keep) |
|
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count) |
|
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1] |
|
keep = tf.gather(keep, top_ids) |
|
|
|
|
|
|
|
detections = tf.concat([ |
|
tf.to_float(tf.gather(refined_rois, keep)), |
|
tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis], |
|
tf.gather(class_scores, keep)[..., tf.newaxis] |
|
], axis=1) |
|
|
|
|
|
gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0] |
|
detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT") |
|
return detections |
|
|
|
|
|
class DetectionLayer(KE.Layer): |
|
"""Takes classified proposal boxes and their bounding box deltas and |
|
returns the final detection boxes. |
|
|
|
Returns: |
|
[batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where |
|
coordinates are in image domain |
|
""" |
|
|
|
def __init__(self, config=None, **kwargs): |
|
super(DetectionLayer, self).__init__(**kwargs) |
|
self.config = config |
|
|
|
def call(self, inputs): |
|
rois = inputs[0] |
|
mrcnn_class = inputs[1] |
|
mrcnn_bbox = inputs[2] |
|
image_meta = inputs[3] |
|
|
|
|
|
_, _, window, _ = parse_image_meta_graph(image_meta) |
|
detections_batch = utils.batch_slice( |
|
[rois, mrcnn_class, mrcnn_bbox, window], |
|
lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config), |
|
self.config.IMAGES_PER_GPU) |
|
|
|
|
|
|
|
return tf.reshape( |
|
detections_batch, |
|
[self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6]) |
|
|
|
def compute_output_shape(self, input_shape): |
|
return (None, self.config.DETECTION_MAX_INSTANCES, 6) |
|
|
|
|
|
|
|
|
|
def rpn_graph(feature_map, anchors_per_location, anchor_stride): |
|
"""Builds the computation graph of Region Proposal Network. |
|
|
|
feature_map: backbone features [batch, height, width, depth] |
|
anchors_per_location: number of anchors per pixel in the feature map |
|
anchor_stride: Controls the density of anchors. Typically 1 (anchors for |
|
every pixel in the feature map), or 2 (every other pixel). |
|
|
|
Returns: |
|
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax) |
|
rpn_probs: [batch, H, W, 2] Anchor classifier probabilities. |
|
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be |
|
applied to anchors. |
|
""" |
|
|
|
|
|
|
|
shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu', |
|
strides=anchor_stride, |
|
name='rpn_conv_shared')(feature_map) |
|
|
|
|
|
x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid', |
|
activation='linear', name='rpn_class_raw')(shared) |
|
|
|
|
|
rpn_class_logits = KL.Lambda( |
|
lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x) |
|
|
|
|
|
rpn_probs = KL.Activation( |
|
"softmax", name="rpn_class_xxx")(rpn_class_logits) |
|
|
|
|
|
|
|
x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid", |
|
activation='linear', name='rpn_bbox_pred')(shared) |
|
|
|
|
|
rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x) |
|
|
|
return [rpn_class_logits, rpn_probs, rpn_bbox] |
|
|
|
|
|
def build_rpn_model(anchor_stride, anchors_per_location, depth): |
|
"""Builds a Keras model of the Region Proposal Network. |
|
It wraps the RPN graph so it can be used multiple times with shared |
|
weights. |
|
|
|
anchors_per_location: number of anchors per pixel in the feature map |
|
anchor_stride: Controls the density of anchors. Typically 1 (anchors for |
|
every pixel in the feature map), or 2 (every other pixel). |
|
depth: Depth of the backbone feature map. |
|
|
|
Returns a Keras Model object. The model outputs, when called, are: |
|
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax) |
|
rpn_probs: [batch, W, W, 2] Anchor classifier probabilities. |
|
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be |
|
applied to anchors. |
|
""" |
|
input_feature_map = KL.Input(shape=[None, None, depth], |
|
name="input_rpn_feature_map") |
|
outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride) |
|
return KM.Model([input_feature_map], outputs, name="rpn_model") |
|
|
|
|
|
|
|
|
|
|
|
|
|
def fpn_classifier_graph(rois, feature_maps, |
|
image_shape, pool_size, num_classes): |
|
"""Builds the computation graph of the feature pyramid network classifier |
|
and regressor heads. |
|
|
|
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized |
|
coordinates. |
|
feature_maps: List of feature maps from diffent layers of the pyramid, |
|
[P2, P3, P4, P5]. Each has a different resolution. |
|
image_shape: [height, width, depth] |
|
pool_size: The width of the square feature map generated from ROI Pooling. |
|
num_classes: number of classes, which determines the depth of the results |
|
|
|
Returns: |
|
logits: [N, NUM_CLASSES] classifier logits (before softmax) |
|
probs: [N, NUM_CLASSES] classifier probabilities |
|
bbox_deltas: [N, (dy, dx, log(dh), log(dw))] Deltas to apply to |
|
proposal boxes |
|
""" |
|
|
|
|
|
x = PyramidROIAlign([pool_size, pool_size], image_shape, |
|
name="roi_align_classifier")([rois] + feature_maps) |
|
|
|
x = KL.TimeDistributed(KL.Conv2D(1024, (pool_size, pool_size), padding="valid"), |
|
name="mrcnn_class_conv1")(x) |
|
x = KL.TimeDistributed(BatchNorm(axis=3), name='mrcnn_class_bn1')(x) |
|
x = KL.Activation('relu')(x) |
|
x = KL.TimeDistributed(KL.Conv2D(1024, (1, 1)), |
|
name="mrcnn_class_conv2")(x) |
|
x = KL.TimeDistributed(BatchNorm(axis=3), |
|
name='mrcnn_class_bn2')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2), |
|
name="pool_squeeze")(x) |
|
|
|
|
|
mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes), |
|
name='mrcnn_class_logits')(shared) |
|
mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"), |
|
name="mrcnn_class")(mrcnn_class_logits) |
|
|
|
|
|
|
|
x = KL.TimeDistributed(KL.Dense(num_classes * 4, activation='linear'), |
|
name='mrcnn_bbox_fc')(shared) |
|
|
|
s = K.int_shape(x) |
|
mrcnn_bbox = KL.Reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x) |
|
|
|
return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox |
|
|
|
|
|
def build_fpn_mask_graph(rois, feature_maps, |
|
image_shape, pool_size, num_classes): |
|
"""Builds the computation graph of the mask head of Feature Pyramid Network. |
|
|
|
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized |
|
coordinates. |
|
feature_maps: List of feature maps from diffent layers of the pyramid, |
|
[P2, P3, P4, P5]. Each has a different resolution. |
|
image_shape: [height, width, depth] |
|
pool_size: The width of the square feature map generated from ROI Pooling. |
|
num_classes: number of classes, which determines the depth of the results |
|
|
|
Returns: Masks [batch, roi_count, height, width, num_classes] |
|
""" |
|
|
|
|
|
x = PyramidROIAlign([pool_size, pool_size], image_shape, |
|
name="roi_align_mask")([rois] + feature_maps) |
|
|
|
|
|
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), |
|
name="mrcnn_mask_conv1")(x) |
|
x = KL.TimeDistributed(BatchNorm(axis=3), |
|
name='mrcnn_mask_bn1')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), |
|
name="mrcnn_mask_conv2")(x) |
|
x = KL.TimeDistributed(BatchNorm(axis=3), |
|
name='mrcnn_mask_bn2')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), |
|
name="mrcnn_mask_conv3")(x) |
|
x = KL.TimeDistributed(BatchNorm(axis=3), |
|
name='mrcnn_mask_bn3')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), |
|
name="mrcnn_mask_conv4")(x) |
|
x = KL.TimeDistributed(BatchNorm(axis=3), |
|
name='mrcnn_mask_bn4')(x) |
|
x = KL.Activation('relu')(x) |
|
|
|
x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"), |
|
name="mrcnn_mask_deconv")(x) |
|
x = KL.TimeDistributed(KL.Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"), |
|
name="mrcnn_mask")(x) |
|
return x |
|
|
|
|
|
|
|
|
|
|
|
|
|
def smooth_l1_loss(y_true, y_pred): |
|
"""Implements Smooth-L1 loss. |
|
y_true and y_pred are typicallly: [N, 4], but could be any shape. |
|
""" |
|
diff = K.abs(y_true - y_pred) |
|
less_than_one = K.cast(K.less(diff, 1.0), "float32") |
|
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5) |
|
return loss |
|
|
|
|
|
def rpn_class_loss_graph(rpn_match, rpn_class_logits): |
|
"""RPN anchor classifier loss. |
|
|
|
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive, |
|
-1=negative, 0=neutral anchor. |
|
rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for FG/BG. |
|
""" |
|
|
|
rpn_match = tf.squeeze(rpn_match, -1) |
|
|
|
anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32) |
|
|
|
|
|
indices = tf.where(K.not_equal(rpn_match, 0)) |
|
|
|
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices) |
|
anchor_class = tf.gather_nd(anchor_class, indices) |
|
|
|
loss = K.sparse_categorical_crossentropy(target=anchor_class, |
|
output=rpn_class_logits, |
|
from_logits=True) |
|
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0)) |
|
return loss |
|
|
|
|
|
def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox): |
|
"""Return the RPN bounding box loss graph. |
|
|
|
config: the model config object. |
|
target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))]. |
|
Uses 0 padding to fill in unsed bbox deltas. |
|
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive, |
|
-1=negative, 0=neutral anchor. |
|
rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))] |
|
""" |
|
|
|
|
|
rpn_match = K.squeeze(rpn_match, -1) |
|
indices = tf.where(K.equal(rpn_match, 1)) |
|
|
|
|
|
rpn_bbox = tf.gather_nd(rpn_bbox, indices) |
|
|
|
|
|
batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1) |
|
target_bbox = batch_pack_graph(target_bbox, batch_counts, |
|
config.IMAGES_PER_GPU) |
|
|
|
|
|
|
|
diff = K.abs(target_bbox - rpn_bbox) |
|
less_than_one = K.cast(K.less(diff, 1.0), "float32") |
|
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5) |
|
|
|
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0)) |
|
return loss |
|
|
|
|
|
def mrcnn_class_loss_graph(target_class_ids, pred_class_logits, |
|
active_class_ids): |
|
"""Loss for the classifier head of Mask RCNN. |
|
|
|
target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero |
|
padding to fill in the array. |
|
pred_class_logits: [batch, num_rois, num_classes] |
|
active_class_ids: [batch, num_classes]. Has a value of 1 for |
|
classes that are in the dataset of the image, and 0 |
|
for classes that are not in the dataset. |
|
""" |
|
target_class_ids = tf.cast(target_class_ids, 'int64') |
|
|
|
|
|
pred_class_ids = tf.argmax(pred_class_logits, axis=2) |
|
|
|
|
|
pred_active = tf.gather(active_class_ids[0], pred_class_ids) |
|
|
|
|
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits( |
|
labels=target_class_ids, logits=pred_class_logits) |
|
|
|
|
|
|
|
loss = loss * pred_active |
|
|
|
|
|
|
|
loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active) |
|
return loss |
|
|
|
|
|
def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox): |
|
"""Loss for Mask R-CNN bounding box refinement. |
|
|
|
target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))] |
|
target_class_ids: [batch, num_rois]. Integer class IDs. |
|
pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))] |
|
""" |
|
|
|
target_class_ids = K.reshape(target_class_ids, (-1,)) |
|
target_bbox = K.reshape(target_bbox, (-1, 4)) |
|
pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4)) |
|
|
|
|
|
|
|
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0] |
|
positive_roi_class_ids = tf.cast( |
|
tf.gather(target_class_ids, positive_roi_ix), tf.int64) |
|
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1) |
|
|
|
|
|
target_bbox = tf.gather(target_bbox, positive_roi_ix) |
|
pred_bbox = tf.gather_nd(pred_bbox, indices) |
|
|
|
|
|
loss = K.switch(tf.size(target_bbox) > 0, |
|
smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox), |
|
tf.constant(0.0)) |
|
loss = K.mean(loss) |
|
loss = K.reshape(loss, [1, 1]) |
|
return loss |
|
|
|
|
|
def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks): |
|
"""Mask binary cross-entropy loss for the masks head. |
|
|
|
target_masks: [batch, num_rois, height, width]. |
|
A float32 tensor of values 0 or 1. Uses zero padding to fill array. |
|
target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded. |
|
pred_masks: [batch, proposals, height, width, num_classes] float32 tensor |
|
with values from 0 to 1. |
|
""" |
|
|
|
target_class_ids = K.reshape(target_class_ids, (-1,)) |
|
mask_shape = tf.shape(target_masks) |
|
target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3])) |
|
pred_shape = tf.shape(pred_masks) |
|
pred_masks = K.reshape(pred_masks, |
|
(-1, pred_shape[2], pred_shape[3], pred_shape[4])) |
|
|
|
pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2]) |
|
|
|
|
|
|
|
positive_ix = tf.where(target_class_ids > 0)[:, 0] |
|
positive_class_ids = tf.cast( |
|
tf.gather(target_class_ids, positive_ix), tf.int64) |
|
indices = tf.stack([positive_ix, positive_class_ids], axis=1) |
|
|
|
|
|
y_true = tf.gather(target_masks, positive_ix) |
|
y_pred = tf.gather_nd(pred_masks, indices) |
|
|
|
|
|
|
|
loss = K.switch(tf.size(y_true) > 0, |
|
K.binary_crossentropy(target=y_true, output=y_pred), |
|
tf.constant(0.0)) |
|
loss = K.mean(loss) |
|
loss = K.reshape(loss, [1, 1]) |
|
return loss |
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_image_gt(dataset, config, image_id, augment=False, |
|
use_mini_mask=False): |
|
"""Load and return ground truth data for an image (image, mask, bounding boxes). |
|
|
|
augment: If true, apply random image augmentation. Currently, only |
|
horizontal flipping is offered. |
|
use_mini_mask: If False, returns full-size masks that are the same height |
|
and width as the original image. These can be big, for example |
|
1024x1024x100 (for 100 instances). Mini masks are smaller, typically, |
|
224x224 and are generated by extracting the bounding box of the |
|
object and resizing it to MINI_MASK_SHAPE. |
|
|
|
Returns: |
|
image: [height, width, 3] |
|
shape: the original shape of the image before resizing and cropping. |
|
class_ids: [instance_count] Integer class IDs |
|
bbox: [instance_count, (y1, x1, y2, x2)] |
|
mask: [height, width, instance_count]. The height and width are those |
|
of the image unless use_mini_mask is True, in which case they are |
|
defined in MINI_MASK_SHAPE. |
|
""" |
|
|
|
image = dataset.load_image(image_id) |
|
mask, class_ids = dataset.load_mask(image_id) |
|
shape = image.shape |
|
image, window, scale, padding = utils.resize_image( |
|
image, |
|
min_dim=config.IMAGE_MIN_DIM, |
|
max_dim=config.IMAGE_MAX_DIM, |
|
padding=config.IMAGE_PADDING) |
|
mask = utils.resize_mask(mask, scale, padding) |
|
|
|
|
|
if augment: |
|
if random.randint(0, 1): |
|
image = np.fliplr(image) |
|
mask = np.fliplr(mask) |
|
|
|
|
|
|
|
_idx = np.sum(mask, axis=(0, 1)) > 0 |
|
mask = mask[:, :, _idx] |
|
class_ids = class_ids[_idx] |
|
|
|
|
|
|
|
bbox = utils.extract_bboxes(mask) |
|
|
|
|
|
|
|
|
|
active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32) |
|
source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]] |
|
active_class_ids[source_class_ids] = 1 |
|
|
|
|
|
if use_mini_mask: |
|
mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE) |
|
|
|
|
|
image_meta = compose_image_meta(image_id, shape, window, active_class_ids) |
|
|
|
return image, image_meta, class_ids, bbox, mask |
|
|
|
|
|
def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config): |
|
"""Generate targets for training Stage 2 classifier and mask heads. |
|
This is not used in normal training. It's useful for debugging or to train |
|
the Mask RCNN heads without using the RPN head. |
|
|
|
Inputs: |
|
rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes. |
|
gt_class_ids: [instance count] Integer class IDs |
|
gt_boxes: [instance count, (y1, x1, y2, x2)] |
|
gt_masks: [height, width, instance count] Grund truth masks. Can be full |
|
size or mini-masks. |
|
|
|
Returns: |
|
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] |
|
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. |
|
bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific |
|
bbox refinements. |
|
masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped |
|
to bbox boundaries and resized to neural network output size. |
|
""" |
|
assert rpn_rois.shape[0] > 0 |
|
assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format( |
|
gt_class_ids.dtype) |
|
assert gt_boxes.dtype == np.int32, "Expected int but got {}".format( |
|
gt_boxes.dtype) |
|
assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format( |
|
gt_masks.dtype) |
|
|
|
|
|
|
|
|
|
|
|
instance_ids = np.where(gt_class_ids > 0)[0] |
|
assert instance_ids.shape[0] > 0, "Image must contain instances." |
|
gt_class_ids = gt_class_ids[instance_ids] |
|
gt_boxes = gt_boxes[instance_ids] |
|
gt_masks = gt_masks[:, :, instance_ids] |
|
|
|
|
|
rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \ |
|
(rpn_rois[:, 3] - rpn_rois[:, 1]) |
|
gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \ |
|
(gt_boxes[:, 3] - gt_boxes[:, 1]) |
|
|
|
|
|
overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0])) |
|
for i in range(overlaps.shape[1]): |
|
gt = gt_boxes[i] |
|
overlaps[:, i] = utils.compute_iou( |
|
gt, rpn_rois, gt_box_area[i], rpn_roi_area) |
|
|
|
|
|
rpn_roi_iou_argmax = np.argmax(overlaps, axis=1) |
|
rpn_roi_iou_max = overlaps[np.arange( |
|
overlaps.shape[0]), rpn_roi_iou_argmax] |
|
|
|
rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax] |
|
rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax] |
|
|
|
|
|
fg_ids = np.where(rpn_roi_iou_max > 0.5)[0] |
|
|
|
|
|
|
|
|
|
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] |
|
|
|
|
|
|
|
fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) |
|
if fg_ids.shape[0] > fg_roi_count: |
|
keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False) |
|
else: |
|
keep_fg_ids = fg_ids |
|
|
|
remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0] |
|
if bg_ids.shape[0] > remaining: |
|
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) |
|
else: |
|
keep_bg_ids = bg_ids |
|
|
|
keep = np.concatenate([keep_fg_ids, keep_bg_ids]) |
|
|
|
remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0] |
|
if remaining > 0: |
|
|
|
|
|
|
|
|
|
|
|
if keep.shape[0] == 0: |
|
|
|
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] |
|
assert bg_ids.shape[0] >= remaining |
|
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) |
|
assert keep_bg_ids.shape[0] == remaining |
|
keep = np.concatenate([keep, keep_bg_ids]) |
|
else: |
|
|
|
keep_extra_ids = np.random.choice( |
|
keep_bg_ids, remaining, replace=True) |
|
keep = np.concatenate([keep, keep_extra_ids]) |
|
assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \ |
|
"keep doesn't match ROI batch size {}, {}".format( |
|
keep.shape[0], config.TRAIN_ROIS_PER_IMAGE) |
|
|
|
|
|
rpn_roi_gt_boxes[keep_bg_ids, :] = 0 |
|
rpn_roi_gt_class_ids[keep_bg_ids] = 0 |
|
|
|
|
|
rois = rpn_rois[keep] |
|
roi_gt_boxes = rpn_roi_gt_boxes[keep] |
|
roi_gt_class_ids = rpn_roi_gt_class_ids[keep] |
|
roi_gt_assignment = rpn_roi_iou_argmax[keep] |
|
|
|
|
|
bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE, |
|
config.NUM_CLASSES, 4), dtype=np.float32) |
|
pos_ids = np.where(roi_gt_class_ids > 0)[0] |
|
bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = utils.box_refinement( |
|
rois[pos_ids], roi_gt_boxes[pos_ids, :4]) |
|
|
|
bboxes /= config.BBOX_STD_DEV |
|
|
|
|
|
masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES), |
|
dtype=np.float32) |
|
for i in pos_ids: |
|
class_id = roi_gt_class_ids[i] |
|
assert class_id > 0, "class id must be greater than 0" |
|
gt_id = roi_gt_assignment[i] |
|
class_mask = gt_masks[:, :, gt_id] |
|
|
|
if config.USE_MINI_MASK: |
|
|
|
placeholder = np.zeros(config.IMAGE_SHAPE[:2], dtype=bool) |
|
|
|
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id] |
|
gt_w = gt_x2 - gt_x1 |
|
gt_h = gt_y2 - gt_y1 |
|
|
|
placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \ |
|
np.round(scipy.misc.imresize(class_mask.astype(float), (gt_h, gt_w), |
|
interp='nearest') / 255.0).astype(bool) |
|
|
|
class_mask = placeholder |
|
|
|
|
|
y1, x1, y2, x2 = rois[i].astype(np.int32) |
|
m = class_mask[y1:y2, x1:x2] |
|
mask = scipy.misc.imresize( |
|
m.astype(float), config.MASK_SHAPE, interp='nearest') / 255.0 |
|
masks[i, :, :, class_id] = mask |
|
|
|
return rois, roi_gt_class_ids, bboxes, masks |
|
|
|
|
|
def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config): |
|
"""Given the anchors and GT boxes, compute overlaps and identify positive |
|
anchors and deltas to refine them to match their corresponding GT boxes. |
|
|
|
anchors: [num_anchors, (y1, x1, y2, x2)] |
|
gt_class_ids: [num_gt_boxes] Integer class IDs. |
|
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)] |
|
|
|
Returns: |
|
rpn_match: [N] (int32) matches between anchors and GT boxes. |
|
1 = positive anchor, -1 = negative anchor, 0 = neutral |
|
rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. |
|
""" |
|
|
|
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32) |
|
|
|
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4)) |
|
|
|
|
|
|
|
|
|
crowd_ix = np.where(gt_class_ids < 0)[0] |
|
if crowd_ix.shape[0] > 0: |
|
|
|
non_crowd_ix = np.where(gt_class_ids > 0)[0] |
|
crowd_boxes = gt_boxes[crowd_ix] |
|
gt_class_ids = gt_class_ids[non_crowd_ix] |
|
gt_boxes = gt_boxes[non_crowd_ix] |
|
|
|
crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes) |
|
crowd_iou_max = np.amax(crowd_overlaps, axis=1) |
|
no_crowd_bool = (crowd_iou_max < 0.001) |
|
else: |
|
|
|
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool) |
|
|
|
|
|
overlaps = utils.compute_overlaps(anchors, gt_boxes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
anchor_iou_argmax = np.argmax(overlaps, axis=1) |
|
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax] |
|
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1 |
|
|
|
|
|
gt_iou_argmax = np.argmax(overlaps, axis=0) |
|
rpn_match[gt_iou_argmax] = 1 |
|
|
|
rpn_match[anchor_iou_max >= 0.7] = 1 |
|
|
|
|
|
|
|
ids = np.where(rpn_match == 1)[0] |
|
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2) |
|
if extra > 0: |
|
|
|
ids = np.random.choice(ids, extra, replace=False) |
|
rpn_match[ids] = 0 |
|
|
|
ids = np.where(rpn_match == -1)[0] |
|
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE - |
|
np.sum(rpn_match == 1)) |
|
if extra > 0: |
|
|
|
ids = np.random.choice(ids, extra, replace=False) |
|
rpn_match[ids] = 0 |
|
|
|
|
|
|
|
ids = np.where(rpn_match == 1)[0] |
|
ix = 0 |
|
|
|
for i, a in zip(ids, anchors[ids]): |
|
|
|
gt = gt_boxes[anchor_iou_argmax[i]] |
|
|
|
|
|
|
|
gt_h = gt[2] - gt[0] |
|
gt_w = gt[3] - gt[1] |
|
gt_center_y = gt[0] + 0.5 * gt_h |
|
gt_center_x = gt[1] + 0.5 * gt_w |
|
|
|
a_h = a[2] - a[0] |
|
a_w = a[3] - a[1] |
|
a_center_y = a[0] + 0.5 * a_h |
|
a_center_x = a[1] + 0.5 * a_w |
|
|
|
|
|
rpn_bbox[ix] = [ |
|
(gt_center_y - a_center_y) / a_h, |
|
(gt_center_x - a_center_x) / a_w, |
|
np.log(gt_h / a_h), |
|
np.log(gt_w / a_w), |
|
] |
|
|
|
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV |
|
ix += 1 |
|
|
|
return rpn_match, rpn_bbox |
|
|
|
|
|
def generate_random_rois(image_shape, count, gt_class_ids, gt_boxes): |
|
"""Generates ROI proposals similar to what a region proposal network |
|
would generate. |
|
|
|
image_shape: [Height, Width, Depth] |
|
count: Number of ROIs to generate |
|
gt_class_ids: [N] Integer ground truth class IDs |
|
gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels. |
|
|
|
Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels. |
|
""" |
|
|
|
rois = np.zeros((count, 4), dtype=np.int32) |
|
|
|
|
|
rois_per_box = int(0.9 * count / gt_boxes.shape[0]) |
|
for i in range(gt_boxes.shape[0]): |
|
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i] |
|
h = gt_y2 - gt_y1 |
|
w = gt_x2 - gt_x1 |
|
|
|
r_y1 = max(gt_y1 - h, 0) |
|
r_y2 = min(gt_y2 + h, image_shape[0]) |
|
r_x1 = max(gt_x1 - w, 0) |
|
r_x2 = min(gt_x2 + w, image_shape[1]) |
|
|
|
|
|
|
|
|
|
while True: |
|
y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2)) |
|
x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2)) |
|
|
|
threshold = 1 |
|
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= |
|
threshold][:rois_per_box] |
|
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= |
|
threshold][:rois_per_box] |
|
if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box: |
|
break |
|
|
|
|
|
|
|
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1) |
|
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1) |
|
box_rois = np.hstack([y1, x1, y2, x2]) |
|
rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois |
|
|
|
|
|
remaining_count = count - (rois_per_box * gt_boxes.shape[0]) |
|
|
|
|
|
|
|
while True: |
|
y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2)) |
|
x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2)) |
|
|
|
threshold = 1 |
|
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= |
|
threshold][:remaining_count] |
|
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= |
|
threshold][:remaining_count] |
|
if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count: |
|
break |
|
|
|
|
|
|
|
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1) |
|
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1) |
|
global_rois = np.hstack([y1, x1, y2, x2]) |
|
rois[-remaining_count:] = global_rois |
|
return rois |
|
|
|
|
|
def data_generator(dataset, config, shuffle=True, augment=True, random_rois=0, |
|
batch_size=1, detection_targets=False): |
|
"""A generator that returns images and corresponding target class ids, |
|
bounding box deltas, and masks. |
|
|
|
dataset: The Dataset object to pick data from |
|
config: The model config object |
|
shuffle: If True, shuffles the samples before every epoch |
|
augment: If True, applies image augmentation to images (currently only |
|
horizontal flips are supported) |
|
random_rois: If > 0 then generate proposals to be used to train the |
|
network classifier and mask heads. Useful if training |
|
the Mask RCNN part without the RPN. |
|
batch_size: How many images to return in each call |
|
detection_targets: If True, generate detection targets (class IDs, bbox |
|
deltas, and masks). Typically for debugging or visualizations because |
|
in trainig detection targets are generated by DetectionTargetLayer. |
|
|
|
Returns a Python generator. Upon calling next() on it, the |
|
generator returns two lists, inputs and outputs. The containtes |
|
of the lists differs depending on the received arguments: |
|
inputs list: |
|
- images: [batch, H, W, C] |
|
- image_meta: [batch, size of image meta] |
|
- rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) |
|
- rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. |
|
- gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs |
|
- gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] |
|
- gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width |
|
are those of the image unless use_mini_mask is True, in which |
|
case they are defined in MINI_MASK_SHAPE. |
|
|
|
outputs list: Usually empty in regular training. But if detection_targets |
|
is True then the outputs list contains target class_ids, bbox deltas, |
|
and masks. |
|
""" |
|
b = 0 |
|
image_index = -1 |
|
image_ids = np.copy(dataset.image_ids) |
|
error_count = 0 |
|
|
|
|
|
|
|
anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, |
|
config.RPN_ANCHOR_RATIOS, |
|
config.BACKBONE_SHAPES, |
|
config.BACKBONE_STRIDES, |
|
config.RPN_ANCHOR_STRIDE) |
|
|
|
|
|
while True: |
|
try: |
|
|
|
image_index = (image_index + 1) % len(image_ids) |
|
if shuffle and image_index == 0: |
|
np.random.shuffle(image_ids) |
|
|
|
|
|
image_id = image_ids[image_index] |
|
image, image_meta, gt_class_ids, gt_boxes, gt_masks = \ |
|
load_image_gt(dataset, config, image_id, augment=augment, |
|
use_mini_mask=config.USE_MINI_MASK) |
|
|
|
|
|
|
|
|
|
if not np.any(gt_class_ids > 0): |
|
continue |
|
|
|
|
|
rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors, |
|
gt_class_ids, gt_boxes, config) |
|
|
|
|
|
if random_rois: |
|
rpn_rois = generate_random_rois( |
|
image.shape, random_rois, gt_class_ids, gt_boxes) |
|
if detection_targets: |
|
rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask =\ |
|
build_detection_targets( |
|
rpn_rois, gt_class_ids, gt_boxes, gt_masks, config) |
|
|
|
|
|
if b == 0: |
|
batch_image_meta = np.zeros( |
|
(batch_size,) + image_meta.shape, dtype=image_meta.dtype) |
|
batch_rpn_match = np.zeros( |
|
[batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) |
|
batch_rpn_bbox = np.zeros( |
|
[batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype) |
|
batch_images = np.zeros( |
|
(batch_size,) + image.shape, dtype=np.float32) |
|
batch_gt_class_ids = np.zeros( |
|
(batch_size, config.MAX_GT_INSTANCES), dtype=np.int32) |
|
batch_gt_boxes = np.zeros( |
|
(batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32) |
|
if config.USE_MINI_MASK: |
|
batch_gt_masks = np.zeros((batch_size, config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], |
|
config.MAX_GT_INSTANCES)) |
|
else: |
|
batch_gt_masks = np.zeros( |
|
(batch_size, image.shape[0], image.shape[1], config.MAX_GT_INSTANCES)) |
|
if random_rois: |
|
batch_rpn_rois = np.zeros( |
|
(batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype) |
|
if detection_targets: |
|
batch_rois = np.zeros( |
|
(batch_size,) + rois.shape, dtype=rois.dtype) |
|
batch_mrcnn_class_ids = np.zeros( |
|
(batch_size,) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype) |
|
batch_mrcnn_bbox = np.zeros( |
|
(batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype) |
|
batch_mrcnn_mask = np.zeros( |
|
(batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype) |
|
|
|
|
|
if gt_boxes.shape[0] > config.MAX_GT_INSTANCES: |
|
ids = np.random.choice( |
|
np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False) |
|
gt_class_ids = gt_class_ids[ids] |
|
gt_boxes = gt_boxes[ids] |
|
gt_masks = gt_masks[:, :, ids] |
|
|
|
|
|
batch_image_meta[b] = image_meta |
|
batch_rpn_match[b] = rpn_match[:, np.newaxis] |
|
batch_rpn_bbox[b] = rpn_bbox |
|
batch_images[b] = mold_image(image.astype(np.float32), config) |
|
batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids |
|
batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes |
|
batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks |
|
if random_rois: |
|
batch_rpn_rois[b] = rpn_rois |
|
if detection_targets: |
|
batch_rois[b] = rois |
|
batch_mrcnn_class_ids[b] = mrcnn_class_ids |
|
batch_mrcnn_bbox[b] = mrcnn_bbox |
|
batch_mrcnn_mask[b] = mrcnn_mask |
|
b += 1 |
|
|
|
|
|
if b >= batch_size: |
|
inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, |
|
batch_gt_class_ids, batch_gt_boxes, batch_gt_masks] |
|
outputs = [] |
|
|
|
if random_rois: |
|
inputs.extend([batch_rpn_rois]) |
|
if detection_targets: |
|
inputs.extend([batch_rois]) |
|
|
|
batch_mrcnn_class_ids = np.expand_dims( |
|
batch_mrcnn_class_ids, -1) |
|
outputs.extend( |
|
[batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask]) |
|
|
|
yield inputs, outputs |
|
|
|
|
|
b = 0 |
|
except (GeneratorExit, KeyboardInterrupt): |
|
raise |
|
except: |
|
|
|
logging.exception("Error processing image {}".format( |
|
dataset.image_info[image_id])) |
|
error_count += 1 |
|
if error_count > 5: |
|
raise |
|
|
|
|
|
|
|
|
|
|
|
|
|
class MaskRCNN(): |
|
"""Encapsulates the Mask RCNN model functionality. |
|
|
|
The actual Keras model is in the keras_model property. |
|
""" |
|
|
|
def __init__(self, mode, config, model_dir): |
|
""" |
|
mode: Either "training" or "inference" |
|
config: A Sub-class of the Config class |
|
model_dir: Directory to save training logs and trained weights |
|
""" |
|
assert mode in ['training', 'inference'] |
|
self.mode = mode |
|
self.config = config |
|
self.model_dir = model_dir |
|
self.set_log_dir() |
|
self.keras_model = self.build(mode=mode, config=config) |
|
|
|
def build(self, mode, config): |
|
"""Build Mask R-CNN architecture. |
|
input_shape: The shape of the input image. |
|
mode: Either "training" or "inference". The inputs and |
|
outputs of the model differ accordingly. |
|
""" |
|
assert mode in ['training', 'inference'] |
|
|
|
|
|
h, w = config.IMAGE_SHAPE[:2] |
|
if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): |
|
raise Exception("Image size must be dividable by 2 at least 6 times " |
|
"to avoid fractions when downscaling and upscaling." |
|
"For example, use 256, 320, 384, 448, 512, ... etc. ") |
|
|
|
|
|
input_image = KL.Input( |
|
shape=config.IMAGE_SHAPE.tolist(), name="input_image") |
|
input_image_meta = KL.Input(shape=[None], name="input_image_meta") |
|
if mode == "training": |
|
|
|
input_rpn_match = KL.Input( |
|
shape=[None, 1], name="input_rpn_match", dtype=tf.int32) |
|
input_rpn_bbox = KL.Input( |
|
shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) |
|
|
|
|
|
|
|
input_gt_class_ids = KL.Input( |
|
shape=[None], name="input_gt_class_ids", dtype=tf.int32) |
|
|
|
|
|
input_gt_boxes = KL.Input( |
|
shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) |
|
|
|
h, w = K.shape(input_image)[1], K.shape(input_image)[2] |
|
image_scale = K.cast(K.stack([h, w, h, w], axis=0), tf.float32) |
|
gt_boxes = KL.Lambda(lambda x: x / image_scale)(input_gt_boxes) |
|
|
|
|
|
if config.USE_MINI_MASK: |
|
input_gt_masks = KL.Input( |
|
shape=[config.MINI_MASK_SHAPE[0], |
|
config.MINI_MASK_SHAPE[1], None], |
|
name="input_gt_masks", dtype=bool) |
|
else: |
|
input_gt_masks = KL.Input( |
|
shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], |
|
name="input_gt_masks", dtype=bool) |
|
|
|
|
|
|
|
|
|
|
|
_, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True) |
|
|
|
|
|
P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5) |
|
P4 = KL.Add(name="fpn_p4add")([ |
|
KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), |
|
KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)]) |
|
P3 = KL.Add(name="fpn_p3add")([ |
|
KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), |
|
KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)]) |
|
P2 = KL.Add(name="fpn_p2add")([ |
|
KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), |
|
KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)]) |
|
|
|
P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2) |
|
P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3) |
|
P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4) |
|
P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5) |
|
|
|
|
|
P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) |
|
|
|
|
|
rpn_feature_maps = [P2, P3, P4, P5, P6] |
|
mrcnn_feature_maps = [P2, P3, P4, P5] |
|
|
|
|
|
self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, |
|
config.RPN_ANCHOR_RATIOS, |
|
config.BACKBONE_SHAPES, |
|
config.BACKBONE_STRIDES, |
|
config.RPN_ANCHOR_STRIDE) |
|
|
|
|
|
rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, |
|
len(config.RPN_ANCHOR_RATIOS), 256) |
|
|
|
layer_outputs = [] |
|
for p in rpn_feature_maps: |
|
layer_outputs.append(rpn([p])) |
|
|
|
|
|
|
|
|
|
output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] |
|
outputs = list(zip(*layer_outputs)) |
|
outputs = [KL.Concatenate(axis=1, name=n)(list(o)) |
|
for o, n in zip(outputs, output_names)] |
|
|
|
rpn_class_logits, rpn_class, rpn_bbox = outputs |
|
|
|
|
|
|
|
|
|
proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\ |
|
else config.POST_NMS_ROIS_INFERENCE |
|
rpn_rois = ProposalLayer(proposal_count=proposal_count, |
|
nms_threshold=config.RPN_NMS_THRESHOLD, |
|
name="ROI", |
|
anchors=self.anchors, |
|
config=config)([rpn_class, rpn_bbox]) |
|
|
|
if mode == "training": |
|
|
|
|
|
_, _, _, active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x), |
|
mask=[None, None, None, None])(input_image_meta) |
|
|
|
if not config.USE_RPN_ROIS: |
|
|
|
input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], |
|
name="input_roi", dtype=np.int32) |
|
|
|
target_rois = KL.Lambda(lambda x: K.cast( |
|
x, tf.float32) / image_scale[:4])(input_rois) |
|
else: |
|
target_rois = rpn_rois |
|
|
|
|
|
|
|
|
|
|
|
rois, target_class_ids, target_bbox, target_mask =\ |
|
DetectionTargetLayer(config, name="proposal_targets")([ |
|
target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) |
|
|
|
|
|
|
|
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ |
|
fpn_classifier_graph(rois, mrcnn_feature_maps, config.IMAGE_SHAPE, |
|
config.POOL_SIZE, config.NUM_CLASSES) |
|
|
|
mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps, |
|
config.IMAGE_SHAPE, |
|
config.MASK_POOL_SIZE, |
|
config.NUM_CLASSES) |
|
|
|
|
|
output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) |
|
|
|
|
|
rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")( |
|
[input_rpn_match, rpn_class_logits]) |
|
rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( |
|
[input_rpn_bbox, input_rpn_match, rpn_bbox]) |
|
class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")( |
|
[target_class_ids, mrcnn_class_logits, active_class_ids]) |
|
bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")( |
|
[target_bbox, target_class_ids, mrcnn_bbox]) |
|
mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")( |
|
[target_mask, target_class_ids, mrcnn_mask]) |
|
|
|
|
|
inputs = [input_image, input_image_meta, |
|
input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks] |
|
if not config.USE_RPN_ROIS: |
|
inputs.append(input_rois) |
|
outputs = [rpn_class_logits, rpn_class, rpn_bbox, |
|
mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, |
|
rpn_rois, output_rois, |
|
rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss] |
|
model = KM.Model(inputs, outputs, name='mask_rcnn') |
|
else: |
|
|
|
|
|
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ |
|
fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, config.IMAGE_SHAPE, |
|
config.POOL_SIZE, config.NUM_CLASSES) |
|
|
|
|
|
|
|
detections = DetectionLayer(config, name="mrcnn_detection")( |
|
[rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) |
|
|
|
|
|
|
|
|
|
h, w = config.IMAGE_SHAPE[:2] |
|
detection_boxes = KL.Lambda( |
|
lambda x: x[..., :4] / np.array([h, w, h, w]))(detections) |
|
|
|
|
|
mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, |
|
config.IMAGE_SHAPE, |
|
config.MASK_POOL_SIZE, |
|
config.NUM_CLASSES) |
|
|
|
model = KM.Model([input_image, input_image_meta], |
|
[detections, mrcnn_class, mrcnn_bbox, |
|
mrcnn_mask, rpn_rois, rpn_class, rpn_bbox], |
|
name='mask_rcnn') |
|
|
|
|
|
if config.GPU_COUNT > 1: |
|
from parallel_model import ParallelModel |
|
model = ParallelModel(model, config.GPU_COUNT) |
|
|
|
return model |
|
|
|
def find_last(self): |
|
"""Finds the last checkpoint file of the last trained model in the |
|
model directory. |
|
Returns: |
|
log_dir: The directory where events and weights are saved |
|
checkpoint_path: the path to the last checkpoint file |
|
""" |
|
|
|
dir_names = next(os.walk(self.model_dir))[1] |
|
key = self.config.NAME.lower() |
|
dir_names = filter(lambda f: f.startswith(key), dir_names) |
|
dir_names = sorted(dir_names) |
|
if not dir_names: |
|
return None, None |
|
|
|
dir_name = os.path.join(self.model_dir, dir_names[-1]) |
|
|
|
checkpoints = next(os.walk(dir_name))[2] |
|
checkpoints = filter(lambda f: f.startswith("mask_rcnn"), checkpoints) |
|
checkpoints = sorted(checkpoints) |
|
if not checkpoints: |
|
return dir_name, None |
|
checkpoint = os.path.join(dir_name, checkpoints[-1]) |
|
return dir_name, checkpoint |
|
|
|
def load_weights(self, filepath, by_name=False, exclude=None): |
|
"""Modified version of the correspoding Keras function with |
|
the addition of multi-GPU support and the ability to exclude |
|
some layers from loading. |
|
exlude: list of layer names to excluce |
|
""" |
|
import h5py |
|
from keras.engine import topology |
|
|
|
if exclude: |
|
by_name = True |
|
|
|
if h5py is None: |
|
raise ImportError('`load_weights` requires h5py.') |
|
f = h5py.File(filepath, mode='r') |
|
if 'layer_names' not in f.attrs and 'model_weights' in f: |
|
f = f['model_weights'] |
|
|
|
|
|
|
|
keras_model = self.keras_model |
|
layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\ |
|
else keras_model.layers |
|
|
|
|
|
if exclude: |
|
layers = filter(lambda l: l.name not in exclude, layers) |
|
|
|
if by_name: |
|
topology.load_weights_from_hdf5_group_by_name(f, layers) |
|
else: |
|
topology.load_weights_from_hdf5_group(f, layers) |
|
if hasattr(f, 'close'): |
|
f.close() |
|
|
|
|
|
self.set_log_dir(filepath) |
|
|
|
def get_imagenet_weights(self): |
|
"""Downloads ImageNet trained weights from Keras. |
|
Returns path to weights file. |
|
""" |
|
from keras.utils.data_utils import get_file |
|
TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/'\ |
|
'releases/download/v0.2/'\ |
|
'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' |
|
weights_path = get_file('resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', |
|
TF_WEIGHTS_PATH_NO_TOP, |
|
cache_subdir='models', |
|
md5_hash='a268eb855778b3df3c7506639542a6af') |
|
return weights_path |
|
|
|
def compile(self, learning_rate, momentum): |
|
"""Gets the model ready for training. Adds losses, regularization, and |
|
metrics. Then calls the Keras compile() function. |
|
""" |
|
|
|
optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=momentum, |
|
clipnorm=5.0) |
|
|
|
|
|
self.keras_model._losses = [] |
|
self.keras_model._per_input_losses = {} |
|
loss_names = ["rpn_class_loss", "rpn_bbox_loss", |
|
"mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"] |
|
for name in loss_names: |
|
layer = self.keras_model.get_layer(name) |
|
if layer.output in self.keras_model.losses: |
|
continue |
|
self.keras_model.add_loss( |
|
tf.reduce_mean(layer.output, keep_dims=True)) |
|
|
|
|
|
|
|
reg_losses = [keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32) |
|
for w in self.keras_model.trainable_weights |
|
if 'gamma' not in w.name and 'beta' not in w.name] |
|
self.keras_model.add_loss(tf.add_n(reg_losses)) |
|
|
|
|
|
self.keras_model.compile(optimizer=optimizer, loss=[ |
|
None] * len(self.keras_model.outputs)) |
|
|
|
|
|
for name in loss_names: |
|
if name in self.keras_model.metrics_names: |
|
continue |
|
layer = self.keras_model.get_layer(name) |
|
self.keras_model.metrics_names.append(name) |
|
self.keras_model.metrics_tensors.append(tf.reduce_mean( |
|
layer.output, keep_dims=True)) |
|
|
|
def set_trainable(self, layer_regex, keras_model=None, indent=0, verbose=1): |
|
"""Sets model layers as trainable if their names match |
|
the given regular expression. |
|
""" |
|
|
|
if verbose > 0 and keras_model is None: |
|
log("Selecting layers to train") |
|
|
|
keras_model = keras_model or self.keras_model |
|
|
|
|
|
|
|
layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\ |
|
else keras_model.layers |
|
|
|
for layer in layers: |
|
|
|
if layer.__class__.__name__ == 'Model': |
|
print("In model: ", layer.name) |
|
self.set_trainable( |
|
layer_regex, keras_model=layer, indent=indent + 4) |
|
continue |
|
|
|
if not layer.weights: |
|
continue |
|
|
|
trainable = bool(re.fullmatch(layer_regex, layer.name)) |
|
|
|
if layer.__class__.__name__ == 'TimeDistributed': |
|
layer.layer.trainable = trainable |
|
else: |
|
layer.trainable = trainable |
|
|
|
if trainable and verbose > 0: |
|
log("{}{:20} ({})".format(" " * indent, layer.name, |
|
layer.__class__.__name__)) |
|
|
|
def set_log_dir(self, model_path=None): |
|
"""Sets the model log directory and epoch counter. |
|
|
|
model_path: If None, or a format different from what this code uses |
|
then set a new log directory and start epochs from 0. Otherwise, |
|
extract the log directory and the epoch counter from the file |
|
name. |
|
""" |
|
|
|
self.epoch = 0 |
|
now = datetime.datetime.now() |
|
|
|
|
|
if model_path: |
|
|
|
|
|
|
|
regex = r".*/\w+(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})/mask\_rcnn\_\w+(\d{4})\.h5" |
|
m = re.match(regex, model_path) |
|
if m: |
|
now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), |
|
int(m.group(4)), int(m.group(5))) |
|
self.epoch = int(m.group(6)) + 1 |
|
|
|
|
|
self.log_dir = os.path.join(self.model_dir, "{}{:%Y%m%dT%H%M}".format( |
|
self.config.NAME.lower(), now)) |
|
|
|
|
|
self.checkpoint_path = os.path.join(self.log_dir, "mask_rcnn_{}_*epoch*.h5".format( |
|
self.config.NAME.lower())) |
|
self.checkpoint_path = self.checkpoint_path.replace( |
|
"*epoch*", "{epoch:04d}") |
|
|
|
def train(self, train_dataset, val_dataset, learning_rate, epochs, layers): |
|
"""Train the model. |
|
train_dataset, val_dataset: Training and validation Dataset objects. |
|
learning_rate: The learning rate to train with |
|
epochs: Number of training epochs. Note that previous training epochs |
|
are considered to be done alreay, so this actually determines |
|
the epochs to train in total rather than in this particaular |
|
call. |
|
layers: Allows selecting wich layers to train. It can be: |
|
- A regular expression to match layer names to train |
|
- One of these predefined values: |
|
heaads: The RPN, classifier and mask heads of the network |
|
all: All the layers |
|
3+: Train Resnet stage 3 and up |
|
4+: Train Resnet stage 4 and up |
|
5+: Train Resnet stage 5 and up |
|
""" |
|
assert self.mode == "training", "Create model in training mode." |
|
|
|
|
|
layer_regex = { |
|
|
|
"heads": r"(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)", |
|
|
|
"3+": r"(res3.*)|(bn3.*)|(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)", |
|
"4+": r"(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)", |
|
"5+": r"(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)", |
|
|
|
"all": ".*", |
|
} |
|
if layers in layer_regex.keys(): |
|
layers = layer_regex[layers] |
|
|
|
|
|
train_generator = data_generator(train_dataset, self.config, shuffle=True, |
|
batch_size=self.config.BATCH_SIZE) |
|
val_generator = data_generator(val_dataset, self.config, shuffle=True, |
|
batch_size=self.config.BATCH_SIZE, |
|
augment=False) |
|
|
|
|
|
callbacks = [ |
|
keras.callbacks.TensorBoard(log_dir=self.log_dir, |
|
histogram_freq=0, write_graph=True, write_images=False), |
|
keras.callbacks.ModelCheckpoint(self.checkpoint_path, |
|
verbose=0, save_weights_only=True), |
|
] |
|
|
|
|
|
log("\nStarting at epoch {}. LR={}\n".format(self.epoch, learning_rate)) |
|
log("Checkpoint Path: {}".format(self.checkpoint_path)) |
|
self.set_trainable(layers) |
|
self.compile(learning_rate, self.config.LEARNING_MOMENTUM) |
|
|
|
|
|
|
|
|
|
if os.name is 'nt': |
|
workers = 0 |
|
else: |
|
workers = max(self.config.BATCH_SIZE // 2, 2) |
|
|
|
self.keras_model.fit_generator( |
|
train_generator, |
|
initial_epoch=self.epoch, |
|
epochs=epochs, |
|
steps_per_epoch=self.config.STEPS_PER_EPOCH, |
|
callbacks=callbacks, |
|
validation_data=next(val_generator), |
|
validation_steps=self.config.VALIDATION_STEPS, |
|
max_queue_size=100, |
|
workers=workers, |
|
use_multiprocessing=True, |
|
) |
|
self.epoch = max(self.epoch, epochs) |
|
|
|
def mold_inputs(self, images): |
|
"""Takes a list of images and modifies them to the format expected |
|
as an input to the neural network. |
|
images: List of image matricies [height,width,depth]. Images can have |
|
different sizes. |
|
|
|
Returns 3 Numpy matricies: |
|
molded_images: [N, h, w, 3]. Images resized and normalized. |
|
image_metas: [N, length of meta data]. Details about each image. |
|
windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the |
|
original image (padding excluded). |
|
""" |
|
molded_images = [] |
|
image_metas = [] |
|
windows = [] |
|
for image in images: |
|
|
|
|
|
molded_image, window, scale, padding = utils.resize_image( |
|
image, |
|
min_dim=self.config.IMAGE_MIN_DIM, |
|
max_dim=self.config.IMAGE_MAX_DIM, |
|
padding=self.config.IMAGE_PADDING) |
|
molded_image = mold_image(molded_image, self.config) |
|
|
|
image_meta = compose_image_meta( |
|
0, image.shape, window, |
|
np.zeros([self.config.NUM_CLASSES], dtype=np.int32)) |
|
|
|
molded_images.append(molded_image) |
|
windows.append(window) |
|
image_metas.append(image_meta) |
|
|
|
molded_images = np.stack(molded_images) |
|
image_metas = np.stack(image_metas) |
|
windows = np.stack(windows) |
|
return molded_images, image_metas, windows |
|
|
|
def unmold_detections(self, detections, mrcnn_mask, image_shape, window): |
|
"""Reformats the detections of one image from the format of the neural |
|
network output to a format suitable for use in the rest of the |
|
application. |
|
|
|
detections: [N, (y1, x1, y2, x2, class_id, score)] |
|
mrcnn_mask: [N, height, width, num_classes] |
|
image_shape: [height, width, depth] Original size of the image before resizing |
|
window: [y1, x1, y2, x2] Box in the image where the real image is |
|
excluding the padding. |
|
|
|
Returns: |
|
boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels |
|
class_ids: [N] Integer class IDs for each bounding box |
|
scores: [N] Float probability scores of the class_id |
|
masks: [height, width, num_instances] Instance masks |
|
""" |
|
|
|
|
|
zero_ix = np.where(detections[:, 4] == 0)[0] |
|
N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0] |
|
|
|
|
|
boxes = detections[:N, :4] |
|
class_ids = detections[:N, 4].astype(np.int32) |
|
scores = detections[:N, 5] |
|
masks = mrcnn_mask[np.arange(N), :, :, class_ids] |
|
|
|
|
|
h_scale = image_shape[0] / (window[2] - window[0]) |
|
w_scale = image_shape[1] / (window[3] - window[1]) |
|
scale = min(h_scale, w_scale) |
|
shift = window[:2] |
|
scales = np.array([scale, scale, scale, scale]) |
|
shifts = np.array([shift[0], shift[1], shift[0], shift[1]]) |
|
|
|
|
|
boxes = np.multiply(boxes - shifts, scales).astype(np.int32) |
|
|
|
|
|
|
|
exclude_ix = np.where( |
|
(boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0] |
|
if exclude_ix.shape[0] > 0: |
|
boxes = np.delete(boxes, exclude_ix, axis=0) |
|
class_ids = np.delete(class_ids, exclude_ix, axis=0) |
|
scores = np.delete(scores, exclude_ix, axis=0) |
|
masks = np.delete(masks, exclude_ix, axis=0) |
|
N = class_ids.shape[0] |
|
|
|
|
|
full_masks = [] |
|
for i in range(N): |
|
|
|
full_mask = utils.unmold_mask(masks[i], boxes[i], image_shape) |
|
full_masks.append(full_mask) |
|
full_masks = np.stack(full_masks, axis=-1)\ |
|
if full_masks else np.empty((0,) + masks.shape[1:3]) |
|
|
|
return boxes, class_ids, scores, full_masks |
|
|
|
def detect(self, images, verbose=0): |
|
"""Runs the detection pipeline. |
|
|
|
images: List of images, potentially of different sizes. |
|
|
|
Returns a list of dicts, one dict per image. The dict contains: |
|
rois: [N, (y1, x1, y2, x2)] detection bounding boxes |
|
class_ids: [N] int class IDs |
|
scores: [N] float probability scores for the class IDs |
|
masks: [H, W, N] instance binary masks |
|
""" |
|
assert self.mode == "inference", "Create model in inference mode." |
|
assert len( |
|
images) == self.config.BATCH_SIZE, "len(images) must be equal to BATCH_SIZE" |
|
|
|
if verbose: |
|
log("Processing {} images".format(len(images))) |
|
for image in images: |
|
log("image", image) |
|
|
|
molded_images, image_metas, windows = self.mold_inputs(images) |
|
if verbose: |
|
log("molded_images", molded_images) |
|
log("image_metas", image_metas) |
|
|
|
detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, \ |
|
rois, rpn_class, rpn_bbox =\ |
|
self.keras_model.predict([molded_images, image_metas], verbose=0) |
|
|
|
results = [] |
|
for i, image in enumerate(images): |
|
final_rois, final_class_ids, final_scores, final_masks =\ |
|
self.unmold_detections(detections[i], mrcnn_mask[i], |
|
image.shape, windows[i]) |
|
results.append({ |
|
"rois": final_rois, |
|
"class_ids": final_class_ids, |
|
"scores": final_scores, |
|
"masks": final_masks, |
|
}) |
|
return results |
|
|
|
def ancestor(self, tensor, name, checked=None): |
|
"""Finds the ancestor of a TF tensor in the computation graph. |
|
tensor: TensorFlow symbolic tensor. |
|
name: Name of ancestor tensor to find |
|
checked: For internal use. A list of tensors that were already |
|
searched to avoid loops in traversing the graph. |
|
""" |
|
checked = checked if checked is not None else [] |
|
|
|
if len(checked) > 500: |
|
return None |
|
|
|
|
|
if isinstance(name, str): |
|
name = re.compile(name.replace("/", r"(\_\d+)*/")) |
|
|
|
parents = tensor.op.inputs |
|
for p in parents: |
|
if p in checked: |
|
continue |
|
if bool(re.fullmatch(name, p.name)): |
|
return p |
|
checked.append(p) |
|
a = self.ancestor(p, name, checked) |
|
if a is not None: |
|
return a |
|
return None |
|
|
|
def find_trainable_layer(self, layer): |
|
"""If a layer is encapsulated by another layer, this function |
|
digs through the encapsulation and returns the layer that holds |
|
the weights. |
|
""" |
|
if layer.__class__.__name__ == 'TimeDistributed': |
|
return self.find_trainable_layer(layer.layer) |
|
return layer |
|
|
|
def get_trainable_layers(self): |
|
"""Returns a list of layers that have weights.""" |
|
layers = [] |
|
|
|
for l in self.keras_model.layers: |
|
|
|
l = self.find_trainable_layer(l) |
|
|
|
if l.get_weights(): |
|
layers.append(l) |
|
return layers |
|
|
|
def run_graph(self, images, outputs): |
|
"""Runs a sub-set of the computation graph that computes the given |
|
outputs. |
|
|
|
outputs: List of tuples (name, tensor) to compute. The tensors are |
|
symbolic TensorFlow tensors and the names are for easy tracking. |
|
|
|
Returns an ordered dict of results. Keys are the names received in the |
|
input and values are Numpy arrays. |
|
""" |
|
model = self.keras_model |
|
|
|
|
|
outputs = OrderedDict(outputs) |
|
for o in outputs.values(): |
|
assert o is not None |
|
|
|
|
|
inputs = model.inputs |
|
if model.uses_learning_phase and not isinstance(K.learning_phase(), int): |
|
inputs += [K.learning_phase()] |
|
kf = K.function(model.inputs, list(outputs.values())) |
|
|
|
|
|
molded_images, image_metas, windows = self.mold_inputs(images) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_in = [molded_images, image_metas] |
|
if model.uses_learning_phase and not isinstance(K.learning_phase(), int): |
|
model_in.append(0.) |
|
outputs_np = kf(model_in) |
|
|
|
|
|
outputs_np = OrderedDict([(k, v) |
|
for k, v in zip(outputs.keys(), outputs_np)]) |
|
for k, v in outputs_np.items(): |
|
log(k, v) |
|
return outputs_np |
|
|
|
|
|
|
|
|
|
|
|
|
|
def compose_image_meta(image_id, image_shape, window, active_class_ids): |
|
"""Takes attributes of an image and puts them in one 1D array. |
|
|
|
image_id: An int ID of the image. Useful for debugging. |
|
image_shape: [height, width, channels] |
|
window: (y1, x1, y2, x2) in pixels. The area of the image where the real |
|
image is (excluding the padding) |
|
active_class_ids: List of class_ids available in the dataset from which |
|
the image came. Useful if training on images from multiple datasets |
|
where not all classes are present in all datasets. |
|
""" |
|
meta = np.array( |
|
[image_id] + |
|
list(image_shape) + |
|
list(window) + |
|
list(active_class_ids) |
|
) |
|
return meta |
|
|
|
|
|
def parse_image_meta_graph(meta): |
|
"""Parses a tensor that contains image attributes to its components. |
|
See compose_image_meta() for more details. |
|
|
|
meta: [batch, meta length] where meta length depends on NUM_CLASSES |
|
""" |
|
image_id = meta[:, 0] |
|
image_shape = meta[:, 1:4] |
|
window = meta[:, 4:8] |
|
active_class_ids = meta[:, 8:] |
|
return [image_id, image_shape, window, active_class_ids] |
|
|
|
|
|
def mold_image(images, config): |
|
"""Takes RGB images with 0-255 values and subtraces |
|
the mean pixel and converts it to float. Expects image |
|
colors in RGB order. |
|
""" |
|
return images.astype(np.float32) - config.MEAN_PIXEL |
|
|
|
|
|
def unmold_image(normalized_images, config): |
|
"""Takes a image normalized with mold() and returns the original.""" |
|
return (normalized_images + config.MEAN_PIXEL).astype(np.uint8) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def trim_zeros_graph(boxes, name=None): |
|
"""Often boxes are represented with matricies of shape [N, 4] and |
|
are padded with zeros. This removes zero boxes. |
|
|
|
boxes: [N, 4] matrix of boxes. |
|
non_zeros: [N] a 1D boolean mask identifying the rows to keep |
|
""" |
|
non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool) |
|
boxes = tf.boolean_mask(boxes, non_zeros, name=name) |
|
return boxes, non_zeros |
|
|
|
|
|
def batch_pack_graph(x, counts, num_rows): |
|
"""Picks different number of values from each row |
|
in x depending on the values in counts. |
|
""" |
|
outputs = [] |
|
for i in range(num_rows): |
|
outputs.append(x[i, :counts[i]]) |
|
return tf.concat(outputs, axis=0) |
|
|