Spaces:

Kroy
/

yolo

Runtime error

App Files Files Community

yolo / model.py

Kroy

Upload 10 files

e80fad1 almost 2 years ago

raw

history blame contribute delete

112 kB

	"""
	Mask R-CNN
	The main Mask R-CNN model implemenetation.

	Copyright (c) 2017 Matterport, Inc.
	Licensed under the MIT License (see LICENSE for details)
	Written by Waleed Abdulla
	"""

	import os
	import sys
	import glob
	import random
	import math
	import datetime
	import itertools
	import json
	import re
	import logging
	from collections import OrderedDict
	import numpy as np
	import scipy.misc
	import tensorflow as tf
	import keras
	import keras.backend as K
	import keras.layers as KL
	import keras.initializers as KI
	import keras.engine as KE
	import keras.models as KM

	import utils

	# Requires TensorFlow 1.3+ and Keras 2.0.8+.
	from distutils.version import LooseVersion
	assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
	assert LooseVersion(keras.__version__) >= LooseVersion('2.0.8')


	############################################################
	# Utility Functions
	############################################################

	def log(text, array=None):
	"""Prints a text message. And, optionally, if a Numpy array is provided it
	prints it's shape, min, and max values.
	"""
	if array is not None:
	text = text.ljust(25)
	text += ("shape: {:20} min: {:10.5f} max: {:10.5f}".format(
	str(array.shape),
	array.min() if array.size else "",
	array.max() if array.size else ""))
	print(text)


	class BatchNorm(KL.BatchNormalization):
	"""Batch Normalization class. Subclasses the Keras BN class and
	hardcodes training=False so the BN layer doesn't update
	during training.

	Batch normalization has a negative effect on training if batches are small
	so we disable it here.
	"""

	def call(self, inputs, training=None):
	return super(self.__class__, self).call(inputs, training=False)


	############################################################
	# Resnet Graph
	############################################################

	# Code adopted from:
	# https://github.com/fchollet/deep-learning-models/blob/master/resnet50.py

	def identity_block(input_tensor, kernel_size, filters, stage, block,
	use_bias=True):
	"""The identity_block is the block that has no conv layer at shortcut
	# Arguments
	input_tensor: input tensor
	kernel_size: defualt 3, the kernel size of middle conv layer at main path
	filters: list of integers, the nb_filters of 3 conv layer at main path
	stage: integer, current stage label, used for generating layer names
	block: 'a','b'..., current block label, used for generating layer names
	"""
	nb_filter1, nb_filter2, nb_filter3 = filters
	conv_name_base = 'res' + str(stage) + block + '_branch'
	bn_name_base = 'bn' + str(stage) + block + '_branch'

	x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
	use_bias=use_bias)(input_tensor)
	x = BatchNorm(axis=3, name=bn_name_base + '2a')(x)
	x = KL.Activation('relu')(x)

	x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
	name=conv_name_base + '2b', use_bias=use_bias)(x)
	x = BatchNorm(axis=3, name=bn_name_base + '2b')(x)
	x = KL.Activation('relu')(x)

	x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
	use_bias=use_bias)(x)
	x = BatchNorm(axis=3, name=bn_name_base + '2c')(x)

	x = KL.Add()([x, input_tensor])
	x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
	return x


	def conv_block(input_tensor, kernel_size, filters, stage, block,
	strides=(2, 2), use_bias=True):
	"""conv_block is the block that has a conv layer at shortcut
	# Arguments
	input_tensor: input tensor
	kernel_size: defualt 3, the kernel size of middle conv layer at main path
	filters: list of integers, the nb_filters of 3 conv layer at main path
	stage: integer, current stage label, used for generating layer names
	block: 'a','b'..., current block label, used for generating layer names
	Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
	And the shortcut should have subsample=(2,2) as well
	"""
	nb_filter1, nb_filter2, nb_filter3 = filters
	conv_name_base = 'res' + str(stage) + block + '_branch'
	bn_name_base = 'bn' + str(stage) + block + '_branch'

	x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
	name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
	x = BatchNorm(axis=3, name=bn_name_base + '2a')(x)
	x = KL.Activation('relu')(x)

	x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
	name=conv_name_base + '2b', use_bias=use_bias)(x)
	x = BatchNorm(axis=3, name=bn_name_base + '2b')(x)
	x = KL.Activation('relu')(x)

	x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base +
	'2c', use_bias=use_bias)(x)
	x = BatchNorm(axis=3, name=bn_name_base + '2c')(x)

	shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
	name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
	shortcut = BatchNorm(axis=3, name=bn_name_base + '1')(shortcut)

	x = KL.Add()([x, shortcut])
	x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
	return x


	def resnet_graph(input_image, architecture, stage5=False):
	assert architecture in ["resnet50", "resnet101"]
	# Stage 1
	x = KL.ZeroPadding2D((3, 3))(input_image)
	x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
	x = BatchNorm(axis=3, name='bn_conv1')(x)
	x = KL.Activation('relu')(x)
	C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
	# Stage 2
	x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
	x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
	C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
	# Stage 3
	x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
	x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
	x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
	C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
	# Stage 4
	x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
	block_count = {"resnet50": 5, "resnet101": 22}[architecture]
	for i in range(block_count):
	x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i))
	C4 = x
	# Stage 5
	if stage5:
	x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
	x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
	C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
	else:
	C5 = None
	return [C1, C2, C3, C4, C5]


	############################################################
	# Proposal Layer
	############################################################

	def apply_box_deltas_graph(boxes, deltas):
	"""Applies the given deltas to the given boxes.
	boxes: [N, 4] where each row is y1, x1, y2, x2
	deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]
	"""
	# Convert to y, x, h, w
	height = boxes[:, 2] - boxes[:, 0]
	width = boxes[:, 3] - boxes[:, 1]
	center_y = boxes[:, 0] + 0.5 * height
	center_x = boxes[:, 1] + 0.5 * width
	# Apply deltas
	center_y += deltas[:, 0] * height
	center_x += deltas[:, 1] * width
	height *= tf.exp(deltas[:, 2])
	width *= tf.exp(deltas[:, 3])
	# Convert back to y1, x1, y2, x2
	y1 = center_y - 0.5 * height
	x1 = center_x - 0.5 * width
	y2 = y1 + height
	x2 = x1 + width
	result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
	return result


	def clip_boxes_graph(boxes, window):
	"""
	boxes: [N, 4] each row is y1, x1, y2, x2
	window: [4] in the form y1, x1, y2, x2
	"""
	# Split corners
	wy1, wx1, wy2, wx2 = tf.split(window, 4)
	y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
	# Clip
	y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
	x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
	y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
	x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
	clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
	clipped.set_shape((clipped.shape[0], 4))
	return clipped


	class ProposalLayer(KE.Layer):
	"""Receives anchor scores and selects a subset to pass as proposals
	to the second stage. Filtering is done based on anchor scores and
	non-max suppression to remove overlaps. It also applies bounding
	box refinement deltas to anchors.

	Inputs:
	rpn_probs: [batch, anchors, (bg prob, fg prob)]
	rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]

	Returns:
	Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
	"""

	def __init__(self, proposal_count, nms_threshold, anchors,
	config=None, **kwargs):
	"""
	anchors: [N, (y1, x1, y2, x2)] anchors defined in image coordinates
	"""
	super(ProposalLayer, self).__init__(**kwargs)
	self.config = config
	self.proposal_count = proposal_count
	self.nms_threshold = nms_threshold
	self.anchors = anchors.astype(np.float32)

	def call(self, inputs):
	# Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
	scores = inputs[0][:, :, 1]
	# Box deltas [batch, num_rois, 4]
	deltas = inputs[1]
	deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4])
	# Base anchors
	anchors = self.anchors

	# Improve performance by trimming to top anchors by score
	# and doing the rest on the smaller subset.
	pre_nms_limit = min(6000, self.anchors.shape[0])
	ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
	name="top_anchors").indices
	scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
	self.config.IMAGES_PER_GPU)
	deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
	self.config.IMAGES_PER_GPU)
	anchors = utils.batch_slice(ix, lambda x: tf.gather(anchors, x),
	self.config.IMAGES_PER_GPU,
	names=["pre_nms_anchors"])

	# Apply deltas to anchors to get refined anchors.
	# [batch, N, (y1, x1, y2, x2)]
	boxes = utils.batch_slice([anchors, deltas],
	lambda x, y: apply_box_deltas_graph(x, y),
	self.config.IMAGES_PER_GPU,
	names=["refined_anchors"])

	# Clip to image boundaries. [batch, N, (y1, x1, y2, x2)]
	height, width = self.config.IMAGE_SHAPE[:2]
	window = np.array([0, 0, height, width]).astype(np.float32)
	boxes = utils.batch_slice(boxes,
	lambda x: clip_boxes_graph(x, window),
	self.config.IMAGES_PER_GPU,
	names=["refined_anchors_clipped"])

	# Filter out small boxes
	# According to Xinlei Chen's paper, this reduces detection accuracy
	# for small objects, so we're skipping it.

	# Normalize dimensions to range of 0 to 1.
	normalized_boxes = boxes / np.array([[height, width, height, width]])

	# Non-max suppression
	def nms(normalized_boxes, scores):
	indices = tf.image.non_max_suppression(
	normalized_boxes, scores, self.proposal_count,
	self.nms_threshold, name="rpn_non_max_suppression")
	proposals = tf.gather(normalized_boxes, indices)
	# Pad if needed
	padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
	proposals = tf.pad(proposals, [(0, padding), (0, 0)])
	return proposals
	proposals = utils.batch_slice([normalized_boxes, scores], nms,
	self.config.IMAGES_PER_GPU)
	return proposals

	def compute_output_shape(self, input_shape):
	return (None, self.proposal_count, 4)


	############################################################
	# ROIAlign Layer
	############################################################

	def log2_graph(x):
	"""Implementatin of Log2. TF doesn't have a native implemenation."""
	return tf.log(x) / tf.log(2.0)


	class PyramidROIAlign(KE.Layer):
	"""Implements ROI Pooling on multiple levels of the feature pyramid.

	Params:
	- pool_shape: [height, width] of the output pooled regions. Usually [7, 7]
	- image_shape: [height, width, channels]. Shape of input image in pixels

	Inputs:
	- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
	coordinates. Possibly padded with zeros if not enough
	boxes to fill the array.
	- Feature maps: List of feature maps from different levels of the pyramid.
	Each is [batch, height, width, channels]

	Output:
	Pooled regions in the shape: [batch, num_boxes, height, width, channels].
	The width and height are those specific in the pool_shape in the layer
	constructor.
	"""

	def __init__(self, pool_shape, image_shape, **kwargs):
	super(PyramidROIAlign, self).__init__(**kwargs)
	self.pool_shape = tuple(pool_shape)
	self.image_shape = tuple(image_shape)

	def call(self, inputs):
	# Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
	boxes = inputs[0]

	# Feature Maps. List of feature maps from different level of the
	# feature pyramid. Each is [batch, height, width, channels]
	feature_maps = inputs[1:]

	# Assign each ROI to a level in the pyramid based on the ROI area.
	y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
	h = y2 - y1
	w = x2 - x1
	# Equation 1 in the Feature Pyramid Networks paper. Account for
	# the fact that our coordinates are normalized here.
	# e.g. a 224x224 ROI (in pixels) maps to P4
	image_area = tf.cast(
	self.image_shape[0] * self.image_shape[1], tf.float32)
	roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
	roi_level = tf.minimum(5, tf.maximum(
	2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
	roi_level = tf.squeeze(roi_level, 2)

	# Loop through levels and apply ROI pooling to each. P2 to P5.
	pooled = []
	box_to_level = []
	for i, level in enumerate(range(2, 6)):
	ix = tf.where(tf.equal(roi_level, level))
	level_boxes = tf.gather_nd(boxes, ix)

	# Box indicies for crop_and_resize.
	box_indices = tf.cast(ix[:, 0], tf.int32)

	# Keep track of which box is mapped to which level
	box_to_level.append(ix)

	# Stop gradient propogation to ROI proposals
	level_boxes = tf.stop_gradient(level_boxes)
	box_indices = tf.stop_gradient(box_indices)

	# Crop and Resize
	# From Mask R-CNN paper: "We sample four regular locations, so
	# that we can evaluate either max or average pooling. In fact,
	# interpolating only a single value at each bin center (without
	# pooling) is nearly as effective."
	#
	# Here we use the simplified approach of a single value per bin,
	# which is how it's done in tf.crop_and_resize()
	# Result: [batch * num_boxes, pool_height, pool_width, channels]
	pooled.append(tf.image.crop_and_resize(
	feature_maps[i], level_boxes, box_indices, self.pool_shape,
	method="bilinear"))

	# Pack pooled features into one tensor
	pooled = tf.concat(pooled, axis=0)

	# Pack box_to_level mapping into one array and add another
	# column representing the order of pooled boxes
	box_to_level = tf.concat(box_to_level, axis=0)
	box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
	box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
	axis=1)

	# Rearrange pooled features to match the order of the original boxes
	# Sort box_to_level by batch then box index
	# TF doesn't have a way to sort by two columns, so merge them and sort.
	sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
	ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
	box_to_level)[0]).indices[::-1]
	ix = tf.gather(box_to_level[:, 2], ix)
	pooled = tf.gather(pooled, ix)

	# Re-add the batch dimension
	pooled = tf.expand_dims(pooled, 0)
	return pooled

	def compute_output_shape(self, input_shape):
	return input_shape[0][:2] + self.pool_shape + (input_shape[1][-1], )


	############################################################
	# Detection Target Layer
	############################################################

	def overlaps_graph(boxes1, boxes2):
	"""Computes IoU overlaps between two sets of boxes.
	boxes1, boxes2: [N, (y1, x1, y2, x2)].
	"""
	# 1. Tile boxes2 and repeate boxes1. This allows us to compare
	# every boxes1 against every boxes2 without loops.
	# TF doesn't have an equivalent to np.repeate() so simulate it
	# using tf.tile() and tf.reshape.
	b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
	[1, 1, tf.shape(boxes2)[0]]), [-1, 4])
	b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
	# 2. Compute intersections
	b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
	b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
	y1 = tf.maximum(b1_y1, b2_y1)
	x1 = tf.maximum(b1_x1, b2_x1)
	y2 = tf.minimum(b1_y2, b2_y2)
	x2 = tf.minimum(b1_x2, b2_x2)
	intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
	# 3. Compute unions
	b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
	b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
	union = b1_area + b2_area - intersection
	# 4. Compute IoU and reshape to [boxes1, boxes2]
	iou = intersection / union
	overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
	return overlaps


	def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):
	"""Generates detection targets for one image. Subsamples proposals and
	generates target class IDs, bounding box deltas, and masks for each.

	Inputs:
	proposals: [N, (y1, x1, y2, x2)] in normalized coordinates. Might
	be zero padded if there are not enough proposals.
	gt_class_ids: [MAX_GT_INSTANCES] int class IDs
	gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
	gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.

	Returns: Target ROIs and corresponding class IDs, bounding box shifts,
	and masks.
	rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
	class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
	deltas: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
	Class-specific bbox refinements.
	masks: [TRAIN_ROIS_PER_IMAGE, height, width). Masks cropped to bbox
	boundaries and resized to neural network output size.

	Note: Returned arrays might be zero padded if not enough target ROIs.
	"""
	# Assertions
	asserts = [
	tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
	name="roi_assertion"),
	]
	with tf.control_dependencies(asserts):
	proposals = tf.identity(proposals)

	# Remove zero padding
	proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
	gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
	gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros,
	name="trim_gt_class_ids")
	gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2,
	name="trim_gt_masks")

	# Handle COCO crowds
	# A crowd box in COCO is a bounding box around several instances. Exclude
	# them from training. A crowd box is given a negative class ID.
	crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
	non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
	crowd_boxes = tf.gather(gt_boxes, crowd_ix)
	crowd_masks = tf.gather(gt_masks, crowd_ix, axis=2)
	gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
	gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
	gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)

	# Compute overlaps matrix [proposals, gt_boxes]
	overlaps = overlaps_graph(proposals, gt_boxes)

	# Compute overlaps with crowd boxes [anchors, crowds]
	crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
	crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
	no_crowd_bool = (crowd_iou_max < 0.001)

	# Determine postive and negative ROIs
	roi_iou_max = tf.reduce_max(overlaps, axis=1)
	# 1. Positive ROIs are those with >= 0.5 IoU with a GT box
	positive_roi_bool = (roi_iou_max >= 0.5)
	positive_indices = tf.where(positive_roi_bool)[:, 0]
	# 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
	negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]

	# Subsample ROIs. Aim for 33% positive
	# Positive ROIs
	positive_count = int(config.TRAIN_ROIS_PER_IMAGE *
	config.ROI_POSITIVE_RATIO)
	positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
	positive_count = tf.shape(positive_indices)[0]
	# Negative ROIs. Add enough to maintain positive:negative ratio.
	r = 1.0 / config.ROI_POSITIVE_RATIO
	negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count
	negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
	# Gather selected ROIs
	positive_rois = tf.gather(proposals, positive_indices)
	negative_rois = tf.gather(proposals, negative_indices)

	# Assign positive ROIs to GT boxes.
	positive_overlaps = tf.gather(overlaps, positive_indices)
	roi_gt_box_assignment = tf.argmax(positive_overlaps, axis=1)
	roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
	roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)

	# Compute bbox refinement for positive ROIs
	deltas = utils.box_refinement_graph(positive_rois, roi_gt_boxes)
	deltas /= config.BBOX_STD_DEV

	# Assign positive ROIs to GT masks
	# Permute masks to [N, height, width, 1]
	transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1)
	# Pick the right mask for each ROI
	roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment)

	# Compute mask targets
	boxes = positive_rois
	if config.USE_MINI_MASK:
	# Transform ROI corrdinates from normalized image space
	# to normalized mini-mask space.
	y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
	gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
	gt_h = gt_y2 - gt_y1
	gt_w = gt_x2 - gt_x1
	y1 = (y1 - gt_y1) / gt_h
	x1 = (x1 - gt_x1) / gt_w
	y2 = (y2 - gt_y1) / gt_h
	x2 = (x2 - gt_x1) / gt_w
	boxes = tf.concat([y1, x1, y2, x2], 1)
	box_ids = tf.range(0, tf.shape(roi_masks)[0])
	masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes,
	box_ids,
	config.MASK_SHAPE)
	# Remove the extra dimension from masks.
	masks = tf.squeeze(masks, axis=3)

	# Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
	# binary cross entropy loss.
	masks = tf.round(masks)

	# Append negative ROIs and pad bbox deltas and masks that
	# are not used for negative ROIs with zeros.
	rois = tf.concat([positive_rois, negative_rois], axis=0)
	N = tf.shape(negative_rois)[0]
	P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
	rois = tf.pad(rois, [(0, P), (0, 0)])
	roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
	roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
	deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
	masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])

	return rois, roi_gt_class_ids, deltas, masks


	class DetectionTargetLayer(KE.Layer):
	"""Subsamples proposals and generates target box refinement, class_ids,
	and masks for each.

	Inputs:
	proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might
	be zero padded if there are not enough proposals.
	gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
	gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized
	coordinates.
	gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type

	Returns: Target ROIs and corresponding class IDs, bounding box shifts,
	and masks.
	rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized
	coordinates
	target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
	target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES,
	(dy, dx, log(dh), log(dw), class_id)]
	Class-specific bbox refinements.
	target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width)
	Masks cropped to bbox boundaries and resized to neural
	network output size.

	Note: Returned arrays might be zero padded if not enough target ROIs.
	"""

	def __init__(self, config, **kwargs):
	super(DetectionTargetLayer, self).__init__(**kwargs)
	self.config = config

	def call(self, inputs):
	proposals = inputs[0]
	gt_class_ids = inputs[1]
	gt_boxes = inputs[2]
	gt_masks = inputs[3]

	# Slice the batch and run a graph for each slice
	# TODO: Rename target_bbox to target_deltas for clarity
	names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
	outputs = utils.batch_slice(
	[proposals, gt_class_ids, gt_boxes, gt_masks],
	lambda w, x, y, z: detection_targets_graph(
	w, x, y, z, self.config),
	self.config.IMAGES_PER_GPU, names=names)
	return outputs

	def compute_output_shape(self, input_shape):
	return [
	(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # rois
	(None, 1), # class_ids
	(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # deltas
	(None, self.config.TRAIN_ROIS_PER_IMAGE, self.config.MASK_SHAPE[0],
	self.config.MASK_SHAPE[1]) # masks
	]

	def compute_mask(self, inputs, mask=None):
	return [None, None, None, None]


	############################################################
	# Detection Layer
	############################################################

	def clip_to_window(window, boxes):
	"""
	window: (y1, x1, y2, x2). The window in the image we want to clip to.
	boxes: [N, (y1, x1, y2, x2)]
	"""
	boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], window[2]), window[0])
	boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], window[3]), window[1])
	boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], window[2]), window[0])
	boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], window[3]), window[1])
	return boxes


	def refine_detections_graph(rois, probs, deltas, window, config):
	"""Refine classified proposals and filter overlaps and return final
	detections.

	Inputs:
	rois: [N, (y1, x1, y2, x2)] in normalized coordinates
	probs: [N, num_classes]. Class probabilities.
	deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific
	bounding box deltas.
	window: (y1, x1, y2, x2) in image coordinates. The part of the image
	that contains the image excluding the padding.

	Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)] where
	coordinates are in image domain.
	"""
	# Class IDs per ROI
	class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
	# Class probability of the top class of each ROI
	indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
	class_scores = tf.gather_nd(probs, indices)
	# Class-specific bounding box deltas
	deltas_specific = tf.gather_nd(deltas, indices)
	# Apply bounding box deltas
	# Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
	refined_rois = apply_box_deltas_graph(
	rois, deltas_specific * config.BBOX_STD_DEV)
	# Convert coordiates to image domain
	# TODO: better to keep them normalized until later
	height, width = config.IMAGE_SHAPE[:2]
	refined_rois *= tf.constant([height, width, height, width], dtype=tf.float32)
	# Clip boxes to image window
	refined_rois = clip_boxes_graph(refined_rois, window)
	# Round and cast to int since we're deadling with pixels now
	refined_rois = tf.to_int32(tf.rint(refined_rois))

	# TODO: Filter out boxes with zero area

	# Filter out background boxes
	keep = tf.where(class_ids > 0)[:, 0]
	# Filter out low confidence boxes
	if config.DETECTION_MIN_CONFIDENCE:
	conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
	keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
	tf.expand_dims(conf_keep, 0))
	keep = tf.sparse_tensor_to_dense(keep)[0]

	# Apply per-class NMS
	# 1. Prepare variables
	pre_nms_class_ids = tf.gather(class_ids, keep)
	pre_nms_scores = tf.gather(class_scores, keep)
	pre_nms_rois = tf.gather(refined_rois, keep)
	unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]

	def nms_keep_map(class_id):
	"""Apply Non-Maximum Suppression on ROIs of the given class."""
	# Indices of ROIs of the given class
	ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
	# Apply NMS
	class_keep = tf.image.non_max_suppression(
	tf.to_float(tf.gather(pre_nms_rois, ixs)),
	tf.gather(pre_nms_scores, ixs),
	max_output_size=config.DETECTION_MAX_INSTANCES,
	iou_threshold=config.DETECTION_NMS_THRESHOLD)
	# Map indicies
	class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
	# Pad with -1 so returned tensors have the same shape
	gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
	class_keep = tf.pad(class_keep, [(0, gap)],
	mode='CONSTANT', constant_values=-1)
	# Set shape so map_fn() can infer result shape
	class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
	return class_keep

	# 2. Map over class IDs
	nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
	dtype=tf.int64)
	# 3. Merge results into one list, and remove -1 padding
	nms_keep = tf.reshape(nms_keep, [-1])
	nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
	# 4. Compute intersection between keep and nms_keep
	keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
	tf.expand_dims(nms_keep, 0))
	keep = tf.sparse_tensor_to_dense(keep)[0]
	# Keep top detections
	roi_count = config.DETECTION_MAX_INSTANCES
	class_scores_keep = tf.gather(class_scores, keep)
	num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
	top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
	keep = tf.gather(keep, top_ids)

	# Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
	# Coordinates are in image domain.
	detections = tf.concat([
	tf.to_float(tf.gather(refined_rois, keep)),
	tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
	tf.gather(class_scores, keep)[..., tf.newaxis]
	], axis=1)

	# Pad with zeros if detections < DETECTION_MAX_INSTANCES
	gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
	detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
	return detections


	class DetectionLayer(KE.Layer):
	"""Takes classified proposal boxes and their bounding box deltas and
	returns the final detection boxes.

	Returns:
	[batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
	coordinates are in image domain
	"""

	def __init__(self, config=None, **kwargs):
	super(DetectionLayer, self).__init__(**kwargs)
	self.config = config

	def call(self, inputs):
	rois = inputs[0]
	mrcnn_class = inputs[1]
	mrcnn_bbox = inputs[2]
	image_meta = inputs[3]

	# Run detection refinement graph on each item in the batch
	_, _, window, _ = parse_image_meta_graph(image_meta)
	detections_batch = utils.batch_slice(
	[rois, mrcnn_class, mrcnn_bbox, window],
	lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
	self.config.IMAGES_PER_GPU)

	# Reshape output
	# [batch, num_detections, (y1, x1, y2, x2, class_score)] in pixels
	return tf.reshape(
	detections_batch,
	[self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])

	def compute_output_shape(self, input_shape):
	return (None, self.config.DETECTION_MAX_INSTANCES, 6)


	# Region Proposal Network (RPN)

	def rpn_graph(feature_map, anchors_per_location, anchor_stride):
	"""Builds the computation graph of Region Proposal Network.

	feature_map: backbone features [batch, height, width, depth]
	anchors_per_location: number of anchors per pixel in the feature map
	anchor_stride: Controls the density of anchors. Typically 1 (anchors for
	every pixel in the feature map), or 2 (every other pixel).

	Returns:
	rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
	rpn_probs: [batch, H, W, 2] Anchor classifier probabilities.
	rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be
	applied to anchors.
	"""
	# TODO: check if stride of 2 causes alignment issues if the featuremap
	# is not even.
	# Shared convolutional base of the RPN
	shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu',
	strides=anchor_stride,
	name='rpn_conv_shared')(feature_map)

	# Anchor Score. [batch, height, width, anchors per location * 2].
	x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
	activation='linear', name='rpn_class_raw')(shared)

	# Reshape to [batch, anchors, 2]
	rpn_class_logits = KL.Lambda(
	lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)

	# Softmax on last dimension of BG/FG.
	rpn_probs = KL.Activation(
	"softmax", name="rpn_class_xxx")(rpn_class_logits)

	# Bounding box refinement. [batch, H, W, anchors per location, depth]
	# where depth is [x, y, log(w), log(h)]
	x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
	activation='linear', name='rpn_bbox_pred')(shared)

	# Reshape to [batch, anchors, 4]
	rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)

	return [rpn_class_logits, rpn_probs, rpn_bbox]


	def build_rpn_model(anchor_stride, anchors_per_location, depth):
	"""Builds a Keras model of the Region Proposal Network.
	It wraps the RPN graph so it can be used multiple times with shared
	weights.

	anchors_per_location: number of anchors per pixel in the feature map
	anchor_stride: Controls the density of anchors. Typically 1 (anchors for
	every pixel in the feature map), or 2 (every other pixel).
	depth: Depth of the backbone feature map.

	Returns a Keras Model object. The model outputs, when called, are:
	rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
	rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
	rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be
	applied to anchors.
	"""
	input_feature_map = KL.Input(shape=[None, None, depth],
	name="input_rpn_feature_map")
	outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride)
	return KM.Model([input_feature_map], outputs, name="rpn_model")


	############################################################
	# Feature Pyramid Network Heads
	############################################################

	def fpn_classifier_graph(rois, feature_maps,
	image_shape, pool_size, num_classes):
	"""Builds the computation graph of the feature pyramid network classifier
	and regressor heads.

	rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized
	coordinates.
	feature_maps: List of feature maps from diffent layers of the pyramid,
	[P2, P3, P4, P5]. Each has a different resolution.
	image_shape: [height, width, depth]
	pool_size: The width of the square feature map generated from ROI Pooling.
	num_classes: number of classes, which determines the depth of the results

	Returns:
	logits: [N, NUM_CLASSES] classifier logits (before softmax)
	probs: [N, NUM_CLASSES] classifier probabilities
	bbox_deltas: [N, (dy, dx, log(dh), log(dw))] Deltas to apply to
	proposal boxes
	"""
	# ROI Pooling
	# Shape: [batch, num_boxes, pool_height, pool_width, channels]
	x = PyramidROIAlign([pool_size, pool_size], image_shape,
	name="roi_align_classifier")([rois] + feature_maps)
	# Two 1024 FC layers (implemented with Conv2D for consistency)
	x = KL.TimeDistributed(KL.Conv2D(1024, (pool_size, pool_size), padding="valid"),
	name="mrcnn_class_conv1")(x)
	x = KL.TimeDistributed(BatchNorm(axis=3), name='mrcnn_class_bn1')(x)
	x = KL.Activation('relu')(x)
	x = KL.TimeDistributed(KL.Conv2D(1024, (1, 1)),
	name="mrcnn_class_conv2")(x)
	x = KL.TimeDistributed(BatchNorm(axis=3),
	name='mrcnn_class_bn2')(x)
	x = KL.Activation('relu')(x)

	shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
	name="pool_squeeze")(x)

	# Classifier head
	mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes),
	name='mrcnn_class_logits')(shared)
	mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"),
	name="mrcnn_class")(mrcnn_class_logits)

	# BBox head
	# [batch, boxes, num_classes * (dy, dx, log(dh), log(dw))]
	x = KL.TimeDistributed(KL.Dense(num_classes * 4, activation='linear'),
	name='mrcnn_bbox_fc')(shared)
	# Reshape to [batch, boxes, num_classes, (dy, dx, log(dh), log(dw))]
	s = K.int_shape(x)
	mrcnn_bbox = KL.Reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x)

	return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox


	def build_fpn_mask_graph(rois, feature_maps,
	image_shape, pool_size, num_classes):
	"""Builds the computation graph of the mask head of Feature Pyramid Network.

	rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized
	coordinates.
	feature_maps: List of feature maps from diffent layers of the pyramid,
	[P2, P3, P4, P5]. Each has a different resolution.
	image_shape: [height, width, depth]
	pool_size: The width of the square feature map generated from ROI Pooling.
	num_classes: number of classes, which determines the depth of the results

	Returns: Masks [batch, roi_count, height, width, num_classes]
	"""
	# ROI Pooling
	# Shape: [batch, boxes, pool_height, pool_width, channels]
	x = PyramidROIAlign([pool_size, pool_size], image_shape,
	name="roi_align_mask")([rois] + feature_maps)

	# Conv layers
	x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
	name="mrcnn_mask_conv1")(x)
	x = KL.TimeDistributed(BatchNorm(axis=3),
	name='mrcnn_mask_bn1')(x)
	x = KL.Activation('relu')(x)

	x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
	name="mrcnn_mask_conv2")(x)
	x = KL.TimeDistributed(BatchNorm(axis=3),
	name='mrcnn_mask_bn2')(x)
	x = KL.Activation('relu')(x)

	x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
	name="mrcnn_mask_conv3")(x)
	x = KL.TimeDistributed(BatchNorm(axis=3),
	name='mrcnn_mask_bn3')(x)
	x = KL.Activation('relu')(x)

	x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
	name="mrcnn_mask_conv4")(x)
	x = KL.TimeDistributed(BatchNorm(axis=3),
	name='mrcnn_mask_bn4')(x)
	x = KL.Activation('relu')(x)

	x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"),
	name="mrcnn_mask_deconv")(x)
	x = KL.TimeDistributed(KL.Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"),
	name="mrcnn_mask")(x)
	return x


	############################################################
	# Loss Functions
	############################################################

	def smooth_l1_loss(y_true, y_pred):
	"""Implements Smooth-L1 loss.
	y_true and y_pred are typicallly: [N, 4], but could be any shape.
	"""
	diff = K.abs(y_true - y_pred)
	less_than_one = K.cast(K.less(diff, 1.0), "float32")
	loss = (less_than_one * 0.5 * diff*2) + (1 - less_than_one) (diff - 0.5)
	return loss


	def rpn_class_loss_graph(rpn_match, rpn_class_logits):
	"""RPN anchor classifier loss.

	rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
	-1=negative, 0=neutral anchor.
	rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for FG/BG.
	"""
	# Squeeze last dim to simplify
	rpn_match = tf.squeeze(rpn_match, -1)
	# Get anchor classes. Convert the -1/+1 match to 0/1 values.
	anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32)
	# Positive and Negative anchors contribute to the loss,
	# but neutral anchors (match value = 0) don't.
	indices = tf.where(K.not_equal(rpn_match, 0))
	# Pick rows that contribute to the loss and filter out the rest.
	rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
	anchor_class = tf.gather_nd(anchor_class, indices)
	# Crossentropy loss
	loss = K.sparse_categorical_crossentropy(target=anchor_class,
	output=rpn_class_logits,
	from_logits=True)
	loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
	return loss


	def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):
	"""Return the RPN bounding box loss graph.

	config: the model config object.
	target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
	Uses 0 padding to fill in unsed bbox deltas.
	rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
	-1=negative, 0=neutral anchor.
	rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
	"""
	# Positive anchors contribute to the loss, but negative and
	# neutral anchors (match value of 0 or -1) don't.
	rpn_match = K.squeeze(rpn_match, -1)
	indices = tf.where(K.equal(rpn_match, 1))

	# Pick bbox deltas that contribute to the loss
	rpn_bbox = tf.gather_nd(rpn_bbox, indices)

	# Trim target bounding box deltas to the same length as rpn_bbox.
	batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1)
	target_bbox = batch_pack_graph(target_bbox, batch_counts,
	config.IMAGES_PER_GPU)

	# TODO: use smooth_l1_loss() rather than reimplementing here
	# to reduce code duplication
	diff = K.abs(target_bbox - rpn_bbox)
	less_than_one = K.cast(K.less(diff, 1.0), "float32")
	loss = (less_than_one * 0.5 * diff*2) + (1 - less_than_one) (diff - 0.5)

	loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
	return loss


	def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,
	active_class_ids):
	"""Loss for the classifier head of Mask RCNN.

	target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
	padding to fill in the array.
	pred_class_logits: [batch, num_rois, num_classes]
	active_class_ids: [batch, num_classes]. Has a value of 1 for
	classes that are in the dataset of the image, and 0
	for classes that are not in the dataset.
	"""
	target_class_ids = tf.cast(target_class_ids, 'int64')

	# Find predictions of classes that are not in the dataset.
	pred_class_ids = tf.argmax(pred_class_logits, axis=2)
	# TODO: Update this line to work with batch > 1. Right now it assumes all
	# images in a batch have the same active_class_ids
	pred_active = tf.gather(active_class_ids[0], pred_class_ids)

	# Loss
	loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
	labels=target_class_ids, logits=pred_class_logits)

	# Erase losses of predictions of classes that are not in the active
	# classes of the image.
	loss = loss * pred_active

	# Computer loss mean. Use only predictions that contribute
	# to the loss to get a correct mean.
	loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
	return loss


	def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
	"""Loss for Mask R-CNN bounding box refinement.

	target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
	target_class_ids: [batch, num_rois]. Integer class IDs.
	pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
	"""
	# Reshape to merge batch and roi dimensions for simplicity.
	target_class_ids = K.reshape(target_class_ids, (-1,))
	target_bbox = K.reshape(target_bbox, (-1, 4))
	pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4))

	# Only positive ROIs contribute to the loss. And only
	# the right class_id of each ROI. Get their indicies.
	positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
	positive_roi_class_ids = tf.cast(
	tf.gather(target_class_ids, positive_roi_ix), tf.int64)
	indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)

	# Gather the deltas (predicted and true) that contribute to loss
	target_bbox = tf.gather(target_bbox, positive_roi_ix)
	pred_bbox = tf.gather_nd(pred_bbox, indices)

	# Smooth-L1 Loss
	loss = K.switch(tf.size(target_bbox) > 0,
	smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox),
	tf.constant(0.0))
	loss = K.mean(loss)
	loss = K.reshape(loss, [1, 1])
	return loss


	def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):
	"""Mask binary cross-entropy loss for the masks head.

	target_masks: [batch, num_rois, height, width].
	A float32 tensor of values 0 or 1. Uses zero padding to fill array.
	target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
	pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
	with values from 0 to 1.
	"""
	# Reshape for simplicity. Merge first two dimensions into one.
	target_class_ids = K.reshape(target_class_ids, (-1,))
	mask_shape = tf.shape(target_masks)
	target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))
	pred_shape = tf.shape(pred_masks)
	pred_masks = K.reshape(pred_masks,
	(-1, pred_shape[2], pred_shape[3], pred_shape[4]))
	# Permute predicted masks to [N, num_classes, height, width]
	pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])

	# Only positive ROIs contribute to the loss. And only
	# the class specific mask of each ROI.
	positive_ix = tf.where(target_class_ids > 0)[:, 0]
	positive_class_ids = tf.cast(
	tf.gather(target_class_ids, positive_ix), tf.int64)
	indices = tf.stack([positive_ix, positive_class_ids], axis=1)

	# Gather the masks (predicted and true) that contribute to loss
	y_true = tf.gather(target_masks, positive_ix)
	y_pred = tf.gather_nd(pred_masks, indices)

	# Compute binary cross entropy. If no positive ROIs, then return 0.
	# shape: [batch, roi, num_classes]
	loss = K.switch(tf.size(y_true) > 0,
	K.binary_crossentropy(target=y_true, output=y_pred),
	tf.constant(0.0))
	loss = K.mean(loss)
	loss = K.reshape(loss, [1, 1])
	return loss


	############################################################
	# Data Generator
	############################################################

	def load_image_gt(dataset, config, image_id, augment=False,
	use_mini_mask=False):
	"""Load and return ground truth data for an image (image, mask, bounding boxes).

	augment: If true, apply random image augmentation. Currently, only
	horizontal flipping is offered.
	use_mini_mask: If False, returns full-size masks that are the same height
	and width as the original image. These can be big, for example
	1024x1024x100 (for 100 instances). Mini masks are smaller, typically,
	224x224 and are generated by extracting the bounding box of the
	object and resizing it to MINI_MASK_SHAPE.

	Returns:
	image: [height, width, 3]
	shape: the original shape of the image before resizing and cropping.
	class_ids: [instance_count] Integer class IDs
	bbox: [instance_count, (y1, x1, y2, x2)]
	mask: [height, width, instance_count]. The height and width are those
	of the image unless use_mini_mask is True, in which case they are
	defined in MINI_MASK_SHAPE.
	"""
	# Load image and mask
	image = dataset.load_image(image_id)
	mask, class_ids = dataset.load_mask(image_id)
	shape = image.shape
	image, window, scale, padding = utils.resize_image(
	image,
	min_dim=config.IMAGE_MIN_DIM,
	max_dim=config.IMAGE_MAX_DIM,
	padding=config.IMAGE_PADDING)
	mask = utils.resize_mask(mask, scale, padding)

	# Random horizontal flips.
	if augment:
	if random.randint(0, 1):
	image = np.fliplr(image)
	mask = np.fliplr(mask)

	# Note that some boxes might be all zeros if the corresponding mask got cropped out.
	# and here is to filter them out
	_idx = np.sum(mask, axis=(0, 1)) > 0
	mask = mask[:, :, _idx]
	class_ids = class_ids[_idx]
	# Bounding boxes. Note that some boxes might be all zeros
	# if the corresponding mask got cropped out.
	# bbox: [num_instances, (y1, x1, y2, x2)]
	bbox = utils.extract_bboxes(mask)

	# Active classes
	# Different datasets have different classes, so track the
	# classes supported in the dataset of this image.
	active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)
	source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]]
	active_class_ids[source_class_ids] = 1

	# Resize masks to smaller size to reduce memory usage
	if use_mini_mask:
	mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE)

	# Image meta data
	image_meta = compose_image_meta(image_id, shape, window, active_class_ids)

	return image, image_meta, class_ids, bbox, mask


	def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):
	"""Generate targets for training Stage 2 classifier and mask heads.
	This is not used in normal training. It's useful for debugging or to train
	the Mask RCNN heads without using the RPN head.

	Inputs:
	rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
	gt_class_ids: [instance count] Integer class IDs
	gt_boxes: [instance count, (y1, x1, y2, x2)]
	gt_masks: [height, width, instance count] Grund truth masks. Can be full
	size or mini-masks.

	Returns:
	rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
	class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
	bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
	bbox refinements.
	masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
	to bbox boundaries and resized to neural network output size.
	"""
	assert rpn_rois.shape[0] > 0
	assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
	gt_class_ids.dtype)
	assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
	gt_boxes.dtype)
	assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
	gt_masks.dtype)

	# It's common to add GT Boxes to ROIs but we don't do that here because
	# according to XinLei Chen's paper, it doesn't help.

	# Trim empty padding in gt_boxes and gt_masks parts
	instance_ids = np.where(gt_class_ids > 0)[0]
	assert instance_ids.shape[0] > 0, "Image must contain instances."
	gt_class_ids = gt_class_ids[instance_ids]
	gt_boxes = gt_boxes[instance_ids]
	gt_masks = gt_masks[:, :, instance_ids]

	# Compute areas of ROIs and ground truth boxes.
	rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \
	(rpn_rois[:, 3] - rpn_rois[:, 1])
	gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \
	(gt_boxes[:, 3] - gt_boxes[:, 1])

	# Compute overlaps [rpn_rois, gt_boxes]
	overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
	for i in range(overlaps.shape[1]):
	gt = gt_boxes[i]
	overlaps[:, i] = utils.compute_iou(
	gt, rpn_rois, gt_box_area[i], rpn_roi_area)

	# Assign ROIs to GT boxes
	rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
	rpn_roi_iou_max = overlaps[np.arange(
	overlaps.shape[0]), rpn_roi_iou_argmax]
	# GT box assigned to each ROI
	rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
	rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]

	# Positive ROIs are those with >= 0.5 IoU with a GT box.
	fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]

	# Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
	# TODO: To hard example mine or not to hard example mine, that's the question
	# bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
	bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]

	# Subsample ROIs. Aim for 33% foreground.
	# FG
	fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
	if fg_ids.shape[0] > fg_roi_count:
	keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
	else:
	keep_fg_ids = fg_ids
	# BG
	remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
	if bg_ids.shape[0] > remaining:
	keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
	else:
	keep_bg_ids = bg_ids
	# Combine indicies of ROIs to keep
	keep = np.concatenate([keep_fg_ids, keep_bg_ids])
	# Need more?
	remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
	if remaining > 0:
	# Looks like we don't have enough samples to maintain the desired
	# balance. Reduce requirements and fill in the rest. This is
	# likely different from the Mask RCNN paper.

	# There is a small chance we have neither fg nor bg samples.
	if keep.shape[0] == 0:
	# Pick bg regions with easier IoU threshold
	bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
	assert bg_ids.shape[0] >= remaining
	keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
	assert keep_bg_ids.shape[0] == remaining
	keep = np.concatenate([keep, keep_bg_ids])
	else:
	# Fill the rest with repeated bg rois.
	keep_extra_ids = np.random.choice(
	keep_bg_ids, remaining, replace=True)
	keep = np.concatenate([keep, keep_extra_ids])
	assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \
	"keep doesn't match ROI batch size {}, {}".format(
	keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)

	# Reset the gt boxes assigned to BG ROIs.
	rpn_roi_gt_boxes[keep_bg_ids, :] = 0
	rpn_roi_gt_class_ids[keep_bg_ids] = 0

	# For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
	rois = rpn_rois[keep]
	roi_gt_boxes = rpn_roi_gt_boxes[keep]
	roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
	roi_gt_assignment = rpn_roi_iou_argmax[keep]

	# Class-aware bbox deltas. [y, x, log(h), log(w)]
	bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
	config.NUM_CLASSES, 4), dtype=np.float32)
	pos_ids = np.where(roi_gt_class_ids > 0)[0]
	bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = utils.box_refinement(
	rois[pos_ids], roi_gt_boxes[pos_ids, :4])
	# Normalize bbox refinements
	bboxes /= config.BBOX_STD_DEV

	# Generate class-specific target masks.
	masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES),
	dtype=np.float32)
	for i in pos_ids:
	class_id = roi_gt_class_ids[i]
	assert class_id > 0, "class id must be greater than 0"
	gt_id = roi_gt_assignment[i]
	class_mask = gt_masks[:, :, gt_id]

	if config.USE_MINI_MASK:
	# Create a mask placeholder, the size of the image
	placeholder = np.zeros(config.IMAGE_SHAPE[:2], dtype=bool)
	# GT box
	gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]
	gt_w = gt_x2 - gt_x1
	gt_h = gt_y2 - gt_y1
	# Resize mini mask to size of GT box
	placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \
	np.round(scipy.misc.imresize(class_mask.astype(float), (gt_h, gt_w),
	interp='nearest') / 255.0).astype(bool)
	# Place the mini batch in the placeholder
	class_mask = placeholder

	# Pick part of the mask and resize it
	y1, x1, y2, x2 = rois[i].astype(np.int32)
	m = class_mask[y1:y2, x1:x2]
	mask = scipy.misc.imresize(
	m.astype(float), config.MASK_SHAPE, interp='nearest') / 255.0
	masks[i, :, :, class_id] = mask

	return rois, roi_gt_class_ids, bboxes, masks


	def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):
	"""Given the anchors and GT boxes, compute overlaps and identify positive
	anchors and deltas to refine them to match their corresponding GT boxes.

	anchors: [num_anchors, (y1, x1, y2, x2)]
	gt_class_ids: [num_gt_boxes] Integer class IDs.
	gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]

	Returns:
	rpn_match: [N] (int32) matches between anchors and GT boxes.
	1 = positive anchor, -1 = negative anchor, 0 = neutral
	rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
	"""
	# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
	rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
	# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
	rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))

	# Handle COCO crowds
	# A crowd box in COCO is a bounding box around several instances. Exclude
	# them from training. A crowd box is given a negative class ID.
	crowd_ix = np.where(gt_class_ids < 0)[0]
	if crowd_ix.shape[0] > 0:
	# Filter out crowds from ground truth class IDs and boxes
	non_crowd_ix = np.where(gt_class_ids > 0)[0]
	crowd_boxes = gt_boxes[crowd_ix]
	gt_class_ids = gt_class_ids[non_crowd_ix]
	gt_boxes = gt_boxes[non_crowd_ix]
	# Compute overlaps with crowd boxes [anchors, crowds]
	crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes)
	crowd_iou_max = np.amax(crowd_overlaps, axis=1)
	no_crowd_bool = (crowd_iou_max < 0.001)
	else:
	# All anchors don't intersect a crowd
	no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)

	# Compute overlaps [num_anchors, num_gt_boxes]
	overlaps = utils.compute_overlaps(anchors, gt_boxes)

	# Match anchors to GT Boxes
	# If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
	# If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
	# Neutral anchors are those that don't match the conditions above,
	# and they don't influence the loss function.
	# However, don't keep any GT box unmatched (rare, but happens). Instead,
	# match it to the closest anchor (even if its max IoU is < 0.3).
	#
	# 1. Set negative anchors first. They get overwritten below if a GT box is
	# matched to them. Skip boxes in crowd areas.
	anchor_iou_argmax = np.argmax(overlaps, axis=1)
	anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
	rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
	# 2. Set an anchor for each GT box (regardless of IoU value).
	# TODO: If multiple anchors have the same IoU match all of them
	gt_iou_argmax = np.argmax(overlaps, axis=0)
	rpn_match[gt_iou_argmax] = 1
	# 3. Set anchors with high overlap as positive.
	rpn_match[anchor_iou_max >= 0.7] = 1

	# Subsample to balance positive and negative anchors
	# Don't let positives be more than half the anchors
	ids = np.where(rpn_match == 1)[0]
	extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
	if extra > 0:
	# Reset the extra ones to neutral
	ids = np.random.choice(ids, extra, replace=False)
	rpn_match[ids] = 0
	# Same for negative proposals
	ids = np.where(rpn_match == -1)[0]
	extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
	np.sum(rpn_match == 1))
	if extra > 0:
	# Rest the extra ones to neutral
	ids = np.random.choice(ids, extra, replace=False)
	rpn_match[ids] = 0

	# For positive anchors, compute shift and scale needed to transform them
	# to match the corresponding GT boxes.
	ids = np.where(rpn_match == 1)[0]
	ix = 0 # index into rpn_bbox
	# TODO: use box_refinement() rather than duplicating the code here
	for i, a in zip(ids, anchors[ids]):
	# Closest gt box (it might have IoU < 0.7)
	gt = gt_boxes[anchor_iou_argmax[i]]

	# Convert coordinates to center plus width/height.
	# GT Box
	gt_h = gt[2] - gt[0]
	gt_w = gt[3] - gt[1]
	gt_center_y = gt[0] + 0.5 * gt_h
	gt_center_x = gt[1] + 0.5 * gt_w
	# Anchor
	a_h = a[2] - a[0]
	a_w = a[3] - a[1]
	a_center_y = a[0] + 0.5 * a_h
	a_center_x = a[1] + 0.5 * a_w

	# Compute the bbox refinement that the RPN should predict.
	rpn_bbox[ix] = [
	(gt_center_y - a_center_y) / a_h,
	(gt_center_x - a_center_x) / a_w,
	np.log(gt_h / a_h),
	np.log(gt_w / a_w),
	]
	# Normalize
	rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
	ix += 1

	return rpn_match, rpn_bbox


	def generate_random_rois(image_shape, count, gt_class_ids, gt_boxes):
	"""Generates ROI proposals similar to what a region proposal network
	would generate.

	image_shape: [Height, Width, Depth]
	count: Number of ROIs to generate
	gt_class_ids: [N] Integer ground truth class IDs
	gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.

	Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
	"""
	# placeholder
	rois = np.zeros((count, 4), dtype=np.int32)

	# Generate random ROIs around GT boxes (90% of count)
	rois_per_box = int(0.9 * count / gt_boxes.shape[0])
	for i in range(gt_boxes.shape[0]):
	gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
	h = gt_y2 - gt_y1
	w = gt_x2 - gt_x1
	# random boundaries
	r_y1 = max(gt_y1 - h, 0)
	r_y2 = min(gt_y2 + h, image_shape[0])
	r_x1 = max(gt_x1 - w, 0)
	r_x2 = min(gt_x2 + w, image_shape[1])

	# To avoid generating boxes with zero area, we generate double what
	# we need and filter out the extra. If we get fewer valid boxes
	# than we need, we loop and try again.
	while True:
	y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
	x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
	# Filter out zero area boxes
	threshold = 1
	y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
	threshold][:rois_per_box]
	x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
	threshold][:rois_per_box]
	if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
	break

	# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
	# into x1, y1, x2, y2 order
	x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
	y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
	box_rois = np.hstack([y1, x1, y2, x2])
	rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois

	# Generate random ROIs anywhere in the image (10% of count)
	remaining_count = count - (rois_per_box * gt_boxes.shape[0])
	# To avoid generating boxes with zero area, we generate double what
	# we need and filter out the extra. If we get fewer valid boxes
	# than we need, we loop and try again.
	while True:
	y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
	x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
	# Filter out zero area boxes
	threshold = 1
	y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
	threshold][:remaining_count]
	x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
	threshold][:remaining_count]
	if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
	break

	# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
	# into x1, y1, x2, y2 order
	x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
	y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
	global_rois = np.hstack([y1, x1, y2, x2])
	rois[-remaining_count:] = global_rois
	return rois


	def data_generator(dataset, config, shuffle=True, augment=True, random_rois=0,
	batch_size=1, detection_targets=False):
	"""A generator that returns images and corresponding target class ids,
	bounding box deltas, and masks.

	dataset: The Dataset object to pick data from
	config: The model config object
	shuffle: If True, shuffles the samples before every epoch
	augment: If True, applies image augmentation to images (currently only
	horizontal flips are supported)
	random_rois: If > 0 then generate proposals to be used to train the
	network classifier and mask heads. Useful if training
	the Mask RCNN part without the RPN.
	batch_size: How many images to return in each call
	detection_targets: If True, generate detection targets (class IDs, bbox
	deltas, and masks). Typically for debugging or visualizations because
	in trainig detection targets are generated by DetectionTargetLayer.

	Returns a Python generator. Upon calling next() on it, the
	generator returns two lists, inputs and outputs. The containtes
	of the lists differs depending on the received arguments:
	inputs list:
	- images: [batch, H, W, C]
	- image_meta: [batch, size of image meta]
	- rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
	- rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
	- gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
	- gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
	- gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width
	are those of the image unless use_mini_mask is True, in which
	case they are defined in MINI_MASK_SHAPE.

	outputs list: Usually empty in regular training. But if detection_targets
	is True then the outputs list contains target class_ids, bbox deltas,
	and masks.
	"""
	b = 0 # batch item index
	image_index = -1
	image_ids = np.copy(dataset.image_ids)
	error_count = 0

	# Anchors
	# [anchor_count, (y1, x1, y2, x2)]
	anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
	config.RPN_ANCHOR_RATIOS,
	config.BACKBONE_SHAPES,
	config.BACKBONE_STRIDES,
	config.RPN_ANCHOR_STRIDE)

	# Keras requires a generator to run indefinately.
	while True:
	try:
	# Increment index to pick next image. Shuffle if at the start of an epoch.
	image_index = (image_index + 1) % len(image_ids)
	if shuffle and image_index == 0:
	np.random.shuffle(image_ids)

	# Get GT bounding boxes and masks for image.
	image_id = image_ids[image_index]
	image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
	load_image_gt(dataset, config, image_id, augment=augment,
	use_mini_mask=config.USE_MINI_MASK)

	# Skip images that have no instances. This can happen in cases
	# where we train on a subset of classes and the image doesn't
	# have any of the classes we care about.
	if not np.any(gt_class_ids > 0):
	continue

	# RPN Targets
	rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors,
	gt_class_ids, gt_boxes, config)

	# Mask R-CNN Targets
	if random_rois:
	rpn_rois = generate_random_rois(
	image.shape, random_rois, gt_class_ids, gt_boxes)
	if detection_targets:
	rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask =\
	build_detection_targets(
	rpn_rois, gt_class_ids, gt_boxes, gt_masks, config)

	# Init batch arrays
	if b == 0:
	batch_image_meta = np.zeros(
	(batch_size,) + image_meta.shape, dtype=image_meta.dtype)
	batch_rpn_match = np.zeros(
	[batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
	batch_rpn_bbox = np.zeros(
	[batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype)
	batch_images = np.zeros(
	(batch_size,) + image.shape, dtype=np.float32)
	batch_gt_class_ids = np.zeros(
	(batch_size, config.MAX_GT_INSTANCES), dtype=np.int32)
	batch_gt_boxes = np.zeros(
	(batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32)
	if config.USE_MINI_MASK:
	batch_gt_masks = np.zeros((batch_size, config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1],
	config.MAX_GT_INSTANCES))
	else:
	batch_gt_masks = np.zeros(
	(batch_size, image.shape[0], image.shape[1], config.MAX_GT_INSTANCES))
	if random_rois:
	batch_rpn_rois = np.zeros(
	(batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)
	if detection_targets:
	batch_rois = np.zeros(
	(batch_size,) + rois.shape, dtype=rois.dtype)
	batch_mrcnn_class_ids = np.zeros(
	(batch_size,) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype)
	batch_mrcnn_bbox = np.zeros(
	(batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)
	batch_mrcnn_mask = np.zeros(
	(batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype)

	# If more instances than fits in the array, sub-sample from them.
	if gt_boxes.shape[0] > config.MAX_GT_INSTANCES:
	ids = np.random.choice(
	np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False)
	gt_class_ids = gt_class_ids[ids]
	gt_boxes = gt_boxes[ids]
	gt_masks = gt_masks[:, :, ids]

	# Add to batch
	batch_image_meta[b] = image_meta
	batch_rpn_match[b] = rpn_match[:, np.newaxis]
	batch_rpn_bbox[b] = rpn_bbox
	batch_images[b] = mold_image(image.astype(np.float32), config)
	batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
	batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
	batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks
	if random_rois:
	batch_rpn_rois[b] = rpn_rois
	if detection_targets:
	batch_rois[b] = rois
	batch_mrcnn_class_ids[b] = mrcnn_class_ids
	batch_mrcnn_bbox[b] = mrcnn_bbox
	batch_mrcnn_mask[b] = mrcnn_mask
	b += 1

	# Batch full?
	if b >= batch_size:
	inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
	batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]
	outputs = []

	if random_rois:
	inputs.extend([batch_rpn_rois])
	if detection_targets:
	inputs.extend([batch_rois])
	# Keras requires that output and targets have the same number of dimensions
	batch_mrcnn_class_ids = np.expand_dims(
	batch_mrcnn_class_ids, -1)
	outputs.extend(
	[batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask])

	yield inputs, outputs

	# start a new batch
	b = 0
	except (GeneratorExit, KeyboardInterrupt):
	raise
	except:
	# Log it and skip the image
	logging.exception("Error processing image {}".format(
	dataset.image_info[image_id]))
	error_count += 1
	if error_count > 5:
	raise


	############################################################
	# MaskRCNN Class
	############################################################

	class MaskRCNN():
	"""Encapsulates the Mask RCNN model functionality.

	The actual Keras model is in the keras_model property.
	"""

	def __init__(self, mode, config, model_dir):
	"""
	mode: Either "training" or "inference"
	config: A Sub-class of the Config class
	model_dir: Directory to save training logs and trained weights
	"""
	assert mode in ['training', 'inference']
	self.mode = mode
	self.config = config
	self.model_dir = model_dir
	self.set_log_dir()
	self.keras_model = self.build(mode=mode, config=config)

	def build(self, mode, config):
	"""Build Mask R-CNN architecture.
	input_shape: The shape of the input image.
	mode: Either "training" or "inference". The inputs and
	outputs of the model differ accordingly.
	"""
	assert mode in ['training', 'inference']

	# Image size must be dividable by 2 multiple times
	h, w = config.IMAGE_SHAPE[:2]
	if h / 26 != int(h / 26) or w / 26 != int(w / 26):
	raise Exception("Image size must be dividable by 2 at least 6 times "
	"to avoid fractions when downscaling and upscaling."
	"For example, use 256, 320, 384, 448, 512, ... etc. ")

	# Inputs
	input_image = KL.Input(
	shape=config.IMAGE_SHAPE.tolist(), name="input_image")
	input_image_meta = KL.Input(shape=[None], name="input_image_meta")
	if mode == "training":
	# RPN GT
	input_rpn_match = KL.Input(
	shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
	input_rpn_bbox = KL.Input(
	shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)

	# Detection GT (class IDs, bounding boxes, and masks)
	# 1. GT Class IDs (zero padded)
	input_gt_class_ids = KL.Input(
	shape=[None], name="input_gt_class_ids", dtype=tf.int32)
	# 2. GT Boxes in pixels (zero padded)
	# [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
	input_gt_boxes = KL.Input(
	shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
	# Normalize coordinates
	h, w = K.shape(input_image)[1], K.shape(input_image)[2]
	image_scale = K.cast(K.stack([h, w, h, w], axis=0), tf.float32)
	gt_boxes = KL.Lambda(lambda x: x / image_scale)(input_gt_boxes)
	# 3. GT Masks (zero padded)
	# [batch, height, width, MAX_GT_INSTANCES]
	if config.USE_MINI_MASK:
	input_gt_masks = KL.Input(
	shape=[config.MINI_MASK_SHAPE[0],
	config.MINI_MASK_SHAPE[1], None],
	name="input_gt_masks", dtype=bool)
	else:
	input_gt_masks = KL.Input(
	shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
	name="input_gt_masks", dtype=bool)

	# Build the shared convolutional layers.
	# Bottom-up Layers
	# Returns a list of the last layers of each stage, 5 in total.
	# Don't create the thead (stage 5), so we pick the 4th item in the list.
	_, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True)
	# Top-down Layers
	# TODO: add assert to varify feature map sizes match what's in config
	P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5)
	P4 = KL.Add(name="fpn_p4add")([
	KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
	KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)])
	P3 = KL.Add(name="fpn_p3add")([
	KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
	KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)])
	P2 = KL.Add(name="fpn_p2add")([
	KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
	KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)])
	# Attach 3x3 conv to all P layers to get the final feature maps.
	P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2)
	P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3)
	P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4)
	P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5)
	# P6 is used for the 5th anchor scale in RPN. Generated by
	# subsampling from P5 with stride of 2.
	P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)

	# Note that P6 is used in RPN, but not in the classifier heads.
	rpn_feature_maps = [P2, P3, P4, P5, P6]
	mrcnn_feature_maps = [P2, P3, P4, P5]

	# Generate Anchors
	self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
	config.RPN_ANCHOR_RATIOS,
	config.BACKBONE_SHAPES,
	config.BACKBONE_STRIDES,
	config.RPN_ANCHOR_STRIDE)

	# RPN Model
	rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE,
	len(config.RPN_ANCHOR_RATIOS), 256)
	# Loop through pyramid layers
	layer_outputs = [] # list of lists
	for p in rpn_feature_maps:
	layer_outputs.append(rpn([p]))
	# Concatenate layer outputs
	# Convert from list of lists of level outputs to list of lists
	# of outputs across levels.
	# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
	output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
	outputs = list(zip(*layer_outputs))
	outputs = [KL.Concatenate(axis=1, name=n)(list(o))
	for o, n in zip(outputs, output_names)]

	rpn_class_logits, rpn_class, rpn_bbox = outputs

	# Generate proposals
	# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
	# and zero padded.
	proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
	else config.POST_NMS_ROIS_INFERENCE
	rpn_rois = ProposalLayer(proposal_count=proposal_count,
	nms_threshold=config.RPN_NMS_THRESHOLD,
	name="ROI",
	anchors=self.anchors,
	config=config)([rpn_class, rpn_bbox])

	if mode == "training":
	# Class ID mask to mark class IDs supported by the dataset the image
	# came from.
	_, _, _, active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x),
	mask=[None, None, None, None])(input_image_meta)

	if not config.USE_RPN_ROIS:
	# Ignore predicted ROIs and use ROIs provided as an input.
	input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
	name="input_roi", dtype=np.int32)
	# Normalize coordinates to 0-1 range.
	target_rois = KL.Lambda(lambda x: K.cast(
	x, tf.float32) / image_scale[:4])(input_rois)
	else:
	target_rois = rpn_rois

	# Generate detection targets
	# Subsamples proposals and generates target outputs for training
	# Note that proposal class IDs, gt_boxes, and gt_masks are zero
	# padded. Equally, returned rois and targets are zero padded.
	rois, target_class_ids, target_bbox, target_mask =\
	DetectionTargetLayer(config, name="proposal_targets")([
	target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])

	# Network Heads
	# TODO: verify that this handles zero padded ROIs
	mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
	fpn_classifier_graph(rois, mrcnn_feature_maps, config.IMAGE_SHAPE,
	config.POOL_SIZE, config.NUM_CLASSES)

	mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps,
	config.IMAGE_SHAPE,
	config.MASK_POOL_SIZE,
	config.NUM_CLASSES)

	# TODO: clean up (use tf.identify if necessary)
	output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)

	# Losses
	rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")(
	[input_rpn_match, rpn_class_logits])
	rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
	[input_rpn_bbox, input_rpn_match, rpn_bbox])
	class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
	[target_class_ids, mrcnn_class_logits, active_class_ids])
	bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
	[target_bbox, target_class_ids, mrcnn_bbox])
	mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
	[target_mask, target_class_ids, mrcnn_mask])

	# Model
	inputs = [input_image, input_image_meta,
	input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
	if not config.USE_RPN_ROIS:
	inputs.append(input_rois)
	outputs = [rpn_class_logits, rpn_class, rpn_bbox,
	mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
	rpn_rois, output_rois,
	rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
	model = KM.Model(inputs, outputs, name='mask_rcnn')
	else:
	# Network Heads
	# Proposal classifier and BBox regressor heads
	mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
	fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, config.IMAGE_SHAPE,
	config.POOL_SIZE, config.NUM_CLASSES)

	# Detections
	# output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in image coordinates
	detections = DetectionLayer(config, name="mrcnn_detection")(
	[rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])

	# Convert boxes to normalized coordinates
	# TODO: let DetectionLayer return normalized coordinates to avoid
	# unnecessary conversions
	h, w = config.IMAGE_SHAPE[:2]
	detection_boxes = KL.Lambda(
	lambda x: x[..., :4] / np.array([h, w, h, w]))(detections)

	# Create masks for detections
	mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps,
	config.IMAGE_SHAPE,
	config.MASK_POOL_SIZE,
	config.NUM_CLASSES)

	model = KM.Model([input_image, input_image_meta],
	[detections, mrcnn_class, mrcnn_bbox,
	mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],
	name='mask_rcnn')

	# Add multi-GPU support.
	if config.GPU_COUNT > 1:
	from parallel_model import ParallelModel
	model = ParallelModel(model, config.GPU_COUNT)

	return model

	def find_last(self):
	"""Finds the last checkpoint file of the last trained model in the
	model directory.
	Returns:
	log_dir: The directory where events and weights are saved
	checkpoint_path: the path to the last checkpoint file
	"""
	# Get directory names. Each directory corresponds to a model
	dir_names = next(os.walk(self.model_dir))[1]
	key = self.config.NAME.lower()
	dir_names = filter(lambda f: f.startswith(key), dir_names)
	dir_names = sorted(dir_names)
	if not dir_names:
	return None, None
	# Pick last directory
	dir_name = os.path.join(self.model_dir, dir_names[-1])
	# Find the last checkpoint
	checkpoints = next(os.walk(dir_name))[2]
	checkpoints = filter(lambda f: f.startswith("mask_rcnn"), checkpoints)
	checkpoints = sorted(checkpoints)
	if not checkpoints:
	return dir_name, None
	checkpoint = os.path.join(dir_name, checkpoints[-1])
	return dir_name, checkpoint

	def load_weights(self, filepath, by_name=False, exclude=None):
	"""Modified version of the correspoding Keras function with
	the addition of multi-GPU support and the ability to exclude
	some layers from loading.
	exlude: list of layer names to excluce
	"""
	import h5py
	from keras.engine import topology

	if exclude:
	by_name = True

	if h5py is None:
	raise ImportError('`load_weights` requires h5py.')
	f = h5py.File(filepath, mode='r')
	if 'layer_names' not in f.attrs and 'model_weights' in f:
	f = f['model_weights']

	# In multi-GPU training, we wrap the model. Get layers
	# of the inner model because they have the weights.
	keras_model = self.keras_model
	layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
	else keras_model.layers

	# Exclude some layers
	if exclude:
	layers = filter(lambda l: l.name not in exclude, layers)

	if by_name:
	topology.load_weights_from_hdf5_group_by_name(f, layers)
	else:
	topology.load_weights_from_hdf5_group(f, layers)
	if hasattr(f, 'close'):
	f.close()

	# Update the log directory
	self.set_log_dir(filepath)

	def get_imagenet_weights(self):
	"""Downloads ImageNet trained weights from Keras.
	Returns path to weights file.
	"""
	from keras.utils.data_utils import get_file
	TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/'\
	'releases/download/v0.2/'\
	'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
	weights_path = get_file('resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
	TF_WEIGHTS_PATH_NO_TOP,
	cache_subdir='models',
	md5_hash='a268eb855778b3df3c7506639542a6af')
	return weights_path

	def compile(self, learning_rate, momentum):
	"""Gets the model ready for training. Adds losses, regularization, and
	metrics. Then calls the Keras compile() function.
	"""
	# Optimizer object
	optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=momentum,
	clipnorm=5.0)
	# Add Losses
	# First, clear previously set losses to avoid duplication
	self.keras_model._losses = []
	self.keras_model._per_input_losses = {}
	loss_names = ["rpn_class_loss", "rpn_bbox_loss",
	"mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"]
	for name in loss_names:
	layer = self.keras_model.get_layer(name)
	if layer.output in self.keras_model.losses:
	continue
	self.keras_model.add_loss(
	tf.reduce_mean(layer.output, keep_dims=True))

	# Add L2 Regularization
	# Skip gamma and beta weights of batch normalization layers.
	reg_losses = [keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32)
	for w in self.keras_model.trainable_weights
	if 'gamma' not in w.name and 'beta' not in w.name]
	self.keras_model.add_loss(tf.add_n(reg_losses))

	# Compile
	self.keras_model.compile(optimizer=optimizer, loss=[
	None] * len(self.keras_model.outputs))

	# Add metrics for losses
	for name in loss_names:
	if name in self.keras_model.metrics_names:
	continue
	layer = self.keras_model.get_layer(name)
	self.keras_model.metrics_names.append(name)
	self.keras_model.metrics_tensors.append(tf.reduce_mean(
	layer.output, keep_dims=True))

	def set_trainable(self, layer_regex, keras_model=None, indent=0, verbose=1):
	"""Sets model layers as trainable if their names match
	the given regular expression.
	"""
	# Print message on the first call (but not on recursive calls)
	if verbose > 0 and keras_model is None:
	log("Selecting layers to train")

	keras_model = keras_model or self.keras_model

	# In multi-GPU training, we wrap the model. Get layers
	# of the inner model because they have the weights.
	layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\
	else keras_model.layers

	for layer in layers:
	# Is the layer a model?
	if layer.__class__.__name__ == 'Model':
	print("In model: ", layer.name)
	self.set_trainable(
	layer_regex, keras_model=layer, indent=indent + 4)
	continue

	if not layer.weights:
	continue
	# Is it trainable?
	trainable = bool(re.fullmatch(layer_regex, layer.name))
	# Update layer. If layer is a container, update inner layer.
	if layer.__class__.__name__ == 'TimeDistributed':
	layer.layer.trainable = trainable
	else:
	layer.trainable = trainable
	# Print trainble layer names
	if trainable and verbose > 0:
	log("{}{:20} ({})".format(" " * indent, layer.name,
	layer.__class__.__name__))

	def set_log_dir(self, model_path=None):
	"""Sets the model log directory and epoch counter.

	model_path: If None, or a format different from what this code uses
	then set a new log directory and start epochs from 0. Otherwise,
	extract the log directory and the epoch counter from the file
	name.
	"""
	# Set date and epoch counter as if starting a new model
	self.epoch = 0
	now = datetime.datetime.now()

	# If we have a model path with date and epochs use them
	if model_path:
	# Continue from we left of. Get epoch and date from the file name
	# A sample model path might look like:
	# /path/to/logs/coco20171029T2315/mask_rcnn_coco_0001.h5
	regex = r".*/\w+(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})/mask\_rcnn\_\w+(\d{4})\.h5"
	m = re.match(regex, model_path)
	if m:
	now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
	int(m.group(4)), int(m.group(5)))
	self.epoch = int(m.group(6)) + 1

	# Directory for training logs
	self.log_dir = os.path.join(self.model_dir, "{}{:%Y%m%dT%H%M}".format(
	self.config.NAME.lower(), now))

	# Path to save after each epoch. Include placeholders that get filled by Keras.
	self.checkpoint_path = os.path.join(self.log_dir, "mask_rcnn_{}_epoch.h5".format(
	self.config.NAME.lower()))
	self.checkpoint_path = self.checkpoint_path.replace(
	"epoch", "{epoch:04d}")

	def train(self, train_dataset, val_dataset, learning_rate, epochs, layers):
	"""Train the model.
	train_dataset, val_dataset: Training and validation Dataset objects.
	learning_rate: The learning rate to train with
	epochs: Number of training epochs. Note that previous training epochs
	are considered to be done alreay, so this actually determines
	the epochs to train in total rather than in this particaular
	call.
	layers: Allows selecting wich layers to train. It can be:
	- A regular expression to match layer names to train
	- One of these predefined values:
	heaads: The RPN, classifier and mask heads of the network
	all: All the layers
	3+: Train Resnet stage 3 and up
	4+: Train Resnet stage 4 and up
	5+: Train Resnet stage 5 and up
	"""
	assert self.mode == "training", "Create model in training mode."

	# Pre-defined layer regular expressions
	layer_regex = {
	# all layers but the backbone
	"heads": r"(mrcnn\_.)\|(rpn\_.)\|(fpn\_.*)",
	# From a specific Resnet stage and up
	"3+": r"(res3.)\|(bn3.)\|(res4.)\|(bn4.)\|(res5.)\|(bn5.)\|(mrcnn\_.)\|(rpn\_.)\|(fpn\_.*)",
	"4+": r"(res4.)\|(bn4.)\|(res5.)\|(bn5.)\|(mrcnn\_.)\|(rpn\_.)\|(fpn\_.*)",
	"5+": r"(res5.)\|(bn5.)\|(mrcnn\_.)\|(rpn\_.)\|(fpn\_.*)",
	# All layers
	"all": ".*",
	}
	if layers in layer_regex.keys():
	layers = layer_regex[layers]

	# Data generators
	train_generator = data_generator(train_dataset, self.config, shuffle=True,
	batch_size=self.config.BATCH_SIZE)
	val_generator = data_generator(val_dataset, self.config, shuffle=True,
	batch_size=self.config.BATCH_SIZE,
	augment=False)

	# Callbacks
	callbacks = [
	keras.callbacks.TensorBoard(log_dir=self.log_dir,
	histogram_freq=0, write_graph=True, write_images=False),
	keras.callbacks.ModelCheckpoint(self.checkpoint_path,
	verbose=0, save_weights_only=True),
	]

	# Train
	log("\nStarting at epoch {}. LR={}\n".format(self.epoch, learning_rate))
	log("Checkpoint Path: {}".format(self.checkpoint_path))
	self.set_trainable(layers)
	self.compile(learning_rate, self.config.LEARNING_MOMENTUM)

	# Work-around for Windows: Keras fails on Windows when using
	# multiprocessing workers. See discussion here:
	# https://github.com/matterport/Mask_RCNN/issues/13#issuecomment-353124009
	if os.name is 'nt':
	workers = 0
	else:
	workers = max(self.config.BATCH_SIZE // 2, 2)

	self.keras_model.fit_generator(
	train_generator,
	initial_epoch=self.epoch,
	epochs=epochs,
	steps_per_epoch=self.config.STEPS_PER_EPOCH,
	callbacks=callbacks,
	validation_data=next(val_generator),
	validation_steps=self.config.VALIDATION_STEPS,
	max_queue_size=100,
	workers=workers,
	use_multiprocessing=True,
	)
	self.epoch = max(self.epoch, epochs)

	def mold_inputs(self, images):
	"""Takes a list of images and modifies them to the format expected
	as an input to the neural network.
	images: List of image matricies [height,width,depth]. Images can have
	different sizes.

	Returns 3 Numpy matricies:
	molded_images: [N, h, w, 3]. Images resized and normalized.
	image_metas: [N, length of meta data]. Details about each image.
	windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the
	original image (padding excluded).
	"""
	molded_images = []
	image_metas = []
	windows = []
	for image in images:
	# Resize image to fit the model expected size
	# TODO: move resizing to mold_image()
	molded_image, window, scale, padding = utils.resize_image(
	image,
	min_dim=self.config.IMAGE_MIN_DIM,
	max_dim=self.config.IMAGE_MAX_DIM,
	padding=self.config.IMAGE_PADDING)
	molded_image = mold_image(molded_image, self.config)
	# Build image_meta
	image_meta = compose_image_meta(
	0, image.shape, window,
	np.zeros([self.config.NUM_CLASSES], dtype=np.int32))
	# Append
	molded_images.append(molded_image)
	windows.append(window)
	image_metas.append(image_meta)
	# Pack into arrays
	molded_images = np.stack(molded_images)
	image_metas = np.stack(image_metas)
	windows = np.stack(windows)
	return molded_images, image_metas, windows

	def unmold_detections(self, detections, mrcnn_mask, image_shape, window):
	"""Reformats the detections of one image from the format of the neural
	network output to a format suitable for use in the rest of the
	application.

	detections: [N, (y1, x1, y2, x2, class_id, score)]
	mrcnn_mask: [N, height, width, num_classes]
	image_shape: [height, width, depth] Original size of the image before resizing
	window: [y1, x1, y2, x2] Box in the image where the real image is
	excluding the padding.

	Returns:
	boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
	class_ids: [N] Integer class IDs for each bounding box
	scores: [N] Float probability scores of the class_id
	masks: [height, width, num_instances] Instance masks
	"""
	# How many detections do we have?
	# Detections array is padded with zeros. Find the first class_id == 0.
	zero_ix = np.where(detections[:, 4] == 0)[0]
	N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]

	# Extract boxes, class_ids, scores, and class-specific masks
	boxes = detections[:N, :4]
	class_ids = detections[:N, 4].astype(np.int32)
	scores = detections[:N, 5]
	masks = mrcnn_mask[np.arange(N), :, :, class_ids]

	# Compute scale and shift to translate coordinates to image domain.
	h_scale = image_shape[0] / (window[2] - window[0])
	w_scale = image_shape[1] / (window[3] - window[1])
	scale = min(h_scale, w_scale)
	shift = window[:2] # y, x
	scales = np.array([scale, scale, scale, scale])
	shifts = np.array([shift[0], shift[1], shift[0], shift[1]])

	# Translate bounding boxes to image domain
	boxes = np.multiply(boxes - shifts, scales).astype(np.int32)

	# Filter out detections with zero area. Often only happens in early
	# stages of training when the network weights are still a bit random.
	exclude_ix = np.where(
	(boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
	if exclude_ix.shape[0] > 0:
	boxes = np.delete(boxes, exclude_ix, axis=0)
	class_ids = np.delete(class_ids, exclude_ix, axis=0)
	scores = np.delete(scores, exclude_ix, axis=0)
	masks = np.delete(masks, exclude_ix, axis=0)
	N = class_ids.shape[0]

	# Resize masks to original image size and set boundary threshold.
	full_masks = []
	for i in range(N):
	# Convert neural network mask to full size mask
	full_mask = utils.unmold_mask(masks[i], boxes[i], image_shape)
	full_masks.append(full_mask)
	full_masks = np.stack(full_masks, axis=-1)\
	if full_masks else np.empty((0,) + masks.shape[1:3])

	return boxes, class_ids, scores, full_masks

	def detect(self, images, verbose=0):
	"""Runs the detection pipeline.

	images: List of images, potentially of different sizes.

	Returns a list of dicts, one dict per image. The dict contains:
	rois: [N, (y1, x1, y2, x2)] detection bounding boxes
	class_ids: [N] int class IDs
	scores: [N] float probability scores for the class IDs
	masks: [H, W, N] instance binary masks
	"""
	assert self.mode == "inference", "Create model in inference mode."
	assert len(
	images) == self.config.BATCH_SIZE, "len(images) must be equal to BATCH_SIZE"

	if verbose:
	log("Processing {} images".format(len(images)))
	for image in images:
	log("image", image)
	# Mold inputs to format expected by the neural network
	molded_images, image_metas, windows = self.mold_inputs(images)
	if verbose:
	log("molded_images", molded_images)
	log("image_metas", image_metas)
	# Run object detection
	detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, \
	rois, rpn_class, rpn_bbox =\
	self.keras_model.predict([molded_images, image_metas], verbose=0)
	# Process detections
	results = []
	for i, image in enumerate(images):
	final_rois, final_class_ids, final_scores, final_masks =\
	self.unmold_detections(detections[i], mrcnn_mask[i],
	image.shape, windows[i])
	results.append({
	"rois": final_rois,
	"class_ids": final_class_ids,
	"scores": final_scores,
	"masks": final_masks,
	})
	return results

	def ancestor(self, tensor, name, checked=None):
	"""Finds the ancestor of a TF tensor in the computation graph.
	tensor: TensorFlow symbolic tensor.
	name: Name of ancestor tensor to find
	checked: For internal use. A list of tensors that were already
	searched to avoid loops in traversing the graph.
	"""
	checked = checked if checked is not None else []
	# Put a limit on how deep we go to avoid very long loops
	if len(checked) > 500:
	return None
	# Convert name to a regex and allow matching a number prefix
	# because Keras adds them automatically
	if isinstance(name, str):
	name = re.compile(name.replace("/", r"(\_\d+)*/"))

	parents = tensor.op.inputs
	for p in parents:
	if p in checked:
	continue
	if bool(re.fullmatch(name, p.name)):
	return p
	checked.append(p)
	a = self.ancestor(p, name, checked)
	if a is not None:
	return a
	return None

	def find_trainable_layer(self, layer):
	"""If a layer is encapsulated by another layer, this function
	digs through the encapsulation and returns the layer that holds
	the weights.
	"""
	if layer.__class__.__name__ == 'TimeDistributed':
	return self.find_trainable_layer(layer.layer)
	return layer

	def get_trainable_layers(self):
	"""Returns a list of layers that have weights."""
	layers = []
	# Loop through all layers
	for l in self.keras_model.layers:
	# If layer is a wrapper, find inner trainable layer
	l = self.find_trainable_layer(l)
	# Include layer if it has weights
	if l.get_weights():
	layers.append(l)
	return layers

	def run_graph(self, images, outputs):
	"""Runs a sub-set of the computation graph that computes the given
	outputs.

	outputs: List of tuples (name, tensor) to compute. The tensors are
	symbolic TensorFlow tensors and the names are for easy tracking.

	Returns an ordered dict of results. Keys are the names received in the
	input and values are Numpy arrays.
	"""
	model = self.keras_model

	# Organize desired outputs into an ordered dict
	outputs = OrderedDict(outputs)
	for o in outputs.values():
	assert o is not None

	# Build a Keras function to run parts of the computation graph
	inputs = model.inputs
	if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
	inputs += [K.learning_phase()]
	kf = K.function(model.inputs, list(outputs.values()))

	# Run inference
	molded_images, image_metas, windows = self.mold_inputs(images)
	# TODO: support training mode?
	# if TEST_MODE == "training":
	# model_in = [molded_images, image_metas,
	# target_rpn_match, target_rpn_bbox,
	# gt_boxes, gt_masks]
	# if not config.USE_RPN_ROIS:
	# model_in.append(target_rois)
	# if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
	# model_in.append(1.)
	# outputs_np = kf(model_in)
	# else:

	model_in = [molded_images, image_metas]
	if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
	model_in.append(0.)
	outputs_np = kf(model_in)

	# Pack the generated Numpy arrays into a a dict and log the results.
	outputs_np = OrderedDict([(k, v)
	for k, v in zip(outputs.keys(), outputs_np)])
	for k, v in outputs_np.items():
	log(k, v)
	return outputs_np


	############################################################
	# Data Formatting
	############################################################

	def compose_image_meta(image_id, image_shape, window, active_class_ids):
	"""Takes attributes of an image and puts them in one 1D array.

	image_id: An int ID of the image. Useful for debugging.
	image_shape: [height, width, channels]
	window: (y1, x1, y2, x2) in pixels. The area of the image where the real
	image is (excluding the padding)
	active_class_ids: List of class_ids available in the dataset from which
	the image came. Useful if training on images from multiple datasets
	where not all classes are present in all datasets.
	"""
	meta = np.array(
	[image_id] + # size=1
	list(image_shape) + # size=3
	list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
	list(active_class_ids) # size=num_classes
	)
	return meta


	def parse_image_meta_graph(meta):
	"""Parses a tensor that contains image attributes to its components.
	See compose_image_meta() for more details.

	meta: [batch, meta length] where meta length depends on NUM_CLASSES
	"""
	image_id = meta[:, 0]
	image_shape = meta[:, 1:4]
	window = meta[:, 4:8] # (y1, x1, y2, x2) window of image in in pixels
	active_class_ids = meta[:, 8:]
	return [image_id, image_shape, window, active_class_ids]


	def mold_image(images, config):
	"""Takes RGB images with 0-255 values and subtraces
	the mean pixel and converts it to float. Expects image
	colors in RGB order.
	"""
	return images.astype(np.float32) - config.MEAN_PIXEL


	def unmold_image(normalized_images, config):
	"""Takes a image normalized with mold() and returns the original."""
	return (normalized_images + config.MEAN_PIXEL).astype(np.uint8)


	############################################################
	# Miscellenous Graph Functions
	############################################################

	def trim_zeros_graph(boxes, name=None):
	"""Often boxes are represented with matricies of shape [N, 4] and
	are padded with zeros. This removes zero boxes.

	boxes: [N, 4] matrix of boxes.
	non_zeros: [N] a 1D boolean mask identifying the rows to keep
	"""
	non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
	boxes = tf.boolean_mask(boxes, non_zeros, name=name)
	return boxes, non_zeros


	def batch_pack_graph(x, counts, num_rows):
	"""Picks different number of values from each row
	in x depending on the values in counts.
	"""
	outputs = []
	for i in range(num_rows):
	outputs.append(x[i, :counts[i]])
	return tf.concat(outputs, axis=0)