Spaces:
Running
Running
# Copyright 2018 The TensorFlow Authors All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
r"""Provides DeepLab model definition and helper functions. | |
DeepLab is a deep learning system for semantic image segmentation with | |
the following features: | |
(1) Atrous convolution to explicitly control the resolution at which | |
feature responses are computed within Deep Convolutional Neural Networks. | |
(2) Atrous spatial pyramid pooling (ASPP) to robustly segment objects at | |
multiple scales with filters at multiple sampling rates and effective | |
fields-of-views. | |
(3) ASPP module augmented with image-level feature and batch normalization. | |
(4) A simple yet effective decoder module to recover the object boundaries. | |
See the following papers for more details: | |
"Encoder-Decoder with Atrous Separable Convolution for Semantic Image | |
Segmentation" | |
Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam. | |
(https://arxiv.org/abs1802.02611) | |
"Rethinking Atrous Convolution for Semantic Image Segmentation," | |
Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam | |
(https://arxiv.org/abs/1706.05587) | |
"DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, | |
Atrous Convolution, and Fully Connected CRFs", | |
Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy, | |
Alan L Yuille (* equal contribution) | |
(https://arxiv.org/abs/1606.00915) | |
"Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected | |
CRFs" | |
Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy, | |
Alan L. Yuille (* equal contribution) | |
(https://arxiv.org/abs/1412.7062) | |
""" | |
import collections | |
import tensorflow as tf | |
from deeplab import model | |
from feelvos import common | |
from feelvos.utils import embedding_utils | |
from feelvos.utils import train_utils | |
slim = tf.contrib.slim | |
get_branch_logits = model.get_branch_logits | |
get_extra_layer_scopes = model.get_extra_layer_scopes | |
multi_scale_logits_v2 = model.multi_scale_logits | |
refine_by_decoder = model.refine_by_decoder | |
scale_dimension = model.scale_dimension | |
split_separable_conv2d = model.split_separable_conv2d | |
MERGED_LOGITS_SCOPE = model.MERGED_LOGITS_SCOPE | |
IMAGE_POOLING_SCOPE = model.IMAGE_POOLING_SCOPE | |
ASPP_SCOPE = model.ASPP_SCOPE | |
CONCAT_PROJECTION_SCOPE = model.CONCAT_PROJECTION_SCOPE | |
def predict_labels(images, | |
model_options, | |
image_pyramid=None, | |
reference_labels=None, | |
k_nearest_neighbors=1, | |
embedding_dimension=None, | |
use_softmax_feedback=False, | |
initial_softmax_feedback=None, | |
embedding_seg_feature_dimension=256, | |
embedding_seg_n_layers=4, | |
embedding_seg_kernel_size=7, | |
embedding_seg_atrous_rates=None, | |
also_return_softmax_probabilities=False, | |
num_frames_per_video=None, | |
normalize_nearest_neighbor_distances=False, | |
also_attend_to_previous_frame=False, | |
use_local_previous_frame_attention=False, | |
previous_frame_attention_window_size=9, | |
use_first_frame_matching=True, | |
also_return_embeddings=False, | |
ref_embeddings=None): | |
"""Predicts segmentation labels. | |
Args: | |
images: A tensor of size [batch, height, width, channels]. | |
model_options: An InternalModelOptions instance to configure models. | |
image_pyramid: Input image scales for multi-scale feature extraction. | |
reference_labels: A tensor of size [batch, height, width, 1]. | |
ground truth labels used to perform a nearest neighbor query | |
k_nearest_neighbors: Integer, the number of neighbors to use for nearest | |
neighbor queries. | |
embedding_dimension: Integer, the dimension used for the learned embedding. | |
use_softmax_feedback: Boolean, whether to give the softmax predictions of | |
the last frame as additional input to the segmentation head. | |
initial_softmax_feedback: Float32 tensor, or None. Can be used to | |
initialize the softmax predictions used for the feedback loop. | |
Typically only useful for inference. Only has an effect if | |
use_softmax_feedback is True. | |
embedding_seg_feature_dimension: Integer, the dimensionality used in the | |
segmentation head layers. | |
embedding_seg_n_layers: Integer, the number of layers in the segmentation | |
head. | |
embedding_seg_kernel_size: Integer, the kernel size used in the | |
segmentation head. | |
embedding_seg_atrous_rates: List of integers of length | |
embedding_seg_n_layers, the atrous rates to use for the segmentation head. | |
also_return_softmax_probabilities: Boolean, if true, additionally return | |
the softmax probabilities as second return value. | |
num_frames_per_video: Integer, the number of frames per video. | |
normalize_nearest_neighbor_distances: Boolean, whether to normalize the | |
nearest neighbor distances to [0,1] using sigmoid, scale and shift. | |
also_attend_to_previous_frame: Boolean, whether to also use nearest | |
neighbor attention with respect to the previous frame. | |
use_local_previous_frame_attention: Boolean, whether to restrict the | |
previous frame attention to a local search window. | |
Only has an effect, if also_attend_to_previous_frame is True. | |
previous_frame_attention_window_size: Integer, the window size used for | |
local previous frame attention, if use_local_previous_frame_attention | |
is True. | |
use_first_frame_matching: Boolean, whether to extract features by matching | |
to the reference frame. This should always be true except for ablation | |
experiments. | |
also_return_embeddings: Boolean, whether to return the embeddings as well. | |
ref_embeddings: Tuple of | |
(first_frame_embeddings, previous_frame_embeddings), | |
each of shape [batch, height, width, embedding_dimension], or None. | |
Returns: | |
A dictionary with keys specifying the output_type (e.g., semantic | |
prediction) and values storing Tensors representing predictions (argmax | |
over channels). Each prediction has size [batch, height, width]. | |
If also_return_softmax_probabilities is True, the second return value are | |
the softmax probabilities. | |
If also_return_embeddings is True, it will also return an embeddings | |
tensor of shape [batch, height, width, embedding_dimension]. | |
Raises: | |
ValueError: If classification_loss is not softmax, softmax_with_attention, | |
nor triplet. | |
""" | |
if (model_options.classification_loss == 'triplet' and | |
reference_labels is None): | |
raise ValueError('Need reference_labels for triplet loss') | |
if model_options.classification_loss == 'softmax_with_attention': | |
if embedding_dimension is None: | |
raise ValueError('Need embedding_dimension for softmax_with_attention ' | |
'loss') | |
if reference_labels is None: | |
raise ValueError('Need reference_labels for softmax_with_attention loss') | |
res = ( | |
multi_scale_logits_with_nearest_neighbor_matching( | |
images, | |
model_options=model_options, | |
image_pyramid=image_pyramid, | |
is_training=False, | |
reference_labels=reference_labels, | |
clone_batch_size=1, | |
num_frames_per_video=num_frames_per_video, | |
embedding_dimension=embedding_dimension, | |
max_neighbors_per_object=0, | |
k_nearest_neighbors=k_nearest_neighbors, | |
use_softmax_feedback=use_softmax_feedback, | |
initial_softmax_feedback=initial_softmax_feedback, | |
embedding_seg_feature_dimension=embedding_seg_feature_dimension, | |
embedding_seg_n_layers=embedding_seg_n_layers, | |
embedding_seg_kernel_size=embedding_seg_kernel_size, | |
embedding_seg_atrous_rates=embedding_seg_atrous_rates, | |
normalize_nearest_neighbor_distances= | |
normalize_nearest_neighbor_distances, | |
also_attend_to_previous_frame=also_attend_to_previous_frame, | |
use_local_previous_frame_attention= | |
use_local_previous_frame_attention, | |
previous_frame_attention_window_size= | |
previous_frame_attention_window_size, | |
use_first_frame_matching=use_first_frame_matching, | |
also_return_embeddings=also_return_embeddings, | |
ref_embeddings=ref_embeddings | |
)) | |
if also_return_embeddings: | |
outputs_to_scales_to_logits, embeddings = res | |
else: | |
outputs_to_scales_to_logits = res | |
embeddings = None | |
else: | |
outputs_to_scales_to_logits = multi_scale_logits_v2( | |
images, | |
model_options=model_options, | |
image_pyramid=image_pyramid, | |
is_training=False, | |
fine_tune_batch_norm=False) | |
predictions = {} | |
for output in sorted(outputs_to_scales_to_logits): | |
scales_to_logits = outputs_to_scales_to_logits[output] | |
original_logits = scales_to_logits[MERGED_LOGITS_SCOPE] | |
if isinstance(original_logits, list): | |
assert len(original_logits) == 1 | |
original_logits = original_logits[0] | |
logits = tf.image.resize_bilinear(original_logits, tf.shape(images)[1:3], | |
align_corners=True) | |
if model_options.classification_loss in ('softmax', | |
'softmax_with_attention'): | |
predictions[output] = tf.argmax(logits, 3) | |
elif model_options.classification_loss == 'triplet': | |
# to keep this fast, we do the nearest neighbor assignment on the | |
# resolution at which the embedding is extracted and scale the result up | |
# afterwards | |
embeddings = original_logits | |
reference_labels_logits_size = tf.squeeze( | |
tf.image.resize_nearest_neighbor( | |
reference_labels[tf.newaxis], | |
train_utils.resolve_shape(embeddings)[1:3], | |
align_corners=True), axis=0) | |
nn_labels = embedding_utils.assign_labels_by_nearest_neighbors( | |
embeddings[0], embeddings[1:], reference_labels_logits_size, | |
k_nearest_neighbors) | |
predictions[common.OUTPUT_TYPE] = tf.image.resize_nearest_neighbor( | |
nn_labels, tf.shape(images)[1:3], align_corners=True) | |
else: | |
raise ValueError( | |
'Only support softmax, triplet, or softmax_with_attention for ' | |
'classification_loss.') | |
if also_return_embeddings: | |
assert also_return_softmax_probabilities | |
return predictions, tf.nn.softmax(original_logits, axis=-1), embeddings | |
elif also_return_softmax_probabilities: | |
return predictions, tf.nn.softmax(original_logits, axis=-1) | |
else: | |
return predictions | |
def multi_scale_logits_with_nearest_neighbor_matching( | |
images, | |
model_options, | |
image_pyramid, | |
clone_batch_size, | |
reference_labels, | |
num_frames_per_video, | |
embedding_dimension, | |
max_neighbors_per_object, | |
weight_decay=0.0001, | |
is_training=False, | |
fine_tune_batch_norm=False, | |
k_nearest_neighbors=1, | |
use_softmax_feedback=False, | |
initial_softmax_feedback=None, | |
embedding_seg_feature_dimension=256, | |
embedding_seg_n_layers=4, | |
embedding_seg_kernel_size=7, | |
embedding_seg_atrous_rates=None, | |
normalize_nearest_neighbor_distances=False, | |
also_attend_to_previous_frame=False, | |
damage_initial_previous_frame_mask=False, | |
use_local_previous_frame_attention=False, | |
previous_frame_attention_window_size=9, | |
use_first_frame_matching=True, | |
also_return_embeddings=False, | |
ref_embeddings=None): | |
"""Gets the logits for multi-scale inputs using nearest neighbor attention. | |
Adjusted version of multi_scale_logits_v2 to support nearest neighbor | |
attention and a variable number of classes for each element of the batch. | |
The returned logits are all downsampled (due to max-pooling layers) | |
for both training and evaluation. | |
Args: | |
images: A tensor of size [batch, height, width, channels]. | |
model_options: A ModelOptions instance to configure models. | |
image_pyramid: Input image scales for multi-scale feature extraction. | |
clone_batch_size: Integer, the number of videos on a batch. | |
reference_labels: The segmentation labels of the reference frame on which | |
attention is applied. | |
num_frames_per_video: Integer, the number of frames per video. | |
embedding_dimension: Integer, the dimension of the embedding. | |
max_neighbors_per_object: Integer, the maximum number of candidates | |
for the nearest neighbor query per object after subsampling. | |
Can be 0 for no subsampling. | |
weight_decay: The weight decay for model variables. | |
is_training: Is training or not. | |
fine_tune_batch_norm: Fine-tune the batch norm parameters or not. | |
k_nearest_neighbors: Integer, the number of nearest neighbors to use. | |
use_softmax_feedback: Boolean, whether to give the softmax predictions of | |
the last frame as additional input to the segmentation head. | |
initial_softmax_feedback: List of Float32 tensors, or None. | |
Can be used to initialize the softmax predictions used for the feedback | |
loop. Only has an effect if use_softmax_feedback is True. | |
embedding_seg_feature_dimension: Integer, the dimensionality used in the | |
segmentation head layers. | |
embedding_seg_n_layers: Integer, the number of layers in the segmentation | |
head. | |
embedding_seg_kernel_size: Integer, the kernel size used in the | |
segmentation head. | |
embedding_seg_atrous_rates: List of integers of length | |
embedding_seg_n_layers, the atrous rates to use for the segmentation head. | |
normalize_nearest_neighbor_distances: Boolean, whether to normalize the | |
nearest neighbor distances to [0,1] using sigmoid, scale and shift. | |
also_attend_to_previous_frame: Boolean, whether to also use nearest | |
neighbor attention with respect to the previous frame. | |
damage_initial_previous_frame_mask: Boolean, whether to artificially damage | |
the initial previous frame mask. Only has an effect if | |
also_attend_to_previous_frame is True. | |
use_local_previous_frame_attention: Boolean, whether to restrict the | |
previous frame attention to a local search window. | |
Only has an effect, if also_attend_to_previous_frame is True. | |
previous_frame_attention_window_size: Integer, the window size used for | |
local previous frame attention, if use_local_previous_frame_attention | |
is True. | |
use_first_frame_matching: Boolean, whether to extract features by matching | |
to the reference frame. This should always be true except for ablation | |
experiments. | |
also_return_embeddings: Boolean, whether to return the embeddings as well. | |
ref_embeddings: Tuple of | |
(first_frame_embeddings, previous_frame_embeddings), | |
each of shape [batch, height, width, embedding_dimension], or None. | |
Returns: | |
outputs_to_scales_to_logits: A map of maps from output_type (e.g., | |
semantic prediction) to a dictionary of multi-scale logits names to | |
logits. For each output_type, the dictionary has keys which | |
correspond to the scales and values which correspond to the logits. | |
For example, if `scales` equals [1.0, 1.5], then the keys would | |
include 'merged_logits', 'logits_1.00' and 'logits_1.50'. | |
If also_return_embeddings is True, it will also return an embeddings | |
tensor of shape [batch, height, width, embedding_dimension]. | |
Raises: | |
ValueError: If model_options doesn't specify crop_size and its | |
add_image_level_feature = True, since add_image_level_feature requires | |
crop_size information. | |
""" | |
# Setup default values. | |
if not image_pyramid: | |
image_pyramid = [1.0] | |
crop_height = ( | |
model_options.crop_size[0] | |
if model_options.crop_size else tf.shape(images)[1]) | |
crop_width = ( | |
model_options.crop_size[1] | |
if model_options.crop_size else tf.shape(images)[2]) | |
# Compute the height, width for the output logits. | |
if model_options.decoder_output_stride: | |
logits_output_stride = min(model_options.decoder_output_stride) | |
else: | |
logits_output_stride = model_options.output_stride | |
logits_height = scale_dimension( | |
crop_height, | |
max(1.0, max(image_pyramid)) / logits_output_stride) | |
logits_width = scale_dimension( | |
crop_width, | |
max(1.0, max(image_pyramid)) / logits_output_stride) | |
# Compute the logits for each scale in the image pyramid. | |
outputs_to_scales_to_logits = { | |
k: {} | |
for k in model_options.outputs_to_num_classes | |
} | |
for image_scale in image_pyramid: | |
if image_scale != 1.0: | |
scaled_height = scale_dimension(crop_height, image_scale) | |
scaled_width = scale_dimension(crop_width, image_scale) | |
scaled_crop_size = [scaled_height, scaled_width] | |
scaled_images = tf.image.resize_bilinear( | |
images, scaled_crop_size, align_corners=True) | |
scaled_reference_labels = tf.image.resize_nearest_neighbor( | |
reference_labels, scaled_crop_size, align_corners=True | |
) | |
if model_options.crop_size is None: | |
scaled_crop_size = None | |
if model_options.crop_size: | |
scaled_images.set_shape([None, scaled_height, scaled_width, 3]) | |
else: | |
scaled_crop_size = model_options.crop_size | |
scaled_images = images | |
scaled_reference_labels = reference_labels | |
updated_options = model_options._replace(crop_size=scaled_crop_size) | |
res = embedding_utils.get_logits_with_matching( | |
scaled_images, | |
updated_options, | |
weight_decay=weight_decay, | |
reuse=tf.AUTO_REUSE, | |
is_training=is_training, | |
fine_tune_batch_norm=fine_tune_batch_norm, | |
reference_labels=scaled_reference_labels, | |
batch_size=clone_batch_size, | |
num_frames_per_video=num_frames_per_video, | |
embedding_dimension=embedding_dimension, | |
max_neighbors_per_object=max_neighbors_per_object, | |
k_nearest_neighbors=k_nearest_neighbors, | |
use_softmax_feedback=use_softmax_feedback, | |
initial_softmax_feedback=initial_softmax_feedback, | |
embedding_seg_feature_dimension=embedding_seg_feature_dimension, | |
embedding_seg_n_layers=embedding_seg_n_layers, | |
embedding_seg_kernel_size=embedding_seg_kernel_size, | |
embedding_seg_atrous_rates=embedding_seg_atrous_rates, | |
normalize_nearest_neighbor_distances= | |
normalize_nearest_neighbor_distances, | |
also_attend_to_previous_frame=also_attend_to_previous_frame, | |
damage_initial_previous_frame_mask=damage_initial_previous_frame_mask, | |
use_local_previous_frame_attention=use_local_previous_frame_attention, | |
previous_frame_attention_window_size= | |
previous_frame_attention_window_size, | |
use_first_frame_matching=use_first_frame_matching, | |
also_return_embeddings=also_return_embeddings, | |
ref_embeddings=ref_embeddings | |
) | |
if also_return_embeddings: | |
outputs_to_logits, embeddings = res | |
else: | |
outputs_to_logits = res | |
embeddings = None | |
# Resize the logits to have the same dimension before merging. | |
for output in sorted(outputs_to_logits): | |
if isinstance(outputs_to_logits[output], collections.Sequence): | |
outputs_to_logits[output] = [tf.image.resize_bilinear( | |
x, [logits_height, logits_width], align_corners=True) | |
for x in outputs_to_logits[output]] | |
else: | |
outputs_to_logits[output] = tf.image.resize_bilinear( | |
outputs_to_logits[output], [logits_height, logits_width], | |
align_corners=True) | |
# Return when only one input scale. | |
if len(image_pyramid) == 1: | |
for output in sorted(model_options.outputs_to_num_classes): | |
outputs_to_scales_to_logits[output][ | |
MERGED_LOGITS_SCOPE] = outputs_to_logits[output] | |
if also_return_embeddings: | |
return outputs_to_scales_to_logits, embeddings | |
else: | |
return outputs_to_scales_to_logits | |
# Save logits to the output map. | |
for output in sorted(model_options.outputs_to_num_classes): | |
outputs_to_scales_to_logits[output][ | |
'logits_%.2f' % image_scale] = outputs_to_logits[output] | |
# Merge the logits from all the multi-scale inputs. | |
for output in sorted(model_options.outputs_to_num_classes): | |
# Concatenate the multi-scale logits for each output type. | |
all_logits = [ | |
[tf.expand_dims(l, axis=4)] | |
for logits in outputs_to_scales_to_logits[output].values() | |
for l in logits | |
] | |
transposed = map(list, zip(*all_logits)) | |
all_logits = [tf.concat(t, 4) for t in transposed] | |
merge_fn = ( | |
tf.reduce_max | |
if model_options.merge_method == 'max' else tf.reduce_mean) | |
outputs_to_scales_to_logits[output][MERGED_LOGITS_SCOPE] = [merge_fn( | |
l, axis=4) for l in all_logits] | |
if also_return_embeddings: | |
return outputs_to_scales_to_logits, embeddings | |
else: | |
return outputs_to_scales_to_logits | |