Spaces:

NCTCMumbai
/

NCTC

Running

File size: 46,115 Bytes

0b8359d

# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Utilities for the instance embedding for segmentation."""

import numpy as np
import tensorflow as tf
from deeplab import model
from deeplab.core import preprocess_utils
from feelvos.utils import mask_damaging

slim = tf.contrib.slim
resolve_shape = preprocess_utils.resolve_shape
WRONG_LABEL_PADDING_DISTANCE = 1e20

# With correlation_cost local matching will be much faster. But we provide a
# slow fallback for convenience.
USE_CORRELATION_COST = False
if USE_CORRELATION_COST:
  # pylint: disable=g-import-not-at-top
  from correlation_cost.python.ops import correlation_cost_op


def pairwise_distances(x, y):
  """Computes pairwise squared l2 distances between tensors x and y.

  Args:
    x: Tensor of shape [n, feature_dim].
    y: Tensor of shape [m, feature_dim].

  Returns:
    Float32 distances tensor of shape [n, m].
  """
  # d[i,j] = (x[i] - y[j]) * (x[i] - y[j])'
  # = sum(x[i]^2, 1) + sum(y[j]^2, 1) - 2 * x[i] * y[j]'
  xs = tf.reduce_sum(x * x, axis=1)[:, tf.newaxis]
  ys = tf.reduce_sum(y * y, axis=1)[tf.newaxis, :]
  d = xs + ys - 2 * tf.matmul(x, y, transpose_b=True)
  return d


def pairwise_distances2(x, y):
  """Computes pairwise squared l2 distances between tensors x and y.

  Naive implementation, high memory use. Could be useful to test the more
  efficient implementation.

  Args:
    x: Tensor of shape [n, feature_dim].
    y: Tensor of shape [m, feature_dim].

  Returns:
    distances of shape [n, m].
  """
  return tf.reduce_sum(tf.squared_difference(
      x[:, tf.newaxis], y[tf.newaxis, :]), axis=-1)


def cross_correlate(x, y, max_distance=9):
  """Efficiently computes the cross correlation of x and y.

  Optimized implementation using correlation_cost.
  Note that we do not normalize by the feature dimension.

  Args:
    x: Float32 tensor of shape [height, width, feature_dim].
    y: Float32 tensor of shape [height, width, feature_dim].
    max_distance: Integer, the maximum distance in pixel coordinates
      per dimension which is considered to be in the search window.

  Returns:
    Float32 tensor of shape [height, width, (2 * max_distance + 1) ** 2].
  """
  with tf.name_scope('cross_correlation'):
    corr = correlation_cost_op.correlation_cost(
        x[tf.newaxis], y[tf.newaxis], kernel_size=1,
        max_displacement=max_distance, stride_1=1, stride_2=1,
        pad=max_distance)
    corr = tf.squeeze(corr, axis=0)
    # This correlation implementation takes the mean over the feature_dim,
    # but we want sum here, so multiply by feature_dim.
    feature_dim = resolve_shape(x)[-1]
    corr *= feature_dim
    return corr


def local_pairwise_distances(x, y, max_distance=9):
  """Computes pairwise squared l2 distances using a local search window.

  Optimized implementation using correlation_cost.

  Args:
    x: Float32 tensor of shape [height, width, feature_dim].
    y: Float32 tensor of shape [height, width, feature_dim].
    max_distance: Integer, the maximum distance in pixel coordinates
      per dimension which is considered to be in the search window.

  Returns:
    Float32 distances tensor of shape
      [height, width, (2 * max_distance + 1) ** 2].
  """
  with tf.name_scope('local_pairwise_distances'):
    # d[i,j] = (x[i] - y[j]) * (x[i] - y[j])'
    # = sum(x[i]^2, -1) + sum(y[j]^2, -1) - 2 * x[i] * y[j]'
    corr = cross_correlate(x, y, max_distance=max_distance)
    xs = tf.reduce_sum(x * x, axis=2)[..., tf.newaxis]
    ys = tf.reduce_sum(y * y, axis=2)[..., tf.newaxis]
    ones_ys = tf.ones_like(ys)
    ys = cross_correlate(ones_ys, ys, max_distance=max_distance)
    d = xs + ys - 2 * corr
    # Boundary should be set to Inf.
    boundary = tf.equal(
        cross_correlate(ones_ys, ones_ys, max_distance=max_distance), 0)
    d = tf.where(boundary, tf.fill(tf.shape(d), tf.constant(np.float('inf'))),
                 d)
    return d


def local_pairwise_distances2(x, y, max_distance=9):
  """Computes pairwise squared l2 distances using a local search window.

  Naive implementation using map_fn.
  Used as a slow fallback for when correlation_cost is not available.

  Args:
    x: Float32 tensor of shape [height, width, feature_dim].
    y: Float32 tensor of shape [height, width, feature_dim].
    max_distance: Integer, the maximum distance in pixel coordinates
      per dimension which is considered to be in the search window.

  Returns:
    Float32 distances tensor of shape
      [height, width, (2 * max_distance + 1) ** 2].
  """
  with tf.name_scope('local_pairwise_distances2'):
    padding_val = 1e20
    padded_y = tf.pad(y, [[max_distance, max_distance],
                          [max_distance, max_distance], [0, 0]],
                      constant_values=padding_val)
    height, width, _ = resolve_shape(x)
    dists = []
    for y_start in range(2 * max_distance + 1):
      y_end = y_start + height
      y_slice = padded_y[y_start:y_end]
      for x_start in range(2 * max_distance + 1):
        x_end = x_start + width
        offset_y = y_slice[:, x_start:x_end]
        dist = tf.reduce_sum(tf.squared_difference(x, offset_y), axis=2)
        dists.append(dist)
    dists = tf.stack(dists, axis=2)
    return dists


def majority_vote(labels):
  """Performs a label majority vote along axis 1.

  Second try, hopefully this time more efficient.
  We assume that the labels are contiguous starting from 0.
  It will also work for non-contiguous labels, but be inefficient.

  Args:
    labels: Int tensor of shape [n, k]

  Returns:
    The majority of labels along axis 1
  """
  max_label = tf.reduce_max(labels)
  one_hot = tf.one_hot(labels, depth=max_label + 1)
  summed = tf.reduce_sum(one_hot, axis=1)
  majority = tf.argmax(summed, axis=1)
  return majority


def assign_labels_by_nearest_neighbors(reference_embeddings, query_embeddings,
                                       reference_labels, k=1):
  """Segments by nearest neighbor query wrt the reference frame.

  Args:
    reference_embeddings: Tensor of shape [height, width, embedding_dim],
      the embedding vectors for the reference frame
    query_embeddings: Tensor of shape [n_query_images, height, width,
      embedding_dim], the embedding vectors for the query frames
    reference_labels: Tensor of shape [height, width, 1], the class labels of
      the reference frame
    k: Integer, the number of nearest neighbors to use

  Returns:
    The labels of the nearest neighbors as [n_query_frames, height, width, 1]
    tensor

  Raises:
    ValueError: If k < 1.
  """
  if k < 1:
    raise ValueError('k must be at least 1')
  dists = flattened_pairwise_distances(reference_embeddings, query_embeddings)
  if k == 1:
    nn_indices = tf.argmin(dists, axis=1)[..., tf.newaxis]
  else:
    _, nn_indices = tf.nn.top_k(-dists, k, sorted=False)
  reference_labels = tf.reshape(reference_labels, [-1])
  nn_labels = tf.gather(reference_labels, nn_indices)
  if k == 1:
    nn_labels = tf.squeeze(nn_labels, axis=1)
  else:
    nn_labels = majority_vote(nn_labels)
  height = tf.shape(reference_embeddings)[0]
  width = tf.shape(reference_embeddings)[1]
  n_query_frames = query_embeddings.shape[0]
  nn_labels = tf.reshape(nn_labels, [n_query_frames, height, width, 1])
  return nn_labels


def flattened_pairwise_distances(reference_embeddings, query_embeddings):
  """Calculates flattened tensor of pairwise distances between ref and query.

  Args:
    reference_embeddings: Tensor of shape [..., embedding_dim],
      the embedding vectors for the reference frame
    query_embeddings: Tensor of shape [n_query_images, height, width,
      embedding_dim], the embedding vectors for the query frames.

  Returns:
    A distance tensor of shape [reference_embeddings.size / embedding_dim,
    query_embeddings.size / embedding_dim]
  """
  embedding_dim = resolve_shape(query_embeddings)[-1]
  reference_embeddings = tf.reshape(reference_embeddings, [-1, embedding_dim])
  first_dim = -1
  query_embeddings = tf.reshape(query_embeddings, [first_dim, embedding_dim])
  dists = pairwise_distances(query_embeddings, reference_embeddings)
  return dists


def nearest_neighbor_features_per_object(
    reference_embeddings, query_embeddings, reference_labels,
    max_neighbors_per_object, k_nearest_neighbors, gt_ids=None, n_chunks=100):
  """Calculates the distance to the nearest neighbor per object.

  For every pixel of query_embeddings calculate the distance to the
  nearest neighbor in the (possibly subsampled) reference_embeddings per object.

  Args:
    reference_embeddings: Tensor of shape [height, width, embedding_dim],
      the embedding vectors for the reference frame.
    query_embeddings: Tensor of shape [n_query_images, height, width,
      embedding_dim], the embedding vectors for the query frames.
    reference_labels: Tensor of shape [height, width, 1], the class labels of
      the reference frame.
    max_neighbors_per_object: Integer, the maximum number of candidates
      for the nearest neighbor query per object after subsampling,
      or 0 for no subsampling.
    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
    gt_ids: Int tensor of shape [n_objs] of the sorted unique ground truth
      ids in the first frame. If None, it will be derived from
      reference_labels.
    n_chunks: Integer, the number of chunks to use to save memory
      (set to 1 for no chunking).

  Returns:
    nn_features: A float32 tensor of nearest neighbor features of shape
      [n_query_images, height, width, n_objects, feature_dim].
    gt_ids: An int32 tensor of the unique sorted object ids present
      in the reference labels.
  """
  with tf.name_scope('nn_features_per_object'):
    reference_labels_flat = tf.reshape(reference_labels, [-1])
    if gt_ids is None:
      ref_obj_ids, _ = tf.unique(reference_labels_flat)
      ref_obj_ids = tf.contrib.framework.sort(ref_obj_ids)
      gt_ids = ref_obj_ids
    embedding_dim = resolve_shape(reference_embeddings)[-1]
    reference_embeddings_flat = tf.reshape(reference_embeddings,
                                           [-1, embedding_dim])

    reference_embeddings_flat, reference_labels_flat = (
        subsample_reference_embeddings_and_labels(reference_embeddings_flat,
                                                  reference_labels_flat,
                                                  gt_ids,
                                                  max_neighbors_per_object))
    shape = resolve_shape(query_embeddings)
    query_embeddings_flat = tf.reshape(query_embeddings, [-1, embedding_dim])
    nn_features = _nearest_neighbor_features_per_object_in_chunks(
        reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
        gt_ids, k_nearest_neighbors, n_chunks)
    nn_features_dim = resolve_shape(nn_features)[-1]
    nn_features_reshaped = tf.reshape(nn_features,
                                      tf.stack(shape[:3] + [tf.size(gt_ids),
                                                            nn_features_dim]))
    return nn_features_reshaped, gt_ids


def _nearest_neighbor_features_per_object_in_chunks(
    reference_embeddings_flat, query_embeddings_flat, reference_labels_flat,
    ref_obj_ids, k_nearest_neighbors, n_chunks):
  """Calculates the nearest neighbor features per object in chunks to save mem.

  Uses chunking to bound the memory use.

  Args:
    reference_embeddings_flat: Tensor of shape [n, embedding_dim],
      the embedding vectors for the reference frame.
    query_embeddings_flat: Tensor of shape [m, embedding_dim], the embedding
      vectors for the query frames.
    reference_labels_flat: Tensor of shape [n], the class labels of the
      reference frame.
    ref_obj_ids: int tensor of unique object ids in the reference labels.
    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
    n_chunks: Integer, the number of chunks to use to save memory
      (set to 1 for no chunking).

  Returns:
    nn_features: A float32 tensor of nearest neighbor features of shape
      [m, n_objects, feature_dim].
  """
  chunk_size = tf.cast(tf.ceil(tf.cast(tf.shape(query_embeddings_flat)[0],
                                       tf.float32) / n_chunks), tf.int32)
  wrong_label_mask = tf.not_equal(reference_labels_flat,
                                  ref_obj_ids[:, tf.newaxis])
  all_features = []
  for n in range(n_chunks):
    if n_chunks == 1:
      query_embeddings_flat_chunk = query_embeddings_flat
    else:
      chunk_start = n * chunk_size
      chunk_end = (n + 1) * chunk_size
      query_embeddings_flat_chunk = query_embeddings_flat[chunk_start:chunk_end]
    # Use control dependencies to make sure that the chunks are not processed
    # in parallel which would prevent any peak memory savings.
    with tf.control_dependencies(all_features):
      features = _nn_features_per_object_for_chunk(
          reference_embeddings_flat, query_embeddings_flat_chunk,
          wrong_label_mask, k_nearest_neighbors
      )
    all_features.append(features)
  if n_chunks == 1:
    nn_features = all_features[0]
  else:
    nn_features = tf.concat(all_features, axis=0)
  return nn_features


def _nn_features_per_object_for_chunk(
    reference_embeddings, query_embeddings, wrong_label_mask,
    k_nearest_neighbors):
  """Extracts features for each object using nearest neighbor attention.

  Args:
    reference_embeddings: Tensor of shape [n_chunk, embedding_dim],
      the embedding vectors for the reference frame.
    query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding
      vectors for the query frames.
    wrong_label_mask:
    k_nearest_neighbors: Integer, the number of nearest neighbors to use.

  Returns:
    nn_features: A float32 tensor of nearest neighbor features of shape
      [m_chunk, n_objects, feature_dim].
  """
  reference_embeddings_key = reference_embeddings
  query_embeddings_key = query_embeddings
  dists = flattened_pairwise_distances(reference_embeddings_key,
                                       query_embeddings_key)
  dists = (dists[:, tf.newaxis, :] +
           tf.cast(wrong_label_mask[tf.newaxis, :, :], tf.float32) *
           WRONG_LABEL_PADDING_DISTANCE)
  if k_nearest_neighbors == 1:
    features = tf.reduce_min(dists, axis=2, keepdims=True)
  else:
    # Find the closest k and combine them according to attention_feature_type
    dists, _ = tf.nn.top_k(-dists, k=k_nearest_neighbors)
    dists = -dists
    # If not enough real neighbors were found, pad with the farthest real
    # neighbor.
    valid_mask = tf.less(dists, WRONG_LABEL_PADDING_DISTANCE)
    masked_dists = dists * tf.cast(valid_mask, tf.float32)
    pad_dist = tf.tile(tf.reduce_max(masked_dists, axis=2)[..., tf.newaxis],
                       multiples=[1, 1, k_nearest_neighbors])
    dists = tf.where(valid_mask, dists, pad_dist)
    # take mean of distances
    features = tf.reduce_mean(dists, axis=2, keepdims=True)
  return features


def create_embedding_segmentation_features(features, feature_dimension,
                                           n_layers, kernel_size, reuse,
                                           atrous_rates=None):
  """Extracts features which can be used to estimate the final segmentation.

  Args:
    features: input features of shape [batch, height, width, features]
    feature_dimension: Integer, the dimensionality used in the segmentation
      head layers.
    n_layers: Integer, the number of layers in the segmentation head.
    kernel_size: Integer, the kernel size used in the segmentation head.
    reuse: reuse mode for the variable_scope.
    atrous_rates: List of integers of length n_layers, the atrous rates to use.

  Returns:
    Features to be used to estimate the segmentation labels of shape
      [batch, height, width, embedding_seg_feat_dim].
  """
  if atrous_rates is None or not atrous_rates:
    atrous_rates = [1 for _ in range(n_layers)]
  assert len(atrous_rates) == n_layers
  with tf.variable_scope('embedding_seg', reuse=reuse):
    for n in range(n_layers):
      features = model.split_separable_conv2d(
          features, feature_dimension, kernel_size=kernel_size,
          rate=atrous_rates[n], scope='split_separable_conv2d_{}'.format(n))
    return features


def add_image_summaries(images, nn_features, logits, batch_size,
                        prev_frame_nn_features=None):
  """Adds image summaries of input images, attention features and logits.

  Args:
    images: Image tensor of shape [batch, height, width, channels].
    nn_features: Nearest neighbor attention features of shape
      [batch_size, height, width, n_objects, 1].
    logits: Float32 tensor of logits.
    batch_size: Integer, the number of videos per clone per mini-batch.
    prev_frame_nn_features: Nearest neighbor attention features wrt. the
      last frame of shape [batch_size, height, width, n_objects, 1].
      Can be None.
  """
  # Separate reference and query images.
  reshaped_images = tf.reshape(images, tf.stack(
      [batch_size, -1] + resolve_shape(images)[1:]))
  reference_images = reshaped_images[:, 0]
  query_images = reshaped_images[:, 1:]
  query_images_reshaped = tf.reshape(query_images, tf.stack(
      [-1] + resolve_shape(images)[1:]))
  tf.summary.image('ref_images', reference_images, max_outputs=batch_size)
  tf.summary.image('query_images', query_images_reshaped, max_outputs=10)
  predictions = tf.cast(
      tf.argmax(logits, axis=-1), tf.uint8)[..., tf.newaxis]
  # Scale up so that we can actually see something.
  tf.summary.image('predictions', predictions * 32, max_outputs=10)
  # We currently only show the first dimension of the features for background
  # and the first foreground object.
  tf.summary.image('nn_fg_features', nn_features[..., 0:1, 0],
                   max_outputs=batch_size)
  if prev_frame_nn_features is not None:
    tf.summary.image('nn_fg_features_prev', prev_frame_nn_features[..., 0:1, 0],
                     max_outputs=batch_size)
  tf.summary.image('nn_bg_features', nn_features[..., 1:2, 0],
                   max_outputs=batch_size)
  if prev_frame_nn_features is not None:
    tf.summary.image('nn_bg_features_prev',
                     prev_frame_nn_features[..., 1:2, 0],
                     max_outputs=batch_size)


def get_embeddings(images, model_options, embedding_dimension):
  """Extracts embedding vectors for images. Should only be used for inference.

  Args:
    images: A tensor of shape [batch, height, width, channels].
    model_options: A ModelOptions instance to configure models.
    embedding_dimension: Integer, the dimension of the embedding.

  Returns:
    embeddings: A tensor of shape [batch, height, width, embedding_dimension].
  """
  features, end_points = model.extract_features(
      images,
      model_options,
      is_training=False)

  if model_options.decoder_output_stride is not None:
    decoder_output_stride = min(model_options.decoder_output_stride)
    if model_options.crop_size is None:
      height = tf.shape(images)[1]
      width = tf.shape(images)[2]
    else:
      height, width = model_options.crop_size
    features = model.refine_by_decoder(
        features,
        end_points,
        crop_size=[height, width],
        decoder_output_stride=[decoder_output_stride],
        decoder_use_separable_conv=model_options.decoder_use_separable_conv,
        model_variant=model_options.model_variant,
        is_training=False)

  with tf.variable_scope('embedding'):
    embeddings = split_separable_conv2d_with_identity_initializer(
        features, embedding_dimension, scope='split_separable_conv2d')
  return embeddings


def get_logits_with_matching(images,
                             model_options,
                             weight_decay=0.0001,
                             reuse=None,
                             is_training=False,
                             fine_tune_batch_norm=False,
                             reference_labels=None,
                             batch_size=None,
                             num_frames_per_video=None,
                             embedding_dimension=None,
                             max_neighbors_per_object=0,
                             k_nearest_neighbors=1,
                             use_softmax_feedback=True,
                             initial_softmax_feedback=None,
                             embedding_seg_feature_dimension=256,
                             embedding_seg_n_layers=4,
                             embedding_seg_kernel_size=7,
                             embedding_seg_atrous_rates=None,
                             normalize_nearest_neighbor_distances=True,
                             also_attend_to_previous_frame=True,
                             damage_initial_previous_frame_mask=False,
                             use_local_previous_frame_attention=True,
                             previous_frame_attention_window_size=15,
                             use_first_frame_matching=True,
                             also_return_embeddings=False,
                             ref_embeddings=None):
  """Gets the logits by atrous/image spatial pyramid pooling using attention.

  Args:
    images: A tensor of size [batch, height, width, channels].
    model_options: A ModelOptions instance to configure models.
    weight_decay: The weight decay for model variables.
    reuse: Reuse the model variables or not.
    is_training: Is training or not.
    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
    reference_labels: The segmentation labels of the reference frame on which
      attention is applied.
    batch_size: Integer, the number of videos on a batch
    num_frames_per_video: Integer, the number of frames per video
    embedding_dimension: Integer, the dimension of the embedding
    max_neighbors_per_object: Integer, the maximum number of candidates
      for the nearest neighbor query per object after subsampling.
      Can be 0 for no subsampling.
    k_nearest_neighbors: Integer, the number of nearest neighbors to use.
    use_softmax_feedback: Boolean, whether to give the softmax predictions of
      the last frame as additional input to the segmentation head.
    initial_softmax_feedback: List of Float32 tensors, or None. Can be used to
      initialize the softmax predictions used for the feedback loop.
      Only has an effect if use_softmax_feedback is True.
    embedding_seg_feature_dimension: Integer, the dimensionality used in the
      segmentation head layers.
    embedding_seg_n_layers: Integer, the number of layers in the segmentation
      head.
    embedding_seg_kernel_size: Integer, the kernel size used in the
      segmentation head.
    embedding_seg_atrous_rates: List of integers of length
      embedding_seg_n_layers, the atrous rates to use for the segmentation head.
    normalize_nearest_neighbor_distances: Boolean, whether to normalize the
      nearest neighbor distances to [0,1] using sigmoid, scale and shift.
    also_attend_to_previous_frame: Boolean, whether to also use nearest
      neighbor attention with respect to the previous frame.
    damage_initial_previous_frame_mask: Boolean, whether to artificially damage
      the initial previous frame mask. Only has an effect if
      also_attend_to_previous_frame is True.
    use_local_previous_frame_attention: Boolean, whether to restrict the
      previous frame attention to a local search window.
      Only has an effect, if also_attend_to_previous_frame is True.
    previous_frame_attention_window_size: Integer, the window size used for
      local previous frame attention, if use_local_previous_frame_attention
      is True.
    use_first_frame_matching: Boolean, whether to extract features by matching
      to the reference frame. This should always be true except for ablation
      experiments.
    also_return_embeddings: Boolean, whether to return the embeddings as well.
    ref_embeddings: Tuple of
      (first_frame_embeddings, previous_frame_embeddings),
      each of shape [batch, height, width, embedding_dimension], or None.
  Returns:
    outputs_to_logits: A map from output_type to logits.
    If also_return_embeddings is True, it will also return an embeddings
      tensor of shape [batch, height, width, embedding_dimension].
  """
  features, end_points = model.extract_features(
      images,
      model_options,
      weight_decay=weight_decay,
      reuse=reuse,
      is_training=is_training,
      fine_tune_batch_norm=fine_tune_batch_norm)

  if model_options.decoder_output_stride:
    decoder_output_stride = min(model_options.decoder_output_stride)
    if model_options.crop_size is None:
      height = tf.shape(images)[1]
      width = tf.shape(images)[2]
    else:
      height, width = model_options.crop_size
    decoder_height = model.scale_dimension(height, 1.0 / decoder_output_stride)
    decoder_width = model.scale_dimension(width, 1.0 / decoder_output_stride)
    features = model.refine_by_decoder(
        features,
        end_points,
        crop_size=[height, width],
        decoder_output_stride=[decoder_output_stride],
        decoder_use_separable_conv=model_options.decoder_use_separable_conv,
        model_variant=model_options.model_variant,
        weight_decay=weight_decay,
        reuse=reuse,
        is_training=is_training,
        fine_tune_batch_norm=fine_tune_batch_norm)

  with tf.variable_scope('embedding', reuse=reuse):
    embeddings = split_separable_conv2d_with_identity_initializer(
        features, embedding_dimension, scope='split_separable_conv2d')
    embeddings = tf.identity(embeddings, name='embeddings')
  scaled_reference_labels = tf.image.resize_nearest_neighbor(
      reference_labels,
      resolve_shape(embeddings, 4)[1:3],
      align_corners=True)
  h, w = decoder_height, decoder_width
  if num_frames_per_video is None:
    num_frames_per_video = tf.size(embeddings) // (
        batch_size * h * w * embedding_dimension)
  new_labels_shape = tf.stack([batch_size, -1, h, w, 1])
  reshaped_reference_labels = tf.reshape(scaled_reference_labels,
                                         new_labels_shape)
  new_embeddings_shape = tf.stack([batch_size,
                                   num_frames_per_video, h, w,
                                   embedding_dimension])
  reshaped_embeddings = tf.reshape(embeddings, new_embeddings_shape)
  all_nn_features = []
  all_ref_obj_ids = []
  # To keep things simple, we do all this separate for each sequence for now.
  for n in range(batch_size):
    embedding = reshaped_embeddings[n]
    if ref_embeddings is None:
      n_chunks = 100
      reference_embedding = embedding[0]
      if also_attend_to_previous_frame or use_softmax_feedback:
        queries_embedding = embedding[2:]
      else:
        queries_embedding = embedding[1:]
    else:
      if USE_CORRELATION_COST:
        n_chunks = 20
      else:
        n_chunks = 500
      reference_embedding = ref_embeddings[0][n]
      queries_embedding = embedding
    reference_labels = reshaped_reference_labels[n][0]
    nn_features_n, ref_obj_ids = nearest_neighbor_features_per_object(
        reference_embedding, queries_embedding, reference_labels,
        max_neighbors_per_object, k_nearest_neighbors, n_chunks=n_chunks)
    if normalize_nearest_neighbor_distances:
      nn_features_n = (tf.nn.sigmoid(nn_features_n) - 0.5) * 2
    all_nn_features.append(nn_features_n)
    all_ref_obj_ids.append(ref_obj_ids)

  feat_dim = resolve_shape(features)[-1]
  features = tf.reshape(features, tf.stack(
      [batch_size, num_frames_per_video, h, w, feat_dim]))
  if ref_embeddings is None:
    # Strip the features for the reference frame.
    if also_attend_to_previous_frame or use_softmax_feedback:
      features = features[:, 2:]
    else:
      features = features[:, 1:]

  # To keep things simple, we do all this separate for each sequence for now.
  outputs_to_logits = {output: [] for
                       output in model_options.outputs_to_num_classes}
  for n in range(batch_size):
    features_n = features[n]
    nn_features_n = all_nn_features[n]
    nn_features_n_tr = tf.transpose(nn_features_n, [3, 0, 1, 2, 4])
    n_objs = tf.shape(nn_features_n_tr)[0]
    # Repeat features for every object.
    features_n_tiled = tf.tile(features_n[tf.newaxis],
                               multiples=[n_objs, 1, 1, 1, 1])
    prev_frame_labels = None
    if also_attend_to_previous_frame:
      prev_frame_labels = reshaped_reference_labels[n, 1]
      if is_training and damage_initial_previous_frame_mask:
        # Damage the previous frame masks.
        prev_frame_labels = mask_damaging.damage_masks(prev_frame_labels,
                                                       dilate=False)
      tf.summary.image('prev_frame_labels',
                       tf.cast(prev_frame_labels[tf.newaxis],
                               tf.uint8) * 32)
      initial_softmax_feedback_n = create_initial_softmax_from_labels(
          prev_frame_labels, reshaped_reference_labels[n][0],
          decoder_output_stride=None, reduce_labels=True)
    elif initial_softmax_feedback is not None:
      initial_softmax_feedback_n = initial_softmax_feedback[n]
    else:
      initial_softmax_feedback_n = None
    if initial_softmax_feedback_n is None:
      last_softmax = tf.zeros((n_objs, h, w, 1), dtype=tf.float32)
    else:
      last_softmax = tf.transpose(initial_softmax_feedback_n, [2, 0, 1])[
          ..., tf.newaxis]
    assert len(model_options.outputs_to_num_classes) == 1
    output = model_options.outputs_to_num_classes.keys()[0]
    logits = []
    n_ref_frames = 1
    prev_frame_nn_features_n = None
    if also_attend_to_previous_frame or use_softmax_feedback:
      n_ref_frames += 1
    if ref_embeddings is not None:
      n_ref_frames = 0
    for t in range(num_frames_per_video - n_ref_frames):
      to_concat = [features_n_tiled[:, t]]
      if use_first_frame_matching:
        to_concat.append(nn_features_n_tr[:, t])
      if use_softmax_feedback:
        to_concat.append(last_softmax)
      if also_attend_to_previous_frame:
        assert normalize_nearest_neighbor_distances, (
            'previous frame attention currently only works when normalized '
            'distances are used')
        embedding = reshaped_embeddings[n]
        if ref_embeddings is None:
          last_frame_embedding = embedding[t + 1]
          query_embeddings = embedding[t + 2, tf.newaxis]
        else:
          last_frame_embedding = ref_embeddings[1][0]
          query_embeddings = embedding
        if use_local_previous_frame_attention:
          assert query_embeddings.shape[0] == 1
          prev_frame_nn_features_n = (
              local_previous_frame_nearest_neighbor_features_per_object(
                  last_frame_embedding,
                  query_embeddings[0],
                  prev_frame_labels,
                  all_ref_obj_ids[n],
                  max_distance=previous_frame_attention_window_size)
          )
        else:
          prev_frame_nn_features_n, _ = (
              nearest_neighbor_features_per_object(
                  last_frame_embedding, query_embeddings, prev_frame_labels,
                  max_neighbors_per_object, k_nearest_neighbors,
                  gt_ids=all_ref_obj_ids[n]))
          prev_frame_nn_features_n = (tf.nn.sigmoid(
              prev_frame_nn_features_n) - 0.5) * 2
        prev_frame_nn_features_n_sq = tf.squeeze(prev_frame_nn_features_n,
                                                 axis=0)
        prev_frame_nn_features_n_tr = tf.transpose(
            prev_frame_nn_features_n_sq, [2, 0, 1, 3])
        to_concat.append(prev_frame_nn_features_n_tr)
      features_n_concat_t = tf.concat(to_concat, axis=-1)
      embedding_seg_features_n_t = (
          create_embedding_segmentation_features(
              features_n_concat_t, embedding_seg_feature_dimension,
              embedding_seg_n_layers, embedding_seg_kernel_size,
              reuse or n > 0, atrous_rates=embedding_seg_atrous_rates))
      logits_t = model.get_branch_logits(
          embedding_seg_features_n_t,
          1,
          model_options.atrous_rates,
          aspp_with_batch_norm=model_options.aspp_with_batch_norm,
          kernel_size=model_options.logits_kernel_size,
          weight_decay=weight_decay,
          reuse=reuse or n > 0 or t > 0,
          scope_suffix=output
      )
      logits.append(logits_t)
      prev_frame_labels = tf.transpose(tf.argmax(logits_t, axis=0),
                                       [2, 0, 1])
      last_softmax = tf.nn.softmax(logits_t, axis=0)
    logits = tf.stack(logits, axis=1)
    logits_shape = tf.stack(
        [n_objs, num_frames_per_video - n_ref_frames] +
        resolve_shape(logits)[2:-1])
    logits_reshaped = tf.reshape(logits, logits_shape)
    logits_transposed = tf.transpose(logits_reshaped, [1, 2, 3, 0])
    outputs_to_logits[output].append(logits_transposed)

    add_image_summaries(
        images[n * num_frames_per_video: (n+1) * num_frames_per_video],
        nn_features_n,
        logits_transposed,
        batch_size=1,
        prev_frame_nn_features=prev_frame_nn_features_n)
  if also_return_embeddings:
    return outputs_to_logits, embeddings
  else:
    return outputs_to_logits


def subsample_reference_embeddings_and_labels(
    reference_embeddings_flat, reference_labels_flat, ref_obj_ids,
    max_neighbors_per_object):
  """Subsamples the reference embedding vectors and labels.

  After subsampling, at most max_neighbors_per_object items will remain per
    class.

  Args:
    reference_embeddings_flat: Tensor of shape [n, embedding_dim],
      the embedding vectors for the reference frame.
    reference_labels_flat: Tensor of shape [n, 1],
      the class labels of the reference frame.
    ref_obj_ids: An int32 tensor of the unique object ids present
      in the reference labels.
    max_neighbors_per_object: Integer, the maximum number of candidates
      for the nearest neighbor query per object after subsampling,
      or 0 for no subsampling.

  Returns:
    reference_embeddings_flat: Tensor of shape [n_sub, embedding_dim],
      the subsampled embedding vectors for the reference frame.
    reference_labels_flat: Tensor of shape [n_sub, 1],
      the class labels of the reference frame.
  """
  if max_neighbors_per_object == 0:
    return reference_embeddings_flat, reference_labels_flat
  same_label_mask = tf.equal(reference_labels_flat[tf.newaxis, :],
                             ref_obj_ids[:, tf.newaxis])
  max_neighbors_per_object_repeated = tf.tile(
      tf.constant(max_neighbors_per_object)[tf.newaxis],
      multiples=[tf.size(ref_obj_ids)])
  # Somehow map_fn on GPU caused trouble sometimes, so let's put it on CPU
  # for now.
  with tf.device('cpu:0'):
    subsampled_indices = tf.map_fn(_create_subsampling_mask,
                                   (same_label_mask,
                                    max_neighbors_per_object_repeated),
                                   dtype=tf.int64,
                                   name='subsample_labels_map_fn',
                                   parallel_iterations=1)
  mask = tf.not_equal(subsampled_indices, tf.constant(-1, dtype=tf.int64))
  masked_indices = tf.boolean_mask(subsampled_indices, mask)
  reference_embeddings_flat = tf.gather(reference_embeddings_flat,
                                        masked_indices)
  reference_labels_flat = tf.gather(reference_labels_flat, masked_indices)
  return reference_embeddings_flat, reference_labels_flat


def _create_subsampling_mask(args):
  """Creates boolean mask which can be used to subsample the labels.

  Args:
    args: tuple of (label_mask, max_neighbors_per_object), where label_mask
      is the mask to be subsampled and max_neighbors_per_object is a int scalar,
      the maximum number of neighbors to be retained after subsampling.

  Returns:
    The boolean mask for subsampling the labels.
  """
  label_mask, max_neighbors_per_object = args
  indices = tf.squeeze(tf.where(label_mask), axis=1)
  shuffled_indices = tf.random_shuffle(indices)
  subsampled_indices = shuffled_indices[:max_neighbors_per_object]
  n_pad = max_neighbors_per_object - tf.size(subsampled_indices)
  padded_label = -1
  padding = tf.fill((n_pad,), tf.constant(padded_label, dtype=tf.int64))
  padded = tf.concat([subsampled_indices, padding], axis=0)
  return padded


def conv2d_identity_initializer(scale=1.0, mean=0, stddev=3e-2):
  """Creates an identity initializer for TensorFlow conv2d.

  We add a small amount of normal noise to the initialization matrix.
  Code copied from lcchen@.

  Args:
    scale: The scale coefficient for the identity weight matrix.
    mean: A 0-D Tensor or Python value of type `dtype`. The mean of the
      truncated normal distribution.
    stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
      of the truncated normal distribution.

  Returns:
    An identity initializer function for TensorFlow conv2d.
  """
  def _initializer(shape, dtype=tf.float32, partition_info=None):
    """Returns the identity matrix scaled by `scale`.

    Args:
      shape: A tuple of int32 numbers indicating the shape of the initializing
         matrix.
      dtype: The data type of the initializing matrix.
      partition_info: (Optional) variable_scope._PartitionInfo object holding
      additional information about how the variable is partitioned. This input
        is not used in our case, but is required by TensorFlow.

    Returns:
      A identity matrix.

    Raises:
      ValueError: If len(shape) != 4, or shape[0] != shape[1], or shape[0] is
        not odd, or shape[1] is not odd..
    """
    del partition_info
    if len(shape) != 4:
      raise ValueError('Expect shape length to be 4.')
    if shape[0] != shape[1]:
      raise ValueError('Expect shape[0] = shape[1].')
    if shape[0] % 2 != 1:
      raise ValueError('Expect shape[0] to be odd value.')
    if shape[1] % 2 != 1:
      raise ValueError('Expect shape[1] to be odd value.')
    weights = np.zeros(shape, dtype=np.float32)
    center_y = shape[0] / 2
    center_x = shape[1] / 2
    min_channel = min(shape[2], shape[3])
    for i in range(min_channel):
      weights[center_y, center_x, i, i] = scale
    return tf.constant(weights, dtype=dtype) + tf.truncated_normal(
        shape, mean=mean, stddev=stddev, dtype=dtype)

  return _initializer


def split_separable_conv2d_with_identity_initializer(
    inputs,
    filters,
    kernel_size=3,
    rate=1,
    weight_decay=0.00004,
    scope=None):
  """Splits a separable conv2d into depthwise and pointwise conv2d.

  This operation differs from `tf.layers.separable_conv2d` as this operation
  applies activation function between depthwise and pointwise conv2d.

  Args:
    inputs: Input tensor with shape [batch, height, width, channels].
    filters: Number of filters in the 1x1 pointwise convolution.
    kernel_size: A list of length 2: [kernel_height, kernel_width] of
      of the filters. Can be an int if both values are the same.
    rate: Atrous convolution rate for the depthwise convolution.
    weight_decay: The weight decay to use for regularizing the model.
    scope: Optional scope for the operation.

  Returns:
    Computed features after split separable conv2d.
  """
  initializer = conv2d_identity_initializer()
  outputs = slim.separable_conv2d(
      inputs,
      None,
      kernel_size=kernel_size,
      depth_multiplier=1,
      rate=rate,
      weights_initializer=initializer,
      weights_regularizer=None,
      scope=scope + '_depthwise')
  return slim.conv2d(
      outputs,
      filters,
      1,
      weights_initializer=initializer,
      weights_regularizer=slim.l2_regularizer(weight_decay),
      scope=scope + '_pointwise')


def create_initial_softmax_from_labels(last_frame_labels, reference_labels,
                                       decoder_output_stride, reduce_labels):
  """Creates initial softmax predictions from last frame labels.

  Args:
    last_frame_labels: last frame labels of shape [1, height, width, 1].
    reference_labels: reference frame labels of shape [1, height, width, 1].
    decoder_output_stride: Integer, the stride of the decoder. Can be None, in
      this case it's assumed that the last_frame_labels and reference_labels
      are already scaled to the decoder output resolution.
    reduce_labels: Boolean, whether to reduce the depth of the softmax one_hot
      encoding to the actual number of labels present in the reference frame
      (otherwise the depth will be the highest label index + 1).

  Returns:
    init_softmax: the initial softmax predictions.
  """
  if decoder_output_stride is None:
    labels_output_size = last_frame_labels
    reference_labels_output_size = reference_labels
  else:
    h = tf.shape(last_frame_labels)[1]
    w = tf.shape(last_frame_labels)[2]
    h_sub = model.scale_dimension(h, 1.0 / decoder_output_stride)
    w_sub = model.scale_dimension(w, 1.0 / decoder_output_stride)
    labels_output_size = tf.image.resize_nearest_neighbor(
        last_frame_labels, [h_sub, w_sub], align_corners=True)
    reference_labels_output_size = tf.image.resize_nearest_neighbor(
        reference_labels, [h_sub, w_sub], align_corners=True)
  if reduce_labels:
    unique_labels, _ = tf.unique(tf.reshape(reference_labels_output_size, [-1]))
    depth = tf.size(unique_labels)
  else:
    depth = tf.reduce_max(reference_labels_output_size) + 1
  one_hot_assertion = tf.assert_less(tf.reduce_max(labels_output_size), depth)
  with tf.control_dependencies([one_hot_assertion]):
    init_softmax = tf.one_hot(tf.squeeze(labels_output_size,
                                         axis=-1),
                              depth=depth,
                              dtype=tf.float32)
  return init_softmax


def local_previous_frame_nearest_neighbor_features_per_object(
    prev_frame_embedding, query_embedding, prev_frame_labels,
    gt_ids, max_distance=9):
  """Computes nearest neighbor features while only allowing local matches.

  Args:
    prev_frame_embedding: Tensor of shape [height, width, embedding_dim],
      the embedding vectors for the last frame.
    query_embedding: Tensor of shape [height, width, embedding_dim],
      the embedding vectors for the query frames.
    prev_frame_labels: Tensor of shape [height, width, 1], the class labels of
      the previous frame.
    gt_ids: Int Tensor of shape [n_objs] of the sorted unique ground truth
      ids in the first frame.
    max_distance: Integer, the maximum distance allowed for local matching.

  Returns:
    nn_features: A float32 np.array of nearest neighbor features of shape
      [1, height, width, n_objects, 1].
  """
  with tf.name_scope(
      'local_previous_frame_nearest_neighbor_features_per_object'):
    if USE_CORRELATION_COST:
      tf.logging.info('Using correlation_cost.')
      d = local_pairwise_distances(query_embedding, prev_frame_embedding,
                                   max_distance=max_distance)
    else:
      # Slow fallback in case correlation_cost is not available.
      tf.logging.warn('correlation cost is not available, using slow fallback '
                      'implementation.')
      d = local_pairwise_distances2(query_embedding, prev_frame_embedding,
                                    max_distance=max_distance)
    d = (tf.nn.sigmoid(d) - 0.5) * 2
    height = tf.shape(prev_frame_embedding)[0]
    width = tf.shape(prev_frame_embedding)[1]

    # Create offset versions of the mask.
    if USE_CORRELATION_COST:
      # New, faster code with cross-correlation via correlation_cost.
      # Due to padding we have to add 1 to the labels.
      offset_labels = correlation_cost_op.correlation_cost(
          tf.ones((1, height, width, 1)),
          tf.cast(prev_frame_labels + 1, tf.float32)[tf.newaxis],
          kernel_size=1,
          max_displacement=max_distance, stride_1=1, stride_2=1,
          pad=max_distance)
      offset_labels = tf.squeeze(offset_labels, axis=0)[..., tf.newaxis]
      # Subtract the 1 again and round.
      offset_labels = tf.round(offset_labels - 1)
      offset_masks = tf.equal(
          offset_labels,
          tf.cast(gt_ids, tf.float32)[tf.newaxis, tf.newaxis, tf.newaxis, :])
    else:
      # Slower code, without dependency to correlation_cost
      masks = tf.equal(prev_frame_labels, gt_ids[tf.newaxis, tf.newaxis])
      padded_masks = tf.pad(masks,
                            [[max_distance, max_distance],
                             [max_distance, max_distance],
                             [0, 0]])
      offset_masks = []
      for y_start in range(2 * max_distance + 1):
        y_end = y_start + height
        masks_slice = padded_masks[y_start:y_end]
        for x_start in range(2 * max_distance + 1):
          x_end = x_start + width
          offset_mask = masks_slice[:, x_start:x_end]
          offset_masks.append(offset_mask)
      offset_masks = tf.stack(offset_masks, axis=2)

    pad = tf.ones((height, width, (2 * max_distance + 1) ** 2, tf.size(gt_ids)))
    d_tiled = tf.tile(d[..., tf.newaxis], multiples=(1, 1, 1, tf.size(gt_ids)))
    d_masked = tf.where(offset_masks, d_tiled, pad)
    dists = tf.reduce_min(d_masked, axis=2)
    dists = tf.reshape(dists, (1, height, width, tf.size(gt_ids), 1))
    return dists