syntax = "proto2";

package object_detection.protos;

import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";

// Configuration for the CenterNet meta architecture from the "Objects as
// Points" paper [1]
// [1]: https://arxiv.org/abs/1904.07850

message CenterNet {
  // Number of classes to predict.
  optional int32 num_classes = 1;

  // Feature extractor config.
  optional CenterNetFeatureExtractor feature_extractor = 2;

  // Image resizer for preprocessing the input image.
  optional ImageResizer image_resizer = 3;

  // Parameters which are related to object detection task.
  message ObjectDetection {
    // The original fields are moved to ObjectCenterParams or deleted.
    reserved 2, 5, 6, 7;

    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Weight for the offset localization loss.
    optional float offset_loss_weight = 3 [default = 1.0];

    // Weight for the height/width localization loss.
    optional float scale_loss_weight = 4 [default = 0.1];

    // Localization loss configuration for object scale and offset losses.
    optional LocalizationLoss localization_loss = 8;
  }
  optional ObjectDetection object_detection_task = 4;

  // Parameters related to object center prediction. This is required for both
  // object detection and keypoint estimation tasks.
  message ObjectCenterParams {
    // Weight for the object center loss.
    optional float object_center_loss_weight = 1 [default = 1.0];

    // Classification loss configuration for object center loss.
    optional ClassificationLoss classification_loss = 2;

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
    // at https://arxiv.org/abs/1708.02002.
    optional float heatmap_bias_init = 3 [default = -2.19];

    // The minimum IOU overlap boxes need to have to not be penalized.
    optional float min_box_overlap_iou = 4 [default = 0.7];

    // Maximum number of boxes to predict.
    optional int32 max_box_predictions = 5 [default = 100];

    // If set, loss is only computed for the labeled classes.
    optional bool use_labeled_classes = 6 [default = false];
  }
  optional ObjectCenterParams object_center_params = 5;

  // Path of the file that conatins the label map along with the keypoint
  // information, including the keypoint indices, corresponding labels, and the
  // corresponding class. The file should be the same one as used in the input
  // pipeline. Note that a plain text of StringIntLabelMap proto is expected in
  // this file.
  // It is required only if the keypoint estimation task is specified.
  optional string keypoint_label_map_path = 6;

  // Parameters which are related to keypoint estimation task.
  message KeypointEstimation {
    // Name of the task, e.g. "human pose". Note that the task name should be
    // unique to each keypoint task.
    optional string task_name = 1;

    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 2 [default = 1.0];

    // Loss configuration for keypoint heatmap, offset, regression losses. Note
    // that the localization loss is used for offset/regression losses and
    // classification loss is used for heatmap loss.
    optional Loss loss = 3;

    // The name of the class that contains the keypoints for this task. This is
    // used to retrieve the corresponding keypoint indices from the label map.
    // Note that this corresponds to the "name" field, not "display_name".
    optional string keypoint_class_name = 4;

    // The standard deviation of the Gaussian kernel used to generate the
    // keypoint heatmap. The unit is the pixel in the output image. It is to
    // provide the flexibility of using different sizes of Gaussian kernel for
    // each keypoint class. Note that if provided, the keypoint standard
    // deviations will be overridden by the specified values here, otherwise,
    // the default value 5.0 will be used.
    // TODO(yuhuic): Update the default value once we found the best value.
    map<string, float> keypoint_label_to_std = 5;

    // Loss weights corresponding to different heads.
    optional float keypoint_regression_loss_weight = 6 [default = 1.0];
    optional float keypoint_heatmap_loss_weight = 7 [default = 1.0];
    optional float keypoint_offset_loss_weight = 8 [default = 1.0];

    // The initial bias value of the convolution kernel of the keypoint heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
    // at https://arxiv.org/abs/1708.02002.
    optional float heatmap_bias_init = 9 [default = -2.19];

    // The heatmap score threshold for a keypoint to become a valid candidate.
    optional float keypoint_candidate_score_threshold = 10 [default = 0.1];

    // The maximum number of candidates to retrieve for each keypoint.
    optional int32 num_candidates_per_keypoint = 11 [default = 100];

    // Max pool kernel size to use to pull off peak score locations in a
    // neighborhood (independently for each keypoint types).
    optional int32 peak_max_pool_kernel_size = 12 [default = 3];

    // The default score to use for regressed keypoints that are not
    // successfully snapped to a nearby candidate.
    optional float unmatched_keypoint_score = 13 [default = 0.1];

    // The multiplier to expand the bounding boxes (either the provided boxes or
    // those which tightly cover the regressed keypoints). Note that new
    // expanded box for an instance becomes the feasible search window for all
    // associated keypoints.
    optional float box_scale = 14 [default = 1.2];

    // The scale parameter that multiplies the largest dimension of a bounding
    // box. The resulting distance becomes a search radius for candidates in the
    // vicinity of each regressed keypoint.
    optional float candidate_search_scale = 15 [default = 0.3];

    // One of ['min_distance', 'score_distance_ratio'] indicating how to select
    // the keypoint candidate.
    optional string candidate_ranking_mode = 16 [default = "min_distance"];

    // The radius (in the unit of output pixel) around heatmap peak to assign
    // the offset targets. If set 0, then the offset target will only be
    // assigned to the heatmap peak (same behavior as the original paper).
    optional int32 offset_peak_radius = 17 [default = 0];

    // Indicates whether to assign offsets for each keypoint channel
    // separately. If set False, the output offset target has the shape
    // [batch_size, out_height, out_width, 2] (same behavior as the original
    // paper). If set True, the output offset target has the shape [batch_size,
    // out_height, out_width, 2 * num_keypoints] (recommended when the
    // offset_peak_radius is not zero).
    optional bool per_keypoint_offset = 18 [default = false];
  }
  repeated KeypointEstimation keypoint_estimation_task = 7;

  // Parameters which are related to mask estimation task.
  // Note: Currently, CenterNet supports a weak instance segmentation, where
  // semantic segmentation masks are estimated, and then cropped based on
  // bounding box detections. Therefore, it is possible for the same image
  // pixel to be assigned to multiple instances.
  message MaskEstimation {
    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Classification loss configuration for segmentation loss.
    optional ClassificationLoss classification_loss = 2;

    // Each instance mask (one per detection) is cropped and resized (bilinear
    // resampling) from the predicted segmentation feature map. After
    // resampling, the masks are binarized with the provided score threshold.
    optional int32 mask_height = 4 [default = 256];
    optional int32 mask_width = 5 [default = 256];
    optional float score_threshold = 6 [default = 0.5];

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1.
    optional float heatmap_bias_init = 3 [default = -2.19];
  }
  optional MaskEstimation mask_estimation_task = 8;
}

message CenterNetFeatureExtractor {
  optional string type = 1;

  // Channel means to be subtracted from each image channel. If not specified,
  // we use a default value of 0.
  repeated float channel_means = 2;

  // Channel standard deviations. Each channel will be normalized by dividing
  // it by its standard deviation. If not specified, we use a default value
  // of 1.
  repeated float channel_stds = 3;

  // If set, will change channel order to be [blue, green, red]. This can be
  // useful to be compatible with some pre-trained feature extractors.
  optional bool bgr_ordering = 4 [default = false];
}