syntax = "proto2"; package object_detection.protos; import "object_detection/protos/image_resizer.proto"; import "object_detection/protos/losses.proto"; // Configuration for the CenterNet meta architecture from the "Objects as // Points" paper [1] // [1]: https://arxiv.org/abs/1904.07850 message CenterNet { // Number of classes to predict. optional int32 num_classes = 1; // Feature extractor config. optional CenterNetFeatureExtractor feature_extractor = 2; // Image resizer for preprocessing the input image. optional ImageResizer image_resizer = 3; // Parameters which are related to object detection task. message ObjectDetection { // The original fields are moved to ObjectCenterParams or deleted. reserved 2, 5, 6, 7; // Weight of the task loss. The total loss of the model will be the // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // Weight for the offset localization loss. optional float offset_loss_weight = 3 [default = 1.0]; // Weight for the height/width localization loss. optional float scale_loss_weight = 4 [default = 0.1]; // Localization loss configuration for object scale and offset losses. optional LocalizationLoss localization_loss = 8; } optional ObjectDetection object_detection_task = 4; // Parameters related to object center prediction. This is required for both // object detection and keypoint estimation tasks. message ObjectCenterParams { // Weight for the object center loss. optional float object_center_loss_weight = 1 [default = 1.0]; // Classification loss configuration for object center loss. optional ClassificationLoss classification_loss = 2; // The initial bias value of the convlution kernel of the class heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. See "Focal Loss for Dense Object Detection" // at https://arxiv.org/abs/1708.02002. optional float heatmap_bias_init = 3 [default = -2.19]; // The minimum IOU overlap boxes need to have to not be penalized. optional float min_box_overlap_iou = 4 [default = 0.7]; // Maximum number of boxes to predict. optional int32 max_box_predictions = 5 [default = 100]; // If set, loss is only computed for the labeled classes. optional bool use_labeled_classes = 6 [default = false]; } optional ObjectCenterParams object_center_params = 5; // Path of the file that conatins the label map along with the keypoint // information, including the keypoint indices, corresponding labels, and the // corresponding class. The file should be the same one as used in the input // pipeline. Note that a plain text of StringIntLabelMap proto is expected in // this file. // It is required only if the keypoint estimation task is specified. optional string keypoint_label_map_path = 6; // Parameters which are related to keypoint estimation task. message KeypointEstimation { // Name of the task, e.g. "human pose". Note that the task name should be // unique to each keypoint task. optional string task_name = 1; // Weight of the task loss. The total loss of the model will be their // summation of task losses weighted by the weights. optional float task_loss_weight = 2 [default = 1.0]; // Loss configuration for keypoint heatmap, offset, regression losses. Note // that the localization loss is used for offset/regression losses and // classification loss is used for heatmap loss. optional Loss loss = 3; // The name of the class that contains the keypoints for this task. This is // used to retrieve the corresponding keypoint indices from the label map. // Note that this corresponds to the "name" field, not "display_name". optional string keypoint_class_name = 4; // The standard deviation of the Gaussian kernel used to generate the // keypoint heatmap. The unit is the pixel in the output image. It is to // provide the flexibility of using different sizes of Gaussian kernel for // each keypoint class. Note that if provided, the keypoint standard // deviations will be overridden by the specified values here, otherwise, // the default value 5.0 will be used. // TODO(yuhuic): Update the default value once we found the best value. map keypoint_label_to_std = 5; // Loss weights corresponding to different heads. optional float keypoint_regression_loss_weight = 6 [default = 1.0]; optional float keypoint_heatmap_loss_weight = 7 [default = 1.0]; optional float keypoint_offset_loss_weight = 8 [default = 1.0]; // The initial bias value of the convolution kernel of the keypoint heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. See "Focal Loss for Dense Object Detection" // at https://arxiv.org/abs/1708.02002. optional float heatmap_bias_init = 9 [default = -2.19]; // The heatmap score threshold for a keypoint to become a valid candidate. optional float keypoint_candidate_score_threshold = 10 [default = 0.1]; // The maximum number of candidates to retrieve for each keypoint. optional int32 num_candidates_per_keypoint = 11 [default = 100]; // Max pool kernel size to use to pull off peak score locations in a // neighborhood (independently for each keypoint types). optional int32 peak_max_pool_kernel_size = 12 [default = 3]; // The default score to use for regressed keypoints that are not // successfully snapped to a nearby candidate. optional float unmatched_keypoint_score = 13 [default = 0.1]; // The multiplier to expand the bounding boxes (either the provided boxes or // those which tightly cover the regressed keypoints). Note that new // expanded box for an instance becomes the feasible search window for all // associated keypoints. optional float box_scale = 14 [default = 1.2]; // The scale parameter that multiplies the largest dimension of a bounding // box. The resulting distance becomes a search radius for candidates in the // vicinity of each regressed keypoint. optional float candidate_search_scale = 15 [default = 0.3]; // One of ['min_distance', 'score_distance_ratio'] indicating how to select // the keypoint candidate. optional string candidate_ranking_mode = 16 [default = "min_distance"]; // The radius (in the unit of output pixel) around heatmap peak to assign // the offset targets. If set 0, then the offset target will only be // assigned to the heatmap peak (same behavior as the original paper). optional int32 offset_peak_radius = 17 [default = 0]; // Indicates whether to assign offsets for each keypoint channel // separately. If set False, the output offset target has the shape // [batch_size, out_height, out_width, 2] (same behavior as the original // paper). If set True, the output offset target has the shape [batch_size, // out_height, out_width, 2 * num_keypoints] (recommended when the // offset_peak_radius is not zero). optional bool per_keypoint_offset = 18 [default = false]; } repeated KeypointEstimation keypoint_estimation_task = 7; // Parameters which are related to mask estimation task. // Note: Currently, CenterNet supports a weak instance segmentation, where // semantic segmentation masks are estimated, and then cropped based on // bounding box detections. Therefore, it is possible for the same image // pixel to be assigned to multiple instances. message MaskEstimation { // Weight of the task loss. The total loss of the model will be their // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // Classification loss configuration for segmentation loss. optional ClassificationLoss classification_loss = 2; // Each instance mask (one per detection) is cropped and resized (bilinear // resampling) from the predicted segmentation feature map. After // resampling, the masks are binarized with the provided score threshold. optional int32 mask_height = 4 [default = 256]; optional int32 mask_width = 5 [default = 256]; optional float score_threshold = 6 [default = 0.5]; // The initial bias value of the convlution kernel of the class heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. optional float heatmap_bias_init = 3 [default = -2.19]; } optional MaskEstimation mask_estimation_task = 8; } message CenterNetFeatureExtractor { optional string type = 1; // Channel means to be subtracted from each image channel. If not specified, // we use a default value of 0. repeated float channel_means = 2; // Channel standard deviations. Each channel will be normalized by dividing // it by its standard deviation. If not specified, we use a default value // of 1. repeated float channel_stds = 3; // If set, will change channel order to be [blue, green, red]. This can be // useful to be compatible with some pre-trained feature extractors. optional bool bgr_ordering = 4 [default = false]; }