File size: 9,392 Bytes
0b8359d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
syntax = "proto2";

package object_detection.protos;

import "object_detection/protos/anchor_generator.proto";
import "object_detection/protos/box_coder.proto";
import "object_detection/protos/box_predictor.proto";
import "object_detection/protos/hyperparams.proto";
import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";
import "object_detection/protos/matcher.proto";
import "object_detection/protos/post_processing.proto";
import "object_detection/protos/region_similarity_calculator.proto";

// Configuration for Single Shot Detection (SSD) models.
// Next id: 27
message Ssd {
  // Number of classes to predict.
  optional int32 num_classes = 1;

  // Image resizer for preprocessing the input image.
  optional ImageResizer image_resizer = 2;

  // Feature extractor config.
  optional SsdFeatureExtractor feature_extractor = 3;

  // Box coder to encode the boxes.
  optional BoxCoder box_coder = 4;

  // Matcher to match groundtruth with anchors.
  optional Matcher matcher = 5;

  // Region similarity calculator to compute similarity of boxes.
  optional RegionSimilarityCalculator similarity_calculator = 6;

  // Whether background targets are to be encoded as an all
  // zeros vector or a one-hot vector (where background is the 0th class).
  optional bool encode_background_as_zeros = 12 [default = false];

  // classification weight to be associated to negative
  // anchors (default: 1.0). The weight must be in [0., 1.].
  optional float negative_class_weight = 13 [default = 1.0];

  // Box predictor to attach to the features.
  optional BoxPredictor box_predictor = 7;

  // Anchor generator to compute anchors.
  optional AnchorGenerator anchor_generator = 8;

  // Post processing to apply on the predictions.
  optional PostProcessing post_processing = 9;

  // Whether to normalize the loss by number of groundtruth boxes that match to
  // the anchors.
  optional bool normalize_loss_by_num_matches = 10 [default = true];

  // Whether to normalize the localization loss by the code size of the box
  // encodings. This is applied along with other normalization factors.
  optional bool normalize_loc_loss_by_codesize = 14 [default = false];

  // Loss configuration for training.
  optional Loss loss = 11;

  // Whether to update batch norm parameters during training or not.
  // When training with a relative small batch size (e.g. 1), it is
  // desirable to disable batch norm update and use pretrained batch norm
  // params.
  //
  // Note: Some feature extractors are used with canned arg_scopes
  // (e.g resnet arg scopes).  In these cases training behavior of batch norm
  // variables may depend on both values of `batch_norm_trainable` and
  // `is_training`.
  //
  // When canned arg_scopes are used with feature extractors `conv_hyperparams`
  // will apply only to the additional layers that are added and are outside the
  // canned arg_scope.
  optional bool freeze_batchnorm = 16 [default = false];

  // Whether to update batch_norm inplace during training. This is required
  // for batch norm to work correctly on TPUs. When this is false, user must add
  // a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order
  // to update the batch norm moving average parameters.
  optional bool inplace_batchnorm_update = 15 [default = false];

  // Whether to add an implicit background class to one-hot encodings of
  // groundtruth labels. Set to false if training a single
  // class model or using an explicit background class.
  optional bool add_background_class = 21 [default = true];

  // Whether to use an explicit background class. Set to true if using
  // groundtruth labels with an explicit background class, as in multiclass
  // scores.
  optional bool explicit_background_class = 24 [default = false];

  optional bool use_confidences_as_targets = 22 [default = false];

  optional float implicit_example_weight = 23 [default = 1.0];

  optional bool return_raw_detections_during_predict = 26 [default = false];

  // Configuration proto for MaskHead.
  // Next id: 11
  message MaskHead {
    // The height and the width of the predicted mask. Only used when
    // predict_instance_masks is true.
    optional int32 mask_height = 1 [default = 15];
    optional int32 mask_width = 2 [default = 15];

    // Whether to predict class agnostic masks. Only used when
    // predict_instance_masks is true.
    optional bool masks_are_class_agnostic = 3 [default = true];

    // The depth for the first conv2d_transpose op applied to the
    // image_features in the mask prediction branch. If set to 0, the value
    // will be set automatically based on the number of channels in the image
    // features and the number of classes.
    optional int32 mask_prediction_conv_depth = 4 [default = 256];

    // The number of convolutions applied to image_features in the mask
    // prediction branch.
    optional int32 mask_prediction_num_conv_layers = 5 [default = 2];

    // Whether to apply convolutions on mask features before upsampling using
    // nearest neighbor resizing.
    // By default, mask features are resized to [`mask_height`, `mask_width`]
    // before applying convolutions and predicting masks.
    optional bool convolve_then_upsample_masks = 6 [default = false];

    // Mask loss weight.
    optional float mask_loss_weight = 7 [default = 5.0];

    // Number of boxes to be generated at training time for computing mask loss.
    optional int32 mask_loss_sample_size = 8 [default = 16];

    // Hyperparameters for convolution ops used in the box predictor.
    optional Hyperparams conv_hyperparams = 9;

    // Output size (width and height are set to be the same) of the initial
    // bilinear interpolation based cropping during ROI pooling. Only used when
    // we have second stage prediction head enabled (e.g. mask head).
    optional int32 initial_crop_size = 10 [default = 15];
  }

  // Configs for mask head.
  optional MaskHead mask_head_config = 25;
}

// Next id: 18.
message SsdFeatureExtractor {
  reserved 6;

  // Type of ssd feature extractor.
  optional string type = 1;

  // The factor to alter the depth of the channels in the feature extractor.
  optional float depth_multiplier = 2 [default = 1.0];

  // Minimum number of the channels in the feature extractor.
  optional int32 min_depth = 3 [default = 16];

  // Hyperparameters that affect the layers of feature extractor added on top
  // of the base feature extractor.
  optional Hyperparams conv_hyperparams = 4;

  // Normally, SSD feature extractors are constructed by reusing an existing
  // base feature extractor (that has its own hyperparams) and adding new layers
  // on top of it. `conv_hyperparams` above normally applies only to the new
  // layers while base feature extractor uses its own default hyperparams. If
  // this value is set to true, the base feature extractor's hyperparams will be
  // overridden with the `conv_hyperparams`.
  optional bool override_base_feature_extractor_hyperparams = 9
      [default = false];

  // The nearest multiple to zero-pad the input height and width dimensions to.
  // For example, if pad_to_multiple = 2, input dimensions are zero-padded
  // until the resulting dimensions are even.
  optional int32 pad_to_multiple = 5 [default = 1];

  // Whether to use explicit padding when extracting SSD multiresolution
  // features. This will also apply to the base feature extractor if a MobileNet
  // architecture is used.
  optional bool use_explicit_padding = 7 [default = false];

  // Whether to use depthwise separable convolutions for to extract additional
  // feature maps added by SSD.
  optional bool use_depthwise = 8 [default = false];

  // Feature Pyramid Networks config.
  optional FeaturePyramidNetworks fpn = 10;

  // If true, replace preprocess function of feature extractor with a
  // placeholder. This should only be used if all the image preprocessing steps
  // happen outside the graph.
  optional bool replace_preprocessor_with_placeholder = 11 [default = false];

  // The number of SSD layers.
  optional int32 num_layers = 12 [default = 6];

}

// Configuration for Feature Pyramid Networks.
message FeaturePyramidNetworks {
  // We recommend to use multi_resolution_feature_map_generator with FPN, and
  // the levels there must match the levels defined below for better
  // performance.
  // Correspondence from FPN levels to Resnet/Mobilenet V1 feature maps:
  // FPN Level        Resnet Feature Map      Mobilenet-V1 Feature Map
  //     2               Block 1                Conv2d_3_pointwise
  //     3               Block 2                Conv2d_5_pointwise
  //     4               Block 3                Conv2d_11_pointwise
  //     5               Block 4                Conv2d_13_pointwise
  //     6               Bottomup_5             bottom_up_Conv2d_14
  //     7               Bottomup_6             bottom_up_Conv2d_15
  //     8               Bottomup_7             bottom_up_Conv2d_16
  //     9               Bottomup_8             bottom_up_Conv2d_17

  // minimum level in feature pyramid
  optional int32 min_level = 1 [default = 3];

  // maximum level in feature pyramid
  optional int32 max_level = 2 [default = 7];

  // channel depth for additional coarse feature layers.
  optional int32 additional_layer_depth = 3 [default = 256];

}