|
syntax = "proto2";
|
|
|
|
package object_detection.protos;
|
|
|
|
import "object_detection/protos/anchor_generator.proto";
|
|
import "object_detection/protos/box_coder.proto";
|
|
import "object_detection/protos/box_predictor.proto";
|
|
import "object_detection/protos/hyperparams.proto";
|
|
import "object_detection/protos/image_resizer.proto";
|
|
import "object_detection/protos/losses.proto";
|
|
import "object_detection/protos/matcher.proto";
|
|
import "object_detection/protos/post_processing.proto";
|
|
import "object_detection/protos/region_similarity_calculator.proto";
|
|
|
|
// Configuration for Single Shot Detection (SSD) models.
|
|
// Next id: 26
|
|
message Ssd {
|
|
// Number of classes to predict.
|
|
optional int32 num_classes = 1;
|
|
|
|
// Image resizer for preprocessing the input image.
|
|
optional ImageResizer image_resizer = 2;
|
|
|
|
// Feature extractor config.
|
|
optional SsdFeatureExtractor feature_extractor = 3;
|
|
|
|
// Box coder to encode the boxes.
|
|
optional BoxCoder box_coder = 4;
|
|
|
|
// Matcher to match groundtruth with anchors.
|
|
optional Matcher matcher = 5;
|
|
|
|
// Region similarity calculator to compute similarity of boxes.
|
|
optional RegionSimilarityCalculator similarity_calculator = 6;
|
|
|
|
// Whether background targets are to be encoded as an all
|
|
// zeros vector or a one-hot vector (where background is the 0th class).
|
|
optional bool encode_background_as_zeros = 12 [default = false];
|
|
|
|
// classification weight to be associated to negative
|
|
// anchors (default: 1.0). The weight must be in [0., 1.].
|
|
optional float negative_class_weight = 13 [default = 1.0];
|
|
|
|
// Box predictor to attach to the features.
|
|
optional BoxPredictor box_predictor = 7;
|
|
|
|
// Anchor generator to compute anchors.
|
|
optional AnchorGenerator anchor_generator = 8;
|
|
|
|
// Post processing to apply on the predictions.
|
|
optional PostProcessing post_processing = 9;
|
|
|
|
// Whether to normalize the loss by number of groundtruth boxes that match to
|
|
// the anchors.
|
|
optional bool normalize_loss_by_num_matches = 10 [default = true];
|
|
|
|
// Whether to normalize the localization loss by the code size of the box
|
|
// encodings. This is applied along with other normalization factors.
|
|
optional bool normalize_loc_loss_by_codesize = 14 [default = false];
|
|
|
|
// Loss configuration for training.
|
|
optional Loss loss = 11;
|
|
|
|
// Whether to update batch norm parameters during training or not.
|
|
// When training with a relative small batch size (e.g. 1), it is
|
|
// desirable to disable batch norm update and use pretrained batch norm
|
|
// params.
|
|
//
|
|
// Note: Some feature extractors are used with canned arg_scopes
|
|
// (e.g resnet arg scopes). In these cases training behavior of batch norm
|
|
// variables may depend on both values of `batch_norm_trainable` and
|
|
// `is_training`.
|
|
//
|
|
// When canned arg_scopes are used with feature extractors `conv_hyperparams`
|
|
// will apply only to the additional layers that are added and are outside the
|
|
// canned arg_scope.
|
|
optional bool freeze_batchnorm = 16 [default = false];
|
|
|
|
// Whether to update batch_norm inplace during training. This is required
|
|
// for batch norm to work correctly on TPUs. When this is false, user must add
|
|
// a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order
|
|
// to update the batch norm moving average parameters.
|
|
optional bool inplace_batchnorm_update = 15 [default = false];
|
|
|
|
// Whether to add an implicit background class to one-hot encodings of
|
|
// groundtruth labels. Set to false if training a single
|
|
// class model or using an explicit background class.
|
|
optional bool add_background_class = 21 [default = true];
|
|
|
|
// Whether to use an explicit background class. Set to true if using
|
|
// groundtruth labels with an explicit background class, as in multiclass
|
|
// scores.
|
|
optional bool explicit_background_class = 24 [default = false];
|
|
|
|
optional bool use_confidences_as_targets = 22 [default = false];
|
|
|
|
optional float implicit_example_weight = 23 [default = 1.0];
|
|
|
|
// Configuration proto for MaskHead.
|
|
// Next id: 11
|
|
message MaskHead {
|
|
// The height and the width of the predicted mask. Only used when
|
|
// predict_instance_masks is true.
|
|
optional int32 mask_height = 1 [default = 15];
|
|
optional int32 mask_width = 2 [default = 15];
|
|
|
|
// Whether to predict class agnostic masks. Only used when
|
|
// predict_instance_masks is true.
|
|
optional bool masks_are_class_agnostic = 3 [default = true];
|
|
|
|
// The depth for the first conv2d_transpose op applied to the
|
|
// image_features in the mask prediction branch. If set to 0, the value
|
|
// will be set automatically based on the number of channels in the image
|
|
// features and the number of classes.
|
|
optional int32 mask_prediction_conv_depth = 4 [default = 256];
|
|
|
|
// The number of convolutions applied to image_features in the mask
|
|
// prediction branch.
|
|
optional int32 mask_prediction_num_conv_layers = 5 [default = 2];
|
|
|
|
// Whether to apply convolutions on mask features before upsampling using
|
|
// nearest neighbor resizing.
|
|
// By default, mask features are resized to [`mask_height`, `mask_width`]
|
|
// before applying convolutions and predicting masks.
|
|
optional bool convolve_then_upsample_masks = 6 [default = false];
|
|
|
|
// Mask loss weight.
|
|
optional float mask_loss_weight = 7 [default = 5.0];
|
|
|
|
// Number of boxes to be generated at training time for computing mask loss.
|
|
optional int32 mask_loss_sample_size = 8 [default = 16];
|
|
|
|
// Hyperparameters for convolution ops used in the box predictor.
|
|
optional Hyperparams conv_hyperparams = 9;
|
|
|
|
// Output size (width and height are set to be the same) of the initial
|
|
// bilinear interpolation based cropping during ROI pooling. Only used when
|
|
// we have second stage prediction head enabled (e.g. mask head).
|
|
optional int32 initial_crop_size = 10 [default = 15];
|
|
}
|
|
|
|
// Configs for mask head.
|
|
optional MaskHead mask_head_config = 25;
|
|
}
|
|
|
|
message SsdFeatureExtractor {
|
|
reserved 6;
|
|
|
|
// Type of ssd feature extractor.
|
|
optional string type = 1;
|
|
|
|
// The factor to alter the depth of the channels in the feature extractor.
|
|
optional float depth_multiplier = 2 [default = 1.0];
|
|
|
|
// Minimum number of the channels in the feature extractor.
|
|
optional int32 min_depth = 3 [default = 16];
|
|
|
|
// Hyperparameters that affect the layers of feature extractor added on top
|
|
// of the base feature extractor.
|
|
optional Hyperparams conv_hyperparams = 4;
|
|
|
|
// Normally, SSD feature extractors are constructed by reusing an existing
|
|
// base feature extractor (that has its own hyperparams) and adding new layers
|
|
// on top of it. `conv_hyperparams` above normally applies only to the new
|
|
// layers while base feature extractor uses its own default hyperparams. If
|
|
// this value is set to true, the base feature extractor's hyperparams will be
|
|
// overridden with the `conv_hyperparams`.
|
|
optional bool override_base_feature_extractor_hyperparams = 9
|
|
[default = false];
|
|
|
|
// The nearest multiple to zero-pad the input height and width dimensions to.
|
|
// For example, if pad_to_multiple = 2, input dimensions are zero-padded
|
|
// until the resulting dimensions are even.
|
|
optional int32 pad_to_multiple = 5 [default = 1];
|
|
|
|
// Whether to use explicit padding when extracting SSD multiresolution
|
|
// features. This will also apply to the base feature extractor if a MobileNet
|
|
// architecture is used.
|
|
optional bool use_explicit_padding = 7 [default = false];
|
|
|
|
// Whether to use depthwise separable convolutions for to extract additional
|
|
// feature maps added by SSD.
|
|
optional bool use_depthwise = 8 [default = false];
|
|
|
|
// Feature Pyramid Networks config.
|
|
optional FeaturePyramidNetworks fpn = 10;
|
|
|
|
// If true, replace preprocess function of feature extractor with a
|
|
// placeholder. This should only be used if all the image preprocessing steps
|
|
// happen outside the graph.
|
|
optional bool replace_preprocessor_with_placeholder = 11 [default = false];
|
|
}
|
|
|
|
// Configuration for Feature Pyramid Networks.
|
|
message FeaturePyramidNetworks {
|
|
// We recommend to use multi_resolution_feature_map_generator with FPN, and
|
|
// the levels there must match the levels defined below for better
|
|
// performance.
|
|
// Correspondence from FPN levels to Resnet/Mobilenet V1 feature maps:
|
|
// FPN Level Resnet Feature Map Mobilenet-V1 Feature Map
|
|
// 2 Block 1 Conv2d_3_pointwise
|
|
// 3 Block 2 Conv2d_5_pointwise
|
|
// 4 Block 3 Conv2d_11_pointwise
|
|
// 5 Block 4 Conv2d_13_pointwise
|
|
// 6 Bottomup_5 bottom_up_Conv2d_14
|
|
// 7 Bottomup_6 bottom_up_Conv2d_15
|
|
// 8 Bottomup_7 bottom_up_Conv2d_16
|
|
// 9 Bottomup_8 bottom_up_Conv2d_17
|
|
|
|
// minimum level in feature pyramid
|
|
optional int32 min_level = 1 [default = 3];
|
|
|
|
// maximum level in feature pyramid
|
|
optional int32 max_level = 2 [default = 7];
|
|
|
|
// channel depth for additional coarse feature layers.
|
|
optional int32 additional_layer_depth = 3 [default = 256];
|
|
|
|
}
|