syntax = "proto2"; package object_detection.protos; import "object_detection/protos/anchor_generator.proto"; import "object_detection/protos/box_coder.proto"; import "object_detection/protos/box_predictor.proto"; import "object_detection/protos/hyperparams.proto"; import "object_detection/protos/image_resizer.proto"; import "object_detection/protos/matcher.proto"; import "object_detection/protos/losses.proto"; import "object_detection/protos/post_processing.proto"; import "object_detection/protos/region_similarity_calculator.proto"; // Configuration for Single Shot Detection (SSD) models. // Next id: 26 message Ssd { // Number of classes to predict. optional int32 num_classes = 1; // Image resizer for preprocessing the input image. optional ImageResizer image_resizer = 2; // Feature extractor config. optional SsdFeatureExtractor feature_extractor = 3; // Box coder to encode the boxes. optional BoxCoder box_coder = 4; // Matcher to match groundtruth with anchors. optional Matcher matcher = 5; // Region similarity calculator to compute similarity of boxes. optional RegionSimilarityCalculator similarity_calculator = 6; // Whether background targets are to be encoded as an all // zeros vector or a one-hot vector (where background is the 0th class). optional bool encode_background_as_zeros = 12 [default = false]; // classification weight to be associated to negative // anchors (default: 1.0). The weight must be in [0., 1.]. optional float negative_class_weight = 13 [default = 1.0]; // Box predictor to attach to the features. optional BoxPredictor box_predictor = 7; // Anchor generator to compute anchors. optional AnchorGenerator anchor_generator = 8; // Post processing to apply on the predictions. optional PostProcessing post_processing = 9; // Whether to normalize the loss by number of groundtruth boxes that match to // the anchors. optional bool normalize_loss_by_num_matches = 10 [default = true]; // Whether to normalize the localization loss by the code size of the box // encodings. This is applied along with other normalization factors. optional bool normalize_loc_loss_by_codesize = 14 [default = false]; // Loss configuration for training. optional Loss loss = 11; // Whether to update batch norm parameters during training or not. // When training with a relative small batch size (e.g. 1), it is // desirable to disable batch norm update and use pretrained batch norm // params. // // Note: Some feature extractors are used with canned arg_scopes // (e.g resnet arg scopes). In these cases training behavior of batch norm // variables may depend on both values of `batch_norm_trainable` and // `is_training`. // // When canned arg_scopes are used with feature extractors `conv_hyperparams` // will apply only to the additional layers that are added and are outside the // canned arg_scope. optional bool freeze_batchnorm = 16 [default = false]; // Whether to update batch_norm inplace during training. This is required // for batch norm to work correctly on TPUs. When this is false, user must add // a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order // to update the batch norm moving average parameters. optional bool inplace_batchnorm_update = 15 [default = false]; // Whether to add an implicit background class to one-hot encodings of // groundtruth labels. Set to false if training a single // class model or using an explicit background class. optional bool add_background_class = 21 [default = true]; // Whether to use an explicit background class. Set to true if using // groundtruth labels with an explicit background class, as in multiclass // scores. optional bool explicit_background_class = 24 [default = false]; optional bool use_confidences_as_targets = 22 [default = false]; optional float implicit_example_weight = 23 [default = 1.0]; // Configuration proto for MaskHead. // Next id: 11 message MaskHead { // The height and the width of the predicted mask. Only used when // predict_instance_masks is true. optional int32 mask_height = 1 [default = 15]; optional int32 mask_width = 2 [default = 15]; // Whether to predict class agnostic masks. Only used when // predict_instance_masks is true. optional bool masks_are_class_agnostic = 3 [default = true]; // The depth for the first conv2d_transpose op applied to the // image_features in the mask prediction branch. If set to 0, the value // will be set automatically based on the number of channels in the image // features and the number of classes. optional int32 mask_prediction_conv_depth = 4 [default = 256]; // The number of convolutions applied to image_features in the mask prediction // branch. optional int32 mask_prediction_num_conv_layers = 5 [default = 2]; // Whether to apply convolutions on mask features before upsampling using // nearest neighbor resizing. // By default, mask features are resized to [`mask_height`, `mask_width`] // before applying convolutions and predicting masks. optional bool convolve_then_upsample_masks = 6 [default = false]; // Mask loss weight. optional float mask_loss_weight = 7 [default=5.0]; // Number of boxes to be generated at training time for computing mask loss. optional int32 mask_loss_sample_size = 8 [default=16]; // Hyperparameters for convolution ops used in the box predictor. optional Hyperparams conv_hyperparams = 9; // Output size (width and height are set to be the same) of the initial // bilinear interpolation based cropping during ROI pooling. Only used when // we have second stage prediction head enabled (e.g. mask head). optional int32 initial_crop_size = 10 [default = 15]; } // Configs for mask head. optional MaskHead mask_head_config = 25; } message SsdFeatureExtractor { reserved 6; // Type of ssd feature extractor. optional string type = 1; // The factor to alter the depth of the channels in the feature extractor. optional float depth_multiplier = 2 [default = 1.0]; // Minimum number of the channels in the feature extractor. optional int32 min_depth = 3 [default = 16]; // Hyperparameters that affect the layers of feature extractor added on top // of the base feature extractor. optional Hyperparams conv_hyperparams = 4; // Normally, SSD feature extractors are constructed by reusing an existing // base feature extractor (that has its own hyperparams) and adding new layers // on top of it. `conv_hyperparams` above normally applies only to the new // layers while base feature extractor uses its own default hyperparams. If // this value is set to true, the base feature extractor's hyperparams will be // overridden with the `conv_hyperparams`. optional bool override_base_feature_extractor_hyperparams = 9 [default = false]; // The nearest multiple to zero-pad the input height and width dimensions to. // For example, if pad_to_multiple = 2, input dimensions are zero-padded // until the resulting dimensions are even. optional int32 pad_to_multiple = 5 [default = 1]; // Whether to use explicit padding when extracting SSD multiresolution // features. This will also apply to the base feature extractor if a MobileNet // architecture is used. optional bool use_explicit_padding = 7 [default = false]; // Whether to use depthwise separable convolutions for to extract additional // feature maps added by SSD. optional bool use_depthwise = 8 [default = false]; // Feature Pyramid Networks config. optional FeaturePyramidNetworks fpn = 10; // If true, replace preprocess function of feature extractor with a // placeholder. This should only be used if all the image preprocessing steps // happen outside the graph. optional bool replace_preprocessor_with_placeholder = 11 [default = false]; } // Configuration for Feature Pyramid Networks. message FeaturePyramidNetworks { // We recommend to use multi_resolution_feature_map_generator with FPN, and // the levels there must match the levels defined below for better // performance. // Correspondence from FPN levels to Resnet/Mobilenet V1 feature maps: // FPN Level Resnet Feature Map Mobilenet-V1 Feature Map // 2 Block 1 Conv2d_3_pointwise // 3 Block 2 Conv2d_5_pointwise // 4 Block 3 Conv2d_11_pointwise // 5 Block 4 Conv2d_13_pointwise // 6 Bottomup_5 bottom_up_Conv2d_14 // 7 Bottomup_6 bottom_up_Conv2d_15 // 8 Bottomup_7 bottom_up_Conv2d_16 // 9 Bottomup_8 bottom_up_Conv2d_17 // minimum level in feature pyramid optional int32 min_level = 1 [default = 3]; // maximum level in feature pyramid optional int32 max_level = 2 [default = 7]; // channel depth for additional coarse feature layers. optional int32 additional_layer_depth = 3 [default = 256]; }