syntax = "proto2"; package object_detection.protos; import "object_detection/protos/hyperparams.proto"; // Configuration proto for box predictor. See core/box_predictor.py for details. message BoxPredictor { oneof box_predictor_oneof { ConvolutionalBoxPredictor convolutional_box_predictor = 1; MaskRCNNBoxPredictor mask_rcnn_box_predictor = 2; RfcnBoxPredictor rfcn_box_predictor = 3; WeightSharedConvolutionalBoxPredictor weight_shared_convolutional_box_predictor = 4; } } // Configuration proto for Convolutional box predictor. // Next id: 13 message ConvolutionalBoxPredictor { // Hyperparameters for convolution ops used in the box predictor. optional Hyperparams conv_hyperparams = 1; // Minimum feature depth prior to predicting box encodings and class // predictions. optional int32 min_depth = 2 [default = 0]; // Maximum feature depth prior to predicting box encodings and class // predictions. If max_depth is set to 0, no additional feature map will be // inserted before location and class predictions. optional int32 max_depth = 3 [default = 0]; // Number of the additional conv layers before the predictor. optional int32 num_layers_before_predictor = 4 [default = 0]; // Whether to use dropout for class prediction. optional bool use_dropout = 5 [default = true]; // Keep probability for dropout optional float dropout_keep_probability = 6 [default = 0.8]; // Size of final convolution kernel. If the spatial resolution of the feature // map is smaller than the kernel size, then the kernel size is set to // min(feature_width, feature_height). optional int32 kernel_size = 7 [default = 1]; // Size of the encoding for boxes. optional int32 box_code_size = 8 [default = 4]; // Whether to apply sigmoid to the output of class predictions. // TODO(jonathanhuang): Do we need this since we have a post processing // module.? optional bool apply_sigmoid_to_scores = 9 [default = false]; optional float class_prediction_bias_init = 10 [default = 0.0]; // Whether to use depthwise separable convolution for box predictor layers. optional bool use_depthwise = 11 [default = false]; // If specified, apply clipping to box encodings. message BoxEncodingsClipRange { optional float min = 1; optional float max = 2; } optional BoxEncodingsClipRange box_encodings_clip_range = 12; } // Configuration proto for weight shared convolutional box predictor. // Next id: 18 message WeightSharedConvolutionalBoxPredictor { // Hyperparameters for convolution ops used in the box predictor. optional Hyperparams conv_hyperparams = 1; // Number of the additional conv layers before the predictor. optional int32 num_layers_before_predictor = 4 [default = 0]; // Output depth for the convolution ops prior to predicting box encodings // and class predictions. optional int32 depth = 2 [default = 0]; // Size of final convolution kernel. If the spatial resolution of the feature // map is smaller than the kernel size, then the kernel size is set to // min(feature_width, feature_height). optional int32 kernel_size = 7 [default = 3]; // Size of the encoding for boxes. optional int32 box_code_size = 8 [default = 4]; // Bias initialization for class prediction. It has been show to stabilize // training where there are large number of negative boxes. See // https://arxiv.org/abs/1708.02002 for details. optional float class_prediction_bias_init = 10 [default = 0.0]; // Whether to use dropout for class prediction. optional bool use_dropout = 11 [default = false]; // Keep probability for dropout. optional float dropout_keep_probability = 12 [default = 0.8]; // Whether to share the multi-layer tower between box prediction and class // prediction heads. optional bool share_prediction_tower = 13 [default = false]; // Whether to use depthwise separable convolution for box predictor layers. optional bool use_depthwise = 14 [default = false]; // Enum to specify how to convert the detection scores at inference time. enum ScoreConverter { // Input scores equals output scores. IDENTITY = 0; // Applies a sigmoid on input scores. SIGMOID = 1; } // Callable elementwise score converter at inference time. optional ScoreConverter score_converter = 16 [default = IDENTITY]; // If specified, apply clipping to box encodings. message BoxEncodingsClipRange { optional float min = 1; optional float max = 2; } optional BoxEncodingsClipRange box_encodings_clip_range = 17; } // TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn // head easily. // Next id: 15 message MaskRCNNBoxPredictor { // Hyperparameters for fully connected ops used in the box predictor. optional Hyperparams fc_hyperparams = 1; // Whether to use dropout op prior to the both box and class predictions. optional bool use_dropout = 2 [default = false]; // Keep probability for dropout. This is only used if use_dropout is true. optional float dropout_keep_probability = 3 [default = 0.5]; // Size of the encoding for the boxes. optional int32 box_code_size = 4 [default = 4]; // Hyperparameters for convolution ops used in the box predictor. optional Hyperparams conv_hyperparams = 5; // Whether to predict instance masks inside detection boxes. optional bool predict_instance_masks = 6 [default = false]; // The depth for the first conv2d_transpose op applied to the // image_features in the mask prediction branch. If set to 0, the value // will be set automatically based on the number of channels in the image // features and the number of classes. optional int32 mask_prediction_conv_depth = 7 [default = 256]; // Whether to predict keypoints inside detection boxes. optional bool predict_keypoints = 8 [default = false]; // The height and the width of the predicted mask. optional int32 mask_height = 9 [default = 15]; optional int32 mask_width = 10 [default = 15]; // The number of convolutions applied to image_features in the mask prediction // branch. optional int32 mask_prediction_num_conv_layers = 11 [default = 2]; optional bool masks_are_class_agnostic = 12 [default = false]; // Whether to use one box for all classes rather than a different box for each // class. optional bool share_box_across_classes = 13 [default = false]; // Whether to apply convolutions on mask features before upsampling using // nearest neighbor resizing. // By default, mask features are resized to [`mask_height`, `mask_width`] // before applying convolutions and predicting masks. optional bool convolve_then_upsample_masks = 14 [default = false]; } message RfcnBoxPredictor { // Hyperparameters for convolution ops used in the box predictor. optional Hyperparams conv_hyperparams = 1; // Bin sizes for RFCN crops. optional int32 num_spatial_bins_height = 2 [default = 3]; optional int32 num_spatial_bins_width = 3 [default = 3]; // Target depth to reduce the input image features to. optional int32 depth = 4 [default = 1024]; // Size of the encoding for the boxes. optional int32 box_code_size = 5 [default = 4]; // Size to resize the rfcn crops to. optional int32 crop_height = 6 [default = 12]; optional int32 crop_width = 7 [default = 12]; }