|
|
- # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """R-FCN meta-architecture definition.
-
- R-FCN: Dai, Jifeng, et al. "R-FCN: Object Detection via Region-based
- Fully Convolutional Networks." arXiv preprint arXiv:1605.06409 (2016).
-
- The R-FCN meta architecture is similar to Faster R-CNN and only differs in the
- second stage. Hence this class inherits FasterRCNNMetaArch and overrides only
- the `_predict_second_stage` method.
-
- Similar to Faster R-CNN we allow for two modes: number_of_stages=1 and
- number_of_stages=2. In the former setting, all of the user facing methods
- (e.g., predict, postprocess, loss) can be used as if the model consisted
- only of the RPN, returning class agnostic proposals (these can be thought of as
- approximate detections with no associated class information). In the latter
- setting, proposals are computed, then passed through a second stage
- "box classifier" to yield (multi-class) detections.
-
- Implementations of R-FCN models must define a new FasterRCNNFeatureExtractor and
- override three methods: `preprocess`, `_extract_proposal_features` (the first
- stage of the model), and `_extract_box_classifier_features` (the second stage of
- the model). Optionally, the `restore_fn` method can be overridden. See tests
- for an example.
-
- See notes in the documentation of Faster R-CNN meta-architecture as they all
- apply here.
- """
- import tensorflow as tf
-
- from object_detection.core import box_predictor
- from object_detection.meta_architectures import faster_rcnn_meta_arch
- from object_detection.utils import ops
-
-
- class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
- """R-FCN Meta-architecture definition."""
-
- def __init__(self,
- is_training,
- num_classes,
- image_resizer_fn,
- feature_extractor,
- number_of_stages,
- first_stage_anchor_generator,
- first_stage_target_assigner,
- first_stage_atrous_rate,
- first_stage_box_predictor_arg_scope_fn,
- first_stage_box_predictor_kernel_size,
- first_stage_box_predictor_depth,
- first_stage_minibatch_size,
- first_stage_sampler,
- first_stage_non_max_suppression_fn,
- first_stage_max_proposals,
- first_stage_localization_loss_weight,
- first_stage_objectness_loss_weight,
- crop_and_resize_fn,
- second_stage_target_assigner,
- second_stage_rfcn_box_predictor,
- second_stage_batch_size,
- second_stage_sampler,
- second_stage_non_max_suppression_fn,
- second_stage_score_conversion_fn,
- second_stage_localization_loss_weight,
- second_stage_classification_loss_weight,
- second_stage_classification_loss,
- hard_example_miner,
- parallel_iterations=16,
- add_summaries=True,
- clip_anchors_to_image=False,
- use_static_shapes=False,
- resize_masks=False):
- """RFCNMetaArch Constructor.
-
- Args:
- is_training: A boolean indicating whether the training version of the
- computation graph should be constructed.
- num_classes: Number of classes. Note that num_classes *does not*
- include the background category, so if groundtruth labels take values
- in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
- assigned classification targets can range from {0,... K}).
- image_resizer_fn: A callable for image resizing. This callable always
- takes a rank-3 image tensor (corresponding to a single image) and
- returns a rank-3 image tensor, possibly with new spatial dimensions.
- See builders/image_resizer_builder.py.
- feature_extractor: A FasterRCNNFeatureExtractor object.
- number_of_stages: Valid values are {1, 2}. If 1 will only construct the
- Region Proposal Network (RPN) part of the model.
- first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
- (note that currently we only support
- grid_anchor_generator.GridAnchorGenerator objects)
- first_stage_target_assigner: Target assigner to use for first stage of
- R-FCN (RPN).
- first_stage_atrous_rate: A single integer indicating the atrous rate for
- the single convolution op which is applied to the `rpn_features_to_crop`
- tensor to obtain a tensor to be used for box prediction. Some feature
- extractors optionally allow for producing feature maps computed at
- denser resolutions. The atrous rate is used to compensate for the
- denser feature maps by using an effectively larger receptive field.
- (This should typically be set to 1).
- first_stage_box_predictor_arg_scope_fn: A function to generate tf-slim
- arg_scope for conv2d, separable_conv2d and fully_connected ops for the
- RPN box predictor.
- first_stage_box_predictor_kernel_size: Kernel size to use for the
- convolution op just prior to RPN box predictions.
- first_stage_box_predictor_depth: Output depth for the convolution op
- just prior to RPN box predictions.
- first_stage_minibatch_size: The "batch size" to use for computing the
- objectness and location loss of the region proposal network. This
- "batch size" refers to the number of anchors selected as contributing
- to the loss function for any given image within the image batch and is
- only called "batch_size" due to terminology from the Faster R-CNN paper.
- first_stage_sampler: The sampler for the boxes used to calculate the RPN
- loss after the first stage.
- first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
- callable that takes `boxes`, `scores` and optional `clip_window`(with
- all other inputs already set) and returns a dictionary containing
- tensors with keys: `detection_boxes`, `detection_scores`,
- `detection_classes`, `num_detections`. This is used to perform non max
- suppression on the boxes predicted by the Region Proposal Network
- (RPN).
- See `post_processing.batch_multiclass_non_max_suppression` for the type
- and shape of these tensors.
- first_stage_max_proposals: Maximum number of boxes to retain after
- performing Non-Max Suppression (NMS) on the boxes predicted by the
- Region Proposal Network (RPN).
- first_stage_localization_loss_weight: A float
- first_stage_objectness_loss_weight: A float
- crop_and_resize_fn: A differentiable resampler to use for cropping RPN
- proposal features.
- second_stage_target_assigner: Target assigner to use for second stage of
- R-FCN. If the model is configured with multiple prediction heads, this
- target assigner is used to generate targets for all heads (with the
- correct `unmatched_class_label`).
- second_stage_rfcn_box_predictor: RFCN box predictor to use for
- second stage.
- second_stage_batch_size: The batch size used for computing the
- classification and refined location loss of the box classifier. This
- "batch size" refers to the number of proposals selected as contributing
- to the loss function for any given image within the image batch and is
- only called "batch_size" due to terminology from the Faster R-CNN paper.
- second_stage_sampler: The sampler for the boxes used for second stage
- box classifier.
- second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
- callable that takes `boxes`, `scores`, optional `clip_window` and
- optional (kwarg) `mask` inputs (with all other inputs already set)
- and returns a dictionary containing tensors with keys:
- `detection_boxes`, `detection_scores`, `detection_classes`,
- `num_detections`, and (optionally) `detection_masks`. See
- `post_processing.batch_multiclass_non_max_suppression` for the type and
- shape of these tensors.
- second_stage_score_conversion_fn: Callable elementwise nonlinearity
- (that takes tensors as inputs and returns tensors). This is usually
- used to convert logits to probabilities.
- second_stage_localization_loss_weight: A float
- second_stage_classification_loss_weight: A float
- second_stage_classification_loss: A string indicating which loss function
- to use, supports 'softmax' and 'sigmoid'.
- hard_example_miner: A losses.HardExampleMiner object (can be None).
- parallel_iterations: (Optional) The number of iterations allowed to run
- in parallel for calls to tf.map_fn.
- add_summaries: boolean (default: True) controlling whether summary ops
- should be added to tensorflow graph.
- clip_anchors_to_image: The anchors generated are clip to the
- window size without filtering the nonoverlapping anchors. This generates
- a static number of anchors. This argument is unused.
- use_static_shapes: If True, uses implementation of ops with static shape
- guarantees.
- resize_masks: Indicates whether the masks presend in the groundtruth
- should be resized in the model with `image_resizer_fn`
-
- Raises:
- ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
- ValueError: If first_stage_anchor_generator is not of type
- grid_anchor_generator.GridAnchorGenerator.
- """
- # TODO(rathodv): add_summaries and crop_and_resize_fn is currently
- # unused. Respect that directive in the future.
- super(RFCNMetaArch, self).__init__(
- is_training,
- num_classes,
- image_resizer_fn,
- feature_extractor,
- number_of_stages,
- first_stage_anchor_generator,
- first_stage_target_assigner,
- first_stage_atrous_rate,
- first_stage_box_predictor_arg_scope_fn,
- first_stage_box_predictor_kernel_size,
- first_stage_box_predictor_depth,
- first_stage_minibatch_size,
- first_stage_sampler,
- first_stage_non_max_suppression_fn,
- first_stage_max_proposals,
- first_stage_localization_loss_weight,
- first_stage_objectness_loss_weight,
- crop_and_resize_fn,
- None, # initial_crop_size is not used in R-FCN
- None, # maxpool_kernel_size is not use in R-FCN
- None, # maxpool_stride is not use in R-FCN
- second_stage_target_assigner,
- None, # fully_connected_box_predictor is not used in R-FCN.
- second_stage_batch_size,
- second_stage_sampler,
- second_stage_non_max_suppression_fn,
- second_stage_score_conversion_fn,
- second_stage_localization_loss_weight,
- second_stage_classification_loss_weight,
- second_stage_classification_loss,
- 1.0, # second stage mask prediction loss weight isn't used in R-FCN.
- hard_example_miner,
- parallel_iterations,
- add_summaries,
- clip_anchors_to_image,
- use_static_shapes,
- resize_masks)
-
- self._rfcn_box_predictor = second_stage_rfcn_box_predictor
-
- def _predict_second_stage(self, rpn_box_encodings,
- rpn_objectness_predictions_with_background,
- rpn_features,
- anchors,
- image_shape,
- true_image_shapes):
- """Predicts the output tensors from 2nd stage of R-FCN.
-
- Args:
- rpn_box_encodings: 3-D float tensor of shape
- [batch_size, num_valid_anchors, self._box_coder.code_size] containing
- predicted boxes.
- rpn_objectness_predictions_with_background: 3-D float tensor of shape
- [batch_size, num_valid_anchors, 2] containing class
- predictions (logits) for each of the anchors. Note that this
- tensor *includes* background class predictions (at class index 0).
- rpn_features: A 4-D float32 tensor with shape
- [batch_size, height, width, depth] representing image features from the
- RPN.
- anchors: 2-D float tensor of shape
- [num_anchors, self._box_coder.code_size].
- image_shape: A 1D int32 tensors of size [4] containing the image shape.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
-
- Returns:
- prediction_dict: a dictionary holding "raw" prediction tensors:
- 1) refined_box_encodings: a 3-D tensor with shape
- [total_num_proposals, num_classes, 4] representing predicted
- (final) refined box encodings, where
- total_num_proposals=batch_size*self._max_num_proposals
- 2) class_predictions_with_background: a 2-D tensor with shape
- [total_num_proposals, num_classes + 1] containing class
- predictions (logits) for each of the anchors, where
- total_num_proposals=batch_size*self._max_num_proposals.
- Note that this tensor *includes* background class predictions
- (at class index 0).
- 3) num_proposals: An int32 tensor of shape [batch_size] representing the
- number of proposals generated by the RPN. `num_proposals` allows us
- to keep track of which entries are to be treated as zero paddings and
- which are not since we always pad the number of proposals to be
- `self.max_num_proposals` for each image.
- 4) proposal_boxes: A float32 tensor of shape
- [batch_size, self.max_num_proposals, 4] representing
- decoded proposal bounding boxes (in absolute coordinates).
- 5) proposal_boxes_normalized: A float32 tensor of shape
- [batch_size, self.max_num_proposals, 4] representing decoded proposal
- bounding boxes (in normalized coordinates). Can be used to override
- the boxes proposed by the RPN, thus enabling one to extract box
- classification and prediction for externally selected areas of the
- image.
- 6) box_classifier_features: a 4-D float32 tensor, of shape
- [batch_size, feature_map_height, feature_map_width, depth],
- representing the box classifier features.
- """
- image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0),
- [image_shape[0], 1])
- proposal_boxes_normalized, _, num_proposals, _, _ = self._postprocess_rpn(
- rpn_box_encodings, rpn_objectness_predictions_with_background,
- anchors, image_shape_2d, true_image_shapes)
-
- box_classifier_features = (
- self._feature_extractor.extract_box_classifier_features(
- rpn_features,
- scope=self.second_stage_feature_extractor_scope))
-
- if self._rfcn_box_predictor.is_keras_model:
- box_predictions = self._rfcn_box_predictor(
- [box_classifier_features],
- proposal_boxes=proposal_boxes_normalized)
- else:
- box_predictions = self._rfcn_box_predictor.predict(
- [box_classifier_features],
- num_predictions_per_location=[1],
- scope=self.second_stage_box_predictor_scope,
- proposal_boxes=proposal_boxes_normalized)
- refined_box_encodings = tf.squeeze(
- tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1)
- class_predictions_with_background = tf.squeeze(
- tf.concat(
- box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
- axis=1),
- axis=1)
-
- absolute_proposal_boxes = ops.normalized_to_image_coordinates(
- proposal_boxes_normalized, image_shape,
- parallel_iterations=self._parallel_iterations)
-
- prediction_dict = {
- 'refined_box_encodings': refined_box_encodings,
- 'class_predictions_with_background':
- class_predictions_with_background,
- 'num_proposals': num_proposals,
- 'proposal_boxes': absolute_proposal_boxes,
- 'box_classifier_features': box_classifier_features,
- 'proposal_boxes_normalized': proposal_boxes_normalized,
- }
- return prediction_dict
|