You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

331 lines
16 KiB

6 years ago
  1. # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """R-FCN meta-architecture definition.
  16. R-FCN: Dai, Jifeng, et al. "R-FCN: Object Detection via Region-based
  17. Fully Convolutional Networks." arXiv preprint arXiv:1605.06409 (2016).
  18. The R-FCN meta architecture is similar to Faster R-CNN and only differs in the
  19. second stage. Hence this class inherits FasterRCNNMetaArch and overrides only
  20. the `_predict_second_stage` method.
  21. Similar to Faster R-CNN we allow for two modes: number_of_stages=1 and
  22. number_of_stages=2. In the former setting, all of the user facing methods
  23. (e.g., predict, postprocess, loss) can be used as if the model consisted
  24. only of the RPN, returning class agnostic proposals (these can be thought of as
  25. approximate detections with no associated class information). In the latter
  26. setting, proposals are computed, then passed through a second stage
  27. "box classifier" to yield (multi-class) detections.
  28. Implementations of R-FCN models must define a new FasterRCNNFeatureExtractor and
  29. override three methods: `preprocess`, `_extract_proposal_features` (the first
  30. stage of the model), and `_extract_box_classifier_features` (the second stage of
  31. the model). Optionally, the `restore_fn` method can be overridden. See tests
  32. for an example.
  33. See notes in the documentation of Faster R-CNN meta-architecture as they all
  34. apply here.
  35. """
  36. import tensorflow as tf
  37. from object_detection.core import box_predictor
  38. from object_detection.meta_architectures import faster_rcnn_meta_arch
  39. from object_detection.utils import ops
  40. class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
  41. """R-FCN Meta-architecture definition."""
  42. def __init__(self,
  43. is_training,
  44. num_classes,
  45. image_resizer_fn,
  46. feature_extractor,
  47. number_of_stages,
  48. first_stage_anchor_generator,
  49. first_stage_target_assigner,
  50. first_stage_atrous_rate,
  51. first_stage_box_predictor_arg_scope_fn,
  52. first_stage_box_predictor_kernel_size,
  53. first_stage_box_predictor_depth,
  54. first_stage_minibatch_size,
  55. first_stage_sampler,
  56. first_stage_non_max_suppression_fn,
  57. first_stage_max_proposals,
  58. first_stage_localization_loss_weight,
  59. first_stage_objectness_loss_weight,
  60. crop_and_resize_fn,
  61. second_stage_target_assigner,
  62. second_stage_rfcn_box_predictor,
  63. second_stage_batch_size,
  64. second_stage_sampler,
  65. second_stage_non_max_suppression_fn,
  66. second_stage_score_conversion_fn,
  67. second_stage_localization_loss_weight,
  68. second_stage_classification_loss_weight,
  69. second_stage_classification_loss,
  70. hard_example_miner,
  71. parallel_iterations=16,
  72. add_summaries=True,
  73. clip_anchors_to_image=False,
  74. use_static_shapes=False,
  75. resize_masks=False):
  76. """RFCNMetaArch Constructor.
  77. Args:
  78. is_training: A boolean indicating whether the training version of the
  79. computation graph should be constructed.
  80. num_classes: Number of classes. Note that num_classes *does not*
  81. include the background category, so if groundtruth labels take values
  82. in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
  83. assigned classification targets can range from {0,... K}).
  84. image_resizer_fn: A callable for image resizing. This callable always
  85. takes a rank-3 image tensor (corresponding to a single image) and
  86. returns a rank-3 image tensor, possibly with new spatial dimensions.
  87. See builders/image_resizer_builder.py.
  88. feature_extractor: A FasterRCNNFeatureExtractor object.
  89. number_of_stages: Valid values are {1, 2}. If 1 will only construct the
  90. Region Proposal Network (RPN) part of the model.
  91. first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
  92. (note that currently we only support
  93. grid_anchor_generator.GridAnchorGenerator objects)
  94. first_stage_target_assigner: Target assigner to use for first stage of
  95. R-FCN (RPN).
  96. first_stage_atrous_rate: A single integer indicating the atrous rate for
  97. the single convolution op which is applied to the `rpn_features_to_crop`
  98. tensor to obtain a tensor to be used for box prediction. Some feature
  99. extractors optionally allow for producing feature maps computed at
  100. denser resolutions. The atrous rate is used to compensate for the
  101. denser feature maps by using an effectively larger receptive field.
  102. (This should typically be set to 1).
  103. first_stage_box_predictor_arg_scope_fn: A function to generate tf-slim
  104. arg_scope for conv2d, separable_conv2d and fully_connected ops for the
  105. RPN box predictor.
  106. first_stage_box_predictor_kernel_size: Kernel size to use for the
  107. convolution op just prior to RPN box predictions.
  108. first_stage_box_predictor_depth: Output depth for the convolution op
  109. just prior to RPN box predictions.
  110. first_stage_minibatch_size: The "batch size" to use for computing the
  111. objectness and location loss of the region proposal network. This
  112. "batch size" refers to the number of anchors selected as contributing
  113. to the loss function for any given image within the image batch and is
  114. only called "batch_size" due to terminology from the Faster R-CNN paper.
  115. first_stage_sampler: The sampler for the boxes used to calculate the RPN
  116. loss after the first stage.
  117. first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
  118. callable that takes `boxes`, `scores` and optional `clip_window`(with
  119. all other inputs already set) and returns a dictionary containing
  120. tensors with keys: `detection_boxes`, `detection_scores`,
  121. `detection_classes`, `num_detections`. This is used to perform non max
  122. suppression on the boxes predicted by the Region Proposal Network
  123. (RPN).
  124. See `post_processing.batch_multiclass_non_max_suppression` for the type
  125. and shape of these tensors.
  126. first_stage_max_proposals: Maximum number of boxes to retain after
  127. performing Non-Max Suppression (NMS) on the boxes predicted by the
  128. Region Proposal Network (RPN).
  129. first_stage_localization_loss_weight: A float
  130. first_stage_objectness_loss_weight: A float
  131. crop_and_resize_fn: A differentiable resampler to use for cropping RPN
  132. proposal features.
  133. second_stage_target_assigner: Target assigner to use for second stage of
  134. R-FCN. If the model is configured with multiple prediction heads, this
  135. target assigner is used to generate targets for all heads (with the
  136. correct `unmatched_class_label`).
  137. second_stage_rfcn_box_predictor: RFCN box predictor to use for
  138. second stage.
  139. second_stage_batch_size: The batch size used for computing the
  140. classification and refined location loss of the box classifier. This
  141. "batch size" refers to the number of proposals selected as contributing
  142. to the loss function for any given image within the image batch and is
  143. only called "batch_size" due to terminology from the Faster R-CNN paper.
  144. second_stage_sampler: The sampler for the boxes used for second stage
  145. box classifier.
  146. second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
  147. callable that takes `boxes`, `scores`, optional `clip_window` and
  148. optional (kwarg) `mask` inputs (with all other inputs already set)
  149. and returns a dictionary containing tensors with keys:
  150. `detection_boxes`, `detection_scores`, `detection_classes`,
  151. `num_detections`, and (optionally) `detection_masks`. See
  152. `post_processing.batch_multiclass_non_max_suppression` for the type and
  153. shape of these tensors.
  154. second_stage_score_conversion_fn: Callable elementwise nonlinearity
  155. (that takes tensors as inputs and returns tensors). This is usually
  156. used to convert logits to probabilities.
  157. second_stage_localization_loss_weight: A float
  158. second_stage_classification_loss_weight: A float
  159. second_stage_classification_loss: A string indicating which loss function
  160. to use, supports 'softmax' and 'sigmoid'.
  161. hard_example_miner: A losses.HardExampleMiner object (can be None).
  162. parallel_iterations: (Optional) The number of iterations allowed to run
  163. in parallel for calls to tf.map_fn.
  164. add_summaries: boolean (default: True) controlling whether summary ops
  165. should be added to tensorflow graph.
  166. clip_anchors_to_image: The anchors generated are clip to the
  167. window size without filtering the nonoverlapping anchors. This generates
  168. a static number of anchors. This argument is unused.
  169. use_static_shapes: If True, uses implementation of ops with static shape
  170. guarantees.
  171. resize_masks: Indicates whether the masks presend in the groundtruth
  172. should be resized in the model with `image_resizer_fn`
  173. Raises:
  174. ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
  175. ValueError: If first_stage_anchor_generator is not of type
  176. grid_anchor_generator.GridAnchorGenerator.
  177. """
  178. # TODO(rathodv): add_summaries and crop_and_resize_fn is currently
  179. # unused. Respect that directive in the future.
  180. super(RFCNMetaArch, self).__init__(
  181. is_training,
  182. num_classes,
  183. image_resizer_fn,
  184. feature_extractor,
  185. number_of_stages,
  186. first_stage_anchor_generator,
  187. first_stage_target_assigner,
  188. first_stage_atrous_rate,
  189. first_stage_box_predictor_arg_scope_fn,
  190. first_stage_box_predictor_kernel_size,
  191. first_stage_box_predictor_depth,
  192. first_stage_minibatch_size,
  193. first_stage_sampler,
  194. first_stage_non_max_suppression_fn,
  195. first_stage_max_proposals,
  196. first_stage_localization_loss_weight,
  197. first_stage_objectness_loss_weight,
  198. crop_and_resize_fn,
  199. None, # initial_crop_size is not used in R-FCN
  200. None, # maxpool_kernel_size is not use in R-FCN
  201. None, # maxpool_stride is not use in R-FCN
  202. second_stage_target_assigner,
  203. None, # fully_connected_box_predictor is not used in R-FCN.
  204. second_stage_batch_size,
  205. second_stage_sampler,
  206. second_stage_non_max_suppression_fn,
  207. second_stage_score_conversion_fn,
  208. second_stage_localization_loss_weight,
  209. second_stage_classification_loss_weight,
  210. second_stage_classification_loss,
  211. 1.0, # second stage mask prediction loss weight isn't used in R-FCN.
  212. hard_example_miner,
  213. parallel_iterations,
  214. add_summaries,
  215. clip_anchors_to_image,
  216. use_static_shapes,
  217. resize_masks)
  218. self._rfcn_box_predictor = second_stage_rfcn_box_predictor
  219. def _predict_second_stage(self, rpn_box_encodings,
  220. rpn_objectness_predictions_with_background,
  221. rpn_features,
  222. anchors,
  223. image_shape,
  224. true_image_shapes):
  225. """Predicts the output tensors from 2nd stage of R-FCN.
  226. Args:
  227. rpn_box_encodings: 3-D float tensor of shape
  228. [batch_size, num_valid_anchors, self._box_coder.code_size] containing
  229. predicted boxes.
  230. rpn_objectness_predictions_with_background: 3-D float tensor of shape
  231. [batch_size, num_valid_anchors, 2] containing class
  232. predictions (logits) for each of the anchors. Note that this
  233. tensor *includes* background class predictions (at class index 0).
  234. rpn_features: A 4-D float32 tensor with shape
  235. [batch_size, height, width, depth] representing image features from the
  236. RPN.
  237. anchors: 2-D float tensor of shape
  238. [num_anchors, self._box_coder.code_size].
  239. image_shape: A 1D int32 tensors of size [4] containing the image shape.
  240. true_image_shapes: int32 tensor of shape [batch, 3] where each row is
  241. of the form [height, width, channels] indicating the shapes
  242. of true images in the resized images, as resized images can be padded
  243. with zeros.
  244. Returns:
  245. prediction_dict: a dictionary holding "raw" prediction tensors:
  246. 1) refined_box_encodings: a 3-D tensor with shape
  247. [total_num_proposals, num_classes, 4] representing predicted
  248. (final) refined box encodings, where
  249. total_num_proposals=batch_size*self._max_num_proposals
  250. 2) class_predictions_with_background: a 2-D tensor with shape
  251. [total_num_proposals, num_classes + 1] containing class
  252. predictions (logits) for each of the anchors, where
  253. total_num_proposals=batch_size*self._max_num_proposals.
  254. Note that this tensor *includes* background class predictions
  255. (at class index 0).
  256. 3) num_proposals: An int32 tensor of shape [batch_size] representing the
  257. number of proposals generated by the RPN. `num_proposals` allows us
  258. to keep track of which entries are to be treated as zero paddings and
  259. which are not since we always pad the number of proposals to be
  260. `self.max_num_proposals` for each image.
  261. 4) proposal_boxes: A float32 tensor of shape
  262. [batch_size, self.max_num_proposals, 4] representing
  263. decoded proposal bounding boxes (in absolute coordinates).
  264. 5) proposal_boxes_normalized: A float32 tensor of shape
  265. [batch_size, self.max_num_proposals, 4] representing decoded proposal
  266. bounding boxes (in normalized coordinates). Can be used to override
  267. the boxes proposed by the RPN, thus enabling one to extract box
  268. classification and prediction for externally selected areas of the
  269. image.
  270. 6) box_classifier_features: a 4-D float32 tensor, of shape
  271. [batch_size, feature_map_height, feature_map_width, depth],
  272. representing the box classifier features.
  273. """
  274. image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0),
  275. [image_shape[0], 1])
  276. proposal_boxes_normalized, _, num_proposals, _, _ = self._postprocess_rpn(
  277. rpn_box_encodings, rpn_objectness_predictions_with_background,
  278. anchors, image_shape_2d, true_image_shapes)
  279. box_classifier_features = (
  280. self._feature_extractor.extract_box_classifier_features(
  281. rpn_features,
  282. scope=self.second_stage_feature_extractor_scope))
  283. if self._rfcn_box_predictor.is_keras_model:
  284. box_predictions = self._rfcn_box_predictor(
  285. [box_classifier_features],
  286. proposal_boxes=proposal_boxes_normalized)
  287. else:
  288. box_predictions = self._rfcn_box_predictor.predict(
  289. [box_classifier_features],
  290. num_predictions_per_location=[1],
  291. scope=self.second_stage_box_predictor_scope,
  292. proposal_boxes=proposal_boxes_normalized)
  293. refined_box_encodings = tf.squeeze(
  294. tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1)
  295. class_predictions_with_background = tf.squeeze(
  296. tf.concat(
  297. box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
  298. axis=1),
  299. axis=1)
  300. absolute_proposal_boxes = ops.normalized_to_image_coordinates(
  301. proposal_boxes_normalized, image_shape,
  302. parallel_iterations=self._parallel_iterations)
  303. prediction_dict = {
  304. 'refined_box_encodings': refined_box_encodings,
  305. 'class_predictions_with_background':
  306. class_predictions_with_background,
  307. 'num_proposals': num_proposals,
  308. 'proposal_boxes': absolute_proposal_boxes,
  309. 'box_classifier_features': box_classifier_features,
  310. 'proposal_boxes_normalized': proposal_boxes_normalized,
  311. }
  312. return prediction_dict