You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

355 lines
14 KiB

6 years ago
  1. # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Mask Head.
  16. Contains Mask prediction head classes for different meta architectures.
  17. All the mask prediction heads have a predict function that receives the
  18. `features` as the first argument and returns `mask_predictions`.
  19. """
  20. import math
  21. import tensorflow as tf
  22. from object_detection.predictors.heads import head
  23. from object_detection.utils import ops
  24. slim = tf.contrib.slim
  25. class MaskRCNNMaskHead(head.Head):
  26. """Mask RCNN mask prediction head.
  27. Please refer to Mask RCNN paper:
  28. https://arxiv.org/abs/1703.06870
  29. """
  30. def __init__(self,
  31. num_classes,
  32. conv_hyperparams_fn=None,
  33. mask_height=14,
  34. mask_width=14,
  35. mask_prediction_num_conv_layers=2,
  36. mask_prediction_conv_depth=256,
  37. masks_are_class_agnostic=False,
  38. convolve_then_upsample=False):
  39. """Constructor.
  40. Args:
  41. num_classes: number of classes. Note that num_classes *does not*
  42. include the background category, so if groundtruth labels take values
  43. in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
  44. assigned classification targets can range from {0,... K}).
  45. conv_hyperparams_fn: A function to generate tf-slim arg_scope with
  46. hyperparameters for convolution ops.
  47. mask_height: Desired output mask height. The default value is 14.
  48. mask_width: Desired output mask width. The default value is 14.
  49. mask_prediction_num_conv_layers: Number of convolution layers applied to
  50. the image_features in mask prediction branch.
  51. mask_prediction_conv_depth: The depth for the first conv2d_transpose op
  52. applied to the image_features in the mask prediction branch. If set
  53. to 0, the depth of the convolution layers will be automatically chosen
  54. based on the number of object classes and the number of channels in the
  55. image features.
  56. masks_are_class_agnostic: Boolean determining if the mask-head is
  57. class-agnostic or not.
  58. convolve_then_upsample: Whether to apply convolutions on mask features
  59. before upsampling using nearest neighbor resizing. Otherwise, mask
  60. features are resized to [`mask_height`, `mask_width`] using bilinear
  61. resizing before applying convolutions.
  62. Raises:
  63. ValueError: conv_hyperparams_fn is None.
  64. """
  65. super(MaskRCNNMaskHead, self).__init__()
  66. self._num_classes = num_classes
  67. self._conv_hyperparams_fn = conv_hyperparams_fn
  68. self._mask_height = mask_height
  69. self._mask_width = mask_width
  70. self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
  71. self._mask_prediction_conv_depth = mask_prediction_conv_depth
  72. self._masks_are_class_agnostic = masks_are_class_agnostic
  73. self._convolve_then_upsample = convolve_then_upsample
  74. if conv_hyperparams_fn is None:
  75. raise ValueError('conv_hyperparams_fn is None.')
  76. def _get_mask_predictor_conv_depth(self,
  77. num_feature_channels,
  78. num_classes,
  79. class_weight=3.0,
  80. feature_weight=2.0):
  81. """Computes the depth of the mask predictor convolutions.
  82. Computes the depth of the mask predictor convolutions given feature channels
  83. and number of classes by performing a weighted average of the two in
  84. log space to compute the number of convolution channels. The weights that
  85. are used for computing the weighted average do not need to sum to 1.
  86. Args:
  87. num_feature_channels: An integer containing the number of feature
  88. channels.
  89. num_classes: An integer containing the number of classes.
  90. class_weight: Class weight used in computing the weighted average.
  91. feature_weight: Feature weight used in computing the weighted average.
  92. Returns:
  93. An integer containing the number of convolution channels used by mask
  94. predictor.
  95. """
  96. num_feature_channels_log = math.log(float(num_feature_channels), 2.0)
  97. num_classes_log = math.log(float(num_classes), 2.0)
  98. weighted_num_feature_channels_log = (
  99. num_feature_channels_log * feature_weight)
  100. weighted_num_classes_log = num_classes_log * class_weight
  101. total_weight = feature_weight + class_weight
  102. num_conv_channels_log = round(
  103. (weighted_num_feature_channels_log + weighted_num_classes_log) /
  104. total_weight)
  105. return int(math.pow(2.0, num_conv_channels_log))
  106. def predict(self, features, num_predictions_per_location=1):
  107. """Performs mask prediction.
  108. Args:
  109. features: A float tensor of shape [batch_size, height, width, channels]
  110. containing features for a batch of images.
  111. num_predictions_per_location: Int containing number of predictions per
  112. location.
  113. Returns:
  114. instance_masks: A float tensor of shape
  115. [batch_size, 1, num_classes, mask_height, mask_width].
  116. Raises:
  117. ValueError: If num_predictions_per_location is not 1.
  118. """
  119. if num_predictions_per_location != 1:
  120. raise ValueError('Only num_predictions_per_location=1 is supported')
  121. num_conv_channels = self._mask_prediction_conv_depth
  122. if num_conv_channels == 0:
  123. num_feature_channels = features.get_shape().as_list()[3]
  124. num_conv_channels = self._get_mask_predictor_conv_depth(
  125. num_feature_channels, self._num_classes)
  126. with slim.arg_scope(self._conv_hyperparams_fn()):
  127. if not self._convolve_then_upsample:
  128. features = tf.image.resize_bilinear(
  129. features, [self._mask_height, self._mask_width],
  130. align_corners=True)
  131. for _ in range(self._mask_prediction_num_conv_layers - 1):
  132. features = slim.conv2d(
  133. features,
  134. num_outputs=num_conv_channels,
  135. kernel_size=[3, 3])
  136. if self._convolve_then_upsample:
  137. # Replace Transposed Convolution with a Nearest Neighbor upsampling step
  138. # followed by 3x3 convolution.
  139. height_scale = self._mask_height / features.shape[1].value
  140. width_scale = self._mask_width / features.shape[2].value
  141. features = ops.nearest_neighbor_upsampling(
  142. features, height_scale=height_scale, width_scale=width_scale)
  143. features = slim.conv2d(
  144. features,
  145. num_outputs=num_conv_channels,
  146. kernel_size=[3, 3])
  147. num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
  148. mask_predictions = slim.conv2d(
  149. features,
  150. num_outputs=num_masks,
  151. activation_fn=None,
  152. normalizer_fn=None,
  153. kernel_size=[3, 3])
  154. return tf.expand_dims(
  155. tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),
  156. axis=1,
  157. name='MaskPredictor')
  158. class ConvolutionalMaskHead(head.Head):
  159. """Convolutional class prediction head."""
  160. def __init__(self,
  161. is_training,
  162. num_classes,
  163. use_dropout,
  164. dropout_keep_prob,
  165. kernel_size,
  166. use_depthwise=False,
  167. mask_height=7,
  168. mask_width=7,
  169. masks_are_class_agnostic=False):
  170. """Constructor.
  171. Args:
  172. is_training: Indicates whether the BoxPredictor is in training mode.
  173. num_classes: Number of classes.
  174. use_dropout: Option to use dropout or not. Note that a single dropout
  175. op is applied here prior to both box and class predictions, which stands
  176. in contrast to the ConvolutionalBoxPredictor below.
  177. dropout_keep_prob: Keep probability for dropout.
  178. This is only used if use_dropout is True.
  179. kernel_size: Size of final convolution kernel. If the
  180. spatial resolution of the feature map is smaller than the kernel size,
  181. then the kernel size is automatically set to be
  182. min(feature_width, feature_height).
  183. use_depthwise: Whether to use depthwise convolutions for prediction
  184. steps. Default is False.
  185. mask_height: Desired output mask height. The default value is 7.
  186. mask_width: Desired output mask width. The default value is 7.
  187. masks_are_class_agnostic: Boolean determining if the mask-head is
  188. class-agnostic or not.
  189. Raises:
  190. ValueError: if min_depth > max_depth.
  191. """
  192. super(ConvolutionalMaskHead, self).__init__()
  193. self._is_training = is_training
  194. self._num_classes = num_classes
  195. self._use_dropout = use_dropout
  196. self._dropout_keep_prob = dropout_keep_prob
  197. self._kernel_size = kernel_size
  198. self._use_depthwise = use_depthwise
  199. self._mask_height = mask_height
  200. self._mask_width = mask_width
  201. self._masks_are_class_agnostic = masks_are_class_agnostic
  202. def predict(self, features, num_predictions_per_location):
  203. """Predicts boxes.
  204. Args:
  205. features: A float tensor of shape [batch_size, height, width, channels]
  206. containing image features.
  207. num_predictions_per_location: Number of box predictions to be made per
  208. spatial location.
  209. Returns:
  210. mask_predictions: A float tensors of shape
  211. [batch_size, num_anchors, num_masks, mask_height, mask_width]
  212. representing the mask predictions for the proposals.
  213. """
  214. image_feature = features
  215. # Add a slot for the background class.
  216. if self._masks_are_class_agnostic:
  217. num_masks = 1
  218. else:
  219. num_masks = self._num_classes
  220. num_mask_channels = num_masks * self._mask_height * self._mask_width
  221. net = image_feature
  222. if self._use_dropout:
  223. net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
  224. if self._use_depthwise:
  225. mask_predictions = slim.separable_conv2d(
  226. net, None, [self._kernel_size, self._kernel_size],
  227. padding='SAME', depth_multiplier=1, stride=1,
  228. rate=1, scope='MaskPredictor_depthwise')
  229. mask_predictions = slim.conv2d(
  230. mask_predictions,
  231. num_predictions_per_location * num_mask_channels,
  232. [1, 1],
  233. activation_fn=None,
  234. normalizer_fn=None,
  235. normalizer_params=None,
  236. scope='MaskPredictor')
  237. else:
  238. mask_predictions = slim.conv2d(
  239. net,
  240. num_predictions_per_location * num_mask_channels,
  241. [self._kernel_size, self._kernel_size],
  242. activation_fn=None,
  243. normalizer_fn=None,
  244. normalizer_params=None,
  245. scope='MaskPredictor')
  246. batch_size = features.get_shape().as_list()[0]
  247. if batch_size is None:
  248. batch_size = tf.shape(features)[0]
  249. mask_predictions = tf.reshape(
  250. mask_predictions,
  251. [batch_size, -1, num_masks, self._mask_height, self._mask_width])
  252. return mask_predictions
  253. # TODO(alirezafathi): See if possible to unify Weight Shared with regular
  254. # convolutional mask head.
  255. class WeightSharedConvolutionalMaskHead(head.Head):
  256. """Weight shared convolutional mask prediction head."""
  257. def __init__(self,
  258. num_classes,
  259. kernel_size=3,
  260. use_dropout=False,
  261. dropout_keep_prob=0.8,
  262. mask_height=7,
  263. mask_width=7,
  264. masks_are_class_agnostic=False):
  265. """Constructor.
  266. Args:
  267. num_classes: number of classes. Note that num_classes *does not*
  268. include the background category, so if groundtruth labels take values
  269. in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
  270. assigned classification targets can range from {0,... K}).
  271. kernel_size: Size of final convolution kernel.
  272. use_dropout: Whether to apply dropout to class prediction head.
  273. dropout_keep_prob: Probability of keeping activiations.
  274. mask_height: Desired output mask height. The default value is 7.
  275. mask_width: Desired output mask width. The default value is 7.
  276. masks_are_class_agnostic: Boolean determining if the mask-head is
  277. class-agnostic or not.
  278. """
  279. super(WeightSharedConvolutionalMaskHead, self).__init__()
  280. self._num_classes = num_classes
  281. self._kernel_size = kernel_size
  282. self._use_dropout = use_dropout
  283. self._dropout_keep_prob = dropout_keep_prob
  284. self._mask_height = mask_height
  285. self._mask_width = mask_width
  286. self._masks_are_class_agnostic = masks_are_class_agnostic
  287. def predict(self, features, num_predictions_per_location):
  288. """Predicts boxes.
  289. Args:
  290. features: A float tensor of shape [batch_size, height, width, channels]
  291. containing image features.
  292. num_predictions_per_location: Number of box predictions to be made per
  293. spatial location.
  294. Returns:
  295. mask_predictions: A tensor of shape
  296. [batch_size, num_anchors, num_classes, mask_height, mask_width]
  297. representing the mask predictions for the proposals.
  298. """
  299. mask_predictions_net = features
  300. if self._masks_are_class_agnostic:
  301. num_masks = 1
  302. else:
  303. num_masks = self._num_classes
  304. num_mask_channels = num_masks * self._mask_height * self._mask_width
  305. if self._use_dropout:
  306. mask_predictions_net = slim.dropout(
  307. mask_predictions_net, keep_prob=self._dropout_keep_prob)
  308. mask_predictions = slim.conv2d(
  309. mask_predictions_net,
  310. num_predictions_per_location * num_mask_channels,
  311. [self._kernel_size, self._kernel_size],
  312. activation_fn=None, stride=1, padding='SAME',
  313. normalizer_fn=None,
  314. scope='MaskPredictor')
  315. batch_size = features.get_shape().as_list()[0]
  316. if batch_size is None:
  317. batch_size = tf.shape(features)[0]
  318. mask_predictions = tf.reshape(
  319. mask_predictions,
  320. [batch_size, -1, num_masks, self._mask_height, self._mask_width])
  321. return mask_predictions