|
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
|
|
"""Mask Head.
|
|
|
|
Contains Mask prediction head classes for different meta architectures.
|
|
All the mask prediction heads have a predict function that receives the
|
|
`features` as the first argument and returns `mask_predictions`.
|
|
"""
|
|
import math
|
|
import tensorflow as tf
|
|
|
|
from object_detection.predictors.heads import head
|
|
from object_detection.utils import ops
|
|
|
|
slim = tf.contrib.slim
|
|
|
|
|
|
class MaskRCNNMaskHead(head.Head):
|
|
"""Mask RCNN mask prediction head.
|
|
|
|
Please refer to Mask RCNN paper:
|
|
https://arxiv.org/abs/1703.06870
|
|
"""
|
|
|
|
def __init__(self,
|
|
num_classes,
|
|
conv_hyperparams_fn=None,
|
|
mask_height=14,
|
|
mask_width=14,
|
|
mask_prediction_num_conv_layers=2,
|
|
mask_prediction_conv_depth=256,
|
|
masks_are_class_agnostic=False,
|
|
convolve_then_upsample=False):
|
|
"""Constructor.
|
|
|
|
Args:
|
|
num_classes: number of classes. Note that num_classes *does not*
|
|
include the background category, so if groundtruth labels take values
|
|
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
|
|
assigned classification targets can range from {0,... K}).
|
|
conv_hyperparams_fn: A function to generate tf-slim arg_scope with
|
|
hyperparameters for convolution ops.
|
|
mask_height: Desired output mask height. The default value is 14.
|
|
mask_width: Desired output mask width. The default value is 14.
|
|
mask_prediction_num_conv_layers: Number of convolution layers applied to
|
|
the image_features in mask prediction branch.
|
|
mask_prediction_conv_depth: The depth for the first conv2d_transpose op
|
|
applied to the image_features in the mask prediction branch. If set
|
|
to 0, the depth of the convolution layers will be automatically chosen
|
|
based on the number of object classes and the number of channels in the
|
|
image features.
|
|
masks_are_class_agnostic: Boolean determining if the mask-head is
|
|
class-agnostic or not.
|
|
convolve_then_upsample: Whether to apply convolutions on mask features
|
|
before upsampling using nearest neighbor resizing. Otherwise, mask
|
|
features are resized to [`mask_height`, `mask_width`] using bilinear
|
|
resizing before applying convolutions.
|
|
|
|
Raises:
|
|
ValueError: conv_hyperparams_fn is None.
|
|
"""
|
|
super(MaskRCNNMaskHead, self).__init__()
|
|
self._num_classes = num_classes
|
|
self._conv_hyperparams_fn = conv_hyperparams_fn
|
|
self._mask_height = mask_height
|
|
self._mask_width = mask_width
|
|
self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
|
|
self._mask_prediction_conv_depth = mask_prediction_conv_depth
|
|
self._masks_are_class_agnostic = masks_are_class_agnostic
|
|
self._convolve_then_upsample = convolve_then_upsample
|
|
if conv_hyperparams_fn is None:
|
|
raise ValueError('conv_hyperparams_fn is None.')
|
|
|
|
def _get_mask_predictor_conv_depth(self,
|
|
num_feature_channels,
|
|
num_classes,
|
|
class_weight=3.0,
|
|
feature_weight=2.0):
|
|
"""Computes the depth of the mask predictor convolutions.
|
|
|
|
Computes the depth of the mask predictor convolutions given feature channels
|
|
and number of classes by performing a weighted average of the two in
|
|
log space to compute the number of convolution channels. The weights that
|
|
are used for computing the weighted average do not need to sum to 1.
|
|
|
|
Args:
|
|
num_feature_channels: An integer containing the number of feature
|
|
channels.
|
|
num_classes: An integer containing the number of classes.
|
|
class_weight: Class weight used in computing the weighted average.
|
|
feature_weight: Feature weight used in computing the weighted average.
|
|
|
|
Returns:
|
|
An integer containing the number of convolution channels used by mask
|
|
predictor.
|
|
"""
|
|
num_feature_channels_log = math.log(float(num_feature_channels), 2.0)
|
|
num_classes_log = math.log(float(num_classes), 2.0)
|
|
weighted_num_feature_channels_log = (
|
|
num_feature_channels_log * feature_weight)
|
|
weighted_num_classes_log = num_classes_log * class_weight
|
|
total_weight = feature_weight + class_weight
|
|
num_conv_channels_log = round(
|
|
(weighted_num_feature_channels_log + weighted_num_classes_log) /
|
|
total_weight)
|
|
return int(math.pow(2.0, num_conv_channels_log))
|
|
|
|
def predict(self, features, num_predictions_per_location=1):
|
|
"""Performs mask prediction.
|
|
|
|
Args:
|
|
features: A float tensor of shape [batch_size, height, width, channels]
|
|
containing features for a batch of images.
|
|
num_predictions_per_location: Int containing number of predictions per
|
|
location.
|
|
|
|
Returns:
|
|
instance_masks: A float tensor of shape
|
|
[batch_size, 1, num_classes, mask_height, mask_width].
|
|
|
|
Raises:
|
|
ValueError: If num_predictions_per_location is not 1.
|
|
"""
|
|
if num_predictions_per_location != 1:
|
|
raise ValueError('Only num_predictions_per_location=1 is supported')
|
|
num_conv_channels = self._mask_prediction_conv_depth
|
|
if num_conv_channels == 0:
|
|
num_feature_channels = features.get_shape().as_list()[3]
|
|
num_conv_channels = self._get_mask_predictor_conv_depth(
|
|
num_feature_channels, self._num_classes)
|
|
with slim.arg_scope(self._conv_hyperparams_fn()):
|
|
if not self._convolve_then_upsample:
|
|
features = tf.image.resize_bilinear(
|
|
features, [self._mask_height, self._mask_width],
|
|
align_corners=True)
|
|
for _ in range(self._mask_prediction_num_conv_layers - 1):
|
|
features = slim.conv2d(
|
|
features,
|
|
num_outputs=num_conv_channels,
|
|
kernel_size=[3, 3])
|
|
if self._convolve_then_upsample:
|
|
# Replace Transposed Convolution with a Nearest Neighbor upsampling step
|
|
# followed by 3x3 convolution.
|
|
height_scale = self._mask_height / features.shape[1].value
|
|
width_scale = self._mask_width / features.shape[2].value
|
|
features = ops.nearest_neighbor_upsampling(
|
|
features, height_scale=height_scale, width_scale=width_scale)
|
|
features = slim.conv2d(
|
|
features,
|
|
num_outputs=num_conv_channels,
|
|
kernel_size=[3, 3])
|
|
|
|
num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
|
|
mask_predictions = slim.conv2d(
|
|
features,
|
|
num_outputs=num_masks,
|
|
activation_fn=None,
|
|
normalizer_fn=None,
|
|
kernel_size=[3, 3])
|
|
return tf.expand_dims(
|
|
tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),
|
|
axis=1,
|
|
name='MaskPredictor')
|
|
|
|
|
|
class ConvolutionalMaskHead(head.Head):
|
|
"""Convolutional class prediction head."""
|
|
|
|
def __init__(self,
|
|
is_training,
|
|
num_classes,
|
|
use_dropout,
|
|
dropout_keep_prob,
|
|
kernel_size,
|
|
use_depthwise=False,
|
|
mask_height=7,
|
|
mask_width=7,
|
|
masks_are_class_agnostic=False):
|
|
"""Constructor.
|
|
|
|
Args:
|
|
is_training: Indicates whether the BoxPredictor is in training mode.
|
|
num_classes: Number of classes.
|
|
use_dropout: Option to use dropout or not. Note that a single dropout
|
|
op is applied here prior to both box and class predictions, which stands
|
|
in contrast to the ConvolutionalBoxPredictor below.
|
|
dropout_keep_prob: Keep probability for dropout.
|
|
This is only used if use_dropout is True.
|
|
kernel_size: Size of final convolution kernel. If the
|
|
spatial resolution of the feature map is smaller than the kernel size,
|
|
then the kernel size is automatically set to be
|
|
min(feature_width, feature_height).
|
|
use_depthwise: Whether to use depthwise convolutions for prediction
|
|
steps. Default is False.
|
|
mask_height: Desired output mask height. The default value is 7.
|
|
mask_width: Desired output mask width. The default value is 7.
|
|
masks_are_class_agnostic: Boolean determining if the mask-head is
|
|
class-agnostic or not.
|
|
|
|
Raises:
|
|
ValueError: if min_depth > max_depth.
|
|
"""
|
|
super(ConvolutionalMaskHead, self).__init__()
|
|
self._is_training = is_training
|
|
self._num_classes = num_classes
|
|
self._use_dropout = use_dropout
|
|
self._dropout_keep_prob = dropout_keep_prob
|
|
self._kernel_size = kernel_size
|
|
self._use_depthwise = use_depthwise
|
|
self._mask_height = mask_height
|
|
self._mask_width = mask_width
|
|
self._masks_are_class_agnostic = masks_are_class_agnostic
|
|
|
|
def predict(self, features, num_predictions_per_location):
|
|
"""Predicts boxes.
|
|
|
|
Args:
|
|
features: A float tensor of shape [batch_size, height, width, channels]
|
|
containing image features.
|
|
num_predictions_per_location: Number of box predictions to be made per
|
|
spatial location.
|
|
|
|
Returns:
|
|
mask_predictions: A float tensors of shape
|
|
[batch_size, num_anchors, num_masks, mask_height, mask_width]
|
|
representing the mask predictions for the proposals.
|
|
"""
|
|
image_feature = features
|
|
# Add a slot for the background class.
|
|
if self._masks_are_class_agnostic:
|
|
num_masks = 1
|
|
else:
|
|
num_masks = self._num_classes
|
|
num_mask_channels = num_masks * self._mask_height * self._mask_width
|
|
net = image_feature
|
|
if self._use_dropout:
|
|
net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
|
|
if self._use_depthwise:
|
|
mask_predictions = slim.separable_conv2d(
|
|
net, None, [self._kernel_size, self._kernel_size],
|
|
padding='SAME', depth_multiplier=1, stride=1,
|
|
rate=1, scope='MaskPredictor_depthwise')
|
|
mask_predictions = slim.conv2d(
|
|
mask_predictions,
|
|
num_predictions_per_location * num_mask_channels,
|
|
[1, 1],
|
|
activation_fn=None,
|
|
normalizer_fn=None,
|
|
normalizer_params=None,
|
|
scope='MaskPredictor')
|
|
else:
|
|
mask_predictions = slim.conv2d(
|
|
net,
|
|
num_predictions_per_location * num_mask_channels,
|
|
[self._kernel_size, self._kernel_size],
|
|
activation_fn=None,
|
|
normalizer_fn=None,
|
|
normalizer_params=None,
|
|
scope='MaskPredictor')
|
|
batch_size = features.get_shape().as_list()[0]
|
|
if batch_size is None:
|
|
batch_size = tf.shape(features)[0]
|
|
mask_predictions = tf.reshape(
|
|
mask_predictions,
|
|
[batch_size, -1, num_masks, self._mask_height, self._mask_width])
|
|
return mask_predictions
|
|
|
|
|
|
# TODO(alirezafathi): See if possible to unify Weight Shared with regular
|
|
# convolutional mask head.
|
|
class WeightSharedConvolutionalMaskHead(head.Head):
|
|
"""Weight shared convolutional mask prediction head."""
|
|
|
|
def __init__(self,
|
|
num_classes,
|
|
kernel_size=3,
|
|
use_dropout=False,
|
|
dropout_keep_prob=0.8,
|
|
mask_height=7,
|
|
mask_width=7,
|
|
masks_are_class_agnostic=False):
|
|
"""Constructor.
|
|
|
|
Args:
|
|
num_classes: number of classes. Note that num_classes *does not*
|
|
include the background category, so if groundtruth labels take values
|
|
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
|
|
assigned classification targets can range from {0,... K}).
|
|
kernel_size: Size of final convolution kernel.
|
|
use_dropout: Whether to apply dropout to class prediction head.
|
|
dropout_keep_prob: Probability of keeping activiations.
|
|
mask_height: Desired output mask height. The default value is 7.
|
|
mask_width: Desired output mask width. The default value is 7.
|
|
masks_are_class_agnostic: Boolean determining if the mask-head is
|
|
class-agnostic or not.
|
|
"""
|
|
super(WeightSharedConvolutionalMaskHead, self).__init__()
|
|
self._num_classes = num_classes
|
|
self._kernel_size = kernel_size
|
|
self._use_dropout = use_dropout
|
|
self._dropout_keep_prob = dropout_keep_prob
|
|
self._mask_height = mask_height
|
|
self._mask_width = mask_width
|
|
self._masks_are_class_agnostic = masks_are_class_agnostic
|
|
|
|
def predict(self, features, num_predictions_per_location):
|
|
"""Predicts boxes.
|
|
|
|
Args:
|
|
features: A float tensor of shape [batch_size, height, width, channels]
|
|
containing image features.
|
|
num_predictions_per_location: Number of box predictions to be made per
|
|
spatial location.
|
|
|
|
Returns:
|
|
mask_predictions: A tensor of shape
|
|
[batch_size, num_anchors, num_classes, mask_height, mask_width]
|
|
representing the mask predictions for the proposals.
|
|
"""
|
|
mask_predictions_net = features
|
|
if self._masks_are_class_agnostic:
|
|
num_masks = 1
|
|
else:
|
|
num_masks = self._num_classes
|
|
num_mask_channels = num_masks * self._mask_height * self._mask_width
|
|
if self._use_dropout:
|
|
mask_predictions_net = slim.dropout(
|
|
mask_predictions_net, keep_prob=self._dropout_keep_prob)
|
|
mask_predictions = slim.conv2d(
|
|
mask_predictions_net,
|
|
num_predictions_per_location * num_mask_channels,
|
|
[self._kernel_size, self._kernel_size],
|
|
activation_fn=None, stride=1, padding='SAME',
|
|
normalizer_fn=None,
|
|
scope='MaskPredictor')
|
|
batch_size = features.get_shape().as_list()[0]
|
|
if batch_size is None:
|
|
batch_size = tf.shape(features)[0]
|
|
mask_predictions = tf.reshape(
|
|
mask_predictions,
|
|
[batch_size, -1, num_masks, self._mask_height, self._mask_width])
|
|
return mask_predictions
|