|
|
- # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """SSD Meta-architecture definition.
-
- General tensorflow implementation of convolutional Multibox/SSD detection
- models.
- """
- import abc
- import tensorflow as tf
-
- from object_detection.core import box_list
- from object_detection.core import box_list_ops
- from object_detection.core import model
- from object_detection.core import standard_fields as fields
- from object_detection.core import target_assigner
- from object_detection.utils import ops
- from object_detection.utils import shape_utils
- from object_detection.utils import visualization_utils
-
- slim = tf.contrib.slim
-
-
- class SSDFeatureExtractor(object):
- """SSD Slim Feature Extractor definition."""
-
- def __init__(self,
- is_training,
- depth_multiplier,
- min_depth,
- pad_to_multiple,
- conv_hyperparams_fn,
- reuse_weights=None,
- use_explicit_padding=False,
- use_depthwise=False,
- override_base_feature_extractor_hyperparams=False):
- """Constructor.
-
- Args:
- is_training: whether the network is in training mode.
- depth_multiplier: float depth multiplier for feature extractor.
- min_depth: minimum feature extractor depth.
- pad_to_multiple: the nearest multiple to zero pad the input height and
- width dimensions to.
- conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
- and separable_conv2d ops in the layers that are added on top of the
- base feature extractor.
- reuse_weights: whether to reuse variables. Default is None.
- use_explicit_padding: Whether to use explicit padding when extracting
- features. Default is False.
- use_depthwise: Whether to use depthwise convolutions. Default is False.
- override_base_feature_extractor_hyperparams: Whether to override
- hyperparameters of the base feature extractor with the one from
- `conv_hyperparams_fn`.
- """
- self._is_training = is_training
- self._depth_multiplier = depth_multiplier
- self._min_depth = min_depth
- self._pad_to_multiple = pad_to_multiple
- self._conv_hyperparams_fn = conv_hyperparams_fn
- self._reuse_weights = reuse_weights
- self._use_explicit_padding = use_explicit_padding
- self._use_depthwise = use_depthwise
- self._override_base_feature_extractor_hyperparams = (
- override_base_feature_extractor_hyperparams)
-
- @property
- def is_keras_model(self):
- return False
-
- @abc.abstractmethod
- def preprocess(self, resized_inputs):
- """Preprocesses images for feature extraction (minus image resizing).
-
- Args:
- resized_inputs: a [batch, height, width, channels] float tensor
- representing a batch of images.
-
- Returns:
- preprocessed_inputs: a [batch, height, width, channels] float tensor
- representing a batch of images.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- """
- pass
-
- @abc.abstractmethod
- def extract_features(self, preprocessed_inputs):
- """Extracts features from preprocessed inputs.
-
- This function is responsible for extracting feature maps from preprocessed
- images.
-
- Args:
- preprocessed_inputs: a [batch, height, width, channels] float tensor
- representing a batch of images.
-
- Returns:
- feature_maps: a list of tensors where the ith tensor has shape
- [batch, height_i, width_i, depth_i]
- """
- raise NotImplementedError
-
- def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
- """Returns a map of variables to load from a foreign checkpoint.
-
- Args:
- feature_extractor_scope: A scope name for the feature extractor.
-
- Returns:
- A dict mapping variable names (to load from a checkpoint) to variables in
- the model graph.
- """
- variables_to_restore = {}
- for variable in tf.global_variables():
- var_name = variable.op.name
- if var_name.startswith(feature_extractor_scope + '/'):
- var_name = var_name.replace(feature_extractor_scope + '/', '')
- variables_to_restore[var_name] = variable
-
- return variables_to_restore
-
-
- class SSDKerasFeatureExtractor(tf.keras.Model):
- """SSD Feature Extractor definition."""
-
- def __init__(self,
- is_training,
- depth_multiplier,
- min_depth,
- pad_to_multiple,
- conv_hyperparams,
- freeze_batchnorm,
- inplace_batchnorm_update,
- use_explicit_padding=False,
- use_depthwise=False,
- override_base_feature_extractor_hyperparams=False,
- name=None):
- """Constructor.
-
- Args:
- is_training: whether the network is in training mode.
- depth_multiplier: float depth multiplier for feature extractor.
- min_depth: minimum feature extractor depth.
- pad_to_multiple: the nearest multiple to zero pad the input height and
- width dimensions to.
- conv_hyperparams: `hyperparams_builder.KerasLayerHyperparams` object
- containing convolution hyperparameters for the layers added on top of
- the base feature extractor.
- freeze_batchnorm: Whether to freeze batch norm parameters during
- training or not. When training with a small batch size (e.g. 1), it is
- desirable to freeze batch norm update and use pretrained batch norm
- params.
- inplace_batchnorm_update: Whether to update batch norm moving average
- values inplace. When this is false train op must add a control
- dependency on tf.graphkeys.UPDATE_OPS collection in order to update
- batch norm statistics.
- use_explicit_padding: Whether to use explicit padding when extracting
- features. Default is False.
- use_depthwise: Whether to use depthwise convolutions. Default is False.
- override_base_feature_extractor_hyperparams: Whether to override
- hyperparameters of the base feature extractor with the one from
- `conv_hyperparams_config`.
- name: A string name scope to assign to the model. If 'None', Keras
- will auto-generate one from the class name.
- """
- super(SSDKerasFeatureExtractor, self).__init__(name=name)
-
- self._is_training = is_training
- self._depth_multiplier = depth_multiplier
- self._min_depth = min_depth
- self._pad_to_multiple = pad_to_multiple
- self._conv_hyperparams = conv_hyperparams
- self._freeze_batchnorm = freeze_batchnorm
- self._inplace_batchnorm_update = inplace_batchnorm_update
- self._use_explicit_padding = use_explicit_padding
- self._use_depthwise = use_depthwise
- self._override_base_feature_extractor_hyperparams = (
- override_base_feature_extractor_hyperparams)
-
- @property
- def is_keras_model(self):
- return True
-
- @abc.abstractmethod
- def preprocess(self, resized_inputs):
- """Preprocesses images for feature extraction (minus image resizing).
-
- Args:
- resized_inputs: a [batch, height, width, channels] float tensor
- representing a batch of images.
-
- Returns:
- preprocessed_inputs: a [batch, height, width, channels] float tensor
- representing a batch of images.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- """
- raise NotImplementedError
-
- @abc.abstractmethod
- def _extract_features(self, preprocessed_inputs):
- """Extracts features from preprocessed inputs.
-
- This function is responsible for extracting feature maps from preprocessed
- images.
-
- Args:
- preprocessed_inputs: a [batch, height, width, channels] float tensor
- representing a batch of images.
-
- Returns:
- feature_maps: a list of tensors where the ith tensor has shape
- [batch, height_i, width_i, depth_i]
- """
- raise NotImplementedError
-
- # This overrides the keras.Model `call` method with the _extract_features
- # method.
- def call(self, inputs, **kwargs):
- return self._extract_features(inputs)
-
- def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
- """Returns a map of variables to load from a foreign checkpoint.
-
- Args:
- feature_extractor_scope: A scope name for the feature extractor.
-
- Returns:
- A dict mapping variable names (to load from a checkpoint) to variables in
- the model graph.
- """
- variables_to_restore = {}
- for variable in tf.global_variables():
- var_name = variable.op.name
- if var_name.startswith(feature_extractor_scope + '/'):
- var_name = var_name.replace(feature_extractor_scope + '/', '')
- variables_to_restore[var_name] = variable
-
- return variables_to_restore
-
-
- class SSDMetaArch(model.DetectionModel):
- """SSD Meta-architecture definition."""
-
- def __init__(self,
- is_training,
- anchor_generator,
- box_predictor,
- box_coder,
- feature_extractor,
- encode_background_as_zeros,
- image_resizer_fn,
- non_max_suppression_fn,
- score_conversion_fn,
- classification_loss,
- localization_loss,
- classification_loss_weight,
- localization_loss_weight,
- normalize_loss_by_num_matches,
- hard_example_miner,
- target_assigner_instance,
- add_summaries=True,
- normalize_loc_loss_by_codesize=False,
- freeze_batchnorm=False,
- inplace_batchnorm_update=False,
- add_background_class=True,
- explicit_background_class=False,
- random_example_sampler=None,
- expected_loss_weights_fn=None,
- use_confidences_as_targets=False,
- implicit_example_weight=0.5,
- equalization_loss_config=None):
- """SSDMetaArch Constructor.
-
- TODO(rathodv,jonathanhuang): group NMS parameters + score converter into
- a class and loss parameters into a class and write config protos for
- postprocessing and losses.
-
- Args:
- is_training: A boolean indicating whether the training version of the
- computation graph should be constructed.
- anchor_generator: an anchor_generator.AnchorGenerator object.
- box_predictor: a box_predictor.BoxPredictor object.
- box_coder: a box_coder.BoxCoder object.
- feature_extractor: a SSDFeatureExtractor object.
- encode_background_as_zeros: boolean determining whether background
- targets are to be encoded as an all zeros vector or a one-hot
- vector (where background is the 0th class).
- image_resizer_fn: a callable for image resizing. This callable always
- takes a rank-3 image tensor (corresponding to a single image) and
- returns a rank-3 image tensor, possibly with new spatial dimensions and
- a 1-D tensor of shape [3] indicating shape of true image within
- the resized image tensor as the resized image tensor could be padded.
- See builders/image_resizer_builder.py.
- non_max_suppression_fn: batch_multiclass_non_max_suppression
- callable that takes `boxes`, `scores` and optional `clip_window`
- inputs (with all other inputs already set) and returns a dictionary
- hold tensors with keys: `detection_boxes`, `detection_scores`,
- `detection_classes` and `num_detections`. See `post_processing.
- batch_multiclass_non_max_suppression` for the type and shape of these
- tensors.
- score_conversion_fn: callable elementwise nonlinearity (that takes tensors
- as inputs and returns tensors). This is usually used to convert logits
- to probabilities.
- classification_loss: an object_detection.core.losses.Loss object.
- localization_loss: a object_detection.core.losses.Loss object.
- classification_loss_weight: float
- localization_loss_weight: float
- normalize_loss_by_num_matches: boolean
- hard_example_miner: a losses.HardExampleMiner object (can be None)
- target_assigner_instance: target_assigner.TargetAssigner instance to use.
- add_summaries: boolean (default: True) controlling whether summary ops
- should be added to tensorflow graph.
- normalize_loc_loss_by_codesize: whether to normalize localization loss
- by code size of the box encoder.
- freeze_batchnorm: Whether to freeze batch norm parameters during
- training or not. When training with a small batch size (e.g. 1), it is
- desirable to freeze batch norm update and use pretrained batch norm
- params.
- inplace_batchnorm_update: Whether to update batch norm moving average
- values inplace. When this is false train op must add a control
- dependency on tf.graphkeys.UPDATE_OPS collection in order to update
- batch norm statistics.
- add_background_class: Whether to add an implicit background class to
- one-hot encodings of groundtruth labels. Set to false if training a
- single class model or using groundtruth labels with an explicit
- background class.
- explicit_background_class: Set to true if using groundtruth labels with an
- explicit background class, as in multiclass scores.
- random_example_sampler: a BalancedPositiveNegativeSampler object that can
- perform random example sampling when computing loss. If None, random
- sampling process is skipped. Note that random example sampler and hard
- example miner can both be applied to the model. In that case, random
- sampler will take effect first and hard example miner can only process
- the random sampled examples.
- expected_loss_weights_fn: If not None, use to calculate
- loss by background/foreground weighting. Should take batch_cls_targets
- as inputs and return foreground_weights, background_weights. See
- expected_classification_loss_by_expected_sampling and
- expected_classification_loss_by_reweighting_unmatched_anchors in
- third_party/tensorflow_models/object_detection/utils/ops.py as examples.
- use_confidences_as_targets: Whether to use groundtruth_condifences field
- to assign the targets.
- implicit_example_weight: a float number that specifies the weight used
- for the implicit negative examples.
- equalization_loss_config: a namedtuple that specifies configs for
- computing equalization loss.
- """
- super(SSDMetaArch, self).__init__(num_classes=box_predictor.num_classes)
- self._is_training = is_training
- self._freeze_batchnorm = freeze_batchnorm
- self._inplace_batchnorm_update = inplace_batchnorm_update
-
- self._anchor_generator = anchor_generator
- self._box_predictor = box_predictor
-
- self._box_coder = box_coder
- self._feature_extractor = feature_extractor
- self._add_background_class = add_background_class
- self._explicit_background_class = explicit_background_class
-
- if add_background_class and explicit_background_class:
- raise ValueError("Cannot have both 'add_background_class' and"
- " 'explicit_background_class' true.")
-
- # Needed for fine-tuning from classification checkpoints whose
- # variables do not have the feature extractor scope.
- if self._feature_extractor.is_keras_model:
- # Keras feature extractors will have a name they implicitly use to scope.
- # So, all contained variables are prefixed by this name.
- # To load from classification checkpoints, need to filter out this name.
- self._extract_features_scope = feature_extractor.name
- else:
- # Slim feature extractors get an explicit naming scope
- self._extract_features_scope = 'FeatureExtractor'
-
- if encode_background_as_zeros:
- background_class = [0]
- else:
- background_class = [1]
-
- if self._add_background_class:
- num_foreground_classes = self.num_classes
- else:
- num_foreground_classes = self.num_classes - 1
-
- self._unmatched_class_label = tf.constant(
- background_class + num_foreground_classes * [0], tf.float32)
-
- self._target_assigner = target_assigner_instance
-
- self._classification_loss = classification_loss
- self._localization_loss = localization_loss
- self._classification_loss_weight = classification_loss_weight
- self._localization_loss_weight = localization_loss_weight
- self._normalize_loss_by_num_matches = normalize_loss_by_num_matches
- self._normalize_loc_loss_by_codesize = normalize_loc_loss_by_codesize
- self._hard_example_miner = hard_example_miner
- self._random_example_sampler = random_example_sampler
- self._parallel_iterations = 16
-
- self._image_resizer_fn = image_resizer_fn
- self._non_max_suppression_fn = non_max_suppression_fn
- self._score_conversion_fn = score_conversion_fn
-
- self._anchors = None
- self._add_summaries = add_summaries
- self._batched_prediction_tensor_names = []
- self._expected_loss_weights_fn = expected_loss_weights_fn
- self._use_confidences_as_targets = use_confidences_as_targets
- self._implicit_example_weight = implicit_example_weight
-
- self._equalization_loss_config = equalization_loss_config
-
- @property
- def anchors(self):
- if not self._anchors:
- raise RuntimeError('anchors have not been constructed yet!')
- if not isinstance(self._anchors, box_list.BoxList):
- raise RuntimeError('anchors should be a BoxList object, but is not.')
- return self._anchors
-
- @property
- def batched_prediction_tensor_names(self):
- if not self._batched_prediction_tensor_names:
- raise RuntimeError('Must call predict() method to get batched prediction '
- 'tensor names.')
- return self._batched_prediction_tensor_names
-
- def preprocess(self, inputs):
- """Feature-extractor specific preprocessing.
-
- SSD meta architecture uses a default clip_window of [0, 0, 1, 1] during
- post-processing. On calling `preprocess` method, clip_window gets updated
- based on `true_image_shapes` returned by `image_resizer_fn`.
-
- Args:
- inputs: a [batch, height_in, width_in, channels] float tensor representing
- a batch of images with values between 0 and 255.0.
-
- Returns:
- preprocessed_inputs: a [batch, height_out, width_out, channels] float
- tensor representing a batch of images.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
-
- Raises:
- ValueError: if inputs tensor does not have type tf.float32
- """
- if inputs.dtype is not tf.float32:
- raise ValueError('`preprocess` expects a tf.float32 tensor')
- with tf.name_scope('Preprocessor'):
- # TODO(jonathanhuang): revisit whether to always use batch size as
- # the number of parallel iterations vs allow for dynamic batching.
- outputs = shape_utils.static_or_dynamic_map_fn(
- self._image_resizer_fn,
- elems=inputs,
- dtype=[tf.float32, tf.int32])
- resized_inputs = outputs[0]
- true_image_shapes = outputs[1]
-
- return (self._feature_extractor.preprocess(resized_inputs),
- true_image_shapes)
-
- def _compute_clip_window(self, preprocessed_images, true_image_shapes):
- """Computes clip window to use during post_processing.
-
- Computes a new clip window to use during post-processing based on
- `resized_image_shapes` and `true_image_shapes` only if `preprocess` method
- has been called. Otherwise returns a default clip window of [0, 0, 1, 1].
-
- Args:
- preprocessed_images: the [batch, height, width, channels] image
- tensor.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros. Or None if the clip window should cover the full image.
-
- Returns:
- a 2-D float32 tensor of the form [batch_size, 4] containing the clip
- window for each image in the batch in normalized coordinates (relative to
- the resized dimensions) where each clip window is of the form [ymin, xmin,
- ymax, xmax] or a default clip window of [0, 0, 1, 1].
-
- """
- if true_image_shapes is None:
- return tf.constant([0, 0, 1, 1], dtype=tf.float32)
-
- resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape(
- preprocessed_images)
- true_heights, true_widths, _ = tf.unstack(
- tf.to_float(true_image_shapes), axis=1)
- padded_height = tf.to_float(resized_inputs_shape[1])
- padded_width = tf.to_float(resized_inputs_shape[2])
- return tf.stack(
- [
- tf.zeros_like(true_heights),
- tf.zeros_like(true_widths), true_heights / padded_height,
- true_widths / padded_width
- ],
- axis=1)
-
- def predict(self, preprocessed_inputs, true_image_shapes):
- """Predicts unpostprocessed tensors from input tensor.
-
- This function takes an input batch of images and runs it through the forward
- pass of the network to yield unpostprocessesed predictions.
-
- A side effect of calling the predict method is that self._anchors is
- populated with a box_list.BoxList of anchors. These anchors must be
- constructed before the postprocess or loss functions can be called.
-
- Args:
- preprocessed_inputs: a [batch, height, width, channels] image tensor.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
-
- Returns:
- prediction_dict: a dictionary holding "raw" prediction tensors:
- 1) preprocessed_inputs: the [batch, height, width, channels] image
- tensor.
- 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors,
- box_code_dimension] containing predicted boxes.
- 3) class_predictions_with_background: 3-D float tensor of shape
- [batch_size, num_anchors, num_classes+1] containing class predictions
- (logits) for each of the anchors. Note that this tensor *includes*
- background class predictions (at class index 0).
- 4) feature_maps: a list of tensors where the ith tensor has shape
- [batch, height_i, width_i, depth_i].
- 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing
- the generated anchors in normalized coordinates.
- """
- if self._inplace_batchnorm_update:
- batchnorm_updates_collections = None
- else:
- batchnorm_updates_collections = tf.GraphKeys.UPDATE_OPS
- if self._feature_extractor.is_keras_model:
- feature_maps = self._feature_extractor(preprocessed_inputs)
- else:
- with slim.arg_scope([slim.batch_norm],
- is_training=(self._is_training and
- not self._freeze_batchnorm),
- updates_collections=batchnorm_updates_collections):
- with tf.variable_scope(None, self._extract_features_scope,
- [preprocessed_inputs]):
- feature_maps = self._feature_extractor.extract_features(
- preprocessed_inputs)
-
- feature_map_spatial_dims = self._get_feature_map_spatial_dims(
- feature_maps)
- image_shape = shape_utils.combined_static_and_dynamic_shape(
- preprocessed_inputs)
- self._anchors = box_list_ops.concatenate(
- self._anchor_generator.generate(
- feature_map_spatial_dims,
- im_height=image_shape[1],
- im_width=image_shape[2]))
- if self._box_predictor.is_keras_model:
- predictor_results_dict = self._box_predictor(feature_maps)
- else:
- with slim.arg_scope([slim.batch_norm],
- is_training=(self._is_training and
- not self._freeze_batchnorm),
- updates_collections=batchnorm_updates_collections):
- predictor_results_dict = self._box_predictor.predict(
- feature_maps, self._anchor_generator.num_anchors_per_location())
- predictions_dict = {
- 'preprocessed_inputs': preprocessed_inputs,
- 'feature_maps': feature_maps,
- 'anchors': self._anchors.get()
- }
- for prediction_key, prediction_list in iter(predictor_results_dict.items()):
- prediction = tf.concat(prediction_list, axis=1)
- if (prediction_key == 'box_encodings' and prediction.shape.ndims == 4 and
- prediction.shape[2] == 1):
- prediction = tf.squeeze(prediction, axis=2)
- predictions_dict[prediction_key] = prediction
- self._batched_prediction_tensor_names = [x for x in predictions_dict
- if x != 'anchors']
- return predictions_dict
-
- def _get_feature_map_spatial_dims(self, feature_maps):
- """Return list of spatial dimensions for each feature map in a list.
-
- Args:
- feature_maps: a list of tensors where the ith tensor has shape
- [batch, height_i, width_i, depth_i].
-
- Returns:
- a list of pairs (height, width) for each feature map in feature_maps
- """
- feature_map_shapes = [
- shape_utils.combined_static_and_dynamic_shape(
- feature_map) for feature_map in feature_maps
- ]
- return [(shape[1], shape[2]) for shape in feature_map_shapes]
-
- def postprocess(self, prediction_dict, true_image_shapes):
- """Converts prediction tensors to final detections.
-
- This function converts raw predictions tensors to final detection results by
- slicing off the background class, decoding box predictions and applying
- non max suppression and clipping to the image window.
-
- See base class for output format conventions. Note also that by default,
- scores are to be interpreted as logits, but if a score_conversion_fn is
- used, then scores are remapped (and may thus have a different
- interpretation).
-
- Args:
- prediction_dict: a dictionary holding prediction tensors with
- 1) preprocessed_inputs: a [batch, height, width, channels] image
- tensor.
- 2) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
- box_code_dimension] containing predicted boxes.
- 3) class_predictions_with_background: 3-D float tensor of shape
- [batch_size, num_anchors, num_classes+1] containing class predictions
- (logits) for each of the anchors. Note that this tensor *includes*
- background class predictions.
- 4) mask_predictions: (optional) a 5-D float tensor of shape
- [batch_size, num_anchors, q, mask_height, mask_width]. `q` can be
- either number of classes or 1 depending on whether a separate mask is
- predicted per class.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros. Or None, if the clip window should cover the full image.
-
- Returns:
- detections: a dictionary containing the following fields
- detection_boxes: [batch, max_detections, 4] tensor with post-processed
- detection boxes.
- detection_scores: [batch, max_detections] tensor with scalar scores for
- post-processed detection boxes.
- detection_classes: [batch, max_detections] tensor with classes for
- post-processed detection classes.
- detection_keypoints: [batch, max_detections, num_keypoints, 2] (if
- encoded in the prediction_dict 'box_encodings')
- detection_masks: [batch_size, max_detections, mask_height, mask_width]
- (optional)
- num_detections: [batch]
- raw_detection_boxes: [batch, total_detections, 4] tensor with decoded
- detection boxes before Non-Max Suppression.
- raw_detection_score: [batch, total_detections,
- num_classes_with_background] tensor of multi-class score logits for
- raw detection boxes.
- Raises:
- ValueError: if prediction_dict does not contain `box_encodings` or
- `class_predictions_with_background` fields.
- """
- if ('box_encodings' not in prediction_dict or
- 'class_predictions_with_background' not in prediction_dict):
- raise ValueError('prediction_dict does not contain expected entries.')
- with tf.name_scope('Postprocessor'):
- preprocessed_images = prediction_dict['preprocessed_inputs']
- box_encodings = prediction_dict['box_encodings']
- box_encodings = tf.identity(box_encodings, 'raw_box_encodings')
- class_predictions = prediction_dict['class_predictions_with_background']
- detection_boxes, detection_keypoints = self._batch_decode(box_encodings)
- detection_boxes = tf.identity(detection_boxes, 'raw_box_locations')
- detection_boxes = tf.expand_dims(detection_boxes, axis=2)
-
- detection_scores = self._score_conversion_fn(class_predictions)
- detection_scores = tf.identity(detection_scores, 'raw_box_scores')
- if self._add_background_class or self._explicit_background_class:
- detection_scores = tf.slice(detection_scores, [0, 0, 1], [-1, -1, -1])
- additional_fields = None
-
- batch_size = (
- shape_utils.combined_static_and_dynamic_shape(preprocessed_images)[0])
-
- if 'feature_maps' in prediction_dict:
- feature_map_list = []
- for feature_map in prediction_dict['feature_maps']:
- feature_map_list.append(tf.reshape(feature_map, [batch_size, -1]))
- box_features = tf.concat(feature_map_list, 1)
- box_features = tf.identity(box_features, 'raw_box_features')
-
- if detection_keypoints is not None:
- additional_fields = {
- fields.BoxListFields.keypoints: detection_keypoints}
- (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
- nmsed_additional_fields, num_detections) = self._non_max_suppression_fn(
- detection_boxes,
- detection_scores,
- clip_window=self._compute_clip_window(preprocessed_images,
- true_image_shapes),
- additional_fields=additional_fields,
- masks=prediction_dict.get('mask_predictions'))
- detection_dict = {
- fields.DetectionResultFields.detection_boxes:
- nmsed_boxes,
- fields.DetectionResultFields.detection_scores:
- nmsed_scores,
- fields.DetectionResultFields.detection_classes:
- nmsed_classes,
- fields.DetectionResultFields.num_detections:
- tf.to_float(num_detections),
- fields.DetectionResultFields.raw_detection_boxes:
- tf.squeeze(detection_boxes, axis=2),
- fields.DetectionResultFields.raw_detection_scores:
- class_predictions
- }
- if (nmsed_additional_fields is not None and
- fields.BoxListFields.keypoints in nmsed_additional_fields):
- detection_dict[fields.DetectionResultFields.detection_keypoints] = (
- nmsed_additional_fields[fields.BoxListFields.keypoints])
- if nmsed_masks is not None:
- detection_dict[
- fields.DetectionResultFields.detection_masks] = nmsed_masks
- return detection_dict
-
- def loss(self, prediction_dict, true_image_shapes, scope=None):
- """Compute scalar loss tensors with respect to provided groundtruth.
-
- Calling this function requires that groundtruth tensors have been
- provided via the provide_groundtruth function.
-
- Args:
- prediction_dict: a dictionary holding prediction tensors with
- 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
- box_code_dimension] containing predicted boxes.
- 2) class_predictions_with_background: 3-D float tensor of shape
- [batch_size, num_anchors, num_classes+1] containing class predictions
- (logits) for each of the anchors. Note that this tensor *includes*
- background class predictions.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- scope: Optional scope name.
-
- Returns:
- a dictionary mapping loss keys (`localization_loss` and
- `classification_loss`) to scalar tensors representing corresponding loss
- values.
- """
- with tf.name_scope(scope, 'Loss', prediction_dict.values()):
- keypoints = None
- if self.groundtruth_has_field(fields.BoxListFields.keypoints):
- keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints)
- weights = None
- if self.groundtruth_has_field(fields.BoxListFields.weights):
- weights = self.groundtruth_lists(fields.BoxListFields.weights)
- confidences = None
- if self.groundtruth_has_field(fields.BoxListFields.confidences):
- confidences = self.groundtruth_lists(fields.BoxListFields.confidences)
- (batch_cls_targets, batch_cls_weights, batch_reg_targets,
- batch_reg_weights, match_list) = self._assign_targets(
- self.groundtruth_lists(fields.BoxListFields.boxes),
- self.groundtruth_lists(fields.BoxListFields.classes),
- keypoints, weights, confidences)
- if self._add_summaries:
- self._summarize_target_assignment(
- self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
-
- if self._random_example_sampler:
- batch_cls_per_anchor_weights = tf.reduce_mean(
- batch_cls_weights, axis=-1)
- batch_sampled_indicator = tf.to_float(
- shape_utils.static_or_dynamic_map_fn(
- self._minibatch_subsample_fn,
- [batch_cls_targets, batch_cls_per_anchor_weights],
- dtype=tf.bool,
- parallel_iterations=self._parallel_iterations,
- back_prop=True))
- batch_reg_weights = tf.multiply(batch_sampled_indicator,
- batch_reg_weights)
- batch_cls_weights = tf.multiply(
- tf.expand_dims(batch_sampled_indicator, -1),
- batch_cls_weights)
-
- losses_mask = None
- if self.groundtruth_has_field(fields.InputDataFields.is_annotated):
- losses_mask = tf.stack(self.groundtruth_lists(
- fields.InputDataFields.is_annotated))
- location_losses = self._localization_loss(
- prediction_dict['box_encodings'],
- batch_reg_targets,
- ignore_nan_targets=True,
- weights=batch_reg_weights,
- losses_mask=losses_mask)
-
- cls_losses = self._classification_loss(
- prediction_dict['class_predictions_with_background'],
- batch_cls_targets,
- weights=batch_cls_weights,
- losses_mask=losses_mask)
-
- if self._expected_loss_weights_fn:
- # Need to compute losses for assigned targets against the
- # unmatched_class_label as well as their assigned targets.
- # simplest thing (but wasteful) is just to calculate all losses
- # twice
- batch_size, num_anchors, num_classes = batch_cls_targets.get_shape()
- unmatched_targets = tf.ones([batch_size, num_anchors, 1
- ]) * self._unmatched_class_label
-
- unmatched_cls_losses = self._classification_loss(
- prediction_dict['class_predictions_with_background'],
- unmatched_targets,
- weights=batch_cls_weights,
- losses_mask=losses_mask)
-
- if cls_losses.get_shape().ndims == 3:
- batch_size, num_anchors, num_classes = cls_losses.get_shape()
- cls_losses = tf.reshape(cls_losses, [batch_size, -1])
- unmatched_cls_losses = tf.reshape(unmatched_cls_losses,
- [batch_size, -1])
- batch_cls_targets = tf.reshape(
- batch_cls_targets, [batch_size, num_anchors * num_classes, -1])
- batch_cls_targets = tf.concat(
- [1 - batch_cls_targets, batch_cls_targets], axis=-1)
-
- location_losses = tf.tile(location_losses, [1, num_classes])
-
- foreground_weights, background_weights = (
- self._expected_loss_weights_fn(batch_cls_targets))
-
- cls_losses = (
- foreground_weights * cls_losses +
- background_weights * unmatched_cls_losses)
-
- location_losses *= foreground_weights
-
- classification_loss = tf.reduce_sum(cls_losses)
- localization_loss = tf.reduce_sum(location_losses)
- elif self._hard_example_miner:
- cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2)
- (localization_loss, classification_loss) = self._apply_hard_mining(
- location_losses, cls_losses, prediction_dict, match_list)
- if self._add_summaries:
- self._hard_example_miner.summarize()
- else:
- cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2)
- localization_loss = tf.reduce_sum(location_losses)
- classification_loss = tf.reduce_sum(cls_losses)
-
- # Optionally normalize by number of positive matches
- normalizer = tf.constant(1.0, dtype=tf.float32)
- if self._normalize_loss_by_num_matches:
- normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)),
- 1.0)
-
- localization_loss_normalizer = normalizer
- if self._normalize_loc_loss_by_codesize:
- localization_loss_normalizer *= self._box_coder.code_size
- localization_loss = tf.multiply((self._localization_loss_weight /
- localization_loss_normalizer),
- localization_loss,
- name='localization_loss')
- classification_loss = tf.multiply((self._classification_loss_weight /
- normalizer), classification_loss,
- name='classification_loss')
-
- loss_dict = {
- str(localization_loss.op.name): localization_loss,
- str(classification_loss.op.name): classification_loss
- }
-
-
- return loss_dict
-
- def _minibatch_subsample_fn(self, inputs):
- """Randomly samples anchors for one image.
-
- Args:
- inputs: a list of 2 inputs. First one is a tensor of shape [num_anchors,
- num_classes] indicating targets assigned to each anchor. Second one
- is a tensor of shape [num_anchors] indicating the class weight of each
- anchor.
-
- Returns:
- batch_sampled_indicator: bool tensor of shape [num_anchors] indicating
- whether the anchor should be selected for loss computation.
- """
- cls_targets, cls_weights = inputs
- if self._add_background_class:
- # Set background_class bits to 0 so that the positives_indicator
- # computation would not consider background class.
- background_class = tf.zeros_like(tf.slice(cls_targets, [0, 0], [-1, 1]))
- regular_class = tf.slice(cls_targets, [0, 1], [-1, -1])
- cls_targets = tf.concat([background_class, regular_class], 1)
- positives_indicator = tf.reduce_sum(cls_targets, axis=1)
- return self._random_example_sampler.subsample(
- tf.cast(cls_weights, tf.bool),
- batch_size=None,
- labels=tf.cast(positives_indicator, tf.bool))
-
- def _summarize_anchor_classification_loss(self, class_ids, cls_losses):
- positive_indices = tf.where(tf.greater(class_ids, 0))
- positive_anchor_cls_loss = tf.squeeze(
- tf.gather(cls_losses, positive_indices), axis=1)
- visualization_utils.add_cdf_image_summary(positive_anchor_cls_loss,
- 'PositiveAnchorLossCDF')
- negative_indices = tf.where(tf.equal(class_ids, 0))
- negative_anchor_cls_loss = tf.squeeze(
- tf.gather(cls_losses, negative_indices), axis=1)
- visualization_utils.add_cdf_image_summary(negative_anchor_cls_loss,
- 'NegativeAnchorLossCDF')
-
- def _assign_targets(self,
- groundtruth_boxes_list,
- groundtruth_classes_list,
- groundtruth_keypoints_list=None,
- groundtruth_weights_list=None,
- groundtruth_confidences_list=None):
- """Assign groundtruth targets.
-
- Adds a background class to each one-hot encoding of groundtruth classes
- and uses target assigner to obtain regression and classification targets.
-
- Args:
- groundtruth_boxes_list: a list of 2-D tensors of shape [num_boxes, 4]
- containing coordinates of the groundtruth boxes.
- Groundtruth boxes are provided in [y_min, x_min, y_max, x_max]
- format and assumed to be normalized and clipped
- relative to the image window with y_min <= y_max and x_min <= x_max.
- groundtruth_classes_list: a list of 2-D one-hot (or k-hot) tensors of
- shape [num_boxes, num_classes] containing the class targets with the 0th
- index assumed to map to the first non-background class.
- groundtruth_keypoints_list: (optional) a list of 3-D tensors of shape
- [num_boxes, num_keypoints, 2]
- groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape
- [num_boxes] containing weights for groundtruth boxes.
- groundtruth_confidences_list: A list of 2-D tf.float32 tensors of shape
- [num_boxes, num_classes] containing class confidences for
- groundtruth boxes.
-
- Returns:
- batch_cls_targets: a tensor with shape [batch_size, num_anchors,
- num_classes],
- batch_cls_weights: a tensor with shape [batch_size, num_anchors],
- batch_reg_targets: a tensor with shape [batch_size, num_anchors,
- box_code_dimension]
- batch_reg_weights: a tensor with shape [batch_size, num_anchors],
- match_list: a list of matcher.Match objects encoding the match between
- anchors and groundtruth boxes for each image of the batch,
- with rows of the Match objects corresponding to groundtruth boxes
- and columns corresponding to anchors.
- """
- groundtruth_boxlists = [
- box_list.BoxList(boxes) for boxes in groundtruth_boxes_list
- ]
- train_using_confidences = (self._is_training and
- self._use_confidences_as_targets)
- if self._add_background_class:
- groundtruth_classes_with_background_list = [
- tf.pad(one_hot_encoding, [[0, 0], [1, 0]], mode='CONSTANT')
- for one_hot_encoding in groundtruth_classes_list
- ]
- if train_using_confidences:
- groundtruth_confidences_with_background_list = [
- tf.pad(groundtruth_confidences, [[0, 0], [1, 0]], mode='CONSTANT')
- for groundtruth_confidences in groundtruth_confidences_list
- ]
- else:
- groundtruth_classes_with_background_list = groundtruth_classes_list
-
- if groundtruth_keypoints_list is not None:
- for boxlist, keypoints in zip(
- groundtruth_boxlists, groundtruth_keypoints_list):
- boxlist.add_field(fields.BoxListFields.keypoints, keypoints)
- if train_using_confidences:
- return target_assigner.batch_assign_confidences(
- self._target_assigner,
- self.anchors,
- groundtruth_boxlists,
- groundtruth_confidences_with_background_list,
- groundtruth_weights_list,
- self._unmatched_class_label,
- self._add_background_class,
- self._implicit_example_weight)
- else:
- return target_assigner.batch_assign_targets(
- self._target_assigner,
- self.anchors,
- groundtruth_boxlists,
- groundtruth_classes_with_background_list,
- self._unmatched_class_label,
- groundtruth_weights_list)
-
- def _summarize_target_assignment(self, groundtruth_boxes_list, match_list):
- """Creates tensorflow summaries for the input boxes and anchors.
-
- This function creates four summaries corresponding to the average
- number (over images in a batch) of (1) groundtruth boxes, (2) anchors
- marked as positive, (3) anchors marked as negative, and (4) anchors marked
- as ignored.
-
- Args:
- groundtruth_boxes_list: a list of 2-D tensors of shape [num_boxes, 4]
- containing corners of the groundtruth boxes.
- match_list: a list of matcher.Match objects encoding the match between
- anchors and groundtruth boxes for each image of the batch,
- with rows of the Match objects corresponding to groundtruth boxes
- and columns corresponding to anchors.
- """
- num_boxes_per_image = tf.stack(
- [tf.shape(x)[0] for x in groundtruth_boxes_list])
- pos_anchors_per_image = tf.stack(
- [match.num_matched_columns() for match in match_list])
- neg_anchors_per_image = tf.stack(
- [match.num_unmatched_columns() for match in match_list])
- ignored_anchors_per_image = tf.stack(
- [match.num_ignored_columns() for match in match_list])
- tf.summary.scalar('AvgNumGroundtruthBoxesPerImage',
- tf.reduce_mean(tf.to_float(num_boxes_per_image)),
- family='TargetAssignment')
- tf.summary.scalar('AvgNumPositiveAnchorsPerImage',
- tf.reduce_mean(tf.to_float(pos_anchors_per_image)),
- family='TargetAssignment')
- tf.summary.scalar('AvgNumNegativeAnchorsPerImage',
- tf.reduce_mean(tf.to_float(neg_anchors_per_image)),
- family='TargetAssignment')
- tf.summary.scalar('AvgNumIgnoredAnchorsPerImage',
- tf.reduce_mean(tf.to_float(ignored_anchors_per_image)),
- family='TargetAssignment')
-
- def _apply_hard_mining(self, location_losses, cls_losses, prediction_dict,
- match_list):
- """Applies hard mining to anchorwise losses.
-
- Args:
- location_losses: Float tensor of shape [batch_size, num_anchors]
- representing anchorwise location losses.
- cls_losses: Float tensor of shape [batch_size, num_anchors]
- representing anchorwise classification losses.
- prediction_dict: p a dictionary holding prediction tensors with
- 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
- box_code_dimension] containing predicted boxes.
- 2) class_predictions_with_background: 3-D float tensor of shape
- [batch_size, num_anchors, num_classes+1] containing class predictions
- (logits) for each of the anchors. Note that this tensor *includes*
- background class predictions.
- match_list: a list of matcher.Match objects encoding the match between
- anchors and groundtruth boxes for each image of the batch,
- with rows of the Match objects corresponding to groundtruth boxes
- and columns corresponding to anchors.
-
- Returns:
- mined_location_loss: a float scalar with sum of localization losses from
- selected hard examples.
- mined_cls_loss: a float scalar with sum of classification losses from
- selected hard examples.
- """
- class_predictions = prediction_dict['class_predictions_with_background']
- if self._add_background_class:
- class_predictions = tf.slice(class_predictions, [0, 0, 1], [-1, -1, -1])
-
- decoded_boxes, _ = self._batch_decode(prediction_dict['box_encodings'])
- decoded_box_tensors_list = tf.unstack(decoded_boxes)
- class_prediction_list = tf.unstack(class_predictions)
- decoded_boxlist_list = []
- for box_location, box_score in zip(decoded_box_tensors_list,
- class_prediction_list):
- decoded_boxlist = box_list.BoxList(box_location)
- decoded_boxlist.add_field('scores', box_score)
- decoded_boxlist_list.append(decoded_boxlist)
- return self._hard_example_miner(
- location_losses=location_losses,
- cls_losses=cls_losses,
- decoded_boxlist_list=decoded_boxlist_list,
- match_list=match_list)
-
- def _batch_decode(self, box_encodings):
- """Decodes a batch of box encodings with respect to the anchors.
-
- Args:
- box_encodings: A float32 tensor of shape
- [batch_size, num_anchors, box_code_size] containing box encodings.
-
- Returns:
- decoded_boxes: A float32 tensor of shape
- [batch_size, num_anchors, 4] containing the decoded boxes.
- decoded_keypoints: A float32 tensor of shape
- [batch_size, num_anchors, num_keypoints, 2] containing the decoded
- keypoints if present in the input `box_encodings`, None otherwise.
- """
- combined_shape = shape_utils.combined_static_and_dynamic_shape(
- box_encodings)
- batch_size = combined_shape[0]
- tiled_anchor_boxes = tf.tile(
- tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1])
- tiled_anchors_boxlist = box_list.BoxList(
- tf.reshape(tiled_anchor_boxes, [-1, 4]))
- decoded_boxes = self._box_coder.decode(
- tf.reshape(box_encodings, [-1, self._box_coder.code_size]),
- tiled_anchors_boxlist)
- decoded_keypoints = None
- if decoded_boxes.has_field(fields.BoxListFields.keypoints):
- decoded_keypoints = decoded_boxes.get_field(
- fields.BoxListFields.keypoints)
- num_keypoints = decoded_keypoints.get_shape()[1]
- decoded_keypoints = tf.reshape(
- decoded_keypoints,
- tf.stack([combined_shape[0], combined_shape[1], num_keypoints, 2]))
- decoded_boxes = tf.reshape(decoded_boxes.get(), tf.stack(
- [combined_shape[0], combined_shape[1], 4]))
- return decoded_boxes, decoded_keypoints
-
- def regularization_losses(self):
- """Returns a list of regularization losses for this model.
-
- Returns a list of regularization losses for this model that the estimator
- needs to use during training/optimization.
-
- Returns:
- A list of regularization loss tensors.
- """
- losses = []
- slim_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
- # Copy the slim losses to avoid modifying the collection
- if slim_losses:
- losses.extend(slim_losses)
- if self._box_predictor.is_keras_model:
- losses.extend(self._box_predictor.losses)
- if self._feature_extractor.is_keras_model:
- losses.extend(self._feature_extractor.losses)
- return losses
-
- def restore_map(self,
- fine_tune_checkpoint_type='detection',
- load_all_detection_checkpoint_vars=False):
- """Returns a map of variables to load from a foreign checkpoint.
-
- See parent class for details.
-
- Args:
- fine_tune_checkpoint_type: whether to restore from a full detection
- checkpoint (with compatible variable names) or to restore from a
- classification checkpoint for initialization prior to training.
- Valid values: `detection`, `classification`. Default 'detection'.
- load_all_detection_checkpoint_vars: whether to load all variables (when
- `fine_tune_checkpoint_type='detection'`). If False, only variables
- within the appropriate scopes are included. Default False.
-
- Returns:
- A dict mapping variable names (to load from a checkpoint) to variables in
- the model graph.
- Raises:
- ValueError: if fine_tune_checkpoint_type is neither `classification`
- nor `detection`.
- """
- if fine_tune_checkpoint_type not in ['detection', 'classification']:
- raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
- fine_tune_checkpoint_type))
-
- if fine_tune_checkpoint_type == 'classification':
- return self._feature_extractor.restore_from_classification_checkpoint_fn(
- self._extract_features_scope)
-
- if fine_tune_checkpoint_type == 'detection':
- variables_to_restore = {}
- for variable in tf.global_variables():
- var_name = variable.op.name
- if load_all_detection_checkpoint_vars:
- variables_to_restore[var_name] = variable
- else:
- if var_name.startswith(self._extract_features_scope):
- variables_to_restore[var_name] = variable
-
- return variables_to_restore
-
- def updates(self):
- """Returns a list of update operators for this model.
-
- Returns a list of update operators for this model that must be executed at
- each training step. The estimator's train op needs to have a control
- dependency on these updates.
-
- Returns:
- A list of update operators.
- """
- update_ops = []
- slim_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- # Copy the slim ops to avoid modifying the collection
- if slim_update_ops:
- update_ops.extend(slim_update_ops)
- if self._box_predictor.is_keras_model:
- update_ops.extend(self._box_predictor.get_updates_for(None))
- update_ops.extend(self._box_predictor.get_updates_for(
- self._box_predictor.inputs))
- if self._feature_extractor.is_keras_model:
- update_ops.extend(self._feature_extractor.get_updates_for(None))
- update_ops.extend(self._feature_extractor.get_updates_for(
- self._feature_extractor.inputs))
- return update_ops
|