- # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """Abstract detection model.
- This file defines a generic base class for detection models. Programs that are
- designed to work with arbitrary detection models should only depend on this
- class. We intend for the functions in this class to follow tensor-in/tensor-out
- design, thus all functions have tensors or lists/dictionaries holding tensors as
- inputs and outputs.
- Abstractly, detection models predict output tensors given input images
- which can be passed to a loss function at training time or passed to a
- postprocessing function at eval time. The computation graphs at a high level
- consequently look as follows:
- Training time:
- inputs (images tensor) -> preprocess -> predict -> loss -> outputs (loss tensor)
- Evaluation time:
- inputs (images tensor) -> preprocess -> predict -> postprocess
- -> outputs (boxes tensor, scores tensor, classes tensor, num_detections tensor)
- DetectionModels must thus implement four functions (1) preprocess, (2) predict,
- (3) postprocess and (4) loss. DetectionModels should make no assumptions about
- the input size or aspect ratio --- they are responsible for doing any
- resize/reshaping necessary (see docstring for the preprocess function).
- Output classes are always integers in the range [0, num_classes). Any mapping
- of these integers to semantic labels is to be handled outside of this class.
- Images are resized in the `preprocess` method. All of `preprocess`, `predict`,
- and `postprocess` should be reentrant.
- The `preprocess` method runs `image_resizer_fn` that returns resized_images and
- `true_image_shapes`. Since `image_resizer_fn` can pad the images with zeros,
- true_image_shapes indicate the slices that contain the image without padding.
- This is useful for padding images to be a fixed size for batching.
- The `postprocess` method uses the true image shapes to clip predictions that lie
- outside of images.
- By default, DetectionModels produce bounding box detections; However, we support
- a handful of auxiliary annotations associated with each bounding box, namely,
- instance masks and keypoints.
- """
- import abc
- from object_detection.core import standard_fields as fields
- class DetectionModel(object):
- """Abstract base class for detection models."""
- __metaclass__ = abc.ABCMeta
- def __init__(self, num_classes):
- """Constructor.
- Args:
- num_classes: number of classes. Note that num_classes *does not* include
- background categories that might be implicitly predicted in various
- implementations.
- """
- self._num_classes = num_classes
- self._groundtruth_lists = {}
- @property
- def num_classes(self):
- return self._num_classes
- def groundtruth_lists(self, field):
- """Access list of groundtruth tensors.
- Args:
- field: a string key, options are
- fields.BoxListFields.{boxes,classes,masks,keypoints} or
- fields.InputDataFields.is_annotated.
- Returns:
- a list of tensors holding groundtruth information (see also
- provide_groundtruth function below), with one entry for each image in the
- batch.
- Raises:
- RuntimeError: if the field has not been provided via provide_groundtruth.
- """
- if field not in self._groundtruth_lists:
- raise RuntimeError('Groundtruth tensor {} has not been provided'.format(
- field))
- return self._groundtruth_lists[field]
- def groundtruth_has_field(self, field):
- """Determines whether the groundtruth includes the given field.
- Args:
- field: a string key, options are
- fields.BoxListFields.{boxes,classes,masks,keypoints} or
- fields.InputDataFields.is_annotated.
- Returns:
- True if the groundtruth includes the given field, False otherwise.
- """
- return field in self._groundtruth_lists
- @abc.abstractmethod
- def preprocess(self, inputs):
- """Input preprocessing.
- To be overridden by implementations.
- This function is responsible for any scaling/shifting of input values that
- is necessary prior to running the detector on an input image.
- It is also responsible for any resizing, padding that might be necessary
- as images are assumed to arrive in arbitrary sizes. While this function
- could conceivably be part of the predict method (below), it is often
- convenient to keep these separate --- for example, we may want to preprocess
- on one device, place onto a queue, and let another device (e.g., the GPU)
- handle prediction.
- A few important notes about the preprocess function:
- + We assume that this operation does not have any trainable variables nor
- does it affect the groundtruth annotations in any way (thus data
- augmentation operations such as random cropping should be performed
- externally).
- + There is no assumption that the batchsize in this function is the same as
- the batch size in the predict function. In fact, we recommend calling the
- preprocess function prior to calling any batching operations (which should
- happen outside of the model) and thus assuming that batch sizes are equal
- to 1 in the preprocess function.
- + There is also no explicit assumption that the output resolutions
- must be fixed across inputs --- this is to support "fully convolutional"
- settings in which input images can have different shapes/resolutions.
- Args:
- inputs: a [batch, height_in, width_in, channels] float32 tensor
- representing a batch of images with values between 0 and 255.0.
- Returns:
- preprocessed_inputs: a [batch, height_out, width_out, channels] float32
- tensor representing a batch of images.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- """
- pass
- @abc.abstractmethod
- def predict(self, preprocessed_inputs, true_image_shapes):
- """Predict prediction tensors from inputs tensor.
- Outputs of this function can be passed to loss or postprocess functions.
- Args:
- preprocessed_inputs: a [batch, height, width, channels] float32 tensor
- representing a batch of images.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- Returns:
- prediction_dict: a dictionary holding prediction tensors to be
- passed to the Loss or Postprocess functions.
- """
- pass
- @abc.abstractmethod
- def postprocess(self, prediction_dict, true_image_shapes, **params):
- """Convert predicted output tensors to final detections.
- This stage typically performs a few things such as
- * Non-Max Suppression to remove overlapping detection boxes.
- * Score conversion and background class removal.
- Outputs adhere to the following conventions:
- * Classes are integers in [0, num_classes); background classes are removed
- and the first non-background class is mapped to 0. If the model produces
- class-agnostic detections, then no output is produced for classes.
- * Boxes are to be interpreted as being in [y_min, x_min, y_max, x_max]
- format and normalized relative to the image window.
- * `num_detections` is provided for settings where detections are padded to a
- fixed number of boxes.
- * We do not specifically assume any kind of probabilistic interpretation
- of the scores --- the only important thing is their relative ordering.
- Thus implementations of the postprocess function are free to output
- logits, probabilities, calibrated probabilities, or anything else.
- Args:
- prediction_dict: a dictionary holding prediction tensors.
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- **params: Additional keyword arguments for specific implementations of
- DetectionModel.
- Returns:
- detections: a dictionary containing the following fields
- detection_boxes: [batch, max_detections, 4]
- detection_scores: [batch, max_detections]
- detection_classes: [batch, max_detections]
- (If a model is producing class-agnostic detections, this field may be
- missing)
- instance_masks: [batch, max_detections, image_height, image_width]
- (optional)
- keypoints: [batch, max_detections, num_keypoints, 2] (optional)
- num_detections: [batch]
- In addition to the above fields this stage also outputs the following
- raw tensors:
- raw_detection_boxes: [batch, total_detections, 4] tensor containing
- all detection boxes from `prediction_dict` in the format
- [ymin, xmin, ymax, xmax] and normalized co-ordinates.
- raw_detection_scores: [batch, total_detections,
- num_classes_with_background] tensor of class score logits for
- raw detection boxes.
- """
- pass
- @abc.abstractmethod
- def loss(self, prediction_dict, true_image_shapes):
- """Compute scalar loss tensors with respect to provided groundtruth.
- Calling this function requires that groundtruth tensors have been
- provided via the provide_groundtruth function.
- Args:
- prediction_dict: a dictionary holding predicted tensors
- true_image_shapes: int32 tensor of shape [batch, 3] where each row is
- of the form [height, width, channels] indicating the shapes
- of true images in the resized images, as resized images can be padded
- with zeros.
- Returns:
- a dictionary mapping strings (loss names) to scalar tensors representing
- loss values.
- """
- pass
- def provide_groundtruth(self,
- groundtruth_boxes_list,
- groundtruth_classes_list,
- groundtruth_masks_list=None,
- groundtruth_keypoints_list=None,
- groundtruth_weights_list=None,
- groundtruth_confidences_list=None,
- groundtruth_is_crowd_list=None,
- is_annotated_list=None):
- """Provide groundtruth tensors.
- Args:
- groundtruth_boxes_list: a list of 2-D tf.float32 tensors of shape
- [num_boxes, 4] containing coordinates of the groundtruth boxes.
- Groundtruth boxes are provided in [y_min, x_min, y_max, x_max]
- format and assumed to be normalized and clipped
- relative to the image window with y_min <= y_max and x_min <= x_max.
- groundtruth_classes_list: a list of 2-D tf.float32 one-hot (or k-hot)
- tensors of shape [num_boxes, num_classes] containing the class targets
- with the 0th index assumed to map to the first non-background class.
- groundtruth_masks_list: a list of 3-D tf.float32 tensors of
- shape [num_boxes, height_in, width_in] containing instance
- masks with values in {0, 1}. If None, no masks are provided.
- Mask resolution `height_in`x`width_in` must agree with the resolution
- of the input image tensor provided to the `preprocess` function.
- groundtruth_keypoints_list: a list of 3-D tf.float32 tensors of
- shape [num_boxes, num_keypoints, 2] containing keypoints.
- Keypoints are assumed to be provided in normalized coordinates and
- missing keypoints should be encoded as NaN.
- groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape
- [num_boxes] containing weights for groundtruth boxes.
- groundtruth_confidences_list: A list of 2-D tf.float32 tensors of shape
- [num_boxes, num_classes] containing class confidences for groundtruth
- boxes.
- groundtruth_is_crowd_list: A list of 1-D tf.bool tensors of shape
- [num_boxes] containing is_crowd annotations
- is_annotated_list: A list of scalar tf.bool tensors indicating whether
- images have been labeled or not.
- """
- self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list
- self._groundtruth_lists[
- fields.BoxListFields.classes] = groundtruth_classes_list
- if groundtruth_weights_list:
- self._groundtruth_lists[fields.BoxListFields.
- weights] = groundtruth_weights_list
- if groundtruth_confidences_list:
- self._groundtruth_lists[fields.BoxListFields.
- confidences] = groundtruth_confidences_list
- if groundtruth_masks_list:
- self._groundtruth_lists[
- fields.BoxListFields.masks] = groundtruth_masks_list
- if groundtruth_keypoints_list:
- self._groundtruth_lists[
- fields.BoxListFields.keypoints] = groundtruth_keypoints_list
- if groundtruth_is_crowd_list:
- self._groundtruth_lists[
- fields.BoxListFields.is_crowd] = groundtruth_is_crowd_list
- if is_annotated_list:
- self._groundtruth_lists[
- fields.InputDataFields.is_annotated] = is_annotated_list
- @abc.abstractmethod
- def regularization_losses(self):
- """Returns a list of regularization losses for this model.
- Returns a list of regularization losses for this model that the estimator
- needs to use during training/optimization.
- Returns:
- A list of regularization loss tensors.
- """
- pass
- @abc.abstractmethod
- def restore_map(self, fine_tune_checkpoint_type='detection'):
- """Returns a map of variables to load from a foreign checkpoint.
- Returns a map of variable names to load from a checkpoint to variables in
- the model graph. This enables the model to initialize based on weights from
- another task. For example, the feature extractor variables from a
- classification model can be used to bootstrap training of an object
- detector. When loading from an object detection model, the checkpoint model
- should have the same parameters as this detection model with exception of
- the num_classes parameter.
- Args:
- fine_tune_checkpoint_type: whether to restore from a full detection
- checkpoint (with compatible variable names) or to restore from a
- classification checkpoint for initialization prior to training.
- Valid values: `detection`, `classification`. Default 'detection'.
- Returns:
- A dict mapping variable names (to load from a checkpoint) to variables in
- the model graph.
- """
- pass
- @abc.abstractmethod
- def updates(self):
- """Returns a list of update operators for this model.
- Returns a list of update operators for this model that must be executed at
- each training step. The estimator's train op needs to have a control
- dependency on these updates.
- Returns:
- A list of update operators.
- """
- pass