|
|
- # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """Evaluator class for Visual Relations Detection.
-
- VRDDetectionEvaluator is a class which manages ground truth information of a
- visual relations detection (vrd) dataset, and computes frequently used detection
- metrics such as Precision, Recall, Recall@k, of the provided vrd detection
- results.
- It supports the following operations:
- 1) Adding ground truth information of images sequentially.
- 2) Adding detection results of images sequentially.
- 3) Evaluating detection metrics on already inserted detection results.
-
- Note1: groundtruth should be inserted before evaluation.
- Note2: This module operates on numpy boxes and box lists.
- """
-
- from abc import abstractmethod
- import collections
- import logging
- import numpy as np
-
- from object_detection.core import standard_fields
- from object_detection.utils import metrics
- from object_detection.utils import object_detection_evaluation
- from object_detection.utils import per_image_vrd_evaluation
-
- # Below standard input numpy datatypes are defined:
- # box_data_type - datatype of the groundtruth visual relations box annotations;
- # this datatype consists of two named boxes: subject bounding box and object
- # bounding box. Each box is of the format [y_min, x_min, y_max, x_max], each
- # coordinate being of type float32.
- # label_data_type - corresponding datatype of the visual relations label
- # annotaions; it consists of three numerical class labels: subject class label,
- # object class label and relation class label, each class label being of type
- # int32.
- vrd_box_data_type = np.dtype([('subject', 'f4', (4,)), ('object', 'f4', (4,))])
- single_box_data_type = np.dtype([('box', 'f4', (4,))])
- label_data_type = np.dtype([('subject', 'i4'), ('object', 'i4'), ('relation',
- 'i4')])
-
-
- class VRDDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
- """A class to evaluate VRD detections.
-
- This class serves as a base class for VRD evaluation in two settings:
- - phrase detection
- - relation detection.
- """
-
- def __init__(self, matching_iou_threshold=0.5, metric_prefix=None):
- """Constructor.
-
- Args:
- matching_iou_threshold: IOU threshold to use for matching groundtruth
- boxes to detection boxes.
- metric_prefix: (optional) string prefix for metric name; if None, no
- prefix is used.
-
- """
- super(VRDDetectionEvaluator, self).__init__([])
- self._matching_iou_threshold = matching_iou_threshold
- self._evaluation = _VRDDetectionEvaluation(
- matching_iou_threshold=self._matching_iou_threshold)
- self._image_ids = set([])
- self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
- self._evaluatable_labels = {}
- self._negative_labels = {}
-
- @abstractmethod
- def _process_groundtruth_boxes(self, groundtruth_box_tuples):
- """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
-
- Phrase detection and Relation detection subclasses re-implement this method
- depending on the task.
-
- Args:
- groundtruth_box_tuples: A numpy array of structures with the shape
- [M, 1], each structure containing the same number of named bounding
- boxes. Each box is of the format [y_min, x_min, y_max, x_max] (see
- datatype vrd_box_data_type, single_box_data_type above).
- """
- raise NotImplementedError(
- '_process_groundtruth_boxes method should be implemented in subclasses'
- 'of VRDDetectionEvaluator.')
-
- @abstractmethod
- def _process_detection_boxes(self, detections_box_tuples):
- """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
-
- Phrase detection and Relation detection subclasses re-implement this method
- depending on the task.
-
- Args:
- detections_box_tuples: A numpy array of structures with the shape
- [M, 1], each structure containing the same number of named bounding
- boxes. Each box is of the format [y_min, x_min, y_max, x_max] (see
- datatype vrd_box_data_type, single_box_data_type above).
- """
- raise NotImplementedError(
- '_process_detection_boxes method should be implemented in subclasses'
- 'of VRDDetectionEvaluator.')
-
- def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
- """Adds groundtruth for a single image to be used for evaluation.
-
- Args:
- image_id: A unique string/integer identifier for the image.
- groundtruth_dict: A dictionary containing -
- standard_fields.InputDataFields.groundtruth_boxes: A numpy array
- of structures with the shape [M, 1], representing M tuples, each tuple
- containing the same number of named bounding boxes.
- Each box is of the format [y_min, x_min, y_max, x_max] (see
- datatype vrd_box_data_type, single_box_data_type above).
- standard_fields.InputDataFields.groundtruth_classes: A numpy array of
- structures shape [M, 1], representing the class labels of the
- corresponding bounding boxes and possibly additional classes (see
- datatype label_data_type above).
- standard_fields.InputDataFields.groundtruth_image_classes: numpy array
- of shape [K] containing verified labels.
- Raises:
- ValueError: On adding groundtruth for an image more than once.
- """
- if image_id in self._image_ids:
- raise ValueError('Image with id {} already added.'.format(image_id))
-
- groundtruth_class_tuples = (
- groundtruth_dict[standard_fields.InputDataFields.groundtruth_classes])
- groundtruth_box_tuples = (
- groundtruth_dict[standard_fields.InputDataFields.groundtruth_boxes])
-
- self._evaluation.add_single_ground_truth_image_info(
- image_key=image_id,
- groundtruth_box_tuples=self._process_groundtruth_boxes(
- groundtruth_box_tuples),
- groundtruth_class_tuples=groundtruth_class_tuples)
- self._image_ids.update([image_id])
- all_classes = []
- for field in groundtruth_box_tuples.dtype.fields:
- all_classes.append(groundtruth_class_tuples[field])
- groudtruth_positive_classes = np.unique(np.concatenate(all_classes))
- verified_labels = groundtruth_dict.get(
- standard_fields.InputDataFields.groundtruth_image_classes,
- np.array([], dtype=int))
- self._evaluatable_labels[image_id] = np.unique(
- np.concatenate((verified_labels, groudtruth_positive_classes)))
-
- self._negative_labels[image_id] = np.setdiff1d(verified_labels,
- groudtruth_positive_classes)
-
- def add_single_detected_image_info(self, image_id, detections_dict):
- """Adds detections for a single image to be used for evaluation.
-
- Args:
- image_id: A unique string/integer identifier for the image.
- detections_dict: A dictionary containing -
- standard_fields.DetectionResultFields.detection_boxes: A numpy array of
- structures with shape [N, 1], representing N tuples, each tuple
- containing the same number of named bounding boxes.
- Each box is of the format [y_min, x_min, y_max, x_max] (as an example
- see datatype vrd_box_data_type, single_box_data_type above).
- standard_fields.DetectionResultFields.detection_scores: float32 numpy
- array of shape [N] containing detection scores for the boxes.
- standard_fields.DetectionResultFields.detection_classes: A numpy array
- of structures shape [N, 1], representing the class labels of the
- corresponding bounding boxes and possibly additional classes (see
- datatype label_data_type above).
- """
- if image_id not in self._image_ids:
- logging.warn('No groundtruth for the image with id %s.', image_id)
- # Since for the correct work of evaluator it is assumed that groundtruth
- # is inserted first we make sure to break the code if is it not the case.
- self._image_ids.update([image_id])
- self._negative_labels[image_id] = np.array([])
- self._evaluatable_labels[image_id] = np.array([])
-
- num_detections = detections_dict[
- standard_fields.DetectionResultFields.detection_boxes].shape[0]
- detection_class_tuples = detections_dict[
- standard_fields.DetectionResultFields.detection_classes]
- detection_box_tuples = detections_dict[
- standard_fields.DetectionResultFields.detection_boxes]
- negative_selector = np.zeros(num_detections, dtype=bool)
- selector = np.ones(num_detections, dtype=bool)
- # Only check boxable labels
- for field in detection_box_tuples.dtype.fields:
- # Verify if one of the labels is negative (this is sure FP)
- negative_selector |= np.isin(detection_class_tuples[field],
- self._negative_labels[image_id])
- # Verify if all labels are verified
- selector &= np.isin(detection_class_tuples[field],
- self._evaluatable_labels[image_id])
- selector |= negative_selector
- self._evaluation.add_single_detected_image_info(
- image_key=image_id,
- detected_box_tuples=self._process_detection_boxes(
- detection_box_tuples[selector]),
- detected_scores=detections_dict[
- standard_fields.DetectionResultFields.detection_scores][selector],
- detected_class_tuples=detection_class_tuples[selector])
-
- def evaluate(self, relationships=None):
- """Compute evaluation result.
-
- Args:
- relationships: A dictionary of numerical label-text label mapping; if
- specified, returns per-relationship AP.
-
- Returns:
- A dictionary of metrics with the following fields -
-
- summary_metrics:
- 'weightedAP@<matching_iou_threshold>IOU' : weighted average precision
- at the specified IOU threshold.
- 'AP@<matching_iou_threshold>IOU/<relationship>' : AP per relationship.
- 'mAP@<matching_iou_threshold>IOU': mean average precision at the
- specified IOU threshold.
- 'Recall@50@<matching_iou_threshold>IOU': recall@50 at the specified IOU
- threshold.
- 'Recall@100@<matching_iou_threshold>IOU': recall@100 at the specified
- IOU threshold.
- if relationships is specified, returns <relationship> in AP metrics as
- readable names, otherwise the names correspond to class numbers.
- """
- (weighted_average_precision, mean_average_precision, average_precisions, _,
- _, recall_50, recall_100, _, _) = (
- self._evaluation.evaluate())
-
- vrd_metrics = {
- (self._metric_prefix + 'weightedAP@{}IOU'.format(
- self._matching_iou_threshold)):
- weighted_average_precision,
- self._metric_prefix + 'mAP@{}IOU'.format(self._matching_iou_threshold):
- mean_average_precision,
- self._metric_prefix + 'Recall@50@{}IOU'.format(
- self._matching_iou_threshold):
- recall_50,
- self._metric_prefix + 'Recall@100@{}IOU'.format(
- self._matching_iou_threshold):
- recall_100,
- }
- if relationships:
- for key, average_precision in average_precisions.iteritems():
- vrd_metrics[self._metric_prefix + 'AP@{}IOU/{}'.format(
- self._matching_iou_threshold,
- relationships[key])] = average_precision
- else:
- for key, average_precision in average_precisions.iteritems():
- vrd_metrics[self._metric_prefix + 'AP@{}IOU/{}'.format(
- self._matching_iou_threshold, key)] = average_precision
-
- return vrd_metrics
-
- def clear(self):
- """Clears the state to prepare for a fresh evaluation."""
- self._evaluation = _VRDDetectionEvaluation(
- matching_iou_threshold=self._matching_iou_threshold)
- self._image_ids.clear()
- self._negative_labels.clear()
- self._evaluatable_labels.clear()
-
-
- class VRDRelationDetectionEvaluator(VRDDetectionEvaluator):
- """A class to evaluate VRD detections in relations setting.
-
- Expected groundtruth box datatype is vrd_box_data_type, expected groudtruth
- labels datatype is label_data_type.
- Expected detection box datatype is vrd_box_data_type, expected detection
- labels
- datatype is label_data_type.
- """
-
- def __init__(self, matching_iou_threshold=0.5):
- super(VRDRelationDetectionEvaluator, self).__init__(
- matching_iou_threshold=matching_iou_threshold,
- metric_prefix='VRDMetric_Relationships')
-
- def _process_groundtruth_boxes(self, groundtruth_box_tuples):
- """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
-
- Args:
- groundtruth_box_tuples: A numpy array of structures with the shape
- [M, 1], each structure containing the same number of named bounding
- boxes. Each box is of the format [y_min, x_min, y_max, x_max].
-
- Returns:
- Unchanged input.
- """
-
- return groundtruth_box_tuples
-
- def _process_detection_boxes(self, detections_box_tuples):
- """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
-
- Phrase detection and Relation detection subclasses re-implement this method
- depending on the task.
-
- Args:
- detections_box_tuples: A numpy array of structures with the shape
- [M, 1], each structure containing the same number of named bounding
- boxes. Each box is of the format [y_min, x_min, y_max, x_max] (see
- datatype vrd_box_data_type, single_box_data_type above).
- Returns:
- Unchanged input.
- """
- return detections_box_tuples
-
-
- class VRDPhraseDetectionEvaluator(VRDDetectionEvaluator):
- """A class to evaluate VRD detections in phrase setting.
-
- Expected groundtruth box datatype is vrd_box_data_type, expected groudtruth
- labels datatype is label_data_type.
- Expected detection box datatype is single_box_data_type, expected detection
- labels datatype is label_data_type.
- """
-
- def __init__(self, matching_iou_threshold=0.5):
- super(VRDPhraseDetectionEvaluator, self).__init__(
- matching_iou_threshold=matching_iou_threshold,
- metric_prefix='VRDMetric_Phrases')
-
- def _process_groundtruth_boxes(self, groundtruth_box_tuples):
- """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
-
- In case of phrase evaluation task, evaluation expects exactly one bounding
- box containing all objects in the phrase. This bounding box is computed
- as an enclosing box of all groundtruth boxes of a phrase.
-
- Args:
- groundtruth_box_tuples: A numpy array of structures with the shape
- [M, 1], each structure containing the same number of named bounding
- boxes. Each box is of the format [y_min, x_min, y_max, x_max]. See
- vrd_box_data_type for an example of structure.
-
- Returns:
- result: A numpy array of structures with the shape [M, 1], each
- structure containing exactly one named bounding box. i-th output
- structure corresponds to the result of processing i-th input structure,
- where the named bounding box is computed as an enclosing bounding box
- of all bounding boxes of the i-th input structure.
- """
- first_box_key = groundtruth_box_tuples.dtype.fields.keys()[0]
- miny = groundtruth_box_tuples[first_box_key][:, 0]
- minx = groundtruth_box_tuples[first_box_key][:, 1]
- maxy = groundtruth_box_tuples[first_box_key][:, 2]
- maxx = groundtruth_box_tuples[first_box_key][:, 3]
- for fields in groundtruth_box_tuples.dtype.fields:
- miny = np.minimum(groundtruth_box_tuples[fields][:, 0], miny)
- minx = np.minimum(groundtruth_box_tuples[fields][:, 1], minx)
- maxy = np.maximum(groundtruth_box_tuples[fields][:, 2], maxy)
- maxx = np.maximum(groundtruth_box_tuples[fields][:, 3], maxx)
- data_result = []
- for i in range(groundtruth_box_tuples.shape[0]):
- data_result.append(([miny[i], minx[i], maxy[i], maxx[i]],))
- result = np.array(data_result, dtype=[('box', 'f4', (4,))])
- return result
-
- def _process_detection_boxes(self, detections_box_tuples):
- """Pre-processes boxes before adding them to the VRDDetectionEvaluation.
-
- In case of phrase evaluation task, evaluation expects exactly one bounding
- box containing all objects in the phrase. This bounding box is computed
- as an enclosing box of all groundtruth boxes of a phrase.
-
- Args:
- detections_box_tuples: A numpy array of structures with the shape
- [M, 1], each structure containing the same number of named bounding
- boxes. Each box is of the format [y_min, x_min, y_max, x_max]. See
- vrd_box_data_type for an example of this structure.
-
- Returns:
- result: A numpy array of structures with the shape [M, 1], each
- structure containing exactly one named bounding box. i-th output
- structure corresponds to the result of processing i-th input structure,
- where the named bounding box is computed as an enclosing bounding box
- of all bounding boxes of the i-th input structure.
- """
- first_box_key = detections_box_tuples.dtype.fields.keys()[0]
- miny = detections_box_tuples[first_box_key][:, 0]
- minx = detections_box_tuples[first_box_key][:, 1]
- maxy = detections_box_tuples[first_box_key][:, 2]
- maxx = detections_box_tuples[first_box_key][:, 3]
- for fields in detections_box_tuples.dtype.fields:
- miny = np.minimum(detections_box_tuples[fields][:, 0], miny)
- minx = np.minimum(detections_box_tuples[fields][:, 1], minx)
- maxy = np.maximum(detections_box_tuples[fields][:, 2], maxy)
- maxx = np.maximum(detections_box_tuples[fields][:, 3], maxx)
- data_result = []
- for i in range(detections_box_tuples.shape[0]):
- data_result.append(([miny[i], minx[i], maxy[i], maxx[i]],))
- result = np.array(data_result, dtype=[('box', 'f4', (4,))])
- return result
-
-
- VRDDetectionEvalMetrics = collections.namedtuple('VRDDetectionEvalMetrics', [
- 'weighted_average_precision', 'mean_average_precision',
- 'average_precisions', 'precisions', 'recalls', 'recall_50', 'recall_100',
- 'median_rank_50', 'median_rank_100'
- ])
-
-
- class _VRDDetectionEvaluation(object):
- """Performs metric computation for the VRD task. This class is internal.
- """
-
- def __init__(self, matching_iou_threshold=0.5):
- """Constructor.
-
- Args:
- matching_iou_threshold: IOU threshold to use for matching groundtruth
- boxes to detection boxes.
- """
- self._per_image_eval = per_image_vrd_evaluation.PerImageVRDEvaluation(
- matching_iou_threshold=matching_iou_threshold)
-
- self._groundtruth_box_tuples = {}
- self._groundtruth_class_tuples = {}
- self._num_gt_instances = 0
- self._num_gt_imgs = 0
- self._num_gt_instances_per_relationship = {}
-
- self.clear_detections()
-
- def clear_detections(self):
- """Clears detections."""
- self._detection_keys = set()
- self._scores = []
- self._relation_field_values = []
- self._tp_fp_labels = []
- self._average_precisions = {}
- self._precisions = []
- self._recalls = []
-
- def add_single_ground_truth_image_info(
- self, image_key, groundtruth_box_tuples, groundtruth_class_tuples):
- """Adds groundtruth for a single image to be used for evaluation.
-
- Args:
- image_key: A unique string/integer identifier for the image.
- groundtruth_box_tuples: A numpy array of structures with the shape
- [M, 1], representing M tuples, each tuple containing the same number
- of named bounding boxes.
- Each box is of the format [y_min, x_min, y_max, x_max].
- groundtruth_class_tuples: A numpy array of structures shape [M, 1],
- representing the class labels of the corresponding bounding boxes and
- possibly additional classes.
- """
- if image_key in self._groundtruth_box_tuples:
- logging.warn(
- 'image %s has already been added to the ground truth database.',
- image_key)
- return
-
- self._groundtruth_box_tuples[image_key] = groundtruth_box_tuples
- self._groundtruth_class_tuples[image_key] = groundtruth_class_tuples
-
- self._update_groundtruth_statistics(groundtruth_class_tuples)
-
- def add_single_detected_image_info(self, image_key, detected_box_tuples,
- detected_scores, detected_class_tuples):
- """Adds detections for a single image to be used for evaluation.
-
- Args:
- image_key: A unique string/integer identifier for the image.
- detected_box_tuples: A numpy array of structures with shape [N, 1],
- representing N tuples, each tuple containing the same number of named
- bounding boxes.
- Each box is of the format [y_min, x_min, y_max, x_max].
- detected_scores: A float numpy array of shape [N, 1], representing
- the confidence scores of the detected N object instances.
- detected_class_tuples: A numpy array of structures shape [N, 1],
- representing the class labels of the corresponding bounding boxes and
- possibly additional classes.
- """
- self._detection_keys.add(image_key)
- if image_key in self._groundtruth_box_tuples:
- groundtruth_box_tuples = self._groundtruth_box_tuples[image_key]
- groundtruth_class_tuples = self._groundtruth_class_tuples[image_key]
- else:
- groundtruth_box_tuples = np.empty(
- shape=[0, 4], dtype=detected_box_tuples.dtype)
- groundtruth_class_tuples = np.array([], dtype=detected_class_tuples.dtype)
-
- scores, tp_fp_labels, mapping = (
- self._per_image_eval.compute_detection_tp_fp(
- detected_box_tuples=detected_box_tuples,
- detected_scores=detected_scores,
- detected_class_tuples=detected_class_tuples,
- groundtruth_box_tuples=groundtruth_box_tuples,
- groundtruth_class_tuples=groundtruth_class_tuples))
-
- self._scores += [scores]
- self._tp_fp_labels += [tp_fp_labels]
- self._relation_field_values += [detected_class_tuples[mapping]['relation']]
-
- def _update_groundtruth_statistics(self, groundtruth_class_tuples):
- """Updates grouth truth statistics.
-
- Args:
- groundtruth_class_tuples: A numpy array of structures shape [M, 1],
- representing the class labels of the corresponding bounding boxes and
- possibly additional classes.
- """
- self._num_gt_instances += groundtruth_class_tuples.shape[0]
- self._num_gt_imgs += 1
- for relation_field_value in np.unique(groundtruth_class_tuples['relation']):
- if relation_field_value not in self._num_gt_instances_per_relationship:
- self._num_gt_instances_per_relationship[relation_field_value] = 0
- self._num_gt_instances_per_relationship[relation_field_value] += np.sum(
- groundtruth_class_tuples['relation'] == relation_field_value)
-
- def evaluate(self):
- """Computes evaluation result.
-
- Returns:
- A named tuple with the following fields -
- average_precision: a float number corresponding to average precision.
- precisions: an array of precisions.
- recalls: an array of recalls.
- recall@50: recall computed on 50 top-scoring samples.
- recall@100: recall computed on 100 top-scoring samples.
- median_rank@50: median rank computed on 50 top-scoring samples.
- median_rank@100: median rank computed on 100 top-scoring samples.
- """
- if self._num_gt_instances == 0:
- logging.warn('No ground truth instances')
-
- if not self._scores:
- scores = np.array([], dtype=float)
- tp_fp_labels = np.array([], dtype=bool)
- else:
- scores = np.concatenate(self._scores)
- tp_fp_labels = np.concatenate(self._tp_fp_labels)
- relation_field_values = np.concatenate(self._relation_field_values)
-
- for relation_field_value, _ in (
- self._num_gt_instances_per_relationship.iteritems()):
- precisions, recalls = metrics.compute_precision_recall(
- scores[relation_field_values == relation_field_value],
- tp_fp_labels[relation_field_values == relation_field_value],
- self._num_gt_instances_per_relationship[relation_field_value])
- self._average_precisions[
- relation_field_value] = metrics.compute_average_precision(
- precisions, recalls)
-
- self._mean_average_precision = np.mean(self._average_precisions.values())
-
- self._precisions, self._recalls = metrics.compute_precision_recall(
- scores, tp_fp_labels, self._num_gt_instances)
- self._weighted_average_precision = metrics.compute_average_precision(
- self._precisions, self._recalls)
-
- self._recall_50 = (
- metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances,
- 50))
- self._median_rank_50 = (
- metrics.compute_median_rank_at_k(self._tp_fp_labels, 50))
- self._recall_100 = (
- metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances,
- 100))
- self._median_rank_100 = (
- metrics.compute_median_rank_at_k(self._tp_fp_labels, 100))
-
- return VRDDetectionEvalMetrics(
- self._weighted_average_precision, self._mean_average_precision,
- self._average_precisions, self._precisions, self._recalls,
- self._recall_50, self._recall_100, self._median_rank_50,
- self._median_rank_100)
|