Source code for kwcoco.metrics.voc_metrics

import warnings
import numpy as np
import ubelt as ub


[docs]class VOC_Metrics(ub.NiceRepr):
    """
    API to compute object detection scores using Pascal VOC evaluation method.

    To use, add true and predicted detections for each image and then run the
    :func:`VOC_Metrics.score` function.

    Attributes:
        recs (Dict[int, List[dict]): true boxes for each image.
            maps image ids to a list of records within that image.
            Each record is a tlbr bbox, a difficult flag, and a class name.

        cx_to_lines (Dict[int, List]): VOC formatted prediction preditions.
            mapping from class index to all predictions for that category.
            Each "line" is a list of [
                [<imgid>, <score>, <tl_x>, <tl_y>, <br_x>, <br_y>]].
    """
    def __init__(self, classes=None):
        self.recs = {}
        self.cx_to_lines = ub.ddict(list)
        self.classes = classes

[docs]    def __nice__(self):
        info = {
            'n_true_imgs': len(self.recs),
            'n_true_anns': sum(map(len, self.recs.values())),
            'n_pred_anns': sum(map(len, self.cx_to_lines.values())),
            'n_pred_cats': len(self.cx_to_lines),
        }
        return ub.repr2(info)

[docs]    def add_truth(self, true_dets, gid):
        self.recs[gid] = []
        true_weights = true_dets.data.get('weights', None)
        if true_weights is None:
            true_weights = [1.0] * len(true_dets)
        for bbox, cx, weight in zip(true_dets.boxes.to_tlbr().data,
                                    true_dets.class_idxs,
                                    true_weights):
            self.recs[gid].append({
                'bbox': bbox,
                'difficult': weight < .5,
                'name': cx
            })

[docs]    def add_predictions(self, pred_dets, gid):
        pred_scores = pred_dets.data.get('scores', None)
        if pred_scores is None:
            pred_scores = [1.0] * len(pred_dets)
        for bbox, cx, score in zip(pred_dets.boxes.to_tlbr().data,
                                   pred_dets.class_idxs,
                                   pred_scores):
            voc_line = [gid, score] + list(bbox)
            self.cx_to_lines[cx].append(voc_line)

[docs]    def score(self, iou_thresh=0.5, bias=1, method='voc2012'):
        """
        Compute VOC scores for every category

        Example:
            >>> from kwcoco.metrics.detect_metrics import DetectionMetrics
            >>> from kwcoco.metrics.voc_metrics import *  # NOQA
            >>> dmet = DetectionMetrics.demo(
            >>>     nimgs=1, nboxes=(0, 100), n_fp=(0, 30), n_fn=(0, 30), classes=2, score_noise=0.9)
            >>> self = VOC_Metrics(classes=dmet.classes)
            >>> self.add_truth(dmet.true_detections(0), 0)
            >>> self.add_predictions(dmet.pred_detections(0), 0)
            >>> voc_scores = self.score()
            >>> # xdoctest: +REQUIRES(--show)
            >>> import kwplot
            >>> kwplot.autompl()
            >>> kwplot.figure(fnum=1, doclf=True)
            >>> voc_scores['perclass'].draw()

            kwplot.figure(fnum=2)
            dmet.true_detections(0).draw(color='green', labels=None)
            dmet.pred_detections(0).draw(color='blue', labels=None)
            kwplot.autoplt().gca().set_xlim(0, 100)
            kwplot.autoplt().gca().set_ylim(0, 100)
        """
        from kwcoco.metrics.confusion_vectors import Measures
        from kwcoco.metrics.confusion_vectors import PerClass_Measures
        perclass = {}
        for cx in self.cx_to_lines.keys():
            lines = self.cx_to_lines[cx]
            classname = cx
            roc_info = _voc_eval(lines, self.recs, classname,
                                 iou_thresh=iou_thresh, bias=bias, method=method)
            roc_info['cx'] = cx
            if self.classes is not None:
                catname = self.classes[cx]
                roc_info.update({
                    'node': catname,
                })
                perclass[catname] = Measures(roc_info)
            else:
                perclass[cx] = Measures(roc_info)

        perclass = PerClass_Measures(perclass)

        perclass_aps = [d['ap'] for d in perclass.values()]
        if len(perclass_aps) == 0:
            mAP = np.nan
        else:
            mAP = np.nanmean(perclass_aps)

        voc_scores = {
            'mAP': mAP,
            'perclass': perclass,
        }
        return voc_scores


[docs]def _pr_curves(y, method='voc2012'):
    """
    Compute a PR curve from a method

    Args:
        y (pd.DataFrame | DataFrameArray): output of detection_confusions

    Returns:
        Tuple[float, ndarray, ndarray]

    Example:
        >>> import pandas as pd
        >>> y1 = pd.DataFrame.from_records([
        >>>     {'pred': 0, 'score': 10.00, 'true': -1, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  1.65, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  8.64, 'true': -1, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  3.97, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  1.68, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  5.06, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  0.25, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  1.75, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  8.52, 'true':  0, 'weight': 1.00},
        >>>     {'pred': 0, 'score':  5.20, 'true':  0, 'weight': 1.00},
        >>> ])
        >>> import kwarray
        >>> y2 = kwarray.DataFrameArray(y1)
        >>> _pr_curves(y2)
        >>> _pr_curves(y1)
    """
    import pandas as pd
    IS_PANDAS = isinstance(y, pd.DataFrame)

    if method not in ['sklearn', 'voc2007', 'voc2012']:
        raise KeyError(method)

    # compute metrics on a per class basis
    if y is None:
        return np.nan, [], []

    # References [Manning2008] and [Everingham2010] present alternative
    # variants of AP that interpolate the precision-recall curve. Currently,
    # average_precision_score does not implement any interpolated variant
    # http://scikit-learn.org/stable/modules/model_evaluation.html
    if method in {'sklearn', 'scikit-learn'}:
        import sklearn
        # In the future, we should simply use the sklearn version
        # which gives nice easy to reproduce results.
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', message='invalid .* true_divide')
            is_correct = (y['true'] == y['pred']).astype(int)
            ap = sklearn.metrics.average_precision_score(
                y_true=is_correct, y_score=y['score'],
                sample_weight=y['weight'],
            )
            prec, rec, thresholds = sklearn.metrics.precision_recall_curve(
                is_correct, y['score'], sample_weight=y['weight'],
            )
            return ap, prec, rec
    elif method == 'voc2007' or method == 'voc2012':
        try:
            y = y.sort_values('score', ascending=False)
            # if True:
            #     # ignore "difficult" matches
            #     y = y[y.weight > 0]
            # npos = sum(y.true >= 0)
        except KeyError:
            npos = 0
            dets = []
        else:
            if IS_PANDAS:
                npos = y[y['true'] >= 0].weight.sum()
                dets = y[y['pred'] > -1]
            else:
                npos = y.compress(y['true'] >= 0)['weight'].sum()
                dets = y.compress(y['pred'] > -1)

        if npos > 0 and len(dets) > 0:
            tp = (dets['pred'] == dets['true'])
            fp = 1 - tp
            fp_cum = np.cumsum(fp)
            tp_cum = np.cumsum(tp)

            eps = np.finfo(np.float64).eps
            rec = 1 if npos == 0 else tp_cum / npos
            prec = tp_cum / np.maximum(tp_cum + fp_cum, eps)

            ap = _voc_ave_precision(rec, prec, method=method)
        else:
            prec, rec = None, None
            if npos == 0:
                ap = np.nan
            if len(dets) == 0:
                if npos == 0:
                    ap = np.nan
                ap = 0.0
    else:
        raise KeyError(method)

    return ap, prec, rec


[docs]def _voc_eval(lines, recs, classname, iou_thresh=0.5, method='voc2012',
              bias=1.0):
    """
    VOC AP evaluation for a single category.

    Args:
        lines (List[list]): VOC formatted predictions.  Each "line" is a list
            of [[<imgid>, <score>, <tl_x>, <tl_y>, <br_x>, <br_y>]].

        recs (Dict[int, List[dict]): true boxes for each image.
            maps image ids to a list of records within that image.
            Each record is a tlbr bbox, a difficult flag, and a class name.

        classname (str): the category to evaluate.

        method (str): code for how the AP is computed.

        bias (float): either 1.0 or 0.0.

    Returns:
        Dict:
            info about the evaluation containing AP. Contains fp, tp, prec,
            rec,

    Note:
        Raw replication of matlab implementation of creating assignments and
        the resulting PR-curves and AP. Based on MATLAB code [1].

    References:
        [1] http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
    """
    import copy
    imagenames = ([x[0] for x in lines])
    recs2 = copy.deepcopy(recs)

    # BUGFIX: need to score images with no predictions / no truth
    imagenames += list(recs.keys())

    # BUGFIX: the original code did not cast this to a set
    imagenames = sorted(set(imagenames))

    # extract gt objects for this class
    class_recs = {}
    npos = 0
    for imagename in imagenames:
        R = [obj for obj in recs2[imagename] if obj['name'] == classname]
        bbox = np.array([x['bbox'] for x in R])
        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
        det = [False] * len(R)
        npos = npos + sum(~difficult)
        class_recs[imagename] = {'bbox': bbox,
                                 'difficult': difficult,
                                 'det': det}

    # Unlike the original implementation our input is presplit
    splitlines = lines
    image_ids = [x[0] for x in splitlines]
    confidence = np.array([x[1] for x in splitlines])
    BB = np.array([[z for z in x[2:]] for x in splitlines])

    # sort by confidence
    sorted_ind = np.argsort(-confidence)
    # sorted_scores = np.sort(-confidence)  #
    BB = BB[sorted_ind, :]
    image_ids = [image_ids[x] for x in sorted_ind]

    # go down dets and mark TPs and FPs
    nd = len(image_ids)
    is_tp = np.zeros(nd)
    is_fp = np.zeros(nd)

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message='invalid .* true_divide')

        # For each prediction
        for d in range(nd):

            # Check if it overlaps any true box.
            R = class_recs[image_ids[d]]
            bb = BB[d, :].astype(float)
            ovmax = -np.inf
            BBGT = R['bbox'].astype(float)

            if BBGT.size > 0:
                # compute overlaps
                # intersection
                ixmin = np.maximum(BBGT[:, 0], bb[0])
                iymin = np.maximum(BBGT[:, 1], bb[1])
                ixmax = np.minimum(BBGT[:, 2], bb[2])
                iymax = np.minimum(BBGT[:, 3], bb[3])
                iw = np.maximum(ixmax - ixmin + bias, 0.)
                ih = np.maximum(iymax - iymin + bias, 0.)
                inters = iw * ih

                # union
                uni = ((bb[2] - bb[0] + bias) * (bb[3] - bb[1] + bias) +
                       (BBGT[:, 2] - BBGT[:, 0] + bias) *
                       (BBGT[:, 3] - BBGT[:, 1] + bias) - inters)

                overlaps = inters / uni
                ovmax = np.max(overlaps)
                jmax = np.argmax(overlaps)

            if ovmax > iou_thresh:
                if not R['difficult'][jmax]:
                    if not R['det'][jmax]:
                        # Mark that this true box has been used.
                        is_tp[d] = 1.
                        R['det'][jmax] = 1
                    else:
                        is_fp[d] = 1.
            else:
                is_fp[d] = 1.

        thresholds = confidence[sorted_ind]
        # compute precision recall
        fp = np.cumsum(is_fp)
        tp = np.cumsum(is_tp)
        fn = npos - tp

        rec = tp / float(npos)
        # avoid divide by zero in case the first detection matches a difficult
        # ground truth
        prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)

        ap = _voc_ave_precision(rec=rec, prec=prec, method=method)

    # number of supports is the number of real positives + unassigned preds
    realneg_total = fp[-1]  # number of unassigned predictions
    realpos_total = npos  # number of truth predictions
    nsupport = realneg_total + realpos_total

    info = {
        'fp_count': fp,
        'tp_count': tp,
        'fn_count': fn,
        'tpr': rec,    # (true positive rate) == (recall)
        'ppv': prec,  # (positive predictive value) == (precision)
        'thresholds': thresholds,
        'npos': npos,
        'nsupport': nsupport,
        'realpos_total': realpos_total,
        'realneg_total': realneg_total,
        'ap': ap,
    }
    return info


[docs]def _voc_ave_precision(rec, prec, method='voc2012'):
    """
    Compute AP from precision and recall
    Based on MATLAB code in [1]_, [2]_, and [3]_.

    Args:
        rec (ndarray): recall
        prec (ndarray): precision
        method (str): either voc2012 or voc2007

    Returns:
        float: ap: average precision

    References:
        .. [1] http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
        .. [2] https://github.com/rbgirshick/voc-dpm/blob/master/test/pascal_eval.m
        .. [3] https://github.com/rbgirshick/voc-dpm/blob/c0b88564bd668bcc6216bbffe96cb061613be768/utils/bootstrap/VOCevaldet_bootstrap.m
    """
    if method == 'voc2007':
        # 11 point metric
        ap = 0.
        for t in np.arange(0., 1.1, 0.1):
            if np.sum(rec >= t) == 0:
                p = 0
            else:
                p = np.max(prec[rec >= t])
            ap = ap + p / 11.
    elif method == 'voc2012':
        # correct AP calculation
        # first append sentinel values at the end
        mrec = np.concatenate(([0.], rec, [1.]))
        mpre = np.concatenate(([0.], prec, [0.]))

        # compute the precision envelope
        for i in range(mpre.size - 1, 0, -1):
            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

        # to calculate area under PR curve, look for points
        # where X axis (recall) changes value
        i = np.where(mrec[1:] != mrec[:-1])[0]

        # and sum (\Delta recall) * prec
        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    elif method == 'sklearn':
        # sklearn metric
        # Note: the voc rec, prec dont extend all the way to 1, so this AUC
        # might not be accurate.
        from sklearn.metrics import auc
        ap = auc(rec, prec)
        # ap = -np.sum(np.diff(rec[::-1]) * np.array(prec[::-1])[:-1])
    else:
        raise KeyError(method)
    return ap