Source code for kwcoco.metrics.clf_report

import warnings
import numpy as np
import ubelt as ub
import os


# TODO : should we use locale or a ASCII_ONLY environ?
# For now lets just expose this environ, even if it is not standard
# ideally we would find a standard environ like NO_COLOR to accomplish this
# https://stackoverflow.com/questions/3425294/how-to-detect-the-os-default-language-in-python
ASCII_ONLY = os.environ.get('ASCII_ONLY', '')



[docs]
def classification_report(y_true, y_pred, target_names=None,
                          sample_weight=None, verbose=False,
                          remove_unsupported=False, log=None,
                          ascii_only=False):
    r"""
    Computes a classification report which is a collection of various metrics
    commonly used to evaluate classification quality. This can handle binary
    and multiclass settings [MulticlassMCC]_.

    Note that this function does not accept probabilities or scores and must
    instead act on final decisions. See ovr_classification_report for a
    probability based report function using a one-vs-rest strategy.

    This emulates the bm(cm) Matlab script [MatlabBM]_ written by David Powers
    that is used for computing bookmaker, markedness, and various other scores
    and is based on the paper [PowersMetrics]_.

    References:
        .. [PowersMetrics] https://csem.flinders.edu.au/research/techreps/SIE07001.pdf

        .. [MatlabBM] https://www.mathworks.com/matlabcentral/fileexchange/5648-bm-cm-?requestedDomain=www.mathworks.com

        .. [MulticlassMCC] Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN Error Measures in MultiClass Prediction

    Args:
        y_true (ndarray): true labels for each item

        y_pred (ndarray): predicted labels for each item

        target_names (List | None): mapping from label to category name

        sample_weight (ndarray | None): weight for each item

        verbose (int): print if True

        log (callable | None): print or logging function

        remove_unsupported (bool): removes categories that have no support.
            Defaults to False.

        ascii_only (bool): if True dont use unicode characters.
            if the environ ASCII_ONLY is present this is forced to True and
            cannot be undone. Defaults to False.

    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> # xdoctest: +REQUIRES(module:sklearn)
        >>> # xdoctest: +REQUIRES(module:pandas)
        >>> y_true = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]
        >>> y_pred = [1, 2, 1, 3, 1, 2, 2, 3, 2, 2, 3, 3, 2, 3, 3, 3, 1, 3]
        >>> target_names = None
        >>> sample_weight = None
        >>> report = classification_report(y_true, y_pred, verbose=0, ascii_only=1)
        >>> print(report['confusion'])
        pred  1  2  3  Σr
        real
        1     3  1  1   5
        2     0  4  1   5
        3     1  1  6   8
        Σp    4  6  8  18
        >>> print(report['metrics'])
        metric    precision  recall    fpr  markedness  bookmaker    mcc  support
        class
        1            0.7500  0.6000 0.0769      0.6071     0.5231 0.5635        5
        2            0.6667  0.8000 0.1538      0.5833     0.6462 0.6139        5
        3            0.7500  0.7500 0.2000      0.5500     0.5500 0.5500        8
        combined     0.7269  0.7222 0.1530      0.5751     0.5761 0.5758       18

    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> # xdoctest: +REQUIRES(module:sklearn)
        >>> # xdoctest: +REQUIRES(module:pandas)
        >>> from kwcoco.metrics.clf_report import *  # NOQA
        >>> y_true = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]
        >>> y_pred = [1, 2, 1, 3, 1, 2, 2, 3, 2, 2, 3, 3, 2, 3, 3, 3, 1, 3]
        >>> target_names = None
        >>> sample_weight = None
        >>> logs = []
        >>> report = classification_report(y_true, y_pred, verbose=1, ascii_only=True, log=logs.append)
        >>> print('\n'.join(logs))

    Ignore:
        >>> size = 100
        >>> rng = np.random.RandomState(0)
        >>> p_classes = np.array([.90, .05, .05][0:2])
        >>> p_classes = p_classes / p_classes.sum()
        >>> p_wrong   = np.array([.03, .01, .02][0:2])
        >>> y_true = testdata_ytrue(p_classes, p_wrong, size, rng)
        >>> rs = []
        >>> for x in range(17):
        >>>     p_wrong += .05
        >>>     y_pred = testdata_ypred(y_true, p_wrong, rng)
        >>>     report = classification_report(y_true, y_pred, verbose='hack')
        >>>     rs.append(report)
        >>> # xdoctest: +REQUIRES(--show)
        >>> import kwplot
        >>> kwplot.autompl()
        >>> import pandas as pd
        >>> df = pd.DataFrame(rs).drop(['raw'], axis=1)
        >>> delta = df.subtract(df['target'], axis=0)
        >>> sqrd_error = np.sqrt((delta ** 2).sum(axis=0))
        >>> print('Error')
        >>> print(sqrd_error.sort_values())
        >>> ys = df.to_dict(orient='list')
        >>> kwplot.multi_plot(ydata_list=ys)
    """
    import pandas as pd
    import scipy as sp  # NOQA
    import sklearn.metrics
    from sklearn.preprocessing import LabelEncoder

    if ASCII_ONLY:
        ascii_only = True

    if verbose or log:
        if log is None:
            log = print

    if target_names is None:
        unique_labels = np.unique(np.hstack([y_true, y_pred]))
        if len(unique_labels) == 1 and (unique_labels[0] == 0 or unique_labels[0] == 1):
            target_names = np.array([False, True])
            y_true_ = y_true
            y_pred_ = y_pred
        else:
            lb = LabelEncoder()
            lb.fit(unique_labels)
            y_true_ = lb.transform(y_true)
            y_pred_ = lb.transform(y_pred)
            target_names = lb.classes_
    else:
        y_true_ = y_true
        y_pred_ = y_pred

    # Real data is on the rows,
    # Pred data is on the cols.

    cm = sklearn.metrics.confusion_matrix(
        y_true_, y_pred_, sample_weight=sample_weight,
        labels=np.arange(len(target_names)))
    confusion = cm  # NOQA

    k = len(cm)  # number of classes
    N = cm.sum()  # number of examples

    real_total = cm.sum(axis=1)
    pred_total = cm.sum(axis=0)

    # the number of "positive" cases **per class**
    n_pos = real_total  # NOQA
    # the number of times a class was predicted.
    n_neg = N - n_pos  # NOQA

    # number of true positives per class
    n_tps = np.diag(cm)
    # number of true negatives per class
    n_fps = (cm - np.diagflat(np.diag(cm))).sum(axis=0)

    import warnings
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message='invalid .* true_divide')
        warnings.filterwarnings('ignore', message='divide by zero')
        warnings.filterwarnings('ignore', message='invalid value encountered in double_scalars')

        tprs = n_tps / real_total  # true pos rate (recall)
        tpas = n_tps / pred_total  # true pos accuracy (precision)

        unused = (real_total + pred_total) == 0

        fprs = n_fps / n_neg  # false pose rate
        fprs[unused] = np.nan

        rprob = real_total / N
        pprob = pred_total / N

        # if len(cm) == 2:
        #     [[A, B],
        #      [C, D]] = cm
        #     (A * D - B * C) / np.sqrt((A + C) * (B + D) * (A + B) * (C + D))

        # bookmaker is analogous to recall, but unbiased by class frequency
        rprob_mat = np.tile(rprob, [k, 1]).T - (1 - np.eye(k))
        bmcm = cm.T / rprob_mat
        bms = np.sum(bmcm.T, axis=0) / N

        # markedness is analogous to precision, but unbiased by class frequency
        pprob_mat = np.tile(pprob, [k, 1]).T - (1 - np.eye(k))
        mkcm = cm / pprob_mat
        mks = np.sum(mkcm.T, axis=0) / N

        mccs = np.sign(bms) * np.sqrt(np.abs(bms * mks))

        import scipy
        # https://en.wikipedia.org/wiki/F1_score
        # f1_scores = scipy.stats.hmean(np.hstack([
        #     tpas[:, None],
        #     tprs[:, None]
        # ]), axis=1)
        f1_scores = 2 * (tpas * tprs) / (tpas + tprs)
        g1_scores = scipy.stats.gmean(np.hstack([
            tpas[:, None],
            tprs[:, None]
        ]), axis=1)

    perclass_data = ub.odict([
        ('precision', tpas),
        ('recall', tprs),
        ('fpr', fprs),
        ('markedness', mks),
        ('bookmaker', bms),
        ('mcc', mccs),
        ('f1', f1_scores),
        ('g1', g1_scores),
        ('support', real_total),
    ])

    tpa = np.nansum(tpas * rprob)
    tpr = np.nansum(tprs * rprob)

    fpr = np.nansum(fprs * rprob)

    mk = np.nansum(mks * rprob)
    bm = np.nansum(bms * pprob)

    # The simple mean seems to do the best
    mccs_ = mccs[~np.isnan(mccs)]
    if len(mccs_) == 0:
        mcc_combo = np.nan
    else:
        mcc_combo = np.nanmean(mccs_)

    combined_data = ub.odict([
        ('precision', tpa),
        ('recall', tpr),
        ('fpr', fpr),
        ('markedness', mk),
        ('bookmaker', bm),
        # ('mcc', np.sign(bm) * np.sqrt(np.abs(bm * mk))),
        ('mcc', mcc_combo),
        # np.sign(bm) * np.sqrt(np.abs(bm * mk))),
        ('f1', np.nanmean(f1_scores)),
        ('g1', np.nanmean(g1_scores)),
        ('support', real_total.sum()),
    ])

    # # Not sure how to compute this. Should it agree with the sklearn impl?
    # if verbose == 'hack':
    #     verbose = False
    #     mcc_known = sklearn.metrics.matthews_corrcoef(
    #         y_true, y_pred, sample_weight=sample_weight)
    #     mcc_raw = np.sign(bm) * np.sqrt(np.abs(bm * mk))

    #     def gmean(x, w=None):
    #         if w is None:
    #             return sp.stats.gmean(x)
    #         return np.exp(np.nansum(w * np.log(x)) / np.nansum(w))

    #     def hmean(x, w=None):
    #         if w is None:
    #             return sp.stats.hmean(x)
    #         return 1 / (np.nansum(w * (1 / x)) / np.nansum(w))

    #     def amean(x, w=None):
    #         if w is None:
    #             return np.mean(x)
    #         return np.nansum(w * x) / np.nansum(w)

    #     report = {
    #         'target': mcc_known,
    #         'raw': mcc_raw,
    #     }

    #     means = {
    #         'a': amean,
    #         # 'h': hmean,
    #         'g': gmean,
    #     }
    #     weights = {
    #         'p': pprob,
    #         'r': rprob,
    #         '': None,
    #     }
    #     for mean_key, mean in means.items():
    #         for w_key, w in weights.items():
    #             # Hack of very wrong items
    #             if mean_key == 'g':
    #                 if w_key in ['r', 'p', '']:
    #                     continue
    #             if mean_key == 'g':
    #                 if w_key in ['r']:
    #                     continue
    #             m = mean(mccs, w)
    #             r_key = '{} {}'.format(mean_key, w_key)
    #             report[r_key] = m
    #             # log(r_key)
    #             # log(np.abs(m - mcc_known))

    #     return report

    index = pd.Index(target_names, name='class')

    perclass_df = pd.DataFrame(perclass_data, index=index)

    if remove_unsupported:
        perclass_df = perclass_df[perclass_df['support'] > 0]

    # combined_df = pd.DataFrame(combined_data, index=['ave/sum'])
    combined_df = pd.DataFrame(combined_data, index=['combined'])

    metric_df = pd.concat([perclass_df, combined_df])
    metric_df.index.name = 'class'
    metric_df.columns.name = 'metric'

    pred_id = ['%s' % m for m in target_names]
    real_id = ['%s' % m for m in target_names]
    confusion_df = pd.DataFrame(confusion, columns=pred_id, index=real_id)

    if ascii_only :
        sum_glyph = 'sum-'
    else:
        sum_glyph = 'Σ'

    sum_real_key = sum_glyph + 'r'
    sum_pred_key = sum_glyph + 'p'

    to_append = pd.DataFrame([confusion.sum(axis=0)], columns=pred_id, index=[sum_pred_key])
    confusion_df = pd.concat([confusion_df, to_append], axis=0)

    confusion_df[sum_real_key] = np.hstack([confusion.sum(axis=1), [0]])
    confusion_df.index.name = 'real'
    confusion_df.columns.name = 'pred'

    _residual = (confusion_df - np.floor(confusion_df)).values
    _thresh = 1e-6
    if np.all(_residual < _thresh):
        confusion_df = confusion_df.astype(int)
    confusion_df.iloc[(-1, -1)] = N
    _residual = (confusion_df - np.floor(confusion_df)).values
    if np.all(_residual < _thresh):
        confusion_df = confusion_df.astype(int)

    if verbose:
        if ascii_only:
            times_glyph = 'x'
        else:
            times_glyph = '×'

        cfsm_str = confusion_df.to_string(float_format=lambda x: '%.1f' % (x,))
        log('Confusion Matrix (real ' + times_glyph + ' pred) :\n' + ub.indent(cfsm_str))

        # ut.cprint('\nExtended Report', 'turquoise')
        float_precision = 2
        float_format = '%.' + str(float_precision) + 'f'
        ext_report = metric_df.to_string(float_format=float_format)
        log('\nEvaluation Metric Report:' + '\n' + ub.indent(ext_report))

    report = {
        'metrics': metric_df,
        'confusion': confusion_df,
    }

    # TODO: What is the difference between sklearn multiclass-MCC
    # and BM * MK MCC?

    try:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', message='invalid value encountered in double_scalars')
            warnings.filterwarnings('ignore', message='Mean of empty slice')

            mcc = sklearn.metrics.matthews_corrcoef(
                y_true, y_pred, sample_weight=sample_weight)
            # mcc = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight)
            # These scales are chosen somewhat arbitrarily in the context of a
            # computer vision application with relatively reasonable quality data
            # https://stats.stackexchange.com/questions/118219/how-to-interpret
            mcc_significance_scales = ub.odict([
                (1.0, 'perfect'),
                (0.9, 'very strong'),
                (0.7, 'strong'),
                (0.5, 'significant'),
                (0.3, 'moderate'),
                (0.2, 'weak'),
                (0.0, 'negligible'),
            ])
            for k, v in mcc_significance_scales.items():
                if np.abs(mcc) >= k:
                    if verbose:
                        log('classifier correlation is %s' % (v,))
                    break
            if verbose:
                float_precision = 2
                log(('MCC\' = %.' + str(float_precision) + 'f') % (mcc,))
            report['mcc'] = mcc
    except ValueError:
        report['mcc'] = None
    return report




[docs]
def ovr_classification_report(mc_y_true, mc_probs, target_names=None,
                              sample_weight=None, metrics=None, verbose=0,
                              remove_unsupported=False, log=None):
    """
    One-vs-rest classification report

    Args:
        mc_y_true (ndarray):
            multiclass truth labels (integer label format). Shape [N].

        mc_probs (ndarray):
            multiclass probabilities for each class.  Shape [N x C].

        target_names (Dict[int, str] | None): mapping from int label to string name

        sample_weight (ndarray | None): weight for each item. Shape [N].

        metrics (List[str] | None): names of metrics to compute


    Example:
        >>> # xdoctest: +IGNORE_WANT
        >>> # xdoctest: +REQUIRES(module:sklearn)
        >>> # xdoctest: +REQUIRES(module:pandas)
        >>> from kwcoco.metrics.clf_report import *  # NOQA
        >>> y_true = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
        >>> y_probs = np.random.rand(len(y_true), max(y_true) + 1)
        >>> target_names = None
        >>> sample_weight = None
        >>> verbose = True
        >>> report = ovr_classification_report(y_true, y_probs)
        >>> print(report['ave'])
        auc     0.6541
        ap      0.6824
        kappa   0.0963
        mcc     0.1002
        brier   0.2214
        dtype: float64
        >>> print(report['ovr'])
             auc     ap  kappa    mcc  brier  support  weight
        0 0.6062 0.6161 0.0526 0.0598 0.2608        8  0.4444
        1 0.5846 0.6014 0.0000 0.0000 0.2195        5  0.2778
        2 0.8000 0.8693 0.2623 0.2652 0.1602        5  0.2778

    Ignore:
        >>> y_true = [1, 1, 1]
        >>> y_probs = np.random.rand(len(y_true), 3)
        >>> target_names = None
        >>> sample_weight = None
        >>> verbose = True
        >>> report = ovr_classification_report(y_true, y_probs)
        >>> print(report['ovr'])

    """
    import pandas as pd
    import sklearn.metrics

    if metrics is None:
        metrics = ['auc', 'ap', 'mcc', 'f1', 'brier', 'kappa']

    n_classes = mc_probs.shape[1]
    ohvec_true = np.eye(n_classes, dtype=np.uint8)[mc_y_true]

    # Preallocate common datas
    bin_probs = np.empty((len(mc_probs), 2), dtype=mc_probs.dtype)

    import kwarray
    # Map everything onto 0-1 range
    ranked_cidxs = kwarray.argmaxima(mc_probs, 2, axis=1)
    ranked_scores = np.array([a[x] for a, x in zip(mc_probs, ranked_cidxs)])  # probably better numpy way to do this

    mc_scores = kwarray.normalize(mc_probs, mode='linear')
    total_scores = mc_scores.sum(axis=1, keepdims=0)
    # max_scores = mc_scores.max(axis=1, keepdims=0)

    class_metrics = ub.odict()
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message='Mean of empty slice')
        warnings.filterwarnings('ignore', message='invalid value encountered in true_divide')
        warnings.filterwarnings('ignore', message='invalid value encountered in double_scalars')
        warnings.filterwarnings('ignore', message='divide by zero')
        warnings.filterwarnings('ignore', message='due to no true nor predicted samples')
        warnings.filterwarnings('ignore', message='ill-defined')

        for cidx in range(n_classes):
            k_metrics = ub.odict()

            class_score = mc_scores.T[cidx]
            is_other = (ranked_cidxs != cidx)
            other_score = np.array([a[f][0] for a, f in zip(ranked_scores, is_other)])

            # HEURISTIC:
            # We need to compute a score or "probability" of other
            # is there a better way to do this?
            # other_prob = total_scores - class_prob
            # class_prob = class_score
            # other_prob = max_scores
            # other_prob = (max_scores - class_score)
            class_prob = class_score / total_scores
            other_prob = other_score / total_scores
            # other_prob = (max_scores - class_score) / total_scores

            # Consider each class a one-vs-rest problem
            # Populate the first column

            bin_probs[:, 1] = class_prob
            bin_probs[:, 0] = other_prob

            # Index of the true class
            k_true = ohvec_true.T[cidx]
            # Index of the predicted class
            k_pred = np.argmax(bin_probs, axis=1)  # NOTE: ASSUME MUTEX CLASSES

            # Probabilities for the true class for each label
            bin_truth = np.eye(2)[k_true]
            true_probs = (bin_probs * bin_truth).sum(axis=1)

            if 'auc' in metrics:
                try:
                    k_metrics['auc'] = sklearn.metrics.roc_auc_score(
                        bin_truth, bin_probs, sample_weight=sample_weight)
                except ValueError:
                    k_metrics['auc'] = np.nan

            if 'ap' in metrics:
                k_metrics['ap'] = sklearn.metrics.average_precision_score(
                    bin_truth, bin_probs, sample_weight=sample_weight,
                    # zero_division=1,
                    # np.nan
                )

            if 'kappa' in metrics:
                k_metrics['kappa'] = sklearn.metrics.cohen_kappa_score(
                    k_true, k_pred, labels=[0, 1], sample_weight=sample_weight)

            if 'mcc' in metrics:
                k_metrics['mcc'] = sklearn.metrics.matthews_corrcoef(
                    k_true, k_pred, sample_weight=sample_weight)

            if 'f1' in metrics:
                k_metrics['f1'] = sklearn.metrics.fbeta_score(
                    k_true, k_pred, beta=1.0, sample_weight=sample_weight,
                    zero_division=0,
                    # zero_division=1,
                    # zero_division=np.nan,
                )

            if 'brier' in metrics:
                # Get the probability of the real class for each example
                rprobs = np.clip(true_probs / total_scores, 0, 1)
                rwants = np.ones(len(rprobs))
                # Use custom brier implementation until sklearn is fixed.
                mse = (rwants - rprobs) ** 2
                if sample_weight is None:
                    k_metrics['brier'] = mse.mean()
                else:
                    k_metrics['brier'] = (mse * sample_weight).sum() / sample_weight.sum()
                # NOTE: There is a bug here (but bug is in sklearn 0.19.1)
                # brier = sklearn.metrics.brier_score_loss(rwants, rprobs)

            if sample_weight is None:
                k_metrics['support'] = k_true.sum()
            else:
                k_metrics['support'] = (sample_weight * k_true).sum()

            key = cidx if target_names is None else target_names[cidx]
            class_metrics[key] = k_metrics

    ovr_metrics = pd.DataFrame.from_dict(class_metrics, orient='index')

    if remove_unsupported:
        ovr_metrics = ovr_metrics[ovr_metrics['support'] > 0]

    weight = ovr_metrics.loc[:, 'support'] / ovr_metrics.loc[:, 'support'].sum()
    ovr_metrics['weight'] = weight

    # weighted = ovr_metrics.drop(columns=['support', 'weight'])
    weighted = ovr_metrics.copy()
    weighted.iloc[:] = weighted.values * weight.values[:, None]
    weighted_ave = weighted.sum(axis=0)
    weighted_ave['support'] = ovr_metrics['support'].sum()
    weighted_ave['weight'] = ovr_metrics['weight'].sum()

    report = {
        'ovr': ovr_metrics,
        'ave': weighted_ave,
    }

    if verbose or log:
        if log is None:
            log = print
        ovr_metrics = report['ovr']
        weighted_ave = report['ave']
        log('ovr_metrics')
        log(pd.concat([ovr_metrics, weighted_ave.to_frame('__accum__').T]))
    return report