import warnings
import numpy as np
import ubelt as ub
import os
# TODO : should we use locale or a ASCII_ONLY environ?
# For now lets just expose this environ, even if it is not standard
# ideally we would find a standard environ like NO_COLOR to accomplish this
# https://stackoverflow.com/questions/3425294/how-to-detect-the-os-default-language-in-python
ASCII_ONLY = os.environ.get('ASCII_ONLY', '')
[docs]
def classification_report(y_true, y_pred, target_names=None,
sample_weight=None, verbose=False,
remove_unsupported=False, log=None,
ascii_only=False):
r"""
Computes a classification report which is a collection of various metrics
commonly used to evaluate classification quality. This can handle binary
and multiclass settings [MulticlassMCC]_.
Note that this function does not accept probabilities or scores and must
instead act on final decisions. See ovr_classification_report for a
probability based report function using a one-vs-rest strategy.
This emulates the bm(cm) Matlab script [MatlabBM]_ written by David Powers
that is used for computing bookmaker, markedness, and various other scores
and is based on the paper [PowersMetrics]_.
References:
.. [PowersMetrics] https://csem.flinders.edu.au/research/techreps/SIE07001.pdf
.. [MatlabBM] https://www.mathworks.com/matlabcentral/fileexchange/5648-bm-cm-?requestedDomain=www.mathworks.com
.. [MulticlassMCC] Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN Error Measures in MultiClass Prediction
Args:
y_true (ndarray): true labels for each item
y_pred (ndarray): predicted labels for each item
target_names (List | None): mapping from label to category name
sample_weight (ndarray | None): weight for each item
verbose (int): print if True
log (callable | None): print or logging function
remove_unsupported (bool): removes categories that have no support.
Defaults to False.
ascii_only (bool): if True dont use unicode characters.
if the environ ASCII_ONLY is present this is forced to True and
cannot be undone. Defaults to False.
Example:
>>> # xdoctest: +IGNORE_WANT
>>> # xdoctest: +REQUIRES(module:sklearn)
>>> # xdoctest: +REQUIRES(module:pandas)
>>> y_true = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]
>>> y_pred = [1, 2, 1, 3, 1, 2, 2, 3, 2, 2, 3, 3, 2, 3, 3, 3, 1, 3]
>>> target_names = None
>>> sample_weight = None
>>> report = classification_report(y_true, y_pred, verbose=0, ascii_only=1)
>>> print(report['confusion'])
pred 1 2 3 Σr
real
1 3 1 1 5
2 0 4 1 5
3 1 1 6 8
Σp 4 6 8 18
>>> print(report['metrics'])
metric precision recall fpr markedness bookmaker mcc support
class
1 0.7500 0.6000 0.0769 0.6071 0.5231 0.5635 5
2 0.6667 0.8000 0.1538 0.5833 0.6462 0.6139 5
3 0.7500 0.7500 0.2000 0.5500 0.5500 0.5500 8
combined 0.7269 0.7222 0.1530 0.5751 0.5761 0.5758 18
Example:
>>> # xdoctest: +IGNORE_WANT
>>> # xdoctest: +REQUIRES(module:sklearn)
>>> # xdoctest: +REQUIRES(module:pandas)
>>> from kwcoco.metrics.clf_report import * # NOQA
>>> y_true = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]
>>> y_pred = [1, 2, 1, 3, 1, 2, 2, 3, 2, 2, 3, 3, 2, 3, 3, 3, 1, 3]
>>> target_names = None
>>> sample_weight = None
>>> logs = []
>>> report = classification_report(y_true, y_pred, verbose=1, ascii_only=True, log=logs.append)
>>> print('\n'.join(logs))
Ignore:
>>> size = 100
>>> rng = np.random.RandomState(0)
>>> p_classes = np.array([.90, .05, .05][0:2])
>>> p_classes = p_classes / p_classes.sum()
>>> p_wrong = np.array([.03, .01, .02][0:2])
>>> y_true = testdata_ytrue(p_classes, p_wrong, size, rng)
>>> rs = []
>>> for x in range(17):
>>> p_wrong += .05
>>> y_pred = testdata_ypred(y_true, p_wrong, rng)
>>> report = classification_report(y_true, y_pred, verbose='hack')
>>> rs.append(report)
>>> # xdoctest: +REQUIRES(--show)
>>> import kwplot
>>> kwplot.autompl()
>>> import pandas as pd
>>> df = pd.DataFrame(rs).drop(['raw'], axis=1)
>>> delta = df.subtract(df['target'], axis=0)
>>> sqrd_error = np.sqrt((delta ** 2).sum(axis=0))
>>> print('Error')
>>> print(sqrd_error.sort_values())
>>> ys = df.to_dict(orient='list')
>>> kwplot.multi_plot(ydata_list=ys)
"""
import pandas as pd
import scipy as sp # NOQA
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder
if ASCII_ONLY:
ascii_only = True
if verbose or log:
if log is None:
log = print
if target_names is None:
unique_labels = np.unique(np.hstack([y_true, y_pred]))
if len(unique_labels) == 1 and (unique_labels[0] == 0 or unique_labels[0] == 1):
target_names = np.array([False, True])
y_true_ = y_true
y_pred_ = y_pred
else:
lb = LabelEncoder()
lb.fit(unique_labels)
y_true_ = lb.transform(y_true)
y_pred_ = lb.transform(y_pred)
target_names = lb.classes_
else:
y_true_ = y_true
y_pred_ = y_pred
# Real data is on the rows,
# Pred data is on the cols.
cm = sklearn.metrics.confusion_matrix(
y_true_, y_pred_, sample_weight=sample_weight,
labels=np.arange(len(target_names)))
confusion = cm # NOQA
k = len(cm) # number of classes
N = cm.sum() # number of examples
real_total = cm.sum(axis=1)
pred_total = cm.sum(axis=0)
# the number of "positive" cases **per class**
n_pos = real_total # NOQA
# the number of times a class was predicted.
n_neg = N - n_pos # NOQA
# number of true positives per class
n_tps = np.diag(cm)
# number of true negatives per class
n_fps = (cm - np.diagflat(np.diag(cm))).sum(axis=0)
import warnings
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='invalid .* true_divide')
warnings.filterwarnings('ignore', message='divide by zero')
warnings.filterwarnings('ignore', message='invalid value encountered in double_scalars')
tprs = n_tps / real_total # true pos rate (recall)
tpas = n_tps / pred_total # true pos accuracy (precision)
unused = (real_total + pred_total) == 0
fprs = n_fps / n_neg # false pose rate
fprs[unused] = np.nan
rprob = real_total / N
pprob = pred_total / N
# if len(cm) == 2:
# [[A, B],
# [C, D]] = cm
# (A * D - B * C) / np.sqrt((A + C) * (B + D) * (A + B) * (C + D))
# bookmaker is analogous to recall, but unbiased by class frequency
rprob_mat = np.tile(rprob, [k, 1]).T - (1 - np.eye(k))
bmcm = cm.T / rprob_mat
bms = np.sum(bmcm.T, axis=0) / N
# markedness is analogous to precision, but unbiased by class frequency
pprob_mat = np.tile(pprob, [k, 1]).T - (1 - np.eye(k))
mkcm = cm / pprob_mat
mks = np.sum(mkcm.T, axis=0) / N
mccs = np.sign(bms) * np.sqrt(np.abs(bms * mks))
import scipy
# https://en.wikipedia.org/wiki/F1_score
# f1_scores = scipy.stats.hmean(np.hstack([
# tpas[:, None],
# tprs[:, None]
# ]), axis=1)
f1_scores = 2 * (tpas * tprs) / (tpas + tprs)
g1_scores = scipy.stats.gmean(np.hstack([
tpas[:, None],
tprs[:, None]
]), axis=1)
perclass_data = ub.odict([
('precision', tpas),
('recall', tprs),
('fpr', fprs),
('markedness', mks),
('bookmaker', bms),
('mcc', mccs),
('f1', f1_scores),
('g1', g1_scores),
('support', real_total),
])
tpa = np.nansum(tpas * rprob)
tpr = np.nansum(tprs * rprob)
fpr = np.nansum(fprs * rprob)
mk = np.nansum(mks * rprob)
bm = np.nansum(bms * pprob)
# The simple mean seems to do the best
mccs_ = mccs[~np.isnan(mccs)]
if len(mccs_) == 0:
mcc_combo = np.nan
else:
mcc_combo = np.nanmean(mccs_)
combined_data = ub.odict([
('precision', tpa),
('recall', tpr),
('fpr', fpr),
('markedness', mk),
('bookmaker', bm),
# ('mcc', np.sign(bm) * np.sqrt(np.abs(bm * mk))),
('mcc', mcc_combo),
# np.sign(bm) * np.sqrt(np.abs(bm * mk))),
('f1', np.nanmean(f1_scores)),
('g1', np.nanmean(g1_scores)),
('support', real_total.sum()),
])
# # Not sure how to compute this. Should it agree with the sklearn impl?
# if verbose == 'hack':
# verbose = False
# mcc_known = sklearn.metrics.matthews_corrcoef(
# y_true, y_pred, sample_weight=sample_weight)
# mcc_raw = np.sign(bm) * np.sqrt(np.abs(bm * mk))
# def gmean(x, w=None):
# if w is None:
# return sp.stats.gmean(x)
# return np.exp(np.nansum(w * np.log(x)) / np.nansum(w))
# def hmean(x, w=None):
# if w is None:
# return sp.stats.hmean(x)
# return 1 / (np.nansum(w * (1 / x)) / np.nansum(w))
# def amean(x, w=None):
# if w is None:
# return np.mean(x)
# return np.nansum(w * x) / np.nansum(w)
# report = {
# 'target': mcc_known,
# 'raw': mcc_raw,
# }
# means = {
# 'a': amean,
# # 'h': hmean,
# 'g': gmean,
# }
# weights = {
# 'p': pprob,
# 'r': rprob,
# '': None,
# }
# for mean_key, mean in means.items():
# for w_key, w in weights.items():
# # Hack of very wrong items
# if mean_key == 'g':
# if w_key in ['r', 'p', '']:
# continue
# if mean_key == 'g':
# if w_key in ['r']:
# continue
# m = mean(mccs, w)
# r_key = '{} {}'.format(mean_key, w_key)
# report[r_key] = m
# # log(r_key)
# # log(np.abs(m - mcc_known))
# return report
index = pd.Index(target_names, name='class')
perclass_df = pd.DataFrame(perclass_data, index=index)
if remove_unsupported:
perclass_df = perclass_df[perclass_df['support'] > 0]
# combined_df = pd.DataFrame(combined_data, index=['ave/sum'])
combined_df = pd.DataFrame(combined_data, index=['combined'])
metric_df = pd.concat([perclass_df, combined_df])
metric_df.index.name = 'class'
metric_df.columns.name = 'metric'
pred_id = ['%s' % m for m in target_names]
real_id = ['%s' % m for m in target_names]
confusion_df = pd.DataFrame(confusion, columns=pred_id, index=real_id)
if ascii_only :
sum_glyph = 'sum-'
else:
sum_glyph = 'Σ'
sum_real_key = sum_glyph + 'r'
sum_pred_key = sum_glyph + 'p'
to_append = pd.DataFrame([confusion.sum(axis=0)], columns=pred_id, index=[sum_pred_key])
confusion_df = pd.concat([confusion_df, to_append], axis=0)
confusion_df[sum_real_key] = np.hstack([confusion.sum(axis=1), [0]])
confusion_df.index.name = 'real'
confusion_df.columns.name = 'pred'
_residual = (confusion_df - np.floor(confusion_df)).values
_thresh = 1e-6
if np.all(_residual < _thresh):
confusion_df = confusion_df.astype(int)
confusion_df.iloc[(-1, -1)] = N
_residual = (confusion_df - np.floor(confusion_df)).values
if np.all(_residual < _thresh):
confusion_df = confusion_df.astype(int)
if verbose:
if ascii_only:
times_glyph = 'x'
else:
times_glyph = '×'
cfsm_str = confusion_df.to_string(float_format=lambda x: '%.1f' % (x,))
log('Confusion Matrix (real ' + times_glyph + ' pred) :\n' + ub.indent(cfsm_str))
# ut.cprint('\nExtended Report', 'turquoise')
float_precision = 2
float_format = '%.' + str(float_precision) + 'f'
ext_report = metric_df.to_string(float_format=float_format)
log('\nEvaluation Metric Report:' + '\n' + ub.indent(ext_report))
report = {
'metrics': metric_df,
'confusion': confusion_df,
}
# TODO: What is the difference between sklearn multiclass-MCC
# and BM * MK MCC?
try:
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='invalid value encountered in double_scalars')
warnings.filterwarnings('ignore', message='Mean of empty slice')
mcc = sklearn.metrics.matthews_corrcoef(
y_true, y_pred, sample_weight=sample_weight)
# mcc = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight)
# These scales are chosen somewhat arbitrarily in the context of a
# computer vision application with relatively reasonable quality data
# https://stats.stackexchange.com/questions/118219/how-to-interpret
mcc_significance_scales = ub.odict([
(1.0, 'perfect'),
(0.9, 'very strong'),
(0.7, 'strong'),
(0.5, 'significant'),
(0.3, 'moderate'),
(0.2, 'weak'),
(0.0, 'negligible'),
])
for k, v in mcc_significance_scales.items():
if np.abs(mcc) >= k:
if verbose:
log('classifier correlation is %s' % (v,))
break
if verbose:
float_precision = 2
log(('MCC\' = %.' + str(float_precision) + 'f') % (mcc,))
report['mcc'] = mcc
except ValueError:
report['mcc'] = None
return report
[docs]
def ovr_classification_report(mc_y_true, mc_probs, target_names=None,
sample_weight=None, metrics=None, verbose=0,
remove_unsupported=False, log=None):
"""
One-vs-rest classification report
Args:
mc_y_true (ndarray):
multiclass truth labels (integer label format). Shape [N].
mc_probs (ndarray):
multiclass probabilities for each class. Shape [N x C].
target_names (Dict[int, str] | None): mapping from int label to string name
sample_weight (ndarray | None): weight for each item. Shape [N].
metrics (List[str] | None): names of metrics to compute
Example:
>>> # xdoctest: +IGNORE_WANT
>>> # xdoctest: +REQUIRES(module:sklearn)
>>> # xdoctest: +REQUIRES(module:pandas)
>>> from kwcoco.metrics.clf_report import * # NOQA
>>> y_true = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
>>> y_probs = np.random.rand(len(y_true), max(y_true) + 1)
>>> target_names = None
>>> sample_weight = None
>>> verbose = True
>>> report = ovr_classification_report(y_true, y_probs)
>>> print(report['ave'])
auc 0.6541
ap 0.6824
kappa 0.0963
mcc 0.1002
brier 0.2214
dtype: float64
>>> print(report['ovr'])
auc ap kappa mcc brier support weight
0 0.6062 0.6161 0.0526 0.0598 0.2608 8 0.4444
1 0.5846 0.6014 0.0000 0.0000 0.2195 5 0.2778
2 0.8000 0.8693 0.2623 0.2652 0.1602 5 0.2778
Ignore:
>>> y_true = [1, 1, 1]
>>> y_probs = np.random.rand(len(y_true), 3)
>>> target_names = None
>>> sample_weight = None
>>> verbose = True
>>> report = ovr_classification_report(y_true, y_probs)
>>> print(report['ovr'])
"""
import pandas as pd
import sklearn.metrics
if metrics is None:
metrics = ['auc', 'ap', 'mcc', 'f1', 'brier', 'kappa']
n_classes = mc_probs.shape[1]
ohvec_true = np.eye(n_classes, dtype=np.uint8)[mc_y_true]
# Preallocate common datas
bin_probs = np.empty((len(mc_probs), 2), dtype=mc_probs.dtype)
import kwarray
# Map everything onto 0-1 range
ranked_cidxs = kwarray.argmaxima(mc_probs, 2, axis=1)
ranked_scores = np.array([a[x] for a, x in zip(mc_probs, ranked_cidxs)]) # probably better numpy way to do this
mc_scores = kwarray.normalize(mc_probs, mode='linear')
total_scores = mc_scores.sum(axis=1, keepdims=0)
# max_scores = mc_scores.max(axis=1, keepdims=0)
class_metrics = ub.odict()
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='Mean of empty slice')
warnings.filterwarnings('ignore', message='invalid value encountered in true_divide')
warnings.filterwarnings('ignore', message='invalid value encountered in double_scalars')
warnings.filterwarnings('ignore', message='divide by zero')
warnings.filterwarnings('ignore', message='due to no true nor predicted samples')
warnings.filterwarnings('ignore', message='ill-defined')
for cidx in range(n_classes):
k_metrics = ub.odict()
class_score = mc_scores.T[cidx]
is_other = (ranked_cidxs != cidx)
other_score = np.array([a[f][0] for a, f in zip(ranked_scores, is_other)])
# HEURISTIC:
# We need to compute a score or "probability" of other
# is there a better way to do this?
# other_prob = total_scores - class_prob
# class_prob = class_score
# other_prob = max_scores
# other_prob = (max_scores - class_score)
class_prob = class_score / total_scores
other_prob = other_score / total_scores
# other_prob = (max_scores - class_score) / total_scores
# Consider each class a one-vs-rest problem
# Populate the first column
bin_probs[:, 1] = class_prob
bin_probs[:, 0] = other_prob
# Index of the true class
k_true = ohvec_true.T[cidx]
# Index of the predicted class
k_pred = np.argmax(bin_probs, axis=1) # NOTE: ASSUME MUTEX CLASSES
# Probabilities for the true class for each label
bin_truth = np.eye(2)[k_true]
true_probs = (bin_probs * bin_truth).sum(axis=1)
if 'auc' in metrics:
try:
k_metrics['auc'] = sklearn.metrics.roc_auc_score(
bin_truth, bin_probs, sample_weight=sample_weight)
except ValueError:
k_metrics['auc'] = np.nan
if 'ap' in metrics:
k_metrics['ap'] = sklearn.metrics.average_precision_score(
bin_truth, bin_probs, sample_weight=sample_weight,
# zero_division=1,
# np.nan
)
if 'kappa' in metrics:
k_metrics['kappa'] = sklearn.metrics.cohen_kappa_score(
k_true, k_pred, labels=[0, 1], sample_weight=sample_weight)
if 'mcc' in metrics:
k_metrics['mcc'] = sklearn.metrics.matthews_corrcoef(
k_true, k_pred, sample_weight=sample_weight)
if 'f1' in metrics:
k_metrics['f1'] = sklearn.metrics.fbeta_score(
k_true, k_pred, beta=1.0, sample_weight=sample_weight,
zero_division=0,
# zero_division=1,
# zero_division=np.nan,
)
if 'brier' in metrics:
# Get the probability of the real class for each example
rprobs = np.clip(true_probs / total_scores, 0, 1)
rwants = np.ones(len(rprobs))
# Use custom brier implementation until sklearn is fixed.
mse = (rwants - rprobs) ** 2
if sample_weight is None:
k_metrics['brier'] = mse.mean()
else:
k_metrics['brier'] = (mse * sample_weight).sum() / sample_weight.sum()
# NOTE: There is a bug here (but bug is in sklearn 0.19.1)
# brier = sklearn.metrics.brier_score_loss(rwants, rprobs)
if sample_weight is None:
k_metrics['support'] = k_true.sum()
else:
k_metrics['support'] = (sample_weight * k_true).sum()
key = cidx if target_names is None else target_names[cidx]
class_metrics[key] = k_metrics
ovr_metrics = pd.DataFrame.from_dict(class_metrics, orient='index')
if remove_unsupported:
ovr_metrics = ovr_metrics[ovr_metrics['support'] > 0]
weight = ovr_metrics.loc[:, 'support'] / ovr_metrics.loc[:, 'support'].sum()
ovr_metrics['weight'] = weight
# weighted = ovr_metrics.drop(columns=['support', 'weight'])
weighted = ovr_metrics.copy()
weighted.iloc[:] = weighted.values * weight.values[:, None]
weighted_ave = weighted.sum(axis=0)
weighted_ave['support'] = ovr_metrics['support'].sum()
weighted_ave['weight'] = ovr_metrics['weight'].sum()
report = {
'ovr': ovr_metrics,
'ave': weighted_ave,
}
if verbose or log:
if log is None:
log = print
ovr_metrics = report['ovr']
weighted_ave = report['ave']
log('ovr_metrics')
log(pd.concat([ovr_metrics, weighted_ave.to_frame('__accum__').T]))
return report