diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index c98b0e14493c6..142f7e50f1903 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -30,6 +30,7 @@ from .classification import recall_score from .classification import zero_one_loss from .classification import brier_score_loss +from .classification import calibration_loss from . import cluster from .cluster import adjusted_mutual_info_score @@ -120,4 +121,5 @@ 'v_measure_score', 'zero_one_loss', 'brier_score_loss', + 'calibration_loss', ] diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index e0055e3476f04..1b6b4455b93cb 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -39,6 +39,7 @@ from ..utils.validation import _num_samples from ..utils.sparsefuncs import count_nonzero from ..exceptions import UndefinedMetricWarning +from __future__ import division def _check_targets(y_true, y_pred): @@ -1993,3 +1994,64 @@ def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): y_true = np.array(y_true == pos_label, int) y_true = _check_binary_probabilistic_predictions(y_true, y_prob) return np.average((y_true - y_prob) ** 2, weights=sample_weight) + + +def calibration_loss(y_true, y_prob, bin_size=2): + + """Compute Calibration score by bins. + The calibration loss is defined as the measure to access the quality of + learning methods and learned models. A calibration measure based on + overlaping binning is CAL (Caruana and Niculescu-Mizil, 2004). + + Parameters + ---------- + y_true : array, shape (n_samples,) + True targets. + + y_prob : array, shape (n_samples,) + Probabilities of the positive class. + + bin_size : int + Size of the bin (samples) analysed in one iteration + + Returns + ------- + score : float + Calibration loss + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import calibration_loss + >>> y_true = np.array([0, 1, 1, 0]) + >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) + >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) + >>> calibration_loss(y_true, y_prob, bin_size=1) + 0.175 + >>> calibration_loss(y_true, y_prob, bin_size=2) + 0.5333333333333333 + """ + pos_loss = 0.0 + neg_loss = 0.0 + + for bin_start in range(0, len(y_true) - bin_size + 1): + + bin_end = bin_start + bin_size + actual_per_pos_class = (y_true[bin_start:bin_end] + .sum()) / bin_size + bin_error_pos = abs(y_prob[bin_start:bin_end] + - actual_per_pos_class).sum() + pos_loss += bin_error_pos + + actual_per_neg_class = (bin_size - y_true[bin_start:bin_end] + .sum()) / bin_size + bin_error_neg = abs((1-y_prob[bin_start:bin_end]) + - actual_per_neg_class).sum() + neg_loss += bin_error_neg + + pos_loss /= (len(y_true) - bin_size + 1) + neg_loss /= (len(y_true) - bin_size + 1) + loss = (0.5) * (pos_loss + neg_loss) + + return loss + diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index cae78e721bc8e..aca45c889815b 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -28,6 +28,7 @@ from sklearn.utils.mocking import MockDataFrame from sklearn.metrics import accuracy_score +from sklearn.metrics import calibration_loss from sklearn.metrics import average_precision_score from sklearn.metrics import classification_report from sklearn.metrics import cohen_kappa_score @@ -1635,3 +1636,36 @@ def test_brier_score_loss(): # calculate even if only single class in y_true (#6980) assert_almost_equal(brier_score_loss([0], [0.5]), 0.25) assert_almost_equal(brier_score_loss([1], [0.5]), 0.25) + + +def test_calibration_loss(): + # Check calibration_loss function + y_true = np.array([0, 1, 1, 0, 1, 1]) + y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + calibration_loss_val = calibration_loss(y_true, y_pred, bin_size=2) + assert_almost_equal(calibration_loss_val, 0.46999, decimal=4) + +def test_balanced_accuracy_score_unseen(): + assert_warns_message(UserWarning, 'y_pred contains classes not in y_true', + balanced_accuracy_score, [0, 0, 0], [0, 0, 1]) + + +@pytest.mark.parametrize('y_true,y_pred', + [ + (['a', 'b', 'a', 'b'], ['a', 'a', 'a', 'b']), + (['a', 'b', 'c', 'b'], ['a', 'a', 'a', 'b']), + (['a', 'a', 'a', 'b'], ['a', 'b', 'c', 'b']), + ]) +def test_balanced_accuracy_score(y_true, y_pred): + macro_recall = recall_score(y_true, y_pred, average='macro', + labels=np.unique(y_true)) + with ignore_warnings(): + # Warnings are tested in test_balanced_accuracy_score_unseen + balanced = balanced_accuracy_score(y_true, y_pred) + assert balanced == pytest.approx(macro_recall) + adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True) + chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0])) + assert adjusted == (balanced - chance) / (1 - chance) +def test_balanced_accuracy_score_unseen(): + assert_warns_message(UserWarning, 'y_pred contains classes not in y_true', + balanced_accuracy_score, [0, 0, 0], [0, 0, 1])