diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index a45765cbd04a3..893f2d6644851 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -248,6 +248,11 @@ Changelog whether to raise an exception if a subset of the scorers in multimetric scoring fails or to return an error code. :pr:`28992` by :user:`Stefanie Senger `. +- |MajorFeature| :func:`inspection.metric_threshold_curve` has been added to + measure the relationship between the threshold used by a binary classifier + for a given threshold-dependent function. :pr:`25639` by + :user:`Carlo Lemos `. + - |Enhancement| Adds `zero_division` to :func:`cohen_kappa_score`. When there is a division by zero, the metric is undefined and this value is returned. :pr:`29210` by :user:`Marc Torrellas Socastro ` and diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 787df39a21979..25f7b62149884 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -66,6 +66,7 @@ root_mean_squared_log_error, ) from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer +from ._decision_threshold import decision_threshold_curve from .cluster import ( adjusted_mutual_info_score, adjusted_rand_score, @@ -117,6 +118,7 @@ "d2_log_loss_score", "d2_pinball_score", "dcg_score", + "decision_threshold_curve", "davies_bouldin_score", "DetCurveDisplay", "det_curve", diff --git a/sklearn/metrics/_decision_threshold.py b/sklearn/metrics/_decision_threshold.py new file mode 100644 index 0000000000000..8bf61e08e0653 --- /dev/null +++ b/sklearn/metrics/_decision_threshold.py @@ -0,0 +1,99 @@ +"""Metric per threshold curve to assess binary classification performance. + +Given threshold grid, one can undestand the behaviour of threshold-dependent +metrics when changing the threshold. In imbalanced scenarios or +cost-sensitive learning, a 0.5 threshold may not be optimal and tools like +this can help you visualize how the performance changes. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral + +from ..utils._param_validation import Interval, validate_params +from ._scorer import _CurveScorer + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "scoring": [callable], + "thresholds": [ + Interval(Integral, 3, None, closed="left"), + "array-like", + None, + ], + }, + prefer_skip_nested_validation=True, +) +def decision_threshold_curve( + y_true, + y_score, + scoring, + thresholds=100, +): + """Compute the threshold-dependent metric of interest per threshold. + + Note: this implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.6 + + Parameters + ---------- + y_true : array-like of shape (n_samples,), default=None + True targets of binary classification. + + y_score : array-like of shape (n_samples,), default=None + Estimated probabilities or output of a decision function. + + scoring : callable, default=None + The objective metric to be estimated. It should be a callable object created + with :func:`~sklearn.metrics.make_scorer`. + # TODO(Carlo): Change it to also just be a function callable. In this case, + # transform it in a scorer inside the function. + + thresholds : int or array-like, default=100 + Related to the number of decision thresholds for which we want to compute the + score. If an integer, it will be used to generate `thresholds` thresholds + uniformly distributed between the minimum and maximum of `y_score`. If an + array-like, it will be used as the thresholds. + + Returns + ------- + metric_values : ndarray of shape (n_thresholds,) + The scores associated to each threshold. At index i being the value of the + theshold-dependent metric for predictions score >= thresholds[i]. + # TODO(Carlo) Check if > or >= + + thresholds : ndarray of shape (n_thresholds,) + Ascending score values used as thresholds. + + See Also + -------- + precision_recall_curve : Compute precision-recall pairs for different + probability thresholds. + det_curve : Compute error rates for different probability thresholds. + roc_curve : Compute Receiver operating characteristic (ROC) curve. + + Examples #TODO(Carlo) change the example and fix threshold. + -------- + >>> import numpy as np + >>> from sklearn.metrics import accuracy_score, decision_threshold_curve + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.4, 0.35, 0.8]) + >>> accuracy_values, thresholds = decision_threshold_curve( + ... y_true, y_score, accuracy_score) + >>> thresholds + array([0.1 , 0.35, 0.4 , 0.8 ]) + >>> accuracy_values + array([0.75, 0.5 , 0.75, 0.5 ]) + """ + # if scoring is function ... transform into scorer (do I need an estimator?) + curve_scorer = _CurveScorer.from_scorer(scoring, thresholds) + metric_values, thresholds = curve_scorer._score_given_prediction(y_score) + + return metric_values, thresholds diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index f09b4e6d77442..15d7d9f663667 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -28,6 +28,7 @@ from ..base import is_regressor from ..utils import Bunch +from ..utils._encode import _unique from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params from ..utils._response import _get_response_values from ..utils.metadata_routing import ( @@ -1132,11 +1133,12 @@ class _CurveScorer(_BaseScorer): uniformly distributed between the minimum and maximum predicted scores. If an array-like, it will be used as the thresholds. - response_method : str - The method to call on the estimator to get the response values. + response_method : str, default=None + The method to call on the estimator to get the response values. If value is set + to `None`, then """ - def __init__(self, score_func, sign, kwargs, thresholds, response_method): + def __init__(self, score_func, sign, kwargs, thresholds, response_method=None): super().__init__( score_func=score_func, sign=sign, @@ -1146,19 +1148,68 @@ def __init__(self, score_func, sign, kwargs, thresholds, response_method): self._thresholds = thresholds @classmethod - def from_scorer(cls, scorer, response_method, thresholds): + def from_scorer(cls, scorer, thresholds, response_method=None): """Create a continuous scorer from a normal scorer.""" instance = cls( score_func=scorer._score_func, sign=scorer._sign, - response_method=response_method, thresholds=thresholds, + response_method=response_method, kwargs=scorer._kwargs, ) # transfer the metadata request instance._metadata_request = scorer._get_metadata_request() return instance + # TODO(Carlo): Create tests for this functions. + def _score_given_prediction( + self, y_score, y_true, classes=None, pos_label=None, **kwargs + ): + """Calculate the scores for given prediction values and true labels. + + Parameters + ---------- + y_score : array-like of shape (n_samples,) + Predicted target scores. + + y_true : array-like of shape (n_samples,) + Gold standard target values. + + classes: TODO(Carlo) + ... + + **kwargs : dict + Other parameters passed to the scorer. + + Returns + ------- + score_thresholds : ndarray of shape (thresholds,) + The scores associated with each threshold. + + potential_thresholds : ndarray of shape (thresholds,) + The potential thresholds used to compute the scores. + """ + if classes is None: + classes = _unique(y_true) + pos_label = self._get_pos_label() + scoring_kwargs = {**self._kwargs, **kwargs} + if isinstance(self._thresholds, Integral): + potential_thresholds = np.linspace( + np.min(y_score), np.max(y_score), self._thresholds + ) + else: + potential_thresholds = np.asarray(self._thresholds) + score_thresholds = [ + self._sign + * self._score_func( + y_true, + _threshold_scores_to_class_labels(y_score, th, classes, pos_label), + **scoring_kwargs, + ) + for th in potential_thresholds + ] + return np.array(score_thresholds), potential_thresholds + def _score(self, method_caller, estimator, X, y_true, **kwargs): """Evaluate predicted target values for X relative to y_true. @@ -1189,27 +1240,18 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs): potential_thresholds : ndarray of shape (thresholds,) The potential thresholds used to compute the scores. """ - pos_label = self._get_pos_label() + if self._response_method is None: + raise ValueError( + "If response_method is set to `None`, you can't use this method. " + "Use `_score_given_prediction` instead." + ) y_score = method_caller( - estimator, self._response_method, X, pos_label=pos_label + estimator, self._response_method, X, pos_label=self._get_pos_label() ) + classes = estimator.classes_ - scoring_kwargs = {**self._kwargs, **kwargs} - if isinstance(self._thresholds, Integral): - potential_thresholds = np.linspace( - np.min(y_score), np.max(y_score), self._thresholds - ) - else: - potential_thresholds = np.asarray(self._thresholds) - score_thresholds = [ - self._sign - * self._score_func( - y_true, - _threshold_scores_to_class_labels( - y_score, th, estimator.classes_, pos_label - ), - **scoring_kwargs, - ) - for th in potential_thresholds - ] - return np.array(score_thresholds), potential_thresholds + scores, potential_thresholds = self._score_given_prediction( + y_score, y_true, classes, **kwargs + ) + + return scores, potential_thresholds diff --git a/sklearn/metrics/tests/test_decision_threshold.py b/sklearn/metrics/tests/test_decision_threshold.py new file mode 100644 index 0000000000000..950ea9e28c916 --- /dev/null +++ b/sklearn/metrics/tests/test_decision_threshold.py @@ -0,0 +1,133 @@ +from functools import partial + +import pytest + +from sklearn.metrics import ( + accuracy_score, + f1_score, + fbeta_score, + precision_score, + recall_score, +) + + +# TODO(Carlo): Update tests. +def test_grid_int_bigger_than_set_then_all(): + # """When `thresholds` parameter is bigger than the number of unique + # `y_score` then `len(thresholds)` should be equal to `len(set(y_score))`. + # """ + + # X, y = make_classification() + # clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X, y) + # y_score = clf.predict_proba(X)[:, 1] + + # _, thresholds_big_int = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=len(set(y_score)) + 1000 + # ) + + # assert len(thresholds_big_int) == len(set(y_score)) + assert True + + +def test_binary_clf_curve_multiclass_error(): + # rng = check_random_state(404) + # y_true = rng.randint(0, 3, size=10) + # y_pred = rng.rand(10) + # msg = "In a multiclass scenario, you must pass " + # with pytest.raises(ValueError, match=msg): + # decision_threshold_curve(y_true, y_pred, accuracy_score) + assert True + + +@pytest.mark.parametrize( + "metric", + [ + # make_scorer(fbeta_score, beta=3), + # make_scorer(fbeta_score, beta=0.5), + f1_score, + precision_score, + recall_score, + accuracy_score, + ], +) +def test_decision_threshold_curve_end_points(metric): + # rng = check_random_state(0) + # y_true = np.array([0] * 50 + [1] * 50) + # y_score = rng.normal(3, size=100) + # min_pred, max_score = min(y_score), max(y_score) + + # metric_values, _ = decision_threshold_curve(y_true, y_score, metric) + + # assert metric_values[0] == metric(y_true, (y_score > min_pred) * 1) + # assert metric_values[-1] == metric(y_true, (y_score > max_score) * 1) + assert True + + +@pytest.mark.parametrize( + "metric", + [partial(fbeta_score, beta=3), precision_score, recall_score], +) +def test_zero_sample_weight_equals_excluding(metric): + # rng = check_random_state(0) + # y_true = np.array([0] * 50 + [1] * 50) + # y_score = rng.normal(3, size=100) + + # sample_weight = np.array([0] * 20 + [1] * 80) + # scoring_kwargs = {"sample_weight": sample_weight} + # metric_values_sw, _ = decision_threshold_curve( + # y_true, y_score, metric, scoring_kwargs=scoring_kwargs + # ) + + # y_true_exclude = y_true[sample_weight != 0] + # y_score_exclude = y_score[sample_weight != 0] + # metric_values_exclude, _ = decision_threshold_curve( + # y_true_exclude, y_score_exclude, metric + # ) + + # assert_allclose(metric_values_sw, metric_values_exclude) + assert True + + +def test_len_of_threshold_when_passing_int(): + # y = [0] * 500 + [1] * 500 + # y_score = list(range(1000)) + # _, thresholds = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=13 + # ) + + # assert len(thresholds) == 13 + assert True + + +@pytest.mark.parametrize( + "metric, scoring_kwargs", + [ + (f1_score, None), + (f1_score, {}), + (fbeta_score, {"beta": 4}), + ], +) +def test_scoring_kwargs(metric, scoring_kwargs): + # y_true = np.array([0] * 50 + [1] * 50) + # decision_threshold_curve(y_true, y_true, metric, scoring_kwargs=scoring_kwargs) + assert True + + +def test_passing_the_grid(): + # y = [0] * 500 + [1] * 500 + # y_score = list(range(1000)) + + # grid_sorted = np.array(list(range(200, 300))) + # _, thresholds_sorted = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=grid_sorted + # ) + + # assert_allclose(grid_sorted, thresholds_sorted) + + # grid_not_sorted = grid_sorted[::-1] + # _, thresholds_not_sorted = decision_threshold_curve( + # y, y_score, accuracy_score, thresholds=grid_not_sorted + # ) + + # assert_allclose(grid_sorted, thresholds_not_sorted) + assert True diff --git a/sklearn/model_selection/_classification_threshold.py b/sklearn/model_selection/_classification_threshold.py index bd30a98ac7cc9..8a6015a9bd46a 100644 --- a/sklearn/model_selection/_classification_threshold.py +++ b/sklearn/model_selection/_classification_threshold.py @@ -903,6 +903,6 @@ def _get_curve_scorer(self): """Get the curve scorer based on the objective metric used.""" scoring = check_scoring(self.estimator, scoring=self.scoring) curve_scorer = _CurveScorer.from_scorer( - scoring, self._get_response_method(), self.thresholds + scoring, self.thresholds, self._get_response_method() ) return curve_scorer diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py index 707aa37737c1b..1ddb6421e4834 100644 --- a/sklearn/tests/test_public_functions.py +++ b/sklearn/tests/test_public_functions.py @@ -239,6 +239,7 @@ def _check_function_param_validation( "sklearn.metrics.d2_tweedie_score", "sklearn.metrics.davies_bouldin_score", "sklearn.metrics.dcg_score", + "sklearn.metrics.decision_threshold_curve", "sklearn.metrics.det_curve", "sklearn.metrics.explained_variance_score", "sklearn.metrics.f1_score",