scikit-learn · vitaliset · Feb 18, 2023 · Feb 18, 2023 · Feb 18, 2023 · Feb 18, 2023
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -248,6 +248,11 @@ Changelog
   whether to raise an exception if a subset of the scorers in multimetric scoring fails
   or to return an error code. :pr:`28992` by :user:`Stefanie Senger <StefanieSenger>`.
 
+- |MajorFeature| :func:`inspection.metric_threshold_curve` has been added to
+  measure the relationship between the threshold used by a binary classifier
+  for a given threshold-dependent function. :pr:`25639` by
+  :user:`Carlo Lemos <vitaliset>`.
+
 - |Enhancement| Adds `zero_division` to :func:`cohen_kappa_score`. When there is a
   division by zero, the metric is undefined and this value is returned.
   :pr:`29210` by :user:`Marc Torrellas Socastro <marctorsoc>` and

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -66,6 +66,7 @@
     root_mean_squared_log_error,
 )
 from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer
+from ._decision_threshold import decision_threshold_curve
 from .cluster import (
     adjusted_mutual_info_score,
     adjusted_rand_score,
@@ -117,6 +118,7 @@
     "d2_log_loss_score",
     "d2_pinball_score",
     "dcg_score",
+    "decision_threshold_curve",
     "davies_bouldin_score",
     "DetCurveDisplay",
     "det_curve",

diff --git a/sklearn/metrics/_decision_threshold.py b/sklearn/metrics/_decision_threshold.py
@@ -0,0 +1,99 @@
+"""Metric per threshold curve to assess binary classification performance.
+
+Given threshold grid, one can undestand the behaviour of threshold-dependent
+metrics when changing the threshold. In imbalanced scenarios or
+cost-sensitive learning, a 0.5 threshold may not be optimal and tools like
+this can help you visualize how the performance changes.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+from numbers import Integral
+
+from ..utils._param_validation import Interval, validate_params
+from ._scorer import _CurveScorer
+
+
+@validate_params(
+    {
+        "y_true": ["array-like"],
+        "y_score": ["array-like"],
+        "scoring": [callable],
+        "thresholds": [
+            Interval(Integral, 3, None, closed="left"),
+            "array-like",
+            None,
+        ],
+    },
+    prefer_skip_nested_validation=True,
+)
+def decision_threshold_curve(
+    y_true,
+    y_score,
+    scoring,
+    thresholds=100,
+):
+    """Compute the threshold-dependent metric of interest per threshold.
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Read more in the :ref:`User Guide <metric_threshold_curve>`.
+
+    .. versionadded:: 1.6
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,), default=None
+        True targets of binary classification.
+
+    y_score : array-like of shape (n_samples,), default=None
+        Estimated probabilities or output of a decision function.
+
+    scoring : callable, default=None
+        The objective metric to be estimated. It should be a callable object created
+        with :func:`~sklearn.metrics.make_scorer`.
+        # TODO(Carlo): Change it to also just be a function callable. In this case,
+        # transform it in a scorer inside the function.
+
+    thresholds : int or array-like, default=100
+        Related to the number of decision thresholds for which we want to compute the
+        score. If an integer, it will be used to generate `thresholds` thresholds
+        uniformly distributed between the minimum and maximum of `y_score`. If an
+        array-like, it will be used as the thresholds.
+
+    Returns
+    -------
+    metric_values : ndarray of shape (n_thresholds,)
+        The scores associated to each threshold. At index i being the value of the
+        theshold-dependent metric for predictions score >= thresholds[i].
+        # TODO(Carlo) Check if > or >=
+
+    thresholds : ndarray of shape (n_thresholds,)
+        Ascending score values used as thresholds.
+
+    See Also
+    --------
+    precision_recall_curve : Compute precision-recall pairs for different
+        probability thresholds.
+    det_curve : Compute error rates for different probability thresholds.
+    roc_curve : Compute Receiver operating characteristic (ROC) curve.
+
+    Examples #TODO(Carlo) change the example and fix threshold.
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import accuracy_score, decision_threshold_curve
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> accuracy_values, thresholds = decision_threshold_curve(
+    ...     y_true, y_score, accuracy_score)
+    >>> thresholds
+    array([0.1 , 0.35, 0.4 , 0.8 ])
+    >>> accuracy_values
+    array([0.75, 0.5 , 0.75, 0.5 ])
+    """
+    # if scoring is function ... transform into scorer (do I need an estimator?)
+    curve_scorer = _CurveScorer.from_scorer(scoring, thresholds)
+    metric_values, thresholds = curve_scorer._score_given_prediction(y_score)
+
+    return metric_values, thresholds
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
@@ -28,6 +28,7 @@
 
 from ..base import is_regressor
 from ..utils import Bunch
+from ..utils._encode import _unique
 from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params
 from ..utils._response import _get_response_values
 from ..utils.metadata_routing import (
@@ -1132,11 +1133,12 @@ class _CurveScorer(_BaseScorer):
         uniformly distributed between the minimum and maximum predicted scores. If an
         array-like, it will be used as the thresholds.
 
-    response_method : str
-        The method to call on the estimator to get the response values.
+    response_method : str, default=None
+        The method to call on the estimator to get the response values. If value is set
+        to `None`, then
     """
 
-    def __init__(self, score_func, sign, kwargs, thresholds, response_method):
+    def __init__(self, score_func, sign, kwargs, thresholds, response_method=None):
         super().__init__(
             score_func=score_func,
             sign=sign,
@@ -1146,19 +1148,68 @@ def __init__(self, score_func, sign, kwargs, thresholds, response_method):
         self._thresholds = thresholds
 
     @classmethod
-    def from_scorer(cls, scorer, response_method, thresholds):
+    def from_scorer(cls, scorer, thresholds, response_method=None):
         """Create a continuous scorer from a normal scorer."""
         instance = cls(
             score_func=scorer._score_func,
             sign=scorer._sign,
-            response_method=response_method,
             thresholds=thresholds,
+            response_method=response_method,
             kwargs=scorer._kwargs,
         )
         # transfer the metadata request
         instance._metadata_request = scorer._get_metadata_request()
         return instance
 
+    # TODO(Carlo): Create tests for this functions.
+    def _score_given_prediction(
+        self, y_score, y_true, classes=None, pos_label=None, **kwargs
+    ):
+        """Calculate the scores for given prediction values and true labels.
+
+        Parameters
+        ----------
+        y_score : array-like of shape (n_samples,)
+            Predicted target scores.
+
+        y_true : array-like of shape (n_samples,)
+            Gold standard target values.
+
+        classes: TODO(Carlo)
+            ...
+
+        **kwargs : dict
+            Other parameters passed to the scorer.
+
+        Returns
+        -------
+        score_thresholds : ndarray of shape (thresholds,)
+            The scores associated with each threshold.
+
+        potential_thresholds : ndarray of shape (thresholds,)
+            The potential thresholds used to compute the scores.
+        """
+        if classes is None:
+            classes = _unique(y_true)
+        pos_label = self._get_pos_label()
+        scoring_kwargs = {**self._kwargs, **kwargs}
+        if isinstance(self._thresholds, Integral):
+            potential_thresholds = np.linspace(
+                np.min(y_score), np.max(y_score), self._thresholds
+            )
+        else:
+            potential_thresholds = np.asarray(self._thresholds)
+        score_thresholds = [
+            self._sign
+            * self._score_func(
+                y_true,
+                _threshold_scores_to_class_labels(y_score, th, classes, pos_label),
+                **scoring_kwargs,
+            )
+            for th in potential_thresholds
+        ]
+        return np.array(score_thresholds), potential_thresholds
+
     def _score(self, method_caller, estimator, X, y_true, **kwargs):
         """Evaluate predicted target values for X relative to y_true.
 
@@ -1189,27 +1240,18 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
         potential_thresholds : ndarray of shape (thresholds,)
             The potential thresholds used to compute the scores.
         """
-        pos_label = self._get_pos_label()
+        if self._response_method is None:
+            raise ValueError(
+                "If response_method is set to `None`, you can't use this method. "
+                "Use `_score_given_prediction` instead."
+            )
         y_score = method_caller(
-            estimator, self._response_method, X, pos_label=pos_label
+            estimator, self._response_method, X, pos_label=self._get_pos_label()
         )
+        classes = estimator.classes_
 
-        scoring_kwargs = {**self._kwargs, **kwargs}
-        if isinstance(self._thresholds, Integral):
-            potential_thresholds = np.linspace(
-                np.min(y_score), np.max(y_score), self._thresholds
-            )
-        else:
-            potential_thresholds = np.asarray(self._thresholds)
-        score_thresholds = [
-            self._sign
-            * self._score_func(
-                y_true,
-                _threshold_scores_to_class_labels(
-                    y_score, th, estimator.classes_, pos_label
-                ),
-                **scoring_kwargs,
-            )
-            for th in potential_thresholds
-        ]
-        return np.array(score_thresholds), potential_thresholds
+        scores, potential_thresholds = self._score_given_prediction(
+            y_score, y_true, classes, **kwargs
+        )
+
+        return scores, potential_thresholds
diff --git a/sklearn/metrics/tests/test_decision_threshold.py b/sklearn/metrics/tests/test_decision_threshold.py
@@ -0,0 +1,133 @@
+from functools import partial
+
+import pytest
+
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    fbeta_score,
+    precision_score,
+    recall_score,
+)
+
+
+# TODO(Carlo): Update tests.
+def test_grid_int_bigger_than_set_then_all():
+    # """When `thresholds` parameter is bigger than the number of unique
+    # `y_score` then `len(thresholds)` should be equal to `len(set(y_score))`.
+    # """
+
+    # X, y = make_classification()
+    # clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X, y)
+    # y_score = clf.predict_proba(X)[:, 1]
+
+    # _, thresholds_big_int = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=len(set(y_score)) + 1000
+    # )
+
+    # assert len(thresholds_big_int) == len(set(y_score))
+    assert True
+
+
+def test_binary_clf_curve_multiclass_error():
+    # rng = check_random_state(404)
+    # y_true = rng.randint(0, 3, size=10)
+    # y_pred = rng.rand(10)
+    # msg = "In a multiclass scenario, you must pass "
+    # with pytest.raises(ValueError, match=msg):
+    #     decision_threshold_curve(y_true, y_pred, accuracy_score)
+    assert True
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [
+        # make_scorer(fbeta_score, beta=3),
+        # make_scorer(fbeta_score, beta=0.5),
+        f1_score,
+        precision_score,
+        recall_score,
+        accuracy_score,
+    ],
+)
+def test_decision_threshold_curve_end_points(metric):
+    # rng = check_random_state(0)
+    # y_true = np.array([0] * 50 + [1] * 50)
+    # y_score = rng.normal(3, size=100)
+    # min_pred, max_score = min(y_score), max(y_score)
+
+    # metric_values, _ = decision_threshold_curve(y_true, y_score, metric)
+
+    # assert metric_values[0] == metric(y_true, (y_score > min_pred) * 1)
+    # assert metric_values[-1] == metric(y_true, (y_score > max_score) * 1)
+    assert True
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [partial(fbeta_score, beta=3), precision_score, recall_score],
+)
+def test_zero_sample_weight_equals_excluding(metric):
+    # rng = check_random_state(0)
+    # y_true = np.array([0] * 50 + [1] * 50)
+    # y_score = rng.normal(3, size=100)
+
+    # sample_weight = np.array([0] * 20 + [1] * 80)
+    # scoring_kwargs = {"sample_weight": sample_weight}
+    # metric_values_sw, _ = decision_threshold_curve(
+    #     y_true, y_score, metric, scoring_kwargs=scoring_kwargs
+    # )
+
+    # y_true_exclude = y_true[sample_weight != 0]
+    # y_score_exclude = y_score[sample_weight != 0]
+    # metric_values_exclude, _ = decision_threshold_curve(
+    #     y_true_exclude, y_score_exclude, metric
+    # )
+
+    # assert_allclose(metric_values_sw, metric_values_exclude)
+    assert True
+
+
+def test_len_of_threshold_when_passing_int():
+    # y = [0] * 500 + [1] * 500
+    # y_score = list(range(1000))
+    # _, thresholds = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=13
+    # )
+
+    # assert len(thresholds) == 13
+    assert True
+
+
+@pytest.mark.parametrize(
+    "metric, scoring_kwargs",
+    [
+        (f1_score, None),
+        (f1_score, {}),
+        (fbeta_score, {"beta": 4}),
+    ],
+)
+def test_scoring_kwargs(metric, scoring_kwargs):
+    # y_true = np.array([0] * 50 + [1] * 50)
+    # decision_threshold_curve(y_true, y_true, metric, scoring_kwargs=scoring_kwargs)
+    assert True
+
+
+def test_passing_the_grid():
+    # y = [0] * 500 + [1] * 500
+    # y_score = list(range(1000))
+
+    # grid_sorted = np.array(list(range(200, 300)))
+    # _, thresholds_sorted = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=grid_sorted
+    # )
+
+    # assert_allclose(grid_sorted, thresholds_sorted)
+
+    # grid_not_sorted = grid_sorted[::-1]
+    # _, thresholds_not_sorted = decision_threshold_curve(
+    #     y, y_score, accuracy_score, thresholds=grid_not_sorted
+    # )
+
+    # assert_allclose(grid_sorted, thresholds_not_sorted)
+    assert True