Thanks to visit codestin.com
Credit goes to github.com

Skip to content

FEA Implementation of "threshold-dependent metric per threshold value" curve #25639

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
172ac47
initial proposal with preliminary tests
vitaliset Feb 18, 2023
d038e11
removing check that validate_params already does
vitaliset Feb 18, 2023
322eccf
changelog and linting from CI
vitaliset Feb 18, 2023
7dbbec5
trying to resolve doc related ci
vitaliset Feb 18, 2023
2a0c6b3
duplicate label
vitaliset Feb 18, 2023
fbb9b9b
docstring example import error
vitaliset Feb 18, 2023
acb94be
docstring typo
vitaliset Feb 18, 2023
a5cd201
docstring typo
vitaliset Feb 18, 2023
253b3e2
docstring typo
vitaliset Feb 18, 2023
cb5fee1
docstring typo
vitaliset Feb 18, 2023
9e45e2e
change in doc order and typos
vitaliset Feb 18, 2023
ad901a2
removing example
vitaliset Feb 20, 2023
1a4ce1b
Merge branch 'main' into metric_threshold_curve
vitaliset May 14, 2023
9b4febb
Update import of _check_pos_label_consistency
vitaliset May 14, 2023
119db53
codecov
vitaliset May 14, 2023
347f524
Merge branch 'metric_threshold_curve' of https://github.com/vitaliset…
vitaliset May 14, 2023
be893c8
linting
vitaliset May 14, 2023
bd1e64f
correcting typo
vitaliset May 14, 2023
0318950
test typo
vitaliset May 14, 2023
efd6d72
add example again to check pytest
vitaliset May 14, 2023
1e500c0
Merge branch 'main' into metric_threshold_curve
vitaliset May 16, 2023
10ebc90
Merge remote-tracking branch 'origin/main' into pr/vitaliset/25639
glemaitre May 20, 2024
dfa66a5
fixing imports
glemaitre May 20, 2024
1fb1c13
towards glemaitre suggestions
vitaliset May 22, 2024
e7bb2a7
applying black suggestions
vitaliset Jun 8, 2024
5a8f0c5
update extra stuff for consistency
vitaliset Jun 8, 2024
4fab2a3
removing doc files for now as we need to adapt to pr 29038
vitaliset Jun 8, 2024
48a0055
Merge branch 'main' into metric_threshold_curve
vitaliset Jun 8, 2024
fbf1d2e
Merge branch 'main' into metric_threshold_curve
vitaliset Jul 25, 2024
98873e6
Merge branch 'main' into metric_threshold_curve
vitaliset Jul 30, 2024
f1dc0e8
Update _decision_threshold.py to add authors
vitaliset Jul 30, 2024
0284251
towards using _curvescorer in the new decision threshold function. mi…
vitaliset Jul 30, 2024
d46bc1a
correcting circular dependences
vitaliset Jul 30, 2024
0a06199
Merge branch 'main' into metric_threshold_curve
vitaliset Aug 23, 2024
a424c3e
trying to solve the circular imports. looks like the order of init is…
vitaliset Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v1.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,11 @@ Changelog
whether to raise an exception if a subset of the scorers in multimetric scoring fails
or to return an error code. :pr:`28992` by :user:`Stefanie Senger <StefanieSenger>`.

- |MajorFeature| :func:`inspection.metric_threshold_curve` has been added to
measure the relationship between the threshold used by a binary classifier
for a given threshold-dependent function. :pr:`25639` by
:user:`Carlo Lemos <vitaliset>`.

- |Enhancement| Adds `zero_division` to :func:`cohen_kappa_score`. When there is a
division by zero, the metric is undefined and this value is returned.
:pr:`29210` by :user:`Marc Torrellas Socastro <marctorsoc>` and
Expand Down
2 changes: 2 additions & 0 deletions sklearn/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
root_mean_squared_log_error,
)
from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer
from ._decision_threshold import decision_threshold_curve
from .cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
Expand Down Expand Up @@ -117,6 +118,7 @@
"d2_log_loss_score",
"d2_pinball_score",
"dcg_score",
"decision_threshold_curve",
"davies_bouldin_score",
"DetCurveDisplay",
"det_curve",
Expand Down
99 changes: 99 additions & 0 deletions sklearn/metrics/_decision_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Metric per threshold curve to assess binary classification performance.

Given threshold grid, one can undestand the behaviour of threshold-dependent
metrics when changing the threshold. In imbalanced scenarios or
cost-sensitive learning, a 0.5 threshold may not be optimal and tools like
this can help you visualize how the performance changes.
"""

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

from numbers import Integral

from ..utils._param_validation import Interval, validate_params
from ._scorer import _CurveScorer


@validate_params(
{
"y_true": ["array-like"],
"y_score": ["array-like"],
"scoring": [callable],
"thresholds": [
Interval(Integral, 3, None, closed="left"),
"array-like",
None,
],
},
prefer_skip_nested_validation=True,
)
def decision_threshold_curve(
y_true,
y_score,
scoring,
thresholds=100,
):
"""Compute the threshold-dependent metric of interest per threshold.

Note: this implementation is restricted to the binary classification task.

Read more in the :ref:`User Guide <metric_threshold_curve>`.

.. versionadded:: 1.6

Parameters
----------
y_true : array-like of shape (n_samples,), default=None
True targets of binary classification.

y_score : array-like of shape (n_samples,), default=None
Estimated probabilities or output of a decision function.

scoring : callable, default=None
The objective metric to be estimated. It should be a callable object created
with :func:`~sklearn.metrics.make_scorer`.
# TODO(Carlo): Change it to also just be a function callable. In this case,
# transform it in a scorer inside the function.

thresholds : int or array-like, default=100
Related to the number of decision thresholds for which we want to compute the
score. If an integer, it will be used to generate `thresholds` thresholds
uniformly distributed between the minimum and maximum of `y_score`. If an
array-like, it will be used as the thresholds.

Returns
-------
metric_values : ndarray of shape (n_thresholds,)
The scores associated to each threshold. At index i being the value of the
theshold-dependent metric for predictions score >= thresholds[i].
# TODO(Carlo) Check if > or >=

thresholds : ndarray of shape (n_thresholds,)
Ascending score values used as thresholds.

See Also
--------
precision_recall_curve : Compute precision-recall pairs for different
probability thresholds.
det_curve : Compute error rates for different probability thresholds.
roc_curve : Compute Receiver operating characteristic (ROC) curve.

Examples #TODO(Carlo) change the example and fix threshold.
--------
>>> import numpy as np
>>> from sklearn.metrics import accuracy_score, decision_threshold_curve
>>> y_true = np.array([0, 0, 1, 1])
>>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
>>> accuracy_values, thresholds = decision_threshold_curve(
... y_true, y_score, accuracy_score)
>>> thresholds
array([0.1 , 0.35, 0.4 , 0.8 ])
>>> accuracy_values
array([0.75, 0.5 , 0.75, 0.5 ])
"""
# if scoring is function ... transform into scorer (do I need an estimator?)
curve_scorer = _CurveScorer.from_scorer(scoring, thresholds)
metric_values, thresholds = curve_scorer._score_given_prediction(y_score)

return metric_values, thresholds
94 changes: 68 additions & 26 deletions sklearn/metrics/_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from ..base import is_regressor
from ..utils import Bunch
from ..utils._encode import _unique
from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params
from ..utils._response import _get_response_values
from ..utils.metadata_routing import (
Expand Down Expand Up @@ -1132,11 +1133,12 @@ class _CurveScorer(_BaseScorer):
uniformly distributed between the minimum and maximum predicted scores. If an
array-like, it will be used as the thresholds.

response_method : str
The method to call on the estimator to get the response values.
response_method : str, default=None
The method to call on the estimator to get the response values. If value is set
to `None`, then
"""

def __init__(self, score_func, sign, kwargs, thresholds, response_method):
def __init__(self, score_func, sign, kwargs, thresholds, response_method=None):
super().__init__(
score_func=score_func,
sign=sign,
Expand All @@ -1146,19 +1148,68 @@ def __init__(self, score_func, sign, kwargs, thresholds, response_method):
self._thresholds = thresholds

@classmethod
def from_scorer(cls, scorer, response_method, thresholds):
def from_scorer(cls, scorer, thresholds, response_method=None):
"""Create a continuous scorer from a normal scorer."""
instance = cls(
score_func=scorer._score_func,
sign=scorer._sign,
response_method=response_method,
thresholds=thresholds,
response_method=response_method,
kwargs=scorer._kwargs,
)
# transfer the metadata request
instance._metadata_request = scorer._get_metadata_request()
return instance

# TODO(Carlo): Create tests for this functions.
def _score_given_prediction(
self, y_score, y_true, classes=None, pos_label=None, **kwargs
):
"""Calculate the scores for given prediction values and true labels.

Parameters
----------
y_score : array-like of shape (n_samples,)
Predicted target scores.

y_true : array-like of shape (n_samples,)
Gold standard target values.

classes: TODO(Carlo)
...

**kwargs : dict
Other parameters passed to the scorer.

Returns
-------
score_thresholds : ndarray of shape (thresholds,)
The scores associated with each threshold.

potential_thresholds : ndarray of shape (thresholds,)
The potential thresholds used to compute the scores.
"""
if classes is None:
classes = _unique(y_true)
pos_label = self._get_pos_label()
scoring_kwargs = {**self._kwargs, **kwargs}
if isinstance(self._thresholds, Integral):
potential_thresholds = np.linspace(
np.min(y_score), np.max(y_score), self._thresholds
)
else:
potential_thresholds = np.asarray(self._thresholds)
score_thresholds = [
self._sign
* self._score_func(
y_true,
_threshold_scores_to_class_labels(y_score, th, classes, pos_label),
**scoring_kwargs,
)
for th in potential_thresholds
]
return np.array(score_thresholds), potential_thresholds

def _score(self, method_caller, estimator, X, y_true, **kwargs):
"""Evaluate predicted target values for X relative to y_true.

Expand Down Expand Up @@ -1189,27 +1240,18 @@ def _score(self, method_caller, estimator, X, y_true, **kwargs):
potential_thresholds : ndarray of shape (thresholds,)
The potential thresholds used to compute the scores.
"""
pos_label = self._get_pos_label()
if self._response_method is None:
raise ValueError(
"If response_method is set to `None`, you can't use this method. "
"Use `_score_given_prediction` instead."
)
y_score = method_caller(
estimator, self._response_method, X, pos_label=pos_label
estimator, self._response_method, X, pos_label=self._get_pos_label()
)
classes = estimator.classes_

scoring_kwargs = {**self._kwargs, **kwargs}
if isinstance(self._thresholds, Integral):
potential_thresholds = np.linspace(
np.min(y_score), np.max(y_score), self._thresholds
)
else:
potential_thresholds = np.asarray(self._thresholds)
score_thresholds = [
self._sign
* self._score_func(
y_true,
_threshold_scores_to_class_labels(
y_score, th, estimator.classes_, pos_label
),
**scoring_kwargs,
)
for th in potential_thresholds
]
return np.array(score_thresholds), potential_thresholds
scores, potential_thresholds = self._score_given_prediction(
y_score, y_true, classes, **kwargs
)

return scores, potential_thresholds
133 changes: 133 additions & 0 deletions sklearn/metrics/tests/test_decision_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from functools import partial

import pytest

from sklearn.metrics import (
accuracy_score,
f1_score,
fbeta_score,
precision_score,
recall_score,
)


# TODO(Carlo): Update tests.
def test_grid_int_bigger_than_set_then_all():
# """When `thresholds` parameter is bigger than the number of unique
# `y_score` then `len(thresholds)` should be equal to `len(set(y_score))`.
# """

# X, y = make_classification()
# clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X, y)
# y_score = clf.predict_proba(X)[:, 1]

# _, thresholds_big_int = decision_threshold_curve(
# y, y_score, accuracy_score, thresholds=len(set(y_score)) + 1000
# )

# assert len(thresholds_big_int) == len(set(y_score))
assert True


def test_binary_clf_curve_multiclass_error():
# rng = check_random_state(404)
# y_true = rng.randint(0, 3, size=10)
# y_pred = rng.rand(10)
# msg = "In a multiclass scenario, you must pass "
# with pytest.raises(ValueError, match=msg):
# decision_threshold_curve(y_true, y_pred, accuracy_score)
assert True


@pytest.mark.parametrize(
"metric",
[
# make_scorer(fbeta_score, beta=3),
# make_scorer(fbeta_score, beta=0.5),
f1_score,
precision_score,
recall_score,
accuracy_score,
],
)
def test_decision_threshold_curve_end_points(metric):
# rng = check_random_state(0)
# y_true = np.array([0] * 50 + [1] * 50)
# y_score = rng.normal(3, size=100)
# min_pred, max_score = min(y_score), max(y_score)

# metric_values, _ = decision_threshold_curve(y_true, y_score, metric)

# assert metric_values[0] == metric(y_true, (y_score > min_pred) * 1)
# assert metric_values[-1] == metric(y_true, (y_score > max_score) * 1)
assert True


@pytest.mark.parametrize(
"metric",
[partial(fbeta_score, beta=3), precision_score, recall_score],
)
def test_zero_sample_weight_equals_excluding(metric):
# rng = check_random_state(0)
# y_true = np.array([0] * 50 + [1] * 50)
# y_score = rng.normal(3, size=100)

# sample_weight = np.array([0] * 20 + [1] * 80)
# scoring_kwargs = {"sample_weight": sample_weight}
# metric_values_sw, _ = decision_threshold_curve(
# y_true, y_score, metric, scoring_kwargs=scoring_kwargs
# )

# y_true_exclude = y_true[sample_weight != 0]
# y_score_exclude = y_score[sample_weight != 0]
# metric_values_exclude, _ = decision_threshold_curve(
# y_true_exclude, y_score_exclude, metric
# )

# assert_allclose(metric_values_sw, metric_values_exclude)
assert True


def test_len_of_threshold_when_passing_int():
# y = [0] * 500 + [1] * 500
# y_score = list(range(1000))
# _, thresholds = decision_threshold_curve(
# y, y_score, accuracy_score, thresholds=13
# )

# assert len(thresholds) == 13
assert True


@pytest.mark.parametrize(
"metric, scoring_kwargs",
[
(f1_score, None),
(f1_score, {}),
(fbeta_score, {"beta": 4}),
],
)
def test_scoring_kwargs(metric, scoring_kwargs):
# y_true = np.array([0] * 50 + [1] * 50)
# decision_threshold_curve(y_true, y_true, metric, scoring_kwargs=scoring_kwargs)
assert True


def test_passing_the_grid():
# y = [0] * 500 + [1] * 500
# y_score = list(range(1000))

# grid_sorted = np.array(list(range(200, 300)))
# _, thresholds_sorted = decision_threshold_curve(
# y, y_score, accuracy_score, thresholds=grid_sorted
# )

# assert_allclose(grid_sorted, thresholds_sorted)

# grid_not_sorted = grid_sorted[::-1]
# _, thresholds_not_sorted = decision_threshold_curve(
# y, y_score, accuracy_score, thresholds=grid_not_sorted
# )

# assert_allclose(grid_sorted, thresholds_not_sorted)
assert True
Loading
Loading