diff --git a/doc/model_selection.rst b/doc/model_selection.rst index 04e41c454419e..46522073d2a5c 100644 --- a/doc/model_selection.rst +++ b/doc/model_selection.rst @@ -14,6 +14,7 @@ Model selection and evaluation modules/cross_validation modules/grid_search + modules/prediction modules/model_evaluation modules/model_persistence modules/learning_curve diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index f5a0e71e07d1c..969032f81323e 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1153,6 +1153,7 @@ Splitter Classes :toctree: generated/ :template: class.rst + model_selection.CutoffClassifier model_selection.GroupKFold model_selection.GroupShuffleSplit model_selection.KFold diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst new file mode 100644 index 0000000000000..9065704f9f89d --- /dev/null +++ b/doc/modules/prediction.rst @@ -0,0 +1,34 @@ +.. currentmodule:: sklearn.model_selection + +.. _prediction_tuning: + +================================================ +Tuning of the decision threshold of an estimator +================================================ + +The real-valued decision functions, i.e. `decision_function` and +`predict_proba`, of machine-learning classifiers carry the inherited biases of +the fitted model; e.g, in a class imbalanced setting, a classifier +will naturally lean toward the most frequent class. In some other cases, the +generic objective function used to train a model is generally unaware of the +evaluation criteria used to evaluate the model; e.g., one might want to +penalized differently a false-positive and false-negative ---it will be less +detrimental to show an MR image without a cancer (i.e., false-positive) to a +radiologist than hidding one with a cancer (i.e, false-negtative) when +developing some computer-aided diagnosis system. + +In a binary classification scenario, the hard-prediction, i.e. `predict`, for a +classifier most commonly use the `predict_proba` and apply a decision threshold +at 0.5 to output a positive or negative label. Thus, this hard-prediction +suffers from the same drawbacks than the one raised in the above paragraph. + +Post-tuning of the decision threshold +===================================== + +:class:`CutoffClassifier` allows for post-tuning the decision threshold using +either `decision_function` or `predict_proba` and an objective metric for which +we want our threshold to be optimized for. + +Fine-tune using a single objective metric +----------------------------------------- + diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 04e10be72cd54..392ec18ea2d17 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -611,6 +611,12 @@ Changelog be removed in 0.25. :pr:`16401` by :user:`Arie Pratama Sutiono ` +- |MajorFeature| :class:`model_selection.CutoffClassifier` calibrates the + decision threshold function of a classifier by maximizing a binary + classification metric through cross-validation. + :pr:`16525` by :user:`Guillaume Lemaitre ` and + :user:`Prokopis Gryllos `. + :mod:`sklearn.multioutput` .......................... diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index aaf86a2f0576d..d5a39076e829f 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -314,6 +314,13 @@ Changelog class to be used when computing the roc auc statistics. :pr:`17651` by :user:`Clara Matos `. +- |Fix| Fix scorers that accept a pos_label parameter and compute their metrics + from values returned by `decision_function` or `predict_proba`. Previously, + they would return erroneous values when pos_label was not corresponding to + `classifier.classes_[1]`. This is especially important when training + classifiers directly with string labeled target classes. + :pr:`#18114` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 0ffafc0e56c61..d06f755f61e13 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -1251,7 +1251,7 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label): str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) - present_labels = unique_labels(y_true, y_pred) + present_labels = unique_labels(y_true, y_pred).tolist() if average == 'binary': if y_type == 'binary': if pos_label not in present_labels: diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 9ad57f4611e52..efd225038bf57 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -127,6 +127,48 @@ def __init__(self, score_func, sign, kwargs): self._score_func = score_func self._sign = sign + @staticmethod + def _check_pos_label(pos_label, classes): + if pos_label not in list(classes): + raise ValueError( + f"pos_label={pos_label} is not a valid label: {classes}" + ) + + def _check_decision_function(self, y_pred, classes): + """Reverse the decision function depending of pos_label.""" + pos_label = self._kwargs.get("pos_label", classes[1]) + self._check_pos_label(pos_label, classes) + if pos_label == classes[0]: + # The implicit positive class of the binary classifier + # does not match `pos_label`: we need to invert the + # predictions + y_pred *= -1 + + return y_pred + + def _select_proba(self, y_pred, classes, support_multi_class): + """Select the column of y_pred when probabilities are provided.""" + if y_pred.shape[1] == 2: + pos_label = self._kwargs.get("pos_label", classes[1]) + self._check_pos_label(pos_label, classes) + col_idx = np.flatnonzero(classes == pos_label)[0] + y_pred = y_pred[:, col_idx] + else: + err_msg = ( + f"Got predict_proba of shape {y_pred.shape}, but need " + f"classifier with two classes for {self._score_func.__name__} " + f"scoring" + ) + if support_multi_class and y_pred.shape[1] == 1: + # In _ProbaScorer, y_true can be tagged as binary while the + # y_pred is multi_class. This case is supported when label is + # provided. + raise ValueError(err_msg) + elif not support_multi_class: + raise ValueError(err_msg) + + return y_pred + def __repr__(self): kwargs_string = "".join([", %s=%s" % (str(k), str(v)) for k, v in self._kwargs.items()]) @@ -238,13 +280,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None): y_type = type_of_target(y) y_pred = method_caller(clf, "predict_proba", X) if y_type == "binary": - if y_pred.shape[1] == 2: - y_pred = y_pred[:, 1] - elif y_pred.shape[1] == 1: # not multiclass - raise ValueError('got predict_proba of shape {},' - ' but need classifier with two' - ' classes for {} scoring'.format( - y_pred.shape, self._score_func.__name__)) + y_pred = self._select_proba( + y_pred, clf.classes_, support_multi_class=True + ) if sample_weight is not None: return self._sign * self._score_func(y, y_pred, sample_weight=sample_weight, @@ -298,22 +336,21 @@ def _score(self, method_caller, clf, X, y, sample_weight=None): try: y_pred = method_caller(clf, "decision_function", X) - # For multi-output multi-class estimator if isinstance(y_pred, list): + # For multi-output multi-class estimator y_pred = np.vstack([p for p in y_pred]).T + elif y_type == "binary": + y_pred = self._check_decision_function( + y_pred, clf.classes_ + ) except (NotImplementedError, AttributeError): y_pred = method_caller(clf, "predict_proba", X) if y_type == "binary": - if y_pred.shape[1] == 2: - y_pred = y_pred[:, 1] - else: - raise ValueError('got predict_proba of shape {},' - ' but need classifier with two' - ' classes for {} scoring'.format( - y_pred.shape, - self._score_func.__name__)) + y_pred = self._select_proba( + y_pred, clf.classes_, support_multi_class=False, + ) elif isinstance(y_pred, list): y_pred = np.vstack([p[:, -1] for p in y_pred]).T diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 6677f3119dacd..e093c4107a5b0 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -4,7 +4,6 @@ from itertools import chain from itertools import permutations import warnings -import re import numpy as np from scipy import linalg @@ -1247,7 +1246,7 @@ def test_multilabel_hamming_loss(): def test_jaccard_score_validation(): y_true = np.array([0, 1, 0, 1, 1]) y_pred = np.array([0, 1, 0, 1, 1]) - err_msg = r"pos_label=2 is not a valid label: array\(\[0, 1\]\)" + err_msg = r"pos_label=2 is not a valid label: \[0, 1\]" with pytest.raises(ValueError, match=err_msg): jaccard_score(y_true, y_pred, average='binary', pos_label=2) @@ -2262,9 +2261,12 @@ def test_brier_score_loss(): # ensure to raise an error for multiclass y_true y_true = np.array([0, 1, 2, 0]) y_pred = np.array([0.8, 0.6, 0.4, 0.2]) - error_message = ("Only binary classification is supported. Labels " - "in y_true: {}".format(np.array([0, 1, 2]))) - with pytest.raises(ValueError, match=re.escape(error_message)): + error_message = ( + r"Only binary classification is supported. Labels in y_true: " + r"\[0 1 2\]" + ) + + with pytest.raises(ValueError, match=error_message): brier_score_loss(y_true, y_pred) # calculate correctly when there's only one class in y_true diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 67900b7cb77c3..52bafb160bfdb 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -1,3 +1,4 @@ +from copy import deepcopy import pickle import tempfile import shutil @@ -16,9 +17,18 @@ from sklearn.utils._testing import ignore_warnings from sklearn.base import BaseEstimator -from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score, - log_loss, precision_score, recall_score, - jaccard_score) +from sklearn.metrics import ( + average_precision_score, + brier_score_loss, + f1_score, + fbeta_score, + jaccard_score, + log_loss, + precision_score, + r2_score, + recall_score, + roc_auc_score, +) from sklearn.metrics import cluster as cluster_module from sklearn.metrics import check_scoring from sklearn.metrics._scorer import (_PredictScorer, _passthrough_scorer, @@ -618,6 +628,8 @@ def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count, mock_est.predict = predict_func mock_est.predict_proba = predict_proba_func mock_est.decision_function = decision_function_func + # add the classes that would be found during fit + mock_est.classes_ = np.array([0, 1]) scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers) multi_scorer = _MultimetricScorer(**scorer_dict) @@ -747,3 +759,188 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name): msg = "'Perceptron' object has no attribute 'predict_proba'" with pytest.raises(AttributeError, match=msg): scorer(lr, X, y) + + +@pytest.fixture +def string_labeled_classification_problem(): + """Train a classifier on binary problem with string target. + + The classifier is trained on a binary classification problem where the + minority class of interest has a string label that is intentionally not the + greatest class label using the lexicographic order. + + In addition, the dataset is imbalanced to better identify problems when + using non-symmetric performance metrics such as f1-score, average precision + and so on. + + Returns + ------- + classifier : estimator object + Trained classifier on the binary problem. + X_test : ndarray of shape (n_samples, n_features) + Data to be used as testing set in tests. + y_test : ndarray of shape (n_samples,), dtype=object + Binary target where labels are strings. + y_pred : ndarray of shape (n_samples,), dtype=object + Prediction of `classifier` when predicting for `X_test`. + y_pred_proba : ndarray of shape (n_samples, 2), dtype=np.float64 + Probabilities of `classifier` when predicting for `X_test`. + y_pred_decision : ndarray of shape (n_samples,), dtype=np.float64 + Decision function values of `classifier` when predicting on `X_test`. + """ + from sklearn.datasets import load_breast_cancer + from sklearn.utils import shuffle + + X, y = load_breast_cancer(return_X_y=True) + # create an highly imbalanced classification task + idx_positive = np.flatnonzero(y == 1) + idx_negative = np.flatnonzero(y == 0) + idx_selected = np.hstack([idx_negative, idx_positive[:25]]) + X, y = X[idx_selected], y[idx_selected] + X, y = shuffle(X, y, random_state=42) + # only use 2 features to make the problem even harder + X = X[:, :2] + y = np.array( + ["cancer" if c == 1 else "not cancer" for c in y], dtype=object + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, random_state=0, + ) + classifier = LogisticRegression().fit(X_train, y_train) + y_pred = classifier.predict(X_test) + y_pred_proba = classifier.predict_proba(X_test) + y_pred_decision = classifier.decision_function(X_test) + + return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision + + +def test_average_precision_pos_label(string_labeled_classification_problem): + # check that _ThresholdScorer will lead to the right score when passing + # `pos_label`. Currently, only `average_precision_score` is defined to + # be such a scorer. + clf, X_test, y_test, _, y_pred_proba, y_pred_decision = \ + string_labeled_classification_problem + + pos_label = "cancer" + # we need to select the positive column or reverse the decision values + y_pred_proba = y_pred_proba[:, 0] + y_pred_decision = y_pred_decision * -1 + assert clf.classes_[0] == pos_label + + # check that when calling the scoring function, probability estimates and + # decision values lead to the same results + ap_proba = average_precision_score( + y_test, y_pred_proba, pos_label=pos_label + ) + ap_decision_function = average_precision_score( + y_test, y_pred_decision, pos_label=pos_label + ) + assert ap_proba == pytest.approx(ap_decision_function) + + # create a scorer which would require to pass a `pos_label` + # check that it fails if `pos_label` is not provided + average_precision_scorer = make_scorer( + average_precision_score, needs_threshold=True, + ) + err_msg = "pos_label=1 is invalid. Set it to a label in y_true." + with pytest.raises(ValueError, match=err_msg): + average_precision_scorer(clf, X_test, y_test) + + # otherwise, the scorer should give the same results than calling the + # scoring function + average_precision_scorer = make_scorer( + average_precision_score, needs_threshold=True, pos_label=pos_label + ) + ap_scorer = average_precision_scorer(clf, X_test, y_test) + + assert ap_scorer == pytest.approx(ap_proba) + + # The above scorer call is using `clf.decision_function`. We will force + # it to use `clf.predict_proba`. + clf_without_predict_proba = deepcopy(clf) + + def _predict_proba(self, X): + raise NotImplementedError + + clf_without_predict_proba.predict_proba = partial( + _predict_proba, clf_without_predict_proba + ) + # sanity check + with pytest.raises(NotImplementedError): + clf_without_predict_proba.predict_proba(X_test) + + ap_scorer = average_precision_scorer( + clf_without_predict_proba, X_test, y_test + ) + assert ap_scorer == pytest.approx(ap_proba) + + +def test_brier_score_loss_pos_label(string_labeled_classification_problem): + # check that _ProbaScorer leads to the right score when `pos_label` is + # provided. Currently only the `brier_score_loss` is defined to be such + # a scorer. + clf, X_test, y_test, _, y_pred_proba, _ = \ + string_labeled_classification_problem + + pos_label = "cancer" + assert clf.classes_[0] == pos_label + + # brier score loss is symmetric + brier_pos_cancer = brier_score_loss( + y_test, y_pred_proba[:, 0], pos_label="cancer" + ) + brier_pos_not_cancer = brier_score_loss( + y_test, y_pred_proba[:, 1], pos_label="not cancer" + ) + assert brier_pos_cancer == pytest.approx(brier_pos_not_cancer) + + brier_scorer = make_scorer( + brier_score_loss, needs_proba=True, pos_label=pos_label, + ) + assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer) + + +@pytest.mark.parametrize( + "score_func", [f1_score, precision_score, recall_score, jaccard_score] +) +def test_non_symmetric_metric_pos_label( + score_func, string_labeled_classification_problem +): + # check that _PredictScorer leads to the right score when `pos_label` is + # provided. We check for all possible metric supported. + clf, X_test, y_test, y_pred, _, _ = string_labeled_classification_problem + + pos_label = "cancer" + assert clf.classes_[0] == pos_label + + score_pos_cancer = score_func(y_test, y_pred, pos_label="cancer") + score_pos_not_cancer = score_func(y_test, y_pred, pos_label="not cancer") + + assert score_pos_cancer != pytest.approx(score_pos_not_cancer) + + scorer = make_scorer(score_func, pos_label=pos_label) + assert scorer(clf, X_test, y_test) == pytest.approx(score_pos_cancer) + + +@pytest.mark.parametrize( + "scorer", + [ + make_scorer( + average_precision_score, needs_threshold=True, pos_label="xxx" + ), + make_scorer(brier_score_loss, needs_proba=True, pos_label="xxx"), + make_scorer(f1_score, pos_label="xxx") + ], + ids=["ThresholdScorer", "ProbaScorer", "PredictScorer"], +) +def test_scorer_select_proba_error(scorer): + # check that we raise the the proper error when passing an unknown + # pos_label + X, y = make_classification( + n_classes=2, n_informative=3, n_samples=20, random_state=0 + ) + lr = LogisticRegression(multi_class="multinomial").fit(X, y) + + err_msg = "is not a valid label" + with pytest.raises(ValueError, match=err_msg): + scorer(lr, X, y) diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 82a9b9371710d..6f39c0bc1499d 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -29,7 +29,10 @@ from ._search import ParameterSampler from ._search import fit_grid_point +from ._prediction import CutoffClassifier + __all__ = ('BaseCrossValidator', + 'CutoffClassifier', 'GridSearchCV', 'TimeSeriesSplit', 'KFold', diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py new file mode 100644 index 0000000000000..7b329aee2ebd0 --- /dev/null +++ b/sklearn/model_selection/_prediction.py @@ -0,0 +1,497 @@ +from inspect import signature +import numbers + +import numpy as np +from joblib import Parallel, delayed + +from ._split import check_cv +from ._split import StratifiedShuffleSplit + +from ..base import clone +from ..base import BaseEstimator +from ..base import ClassifierMixin +from ..base import MetaEstimatorMixin +from ..metrics import balanced_accuracy_score +from ..metrics import check_scoring +from ..metrics import make_scorer +from ..metrics import roc_curve +from ..metrics._plot.base import _check_classifier_response_method +from ..metrics._scorer import _BaseScorer +from ..preprocessing import LabelEncoder +from ..utils import check_array +from ..utils import _safe_indexing +from ..utils.multiclass import check_classification_targets +from ..utils.multiclass import type_of_target +from ..utils.validation import check_is_fitted + + +class _ContinuousScorer(_BaseScorer): + def __init__(self, score_func, sign, response_method, kwargs): + super().__init__(score_func=score_func, sign=sign, kwargs=kwargs) + self.response_method = response_method + + def _score(self, method_caller, estimator, X, y_true, sample_weight=None): + """Evaluate predicted target values for X relative to y_true. + + Parameters + ---------- + method_caller : callable + Returns predictions given an estimator, method name, and other + arguments, potentially caching results. + + estimator : object + Trained estimator to use for scoring. Must have a predict_proba + method; the output of that is used to compute the score. + + X : {array-like, sparse matrix} + Test data that will be fed to estimator.predict. + + y_true : array-like + Gold standard target values for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + Score function applied to prediction of estimator on X. + """ + response_method = _check_classifier_response_method( + estimator=estimator, response_method=self.response_method + ) + y_score = response_method(X) + if response_method.__name__ == "decision_function": + y_score = self._check_decision_function( + y_score, estimator.classes_ + ) + else: + y_score = self._select_proba( + y_score, estimator.classes_, support_multi_class=False + ) + + # `np.unique` returned sorted array, thus no need to sort values + potential_thresholds = np.unique(y_score) + score_thresholds = [] + for th in potential_thresholds: + y_score_thresholded = estimator.classes_[ + (y_score >= th).astype(int) + ] + if sample_weight is not None: + score_thresholds.append( + self._sign + * self._score_func( + y_true, + y_score_thresholded, + sample_weight=sample_weight, + **self._kwargs, + ) + ) + else: + score_thresholds.append( + self._sign + * self._score_func( + y_true, y_score_thresholded, **self._kwargs + ) + ) + return np.array(potential_thresholds), np.array(score_thresholds) + + +class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): + """Decision threshold calibration for binary classification. + + Estimator that calibrates the decision threshold (cutoff point) that is + used for prediction. The methods for picking cutoff points make use of + traditional binary classification evaluation statistics such as the true + positive and true negative rates or any metrics accepting true labels and + the output of a scoring function from a scikit-learn estimator. + + Parameters + ---------- + base_estimator : estimator object + The classifier, fitted or not fitted, from which we want to optimize + the decision threshold used during `predict`. + + objective_metric : {"tpr", "tnr"}, str or callable, \ + default="balanced_accuracy" + The objective metric to be optimized. Can be one of: + + * a string associated to a scoring function (see model evaluation + documentation); + * a scorer callable object / function with the signature + `metric(estimator, X, y)`; + * `"tpr"`: find the decision threshold for a true positive ratio (TPR) + of `objective_value`; + * `"tnr"`: find the decision threshold for a true negative ratio (TNR) + of `objective_value`. + + objective_value : float, default=None + The value associated with the `objective_metric` metric for which we + want to find the decision threshold when `objective_metric` is equal to + `"tpr"` or `"tnr"`. + + response_method : {"auto", "decision_function", "predict_proba"}, \ + default="auto" + Methods by the classifier `base_estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke, for each classifier, + `"predict_proba"` or `"decision_function"` in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + + n_threshold : int, default=1000 + The number of decision threshold to use when discretizing the output + of the classifier `method`. + + cv : int, float, cross-validation generator, iterable or "prefit", \ + default=None + Determines the cross-validation splitting strategy used in + `cross_val_predict` to train classifier. Possible inputs for cv are: + + * None, to use the default 5-fold stratified K-fold cross validation; + * An integer number, to specify the number of folds in a stratified + k-fold; + * A float number, to specify a single shuffle split. The floating + number should be in (0, 1) and represent the size of the validation + set; + * An object to be used as a cross-validation generator; + * An iterable yielding train, test splits; + * "prefit", to bypass the cross-validation. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + refit : "auto" or bool, default="auto" + Whether or not to refit the classifier on the entire training set once + the decision threshold has been found. By default, `refit="auto"` is + equivalent to `refit=False` when `cv` is a float number using a single + shuffle split or `cv="prefit"` otherwise `refit=True` in all other + cases. Note that forcing `refit=False` on cross-validation having more + than a single split will raise an error. Similarly, `refit=True` in + conjunction with `cv="prefit"` will raise an error. + + random_state : int or RandomState, default=None + Controls the randomness of the training and testing indices produced + when `cv` is a single shuffle split (i.e., giving a float number). + See :term:`Glossary `. + + n_jobs : int, default=None + The number of jobs to run in parallel all `estimators` `fit`. + `None` means 1 unless in a `joblib.parallel_backend` context. -1 means + using all processors. See Glossary for more details. + + Attributes + ---------- + decision_threshold_ : float + The new decision threshold. + + classes_ : array of shape (n_classes,) + The class labels. + + Examples + -------- + First, we will load the breast cancer databases and make it highly + imbalanced. + + >>> import numpy as np + >>> from sklearn.datasets import load_breast_cancer + >>> X, y = load_breast_cancer(return_X_y=True) + >>> pos_idx = np.flatnonzero(y == 1)[:10].tolist() + >>> neg_idx = np.flatnonzero(y == 0).tolist() + >>> X, y = X[pos_idx + neg_idx, :], y[pos_idx + neg_idx] + + Then, we can split into a training and testing set and keep the + same imbalance level in both sets. + + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=0 + ... ) + + We can check the performance of a logistic regression model. + + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.pipeline import make_pipeline + >>> model = make_pipeline(StandardScaler(), LogisticRegression()) + >>> model.fit(X_train, y_train) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('logisticregression', LogisticRegression())]) + >>> from sklearn.metrics import balanced_accuracy_score + >>> y_pred = model.predict(X_test) + >>> print(f"Score: {balanced_accuracy_score(y_test, y_pred):.3f}") + Score: 0.833 + + We will try to correct the decision threshold which is impacted by the + class imbalanced. + + >>> from sklearn.model_selection import CutoffClassifier + >>> model_optimized = CutoffClassifier( + ... base_estimator=model, objective_metric=balanced_accuracy_score + ... ) + >>> model_optimized.fit(X, y) + CutoffClassifier(...) + >>> y_pred = model_optimized.predict(X_test) + >>> print(f"Score: {balanced_accuracy_score(y_test, y_pred):.3f}") + Score: 0.972 + """ + + def __init__( + self, + base_estimator, + objective_metric="balanced_accuracy", + objective_value=None, + response_method="auto", + n_threshold=1000, + cv=None, + refit="auto", + random_state=None, + n_jobs=None, + ): + self.base_estimator = base_estimator + self.objective_metric = objective_metric + self.objective_value = objective_value + self.response_method = response_method + self.n_threshold = n_threshold + self.cv = cv + self.refit = refit + self.random_state = random_state + self.n_jobs = n_jobs + + def _validate_parameters(self): + """Validate the input parameters.""" + if ( + self.objective_metric not in ("tpr", "tnr") + and self.objective_value is not None + ): + raise ValueError( + f"When 'objective_metric' is a scoring function, " + f"'objective_value' should be None. Got " + f"{self.objective_value} instead." + ) + + if ( + not isinstance(self.n_threshold, numbers.Integral) + or self.n_threshold < 0 + ): + raise ValueError( + f"'n_threshold' should be a strictly positive integer. " + f"Got {self.n_threshold} instead." + ) + + @staticmethod + def _validate_data(X, y): + y = check_array(y, ensure_2d=False, dtype=None) + check_classification_targets(y) + y_type = type_of_target(y) + if y_type != 'binary': + raise ValueError(f'Expected target of binary type. Got {y_type}.') + return X, y + + def _check_cv_refit(self, cv, refit, y, random_state): + if isinstance(cv, numbers.Real) and 0 < cv < 1: + cv = StratifiedShuffleSplit( + n_splits=1, test_size=cv, random_state=random_state + ) + refit = False if refit == "auto" else refit + elif cv == "prefit": + if refit is True: + raise ValueError("When cv='prefit', refit cannot be True.") + refit = False + else: + cv = check_cv(cv, y=y, classifier=True) + if refit is False: + raise ValueError( + "When cv has several folds, refit cannot be False" + ) + refit = True + return cv, refit + + @staticmethod + def _fit_and_score( + estimator, X, y, train_idx, val_idx, scorer, score_method + ): + if train_idx is not None: + X_train = _safe_indexing(X, train_idx) + X_val = _safe_indexing(X, val_idx) + y_train = _safe_indexing(y, train_idx) + y_val = _safe_indexing(y, val_idx) + + estimator.fit(X_train, y_train) + else: + X_val, y_val = X, y + + if score_method in ("tnr", "tpr"): + fpr, tpr, potential_thresholds = scorer(estimator, X_val, y_val) + score_thresholds = tpr + if score_method == "tnr": + score_thresholds = (1 - fpr)[::-1] + potential_thresholds = potential_thresholds[::-1] + else: + potential_thresholds, score_thresholds = scorer( + estimator, X_val, y_val + ) + + return potential_thresholds, score_thresholds + + @staticmethod + def _find_decision_threshold( + thresholds, scores, n_thresholds, objective_score + ): + min_threshold = np.min([th.min() for th in thresholds]) + max_threshold = np.max([th.max() for th in thresholds]) + ascending = thresholds[0].argmin() == 0 + start = min_threshold if ascending else max_threshold + stop = max_threshold if ascending else min_threshold + thresholds_interpolated = np.linspace(start, stop, num=n_thresholds) + mean_score = np.mean( + [np.interp(thresholds_interpolated, + thresholds[fold_idx], scores[fold_idx]) + for fold_idx in range(len(scores))], + axis=0 + ) + if objective_score == "highest": + threshold_idx = mean_score.argmax() + else: + threshold_idx = np.searchsorted(mean_score, objective_score) + return thresholds_interpolated[threshold_idx] + + def fit(self, X, y): + """Find the decision threshold. + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) + The training data. + + y : array-like of shape (n_samples,) + Target values. It should be a binary target. + + Returns + ------- + self : object + Returns an instance of self. + """ + X, y = self._validate_data(X, y) + + cv, refit = self._check_cv_refit( + self.cv, self.refit, y, self.random_state + ) + + # Start by fitting the final estimator + if refit: + self._estimator = clone(self.base_estimator).fit(X, y) + elif cv == "prefit": + check_is_fitted(self.base_estimator, attributes=["classes_"]) + self._estimator = self.base_estimator + else: # single shuffle split CV + train_idx, _ = next(cv.split(X, y)) + X_train = _safe_indexing(X, train_idx) + y_train = _safe_indexing(y, train_idx) + self._estimator = clone(self.base_estimator).fit(X_train, y_train) + + self.classes_ = self._estimator.classes_ + if len(self.classes_) == 1: + raise ValueError( + f"This classifier needs samples from 2 classes in the data " + f"to be trained but the data contains only the class: " + f"{self.classes_.item(0)}" + ) + + # delayed the parameters check until we have a fitted base estimator + # with known classes + self._validate_parameters() + + if cv == "prefit" or not refit: + model = self._estimator + splits = ([None, range(len(X))],) + else: + model = clone(self.base_estimator) + splits = cv.split(X, y) + + if self.objective_metric in ("tpr", "tnr"): + scoring = make_scorer(roc_curve, needs_threshold=True) + else: + scoring = check_scoring( + estimator=model, scoring=self.objective_metric + ) + if isinstance(scoring, _BaseScorer): + scoring = _ContinuousScorer( + score_func=scoring._score_func, + sign=scoring._sign, + response_method=self.response_method, + kwargs=scoring._kwargs, + ) + self._scorer = check_scoring(estimator=model, scoring=scoring) + + thresholds, scores = zip( + *Parallel(n_jobs=self.n_jobs)( + delayed(self._fit_and_score)( + model, + X, + y, + train_idx, + val_idx, + self._scorer, + self.objective_metric, + ) + for train_idx, val_idx in splits + ) + ) + + if self.objective_metric in ("tnr", "tpr"): + objective_value = self.objective_value + else: + objective_value = "highest" + self.decision_threshold_ = self._find_decision_threshold( + thresholds, scores, self.n_threshold, objective_value + ) + + return self + + def predict(self, X): + """Predict using the calibrated decision threshold + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) + The data matrix. + + Returns + ------- + C : ndarray of shape (n_samples,) + The predicted class. + """ + check_is_fitted(self) + + response_method = _check_classifier_response_method( + estimator=self._estimator, response_method=self.response_method + ) + + y_score = response_method(X) + if response_method.__name__ == "decision_function": + y_score = self._scorer._check_decision_function( + y_score, self.classes_ + ) + else: + y_score = self._scorer._select_proba( + y_score, self.classes_, support_multi_class=False + ) + y_class_indices = (y_score >= self.decision_threshold_).astype(int) + + return self.classes_[y_class_indices] + + def _more_tags(self): + return { + "binary_only": True, + "_xfail_test": { + "check_classifiers_classes": + "requires non default 'pos_label='two'' parameter", + "check_fit2d_1feature": + "requires non default 'pos_label=2' parameter", + } + } diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py new file mode 100644 index 0000000000000..cf0b975d10f1f --- /dev/null +++ b/sklearn/model_selection/tests/test_prediction.py @@ -0,0 +1,234 @@ +import numpy as np +import pytest + +from sklearn.base import BaseEstimator +from sklearn.datasets import load_breast_cancer +from sklearn.datasets import load_iris +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import balanced_accuracy_score +from sklearn.metrics import f1_score +from sklearn.metrics import fbeta_score +from sklearn.metrics import make_scorer +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_allclose + +from sklearn.model_selection import CutoffClassifier + + +class MockNoPredictorClassifier(BaseEstimator): + """Classifier which does not predict.""" + def fit(self, X, y): + self.classes_ = np.array([0, 1]) + return self + + +# @pytest.mark.parametrize( +# "Estimator, params, err_type, err_msg", +# [ +# (LogisticRegression, {"method": "xxx"}, ValueError, +# "'method' should be one of"), +# (MockNoPredictorClassifier, {"method": "auto"}, TypeError, +# "'base_estimator' must implement one of the"), +# (SVC, {"method": "predict_proba"}, TypeError, +# "'base_estimator' does not implement predict_proba"), +# (LogisticRegression, +# {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError, +# "When 'objective_metric' is a scoring function"), +# (LogisticRegression, {"cv": 1.5}, ValueError, "Got 1.5"), +# (LogisticRegression, {"refit": False}, ValueError, +# "When cv has several folds, refit cannot be False"), +# (LogisticRegression, {"cv": "prefit", "refit": True}, ValueError, +# "When cv='prefit', refit cannot be True."), +# (LogisticRegression, {"n_threshold": -10}, ValueError, +# "'n_threshold' should be a strictly positive integer."), +# (LogisticRegression, {"n_threshold": 10.5}, ValueError, +# "'n_threshold' should be a strictly positive integer."), +# ] +# ) +# def test_cutoffclassifier_valid_params_error(Estimator, params, err_type, +# err_msg): +# # check that the proper errors are raised with wrong parameters +# X, y = make_classification(n_samples=200, n_features=6, random_state=42, +# n_classes=2) +# with pytest.raises(err_type, match=err_msg): +# clf = CutoffClassifier(base_estimator=Estimator(), **params) +# clf.fit(X, y) + + +def test_cutoffclassifier_not_binary(): + # check that we only accept binary target + X, y = load_iris(return_X_y=True) + with pytest.raises(ValueError, match="Expected target of binary type."): + CutoffClassifier( + base_estimator=make_pipeline( + StandardScaler(), LogisticRegression() + ) + ).fit(X, y) + + +def test_cutoffclassifier_xxx(): + # check that an objective value of 0 give opposite predictions in with + # tpr and tnr + X, y = load_breast_cancer(return_X_y=True) + # replaces y by some strings + classes = np.array(["healthy", "cancer"], dtype=object) + y = classes[y] + clf = CutoffClassifier( + base_estimator=make_pipeline(StandardScaler(), LogisticRegression()), + ) + y_pred_tpr = clf.fit(X, y).predict(X) + + +def test_cutoffclassifier_limit_tpr_tnr(): + # check that an objective value of 0 give opposite predictions in with + # tpr and tnr + X, y = load_breast_cancer(return_X_y=True) + clf = CutoffClassifier( + base_estimator=make_pipeline(StandardScaler(), LogisticRegression()), + objective_metric="tpr", + objective_value=0, + ) + y_pred_tpr = clf.fit(X, y).predict(X) + clf.set_params(objective_metric="tnr") + y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int) + assert np.mean(y_pred_tnr == y_pred_tpr) > 0.98 + + +@pytest.mark.parametrize( + "response_method", ["auto", "decision_function", "predict_proba"] +) +def test_cutoffclassifier_with_objective_value(response_method): + # check that we can optimize a given metric as a callable + X, y = load_breast_cancer(return_X_y=True) + # remove feature to degrade performances + X = X[:, :5] + + # make the problem completely imbalanced such that the balanced accuracy + # is low + indices_pos = np.flatnonzero(y == 1) + indices_pos = indices_pos[:indices_pos.size // 50] + indices_neg = np.flatnonzero(y == 0) + + X = np.vstack([X[indices_neg], X[indices_pos]]) + y = np.hstack([y[indices_neg], y[indices_pos]]) + + lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + model = CutoffClassifier( + base_estimator=lr, + objective_metric="balanced_accuracy", + response_method=response_method, + ) + score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X)) + score_baseline = balanced_accuracy_score(y, lr.predict(X)) + assert score_optimized > score_baseline + assert_array_equal(model.classes_, [0, 1]) + + +def test_cutoffclassifier_metric_with_parameter(): + # check that we can pass a metric with a parameter + # in addition check that f_beta with beta=1 is equivalent to f1 + X, y = load_breast_cancer(return_X_y=True) + lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + model_fbeta = CutoffClassifier( + base_estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1), + ).fit(X, y) + model_f1 = CutoffClassifier( + base_estimator=lr, objective_metric=make_scorer(f1_score), + ).fit(X, y) + + assert (model_fbeta.decision_threshold_ == + pytest.approx(model_f1.decision_threshold_)) + + +def test_cutoffclassifier_pretrained_estimator(): + # check that passing a pre-trained estimator is equivalent to training it + # in the meta-estimator + X, y = load_breast_cancer(return_X_y=True) + + random_state = 0 + val_size = 0.2 + + cv = StratifiedShuffleSplit( + n_splits=1, test_size=val_size, random_state=random_state + ) + train_idx, val_idx = next(cv.split(X, y)) + X_train, X_val = X[train_idx], X[val_idx] + y_train, y_val = y[train_idx], y[val_idx] + + lr_prefit = make_pipeline(StandardScaler(), LogisticRegression()) + lr_prefit.fit(X_train, y_train) + lr = make_pipeline(StandardScaler(), LogisticRegression()) + + model_prefit = CutoffClassifier(base_estimator=lr_prefit, cv="prefit") + model = CutoffClassifier(base_estimator=lr, cv=val_size, random_state=0) + + model_prefit.fit(X_val, y_val) + model.fit(X, y) + + # FIXME: we should find the same decision threshold + # assert (model_prefit.decision_threshold_ == + # pytest.approx(model.decision_threshold_)) + + # The model coefficient of the 2 models should be close because they are + # fitted on the same training data + assert_allclose( + model_prefit._estimator[-1].coef_, model._estimator[-1].coef_ + ) + + # check that we did not make any clone/copy of the pretrained estimator + # when this is not required + assert model_prefit._estimator is lr_prefit + assert model._estimator is not lr + + +@pytest.mark.parametrize( + "response_method", ["auto", "decision_function", "predict_proba"] +) +@pytest.mark.parametrize( + "metric", + [make_scorer(balanced_accuracy_score), + make_scorer(f1_score, pos_label="cancer")] +) +@pytest.mark.parametrize("dtype", [None, object]) +def test_cutoffclassifier_with_string_targets(response_method, dtype, metric): + # check that targets represented by str are properly managed + # check with several metrics to be sure that `pos_label` is properly + # dispatched + X, y = load_breast_cancer(return_X_y=True) + # replaces y by some strings + classes = np.array(["healthy", "cancer"]) + if dtype is not None: + classes = classes.astype(dtype) + y = classes[y] + model = CutoffClassifier( + base_estimator=make_pipeline(StandardScaler(), LogisticRegression()), + objective_metric=metric, + response_method=response_method, + ).fit(X, y) + assert_array_equal(np.sort(model.classes_), np.sort(classes)) + y_pred = model.predict(X[[0], :]) + assert y_pred.item(0) in classes + + +@pytest.mark.parametrize( + "params", + [ + {"cv": 5, "refit": True}, + {"cv": 0.2, "refit": False}, + {"cv": 0.2, "refit": True}, + {"cv": "prefit"}, + ] +) +def test_tmp_fit(params): + X, y = load_breast_cancer(return_X_y=True) + + estimator = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + CutoffClassifier( + base_estimator=estimator, + **params + ).fit(X, y) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index a48af83b15a7a..815c268efb93c 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -179,8 +179,9 @@ def test_fit_docstring_attributes(name, Estimator): attributes = doc['Attributes'] IGNORED = {'ClassifierChain', 'ColumnTransformer', 'CountVectorizer', - 'DictVectorizer', 'FeatureUnion', 'GaussianRandomProjection', - 'GridSearchCV', 'MultiOutputClassifier', 'MultiOutputRegressor', + 'CutoffClassifier', 'DictVectorizer', 'FeatureUnion', + 'GaussianRandomProjection', 'GridSearchCV', + 'MultiOutputClassifier', 'MultiOutputRegressor', 'NoSampleWeightWrapper', 'OneVsOneClassifier', 'OutputCodeClassifier', 'Pipeline', 'RFE', 'RFECV', 'RandomizedSearchCV', 'RegressorChain', diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 71a84537aabbf..186d20513a20f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2553,7 +2553,7 @@ def check_classifier_data_not_an_array(name, estimator_orig, strict_mode=True): X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1], [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]]) X = _pairwise_estimator_convert_X(X, estimator_orig) - y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]) + y = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0]) y = _enforce_estimator_tags_y(estimator_orig, y) for obj_type in ["NotAnArray", "PandasDataframe"]: check_estimators_data_not_an_array(name, estimator_orig, X, y,