From d3705d697351d38d2d20270a033159f858e8705a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 23 Feb 2020 16:18:19 +0100
Subject: [PATCH 01/44] FEA add CutoffCalibration estimator

---
 doc/modules/calibration.rst       | 129 ++++++++++-
 doc/modules/classes.rst           |   6 +-
 sklearn/calibration.py            | 364 ++++++++++++++++++++++++++++++
 sklearn/tests/test_calibration.py | 229 ++++++++++++++++++-
 4 files changed, 714 insertions(+), 14 deletions(-)

diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
index 19df08ea3b1fe..daaf8c9a9545f 100644
--- a/doc/modules/calibration.rst
+++ b/doc/modules/calibration.rst
@@ -1,11 +1,15 @@
 .. _calibration:
 
-=======================
-Probability calibration
-=======================
+======================
+Prediction calibration
+======================
 
 .. currentmodule:: sklearn.calibration
 
+.. _probability_calibration:
+
+Probability calibration
+=======================
 
 When performing classification you often want not only to predict the class
 label, but also obtain a probability of the respective label. This probability
@@ -156,3 +160,122 @@ well a classifier is calibrated.
 
     .. [4] Transforming Classifier Scores into Accurate Multiclass
            Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
+
+.. _decision_threshold_calibration:
+
+Decision Threshold calibration
+==============================
+
+.. currentmodule:: sklearn.calibration
+
+Often Machine Learning classifiers base their
+predictions on real-valued decision functions or probability estimates that
+carry the inherited biases of their models. Additionally when using a machine
+learning model the evaluation criteria can differ from the optimisation
+objectives used by the model during training.
+
+When predicting between two classes it is commonly advised that an appropriate
+decision threshold is estimated based on some cutoff criteria rather than
+arbitrarily using the midpoint of the space of possible values. Estimating a
+decision threshold for a specific use case can help to increase the overall
+accuracy of the model and provide better handling for sensitive classes.
+
+.. currentmodule:: sklearn.calibration
+
+:class:`CutoffClassifier` can be used as a wrapper around a model for binary
+classification to help obtain a more appropriate decision threshold and use it
+for predicting new samples.
+
+Usage
+-----
+
+To use the :class:`CutoffClassifier` you need to provide an estimator that has
+a ``decision_function`` or a ``predict_proba`` method. The ``method``
+parameter controls whether the first will be preferred over the second if both
+are available.
+
+The wrapped estimator can be pre-trained, in which case ``cv = 'prefit'``, or
+not. If the classifier is not trained then a cross-validation loop specified by
+the parameter ``cv`` can be used to obtain a decision threshold by averaging
+all decision thresholds calculated on the hold-out parts of each cross
+validation iteration. Finally the model is trained on all the provided data.
+When using ``cv = 'prefit'`` you need to make sure to use a hold-out part of
+your data for calibration.
+
+The strategies, controlled by the parameter ``strategy``, for finding
+appropriate decision thresholds are based either on precision recall estimates
+or true positive and true negative rates. Specifically:
+
+.. currentmodule:: sklearn.metrics
+
+* ``f_beta``
+   selects a decision threshold that maximizes the :func:`fbeta_score`. The
+   value of beta is specified by the parameter ``beta``. The ``beta`` parameter
+   determines the weight of precision. When ``beta = 1`` both precision recall
+   get the same weight therefore the maximization target in this case is the
+   :func:`f1_score`. if ``beta < 1`` more weight is given to precision whereas
+   if ``beta > 1`` more weight is given to recall.
+
+* ``roc``
+   selects the decision threshold for the point on the :func:`roc_curve` that
+   is closest to the ideal corner (0, 1)
+
+* ``max_tpr``
+   selects the decision threshold for the point that yields the highest true
+   positive rate while maintaining a minimum true negative rate, specified by
+   the parameter ``threshold``
+
+* ``max_tnr``
+   selects the decision threshold for the point that yields the highest true
+   negative rate while maintaining a minimum true positive rate, specified by
+   the parameter ``threshold``
+
+Here is a simple usage example::
+
+   >>> from sklearn.calibration import CutoffClassifier
+   >>> from sklearn.datasets import load_breast_cancer
+   >>> from sklearn.naive_bayes import GaussianNB
+   >>> from sklearn.metrics import precision_score
+   >>> from sklearn.model_selection import train_test_split
+
+   >>> X, y = load_breast_cancer(return_X_y=True)
+   >>> X_train, X_test, y_train, y_test = train_test_split(
+   ...     X, y, train_size=0.6, random_state=42)
+   >>> clf = CutoffClassifier(GaussianNB(), strategy='f_beta', beta=0.6,
+   ...                        cv=3).fit(X_train, y_train)
+   >>> y_pred = clf.predict(X_test)
+   >>> precision_score(y_test, y_pred)                   # doctest: +ELLIPSIS
+   0.959...
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_calibration_plot_decision_threshold_calibration.py`: Decision
+   threshold calibration on the breast cancer dataset
+
+.. currentmodule:: sklearn.calibration
+
+The following image shows the results of using the :class:`CutoffClassifier`
+for finding a decision threshold for a :class:`LogisticRegression` classifier
+and an :class:`AdaBoostClassifier` for two use cases.
+
+.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_decision_threshold_calibration_001.png
+   :target: ../auto_examples/calibration/plot_decision_threshold_calibration.html
+   :align: center
+
+In the first case we want to increase the overall accuracy of the classifier on
+the breast cancer dataset. In the second case we want to find a decision
+threshold that yields maximum true positive rate while maintaining a minimum
+value for the true negative rate.
+
+.. topic:: References:
+
+    * Receiver-operating characteristic (ROC) plots: a fundamental
+      evaluation tool in clinical medicine, MH Zweig, G Campbell -
+      Clinical chemistry, 1993
+
+Notes
+-----
+
+Calibrating the decision threshold of a classifier does not guarantee increased
+performance. The generalisation ability of the obtained decision threshold has
+to be evaluated.
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 752b41151fca0..2f1d8bba7e653 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -53,8 +53,8 @@ Functions
 
 .. _calibration_ref:
 
-:mod:`sklearn.calibration`: Probability Calibration
-===================================================
+:mod:`sklearn.calibration`: Prediction Calibration
+==================================================
 
 .. automodule:: sklearn.calibration
    :no-members:
@@ -69,7 +69,7 @@ Functions
    :template: class.rst
 
    calibration.CalibratedClassifierCV
-
+   calibration.CutoffClassifier
 
 .. autosummary::
    :toctree: generated/
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index ff9c4b3e75c44..8692e628bbb8a 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -20,13 +20,17 @@
 
 from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
                    MetaEstimatorMixin)
+from .base import clone
 from .preprocessing import label_binarize, LabelBinarizer
 from .utils import check_X_y, check_array, indexable, column_or_1d
 from .utils.validation import check_is_fitted, check_consistent_length
 from .utils.validation import _check_sample_weight
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
+from .metrics import precision_recall_curve
+from .metrics import roc_curve
 from .model_selection import check_cv
+from .utils.multiclass import type_of_target
 from .utils.validation import _deprecate_positional_args
 
 
@@ -597,3 +601,363 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
     prob_pred = bin_sums[nonzero] / bin_total[nonzero]
 
     return prob_true, prob_pred
+
+
+class CutoffClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+    """Decision threshold calibration for binary classification
+
+    Meta estimator that calibrates the decision threshold (cutoff point)
+    that is used for prediction. The methods for picking cutoff points make use
+    of traditional binary classification evaluation statistics such as the
+    true positive and true negative rates and F-scores.
+
+    If cv="prefit" the base estimator is assumed to be fitted and all data will
+    be used for the selection of the cutoff point. Otherwise the decision
+    threshold is calculated as the average of the thresholds resulting from the
+    cross-validation loop.
+
+    Parameters
+    ----------
+    base_estimator : obj
+        The binary classifier whose decision threshold will be adapted
+        according to the acquired cutoff point. The estimator must have a
+        decision_function or a predict_proba
+
+    strategy : str, optional (default='roc')
+        The strategy to use for choosing the cutoff point
+
+        'roc'
+            selects the point on the roc curve that is closest to the ideal
+            corner (0, 1)
+
+        'f_beta'
+            selects a decision threshold that maximizes the f_beta score
+
+        'max_tpr'
+            selects the point that yields the highest true positive rate with
+            true negative rate at least equal to the value of the parameter
+            threshold
+
+        'max_tnr'
+            selects the point that yields the highest true negative rate with
+            true positive rate at least equal to the value of the parameter
+            threshold
+
+    method : str or None, optional (default=None)
+        The method to be used for acquiring the score
+
+        'decision_function'
+            base_estimator.decision_function will be used for scoring
+
+        'predict_proba'
+            base_estimator.predict_proba will be used for scoring
+
+        None
+            base_estimator.decision_function will be used first and if not
+            available base_estimator.predict_proba
+
+    beta : float in [0, 1], optional (default=None)
+        beta value to be used in case strategy == 'f_beta'
+
+    threshold : float in [0, 1] or None, (default=None)
+        In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
+        to specify the threshold for the true negative rate or true positive
+        rate respectively that needs to be achieved
+
+    pos_label : object, optional (default=1)
+        Object representing the positive label
+
+    cv : int, cross-validation generator, iterable or 'prefit', optional
+        (default=3). Determines the cross-validation splitting strategy.
+        If cv='prefit' the base estimator is assumed to be fitted and all data
+        will be used for the calibration of the probability threshold
+
+    Attributes
+    ----------
+    decision_threshold_ : float
+        Decision threshold for the positive class. Determines the output of
+        predict
+
+    std_ : float
+        Standard deviation of the obtained decision thresholds for when the
+        provided base estimator is not pre-trained and the decision_threshold_
+        is computed as the mean of the decision threshold of each
+        cross-validation iteration. If the base estimator is pre-trained then
+        std_ = None
+
+    classes_ : array, shape (n_classes)
+        The class labels.
+
+    References
+    ----------
+    .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
+           evaluation tool in clinical medicine, MH Zweig, G Campbell -
+           Clinical chemistry, 1993
+
+    """
+    def __init__(self, base_estimator, strategy='roc', method=None, beta=None,
+                 threshold=None, pos_label=1, cv=3):
+        self.base_estimator = base_estimator
+        self.strategy = strategy
+        self.method = method
+        self.beta = beta
+        self.threshold = threshold
+        self.pos_label = pos_label
+        self.cv = cv
+
+    def fit(self, X, y):
+        """Fit model
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data
+
+        y : array-like, shape (n_samples,)
+            Target values. There must be two 2 distinct values
+
+        Returns
+        -------
+        self : object
+            Instance of self
+        """
+        if (not hasattr(self.base_estimator, 'decision_function') and
+                not hasattr(self.base_estimator, 'predict_proba')):
+            raise TypeError('The base_estimator needs to implement either a '
+                            'decision_function or a predict_proba method')
+
+        if self.strategy not in ('roc', 'f_beta', 'max_tpr', 'max_tnr'):
+            raise ValueError('strategy can either be "roc" or "max_tpr" or '
+                             '"max_tnr. Got {} instead'.format(self.strategy))
+
+        if self.method not in (None, 'decision_function', 'predict_proba'):
+            raise ValueError('method param can either be "decision_function" '
+                             'or "predict_proba" or None. '
+                             'Got {} instead'.format(self.method))
+
+        if self.strategy == 'max_tpr' or self.strategy == 'max_tnr':
+            if (not self.threshold or not
+                    isinstance(self.threshold, (int, float))
+                    or not self.threshold >= 0 or not self.threshold <= 1):
+                raise ValueError('parameter threshold must be a number in'
+                                 '[0, 1]. '
+                                 'Got {} instead'.format(self.threshold))
+
+        if self.strategy == 'f_beta':
+            if not self.beta or not isinstance(self.beta, (int, float)):
+                raise ValueError('parameter beta must be a real number.'
+                                 'Got {} instead'.format(type(self.beta)))
+
+        X, y = check_X_y(X, y)
+
+        y_type = type_of_target(y)
+        if y_type != 'binary':
+            raise ValueError('Expected target of binary type. Got {}'.format(
+                y_type))
+
+        self.label_encoder_ = LabelEncoder().fit(y)
+        self.classes_ = self.label_encoder_.classes_
+
+        y = self.label_encoder_.transform(y)
+        self.pos_label = self.label_encoder_.transform([self.pos_label])[0]
+
+        if self.cv == 'prefit':
+            self.decision_threshold_ = _CutoffClassifier(
+                self.base_estimator, self.strategy, self.method, self.beta,
+                self.threshold, self.pos_label
+            ).fit(X, y).decision_threshold_
+            self.std_ = None
+        else:
+            cv = check_cv(self.cv, y, classifier=True)
+            decision_thresholds = []
+
+            for train, test in cv.split(X, y):
+                estimator = clone(self.base_estimator).fit(X[train], y[train])
+                decision_thresholds.append(
+                    _CutoffClassifier(estimator, self.strategy, self.method,
+                                      self.beta, self.threshold,
+                                      self.pos_label).fit(
+                        X[test], y[test]
+                    ).decision_threshold_
+                )
+            self.decision_threshold_ = np.mean(decision_thresholds)
+            self.std_ = np.std(decision_thresholds)
+
+            self.base_estimator.fit(X, y)
+        return self
+
+    def predict(self, X):
+        """Predict using the calibrated decision threshold
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The samples
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            The predicted class
+        """
+        X = check_array(X)
+        check_is_fitted(
+            self, ["label_encoder_", "decision_threshold_", "std_", "classes_"]
+        )
+
+        y_score = _get_binary_score(self.base_estimator, X, self.method,
+                                    self.pos_label)
+        return self.label_encoder_.inverse_transform(
+            (y_score > self.decision_threshold_).astype(int)
+        )
+
+
+class _CutoffClassifier(object):
+    """Cutoff point selection.
+
+    It assumes that base_estimator has already been fit, and uses the input set
+    of the fit function to select a cutoff point. Note that this class should
+    not be used as an estimator directly. Use the CutoffClassifier with
+    cv="prefit" instead.
+
+    Parameters
+    ----------
+    base_estimator : obj
+        The binary classifier whose decision threshold will be adapted
+        according to the acquired cutoff point. The estimator must have a
+        decision_function or a predict_proba
+
+    strategy : 'roc' or 'f_beta' or 'max_tpr' or 'max_tnr'
+        The method to use for choosing the cutoff point
+
+    method : str or None, optional (default=None)
+        The method to be used for acquiring the score. Can either be
+        "decision_function" or "predict_proba" or None. If None then
+        decision_function will be used first and if not available
+        predict_proba
+
+    beta : float in [0, 1]
+        beta value to be used in case strategy == 'f_beta'
+
+    threshold : float in [0, 1]
+        minimum required value for the true negative rate (specificity) in case
+        strategy 'max_tpr' is used or for the true positive rate (sensitivity)
+        in case method 'max_tnr' is used
+
+    pos_label : object
+        Label considered as positive during the roc_curve construction
+
+    Attributes
+    ----------
+    decision_threshold_ : float
+        Acquired decision threshold for the positive class
+    """
+    def __init__(self, base_estimator, strategy, method, beta, threshold,
+                 pos_label):
+        self.base_estimator = base_estimator
+        self.strategy = strategy
+        self.method = method
+        self.beta = beta
+        self.threshold = threshold
+        self.pos_label = pos_label
+
+    def fit(self, X, y):
+        """Select a decision threshold for the fitted model's positive class
+        using one of the available methods
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data
+
+        y : array-like, shape (n_samples,)
+            Target values
+
+        Returns
+        -------
+        self : object
+            Instance of self
+        """
+        y_score = _get_binary_score(self.base_estimator, X, self.method,
+                                    self.pos_label)
+        if self.strategy == 'f_beta':
+            precision, recall, thresholds = precision_recall_curve(
+                y, y_score, pos_label=self.pos_label
+            )
+            f_beta = ((1 + self.beta**2) * (precision * recall) /
+                      (self.beta**2 * precision + recall))
+            self.decision_threshold_ = thresholds[np.argmax(f_beta)]
+            return self
+
+        fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=self.pos_label)
+
+        if self.strategy == 'roc':
+            # we find the threshold of the point (fpr, tpr) with the smallest
+            # euclidean distance from the "ideal" corner (0, 1)
+            self.decision_threshold_ = thresholds[
+                np.argmin(fpr ** 2 + (tpr - 1) ** 2)
+            ]
+        elif self.strategy == 'max_tpr':
+            indices = np.where(1 - fpr >= self.threshold)[0]
+            max_tpr_index = np.argmax(tpr[indices])
+            self.decision_threshold_ = thresholds[indices[max_tpr_index]]
+        else:
+            indices = np.where(tpr >= self.threshold)[0]
+            max_tnr_index = np.argmax(1 - fpr[indices])
+            self.decision_threshold_ = thresholds[indices[max_tnr_index]]
+        return self
+
+
+def _get_binary_score(clf, X, method=None, pos_label=1):
+    """Binary classification score for the positive label (0 or 1)
+
+    Returns the score that a binary classifier outputs for the positive label
+    acquired either from decision_function or predict_proba
+
+    Parameters
+    ----------
+    clf : object
+        Classifier object to be used for acquiring the scores. Needs to have
+        a decision_function or a predict_proba method
+
+    X : array-like, shape (n_samples, n_features)
+        The samples
+
+    pos_label : int, optional (default=1)
+        The positive label. Can either be 0 or 1
+
+    method : str or None, optional (default=None)
+        The method to be used for acquiring the score. Can either be
+        "decision_function" or "predict_proba" or None. If None then
+        decision_function will be used first and if not available
+        predict_proba
+
+    Returns
+    -------
+    y_score : array-like, shape (n_samples,)
+        The return value of the provided classifier's decision_function or
+        predict_proba depending on the method used.
+    """
+    if len(clf.classes_) != 2:
+        raise ValueError('Expected binary classifier. Found {} classes'.format(
+            len(clf.classes_)
+        ))
+
+    if method not in (None, 'decision_function', 'predict_proba'):
+        raise ValueError('scoring param can either be "decision_function" '
+                         'or "predict_proba" or None. '
+                         'Got {} instead'.format(method))
+
+    if not method:
+        try:
+            y_score = clf.decision_function(X)
+            if pos_label == clf.classes_[0]:
+                y_score = -y_score
+        except (NotImplementedError, AttributeError):
+            y_score = clf.predict_proba(X)[:, pos_label]
+    elif method == 'decision_function':
+        y_score = clf.decision_function(X)
+        if pos_label == clf.classes_[0]:
+            y_score = - y_score
+    else:
+        y_score = clf.predict_proba(X)[:, pos_label]
+    return y_score
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index f131eab4c1680..e7970ebb76adc 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -8,18 +8,28 @@
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import LeaveOneOut
 
-from sklearn.utils._testing import (assert_array_almost_equal,
-                                   assert_almost_equal,
-                                   assert_array_equal,
-                                   assert_raises, ignore_warnings)
 from sklearn.datasets import make_classification, make_blobs
-from sklearn.naive_bayes import MultinomialNB
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.svm import LinearSVC
-from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
-from sklearn.metrics import brier_score_loss, log_loss
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import brier_score_loss
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import f1_score
+from sklearn.metrics import log_loss
+from sklearn.metrics import recall_score
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_raises
+from sklearn.utils._testing import ignore_warnings
+
 from sklearn.calibration import CalibratedClassifierCV
+from sklearn.calibration import CutoffClassifier
+from sklearn.calibration import _get_binary_score
 from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
 from sklearn.calibration import calibration_curve
 
@@ -341,3 +351,206 @@ def decision_function(self, X):
     calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
     # we should be able to fit this classifier with no error
     calibrated_clf.fit(X, y)
+
+
+def test_cutoff_prefit():
+    calibration_samples = 200
+    X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
+                               n_classes=2)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                        test_size=0.4,
+                                                        random_state=42)
+    lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
+
+    clf_roc = CutoffClassifier(lr, strategy='roc', cv='prefit').fit(
+        X_test[:calibration_samples], y_test[:calibration_samples]
+    )
+
+    y_pred = lr.predict(X_test[calibration_samples:])
+    y_pred_roc = clf_roc.predict(X_test[calibration_samples:])
+
+    tn, fp, fn, tp = confusion_matrix(
+        y_test[calibration_samples:], y_pred).ravel()
+    tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
+        y_test[calibration_samples:], y_pred_roc).ravel()
+
+    tpr = tp / (tp + fn)
+    tnr = tn / (tn + fp)
+
+    tpr_roc = tp_roc / (tp_roc + fn_roc)
+    tnr_roc = tn_roc / (tn_roc + fp_roc)
+
+    # check that the sum of tpr and tnr has improved
+    assert tpr_roc + tnr_roc > tpr + tnr
+
+    clf_f1 = CutoffClassifier(
+        lr, strategy='f_beta', method='predict_proba', beta=1,
+        cv='prefit').fit(
+        X_test[:calibration_samples], y_test[:calibration_samples]
+    )
+
+    y_pred_f1 = clf_f1.predict(X_test[calibration_samples:])
+    assert (f1_score(y_test[calibration_samples:], y_pred_f1) >
+            f1_score(y_test[calibration_samples:], y_pred))
+
+    clf_fbeta = CutoffClassifier(
+        lr, strategy='f_beta', method='predict_proba', beta=2,
+        cv='prefit').fit(
+        X_test[:calibration_samples], y_test[:calibration_samples]
+    )
+
+    y_pred_fbeta = clf_fbeta.predict(X_test[calibration_samples:])
+    assert (recall_score(y_test[calibration_samples:], y_pred_fbeta) >
+            recall_score(y_test[calibration_samples:], y_pred))
+
+    clf_max_tpr = CutoffClassifier(
+        lr, strategy='max_tpr', threshold=0.7, cv='prefit'
+    ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
+
+    y_pred_max_tpr = clf_max_tpr.predict(X_test[calibration_samples:])
+
+    tn_max_tpr, fp_max_tpr, fn_max_tpr, tp_max_tpr = confusion_matrix(
+        y_test[calibration_samples:], y_pred_max_tpr).ravel()
+
+    tpr_max_tpr = tp_max_tpr / (tp_max_tpr + fn_max_tpr)
+    tnr_max_tpr = tn_max_tpr / (tn_max_tpr + fp_max_tpr)
+
+    # check that the tpr increases with tnr >= min_val_tnr
+    assert tpr_max_tpr > tpr
+    assert tpr_max_tpr > tpr_roc
+    assert tnr_max_tpr >= 0.7
+
+    clf_max_tnr = CutoffClassifier(
+        lr, strategy='max_tnr', threshold=0.7, cv='prefit'
+    ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
+
+    y_pred_clf = clf_max_tnr.predict(X_test[calibration_samples:])
+
+    tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(
+        y_test[calibration_samples:], y_pred_clf).ravel()
+
+    tnr_clf_max_tnr = tn_clf / (tn_clf + fp_clf)
+    tpr_clf_max_tnr = tp_clf / (tp_clf + fn_clf)
+
+    # check that the tnr increases with tpr >= min_val_tpr
+    assert tnr_clf_max_tnr > tnr
+    assert tnr_clf_max_tnr > tnr_roc
+    assert tpr_clf_max_tnr >= 0.7
+
+    # check error cases
+    clf_bad_base_estimator = CutoffClassifier([])
+    with pytest.raises(TypeError):
+        clf_bad_base_estimator.fit(X_train, y_train)
+
+    X_non_binary, y_non_binary = make_classification(
+        n_samples=20, n_features=6, random_state=42, n_classes=4,
+        n_informative=4
+    )
+    with pytest.raises(ValueError):
+        clf_roc.fit(X_non_binary, y_non_binary)
+
+    clf_foo = CutoffClassifier(lr, strategy='f_beta', beta='foo')
+    with pytest.raises(ValueError):
+        clf_foo.fit(X_train, y_train)
+
+    clf_foo = CutoffClassifier(lr, strategy='foo')
+    with pytest.raises(ValueError):
+        clf_foo.fit(X_train, y_train)
+
+    for method in ['max_tpr', 'max_tnr']:
+        clf_missing_info = CutoffClassifier(lr, strategy=method)
+        with pytest.raises(ValueError):
+            clf_missing_info.fit(X_train, y_train)
+
+
+def test_cutoff_cv():
+    X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
+                               n_classes=2)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y,
+                                                        test_size=0.4,
+                                                        random_state=42)
+    lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
+    clf_roc = CutoffClassifier(LogisticRegression(solver='liblinear'),
+                               strategy='roc',
+                               cv=3).fit(
+        X_train, y_train
+    )
+
+    assert clf_roc.decision_threshold_ != 0
+    assert clf_roc.std_ is not None and clf_roc.std_ != 0
+
+    y_pred = lr.predict(X_test)
+    y_pred_roc = clf_roc.predict(X_test)
+
+    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
+    tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
+        y_test, y_pred_roc
+    ).ravel()
+
+    tpr = tp / (tp + fn)
+    tnr = tn / (tn + fp)
+
+    tpr_roc = tp_roc / (tp_roc + fn_roc)
+    tnr_roc = tn_roc / (tn_roc + fp_roc)
+
+    # check that the sum of tpr + tnr has improved
+    assert tpr_roc + tnr_roc > tpr + tnr
+
+
+def test_get_binary_score():
+    X, y = make_classification(n_samples=200, n_features=6, random_state=42,
+                               n_classes=2)
+
+    X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.4,
+                                                   random_state=42)
+    lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
+    y_pred_proba = lr.predict_proba(X_test)
+    y_pred_score = lr.decision_function(X_test)
+
+    assert_array_equal(
+        y_pred_score, _get_binary_score(
+            lr, X_test, method='decision_function', pos_label=1)
+    )
+
+    assert_array_equal(
+        - y_pred_score, _get_binary_score(
+            lr, X_test, method='decision_function', pos_label=0)
+    )
+
+    assert_array_equal(
+        y_pred_proba[:, 1], _get_binary_score(
+            lr, X_test, method='predict_proba', pos_label=1)
+    )
+
+    assert_array_equal(
+        y_pred_proba[:, 0], _get_binary_score(
+            lr, X_test, method='predict_proba', pos_label=0)
+    )
+
+    assert_array_equal(
+        y_pred_score,
+        _get_binary_score(lr, X_test, method=None, pos_label=1)
+    )
+
+    with pytest.raises(ValueError):
+        _get_binary_score(lr, X_test, method='foo')
+
+    # classifier that does not have a decision_function
+    rf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
+    y_pred_proba_rf = rf.predict_proba(X_test)
+    assert_array_equal(
+        y_pred_proba_rf[:, 1],
+        _get_binary_score(rf, X_test, method=None, pos_label=1)
+    )
+
+    X_non_binary, y_non_binary = make_classification(
+        n_samples=20, n_features=6, random_state=42, n_classes=4,
+        n_informative=4
+    )
+
+    rf_non_bin = RandomForestClassifier(n_estimators=10).fit(X_non_binary,
+                                                             y_non_binary)
+    with pytest.raises(ValueError):
+        _get_binary_score(rf_non_bin, X_non_binary)

From b99218b5227a6d1bab1f2ddd03fe5002f78909e7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 23 Feb 2020 16:38:46 +0100
Subject: [PATCH 02/44] add example

---
 .../plot_decision_threshold_calibration.py    | 167 ++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 examples/calibration/plot_decision_threshold_calibration.py

diff --git a/examples/calibration/plot_decision_threshold_calibration.py b/examples/calibration/plot_decision_threshold_calibration.py
new file mode 100644
index 0000000000000..e14e680380e17
--- /dev/null
+++ b/examples/calibration/plot_decision_threshold_calibration.py
@@ -0,0 +1,167 @@
+"""
+======================================================================
+Decision threshold (cutoff point) calibration on breast cancer dataset
+======================================================================
+
+Machine learning classifiers often base their predictions on real-valued
+decision functions that don't always have accuracy as their objective. Moreover
+the learning objective of a model can differ from the user's needs hence using
+an arbitrary decision threshold as defined by the model can be not ideal.
+
+The CutoffClassifier can be used to calibrate the decision threshold of a model
+in order to increase the classifier's trustworthiness. Optimization objectives
+during the decision threshold calibration can be the true positive and / or
+the true negative rate as well as the f beta score.
+
+In this example the decision threshold calibration is applied on two
+classifiers trained on the breast cancer dataset. The goal in the first case is
+to maximize the f1 score of the classifiers whereas in the second the goal is
+to maximize the true positive rate while maintaining a minimum true negative
+rate.
+
+As you can see after calibration the f1 score of the LogisticRegression
+classifiers has increased slightly whereas the accuracy of the
+AdaBoostClassifier classifier has stayed the same.
+
+For the second goal as seen after calibration both classifiers achieve better
+true positive rate while their respective true negative rates have decreased
+slightly or remained stable.
+"""
+
+# Author: Prokopios Gryllos <prokopis.gryllos@sentiance.com>
+#
+# License: BSD 3 clause
+
+from __future__ import division
+
+import numpy as np
+
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.metrics import confusion_matrix, f1_score
+from sklearn.calibration import CutoffClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.datasets import load_breast_cancer
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+
+
+print(__doc__)
+
+# percentage of the training set that will be used for calibration
+calibration_samples_percentage = 0.2
+
+X, y = load_breast_cancer(return_X_y=True)
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6,
+                                                    random_state=42)
+
+calibration_samples = int(len(X_train) * calibration_samples_percentage)
+
+lr = LogisticRegression().fit(
+    X_train[:-calibration_samples], y_train[:-calibration_samples])
+
+y_pred_lr = lr.predict(X_test)
+tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(y_test, y_pred_lr).ravel()
+tpr_lr = tp_lr / (tp_lr + fn_lr)
+tnr_lr = tn_lr / (tn_lr + fp_lr)
+f_one_lr = f1_score(y_test, y_pred_lr)
+
+ada = AdaBoostClassifier().fit(
+    X_train[:-calibration_samples], y_train[:-calibration_samples])
+
+y_pred_ada = ada.predict(X_test)
+tn_ada, fp_ada, fn_ada, tp_ada = confusion_matrix(y_test, y_pred_ada).ravel()
+tpr_ada = tp_ada / (tp_ada + fn_ada)
+tnr_ada = tn_ada / (tn_ada + fp_ada)
+f_one_ada = f1_score(y_test, y_pred_ada)
+
+# objective 1: we want to calibrate the decision threshold in order to achieve
+# better f1 score
+lr_f_beta = CutoffClassifier(
+    lr, strategy='f_beta', method='predict_proba', beta=1, cv='prefit').fit(
+    X_train[calibration_samples:], y_train[calibration_samples:])
+
+y_pred_lr_f_beta = lr_f_beta.predict(X_test)
+f_one_lr_f_beta = f1_score(y_test, y_pred_lr_f_beta)
+
+ada_f_beta = CutoffClassifier(
+    ada, strategy='f_beta', method='predict_proba', beta=1, cv='prefit'
+).fit(X_train[calibration_samples:], y_train[calibration_samples:])
+
+y_pred_ada_f_beta = ada_f_beta.predict(X_test)
+f_one_ada_f_beta = f1_score(y_test, y_pred_ada_f_beta)
+
+# objective 2: we want to maximize the true positive rate while the true
+# negative rate is at least 0.7
+lr_max_tpr = CutoffClassifier(
+    lr, strategy='max_tpr', method='predict_proba', threshold=0.7, cv='prefit'
+).fit(X_train[calibration_samples:], y_train[calibration_samples:])
+
+y_pred_lr_max_tpr = lr_max_tpr.predict(X_test)
+tn_lr_max_tpr, fp_lr_max_tpr, fn_lr_max_tpr, tp_lr_max_tpr = \
+    confusion_matrix(y_test, y_pred_lr_max_tpr).ravel()
+tpr_lr_max_tpr = tp_lr_max_tpr / (tp_lr_max_tpr + fn_lr_max_tpr)
+tnr_lr_max_tpr = tn_lr_max_tpr / (tn_lr_max_tpr + fp_lr_max_tpr)
+
+ada_max_tpr = CutoffClassifier(
+    ada, strategy='max_tpr', method='predict_proba', threshold=0.7, cv='prefit'
+).fit(X_train[calibration_samples:], y_train[calibration_samples:])
+
+y_pred_ada_max_tpr = ada_max_tpr.predict(X_test)
+tn_ada_max_tpr, fp_ada_max_tpr, fn_ada_max_tpr, tp_ada_max_tpr = \
+    confusion_matrix(y_test, y_pred_ada_max_tpr).ravel()
+tpr_ada_max_tpr = tp_ada_max_tpr / (tp_ada_max_tpr + fn_ada_max_tpr)
+tnr_ada_max_tpr = tn_ada_max_tpr / (tn_ada_max_tpr + fp_ada_max_tpr)
+
+print('Calibrated threshold')
+print('Logistic Regression classifier: {}'.format(
+    lr_max_tpr.decision_threshold_))
+print('AdaBoost classifier: {}'.format(ada_max_tpr.decision_threshold_))
+print('before calibration')
+print('Logistic Regression classifier: tpr = {}, tnr = {}, f1 = {}'.format(
+    tpr_lr, tnr_lr, f_one_lr))
+print('AdaBoost classifier: tpr = {}, tpn = {}, f1 = {}'.format(
+    tpr_ada, tnr_ada, f_one_ada))
+
+print('true positive and true negative rates after calibration')
+print('Logistic Regression classifier: tpr = {}, tnr = {}, f1 = {}'.format(
+    tpr_lr_max_tpr, tnr_lr_max_tpr, f_one_lr_f_beta))
+print('AdaBoost classifier: tpr = {}, tnr = {}, f1 = {}'.format(
+    tpr_ada_max_tpr, tnr_ada_max_tpr, f_one_ada_f_beta))
+
+#########
+# plots #
+#########
+bar_width = 0.2
+
+plt.subplot(2, 1, 1)
+index = np.asarray([1, 2])
+plt.bar(index, [f_one_lr, f_one_ada], bar_width, color='r',
+        label='Before calibration')
+
+plt.bar(index + bar_width, [f_one_lr_f_beta, f_one_ada_f_beta], bar_width,
+        color='b', label='After calibration')
+
+plt.xticks(index + bar_width / 2, ('f1 logistic', 'f1 adaboost'))
+
+plt.ylabel('scores')
+plt.title('f1 score')
+plt.legend(bbox_to_anchor=(.5, -.2), loc='center', borderaxespad=0.)
+
+plt.subplot(2, 1, 2)
+index = np.asarray([1, 2, 3, 4])
+plt.bar(index, [tpr_lr, tnr_lr, tpr_ada, tnr_ada],
+        bar_width, color='r', label='Before calibration')
+
+plt.bar(index + bar_width,
+        [tpr_lr_max_tpr, tnr_lr_max_tpr, tpr_ada_max_tpr, tnr_ada_max_tpr],
+        bar_width, color='b', label='After calibration')
+
+plt.xticks(
+    index + bar_width / 2,
+    ('tpr logistic', 'tnr logistic', 'tpr adaboost', 'tnr adaboost'))
+plt.ylabel('scores')
+plt.title('true positive & true negative rate')
+
+plt.subplots_adjust(hspace=0.6)
+plt.show()

From ce664273d82e965c07d80c7227c659d8aebb29ce Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 23 Feb 2020 16:53:26 +0100
Subject: [PATCH 03/44] PEP8

---
 sklearn/calibration.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 8692e628bbb8a..d7f798b1ef62c 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -20,7 +20,6 @@
 
 from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
                    MetaEstimatorMixin)
-from .base import clone
 from .preprocessing import label_binarize, LabelBinarizer
 from .utils import check_X_y, check_array, indexable, column_or_1d
 from .utils.validation import check_is_fitted, check_consistent_length

From 3ee3b5a4607abb9db2fa01e876c3fd55923ef5d7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 23 Feb 2020 17:13:11 +0100
Subject: [PATCH 04/44] add whats new entry

---
 doc/whats_new/v0.23.rst |  9 +++++++++
 sklearn/calibration.py  | 10 +++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 9ae776c4c7b9b..6925414835f63 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -45,6 +45,15 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.calibration`
+..........................
+
+- |MajorFeature| :class:`calibration.CutoffClassifier` calibrates the decision
+  threshold function of a classifier by maximizing a binary classification
+  metric through cross-validation.
+  :pr:`16525` by :user:`Prokopis Gryllos <PGryllos>`
+  and :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index d7f798b1ef62c..9fc09a0100c7c 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -603,12 +603,12 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
 
 
 class CutoffClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
-    """Decision threshold calibration for binary classification
+    """Decision threshold calibration for binary classification.
 
-    Meta estimator that calibrates the decision threshold (cutoff point)
-    that is used for prediction. The methods for picking cutoff points make use
-    of traditional binary classification evaluation statistics such as the
-    true positive and true negative rates and F-scores.
+    Estimator that calibrates the decision threshold (cutoff point) that is
+    used for prediction. The methods for picking cutoff points make use of
+    traditional binary classification evaluation statistics such as the true
+    positive and true negative rates and F-scores.
 
     If cv="prefit" the base estimator is assumed to be fitted and all data will
     be used for the selection of the cutoff point. Otherwise the decision

From c5a51eb497d96580a8e51d5a1a08b41899004d3b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 23 Feb 2020 17:52:57 +0100
Subject: [PATCH 05/44] mark as only working with binary data

---
 sklearn/calibration.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 9fc09a0100c7c..bc3a2fb7e9081 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -810,7 +810,7 @@ def predict(self, X):
         )
 
 
-class _CutoffClassifier(object):
+class _CutoffClassifier:
     """Cutoff point selection.
 
     It assumes that base_estimator has already been fit, and uses the input set
@@ -905,6 +905,9 @@ def fit(self, X, y):
             self.decision_threshold_ = thresholds[indices[max_tnr_index]]
         return self
 
+    def _more_tags(self):
+        return {"binary_only": True}
+
 
 def _get_binary_score(clf, X, method=None, pos_label=1):
     """Binary classification score for the positive label (0 or 1)

From 0aab70ca3c9d1f97f59e4a4dcf2cb0c22af97acb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 23 Feb 2020 19:22:44 +0100
Subject: [PATCH 06/44] common tests fixes

---
 sklearn/calibration.py | 86 +++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index bc3a2fb7e9081..b4d90df9aca97 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -8,6 +8,7 @@
 # License: BSD 3 clause
 
 import warnings
+from copy import deepcopy
 from inspect import signature
 
 from math import log
@@ -18,21 +19,32 @@
 from scipy.optimize import fmin_bfgs
 from .preprocessing import LabelEncoder
 
-from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
-                   MetaEstimatorMixin)
-from .preprocessing import label_binarize, LabelBinarizer
-from .utils import check_X_y, check_array, indexable, column_or_1d
-from .utils.validation import check_is_fitted, check_consistent_length
-from .utils.validation import _check_sample_weight
+from .base import BaseEstimator
+from .base import ClassifierMixin
+from .base import MetaEstimatorMixin
+from .base import RegressorMixin
+from .base import clone
 from .isotonic import IsotonicRegression
-from .svm import LinearSVC
 from .metrics import precision_recall_curve
 from .metrics import roc_curve
 from .model_selection import check_cv
+from .preprocessing import label_binarize
+from .preprocessing import LabelBinarizer
+from .svm import LinearSVC
+from .utils import check_X_y
+from .utils import check_array
+from .utils import column_or_1d
+from .utils import indexable
+from .utils import _safe_indexing
+from .utils.multiclass import check_classification_targets
 from .utils.multiclass import type_of_target
+from .utils.validation import check_is_fitted
+from .utils.validation import check_consistent_length
+from .utils.validation import _check_sample_weight
 from .utils.validation import _deprecate_positional_args
 
 
+
 class CalibratedClassifierCV(BaseEstimator, ClassifierMixin,
                              MetaEstimatorMixin):
     """Probability calibration with isotonic regression or logistic regression.
@@ -602,7 +614,7 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
     return prob_true, prob_pred
 
 
-class CutoffClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     """Decision threshold calibration for binary classification.
 
     Estimator that calibrates the decision threshold (cutoff point) that is
@@ -747,23 +759,32 @@ def fit(self, X, y):
                 raise ValueError('parameter beta must be a real number.'
                                  'Got {} instead'.format(type(self.beta)))
 
-        X, y = check_X_y(X, y)
+        self._base_estimator = (deepcopy(self.base_estimator)
+                                if self.cv != "prefit"
+                                else self.base_estimator)
 
+        X = check_array(
+            X, accept_sparse=['csc', 'csr'], force_all_finite=False,
+            allow_nd=True,
+        )
+        y = check_array(y, ensure_2d=False, dtype=None)
+        # FIXME: check_classification_targets should return the type of target
+        # as well
+        check_classification_targets(y)
         y_type = type_of_target(y)
         if y_type != 'binary':
-            raise ValueError('Expected target of binary type. Got {}'.format(
-                y_type))
+            raise ValueError(f'Expected target of binary type. Got {y_type}.')
 
         self.label_encoder_ = LabelEncoder().fit(y)
         self.classes_ = self.label_encoder_.classes_
 
         y = self.label_encoder_.transform(y)
-        self.pos_label = self.label_encoder_.transform([self.pos_label])[0]
+        self._pos_label = self.label_encoder_.transform([self.pos_label])[0]
 
         if self.cv == 'prefit':
             self.decision_threshold_ = _CutoffClassifier(
-                self.base_estimator, self.strategy, self.method, self.beta,
-                self.threshold, self.pos_label
+                self._base_estimator, self.strategy, self.method, self.beta,
+                self.threshold, self._pos_label
             ).fit(X, y).decision_threshold_
             self.std_ = None
         else:
@@ -771,18 +792,22 @@ def fit(self, X, y):
             decision_thresholds = []
 
             for train, test in cv.split(X, y):
-                estimator = clone(self.base_estimator).fit(X[train], y[train])
+                estimator = clone(self._base_estimator).fit(
+                    _safe_indexing(X, train), _safe_indexing(y, train)
+                )
                 decision_thresholds.append(
-                    _CutoffClassifier(estimator, self.strategy, self.method,
-                                      self.beta, self.threshold,
-                                      self.pos_label).fit(
-                        X[test], y[test]
+                    _CutoffClassifier(
+                        estimator, self.strategy, self.method,
+                        self.beta, self.threshold,
+                        self.pos_label
+                    ).fit(
+                        _safe_indexing(X, test), _safe_indexing(y, test)
                     ).decision_threshold_
                 )
             self.decision_threshold_ = np.mean(decision_thresholds)
             self.std_ = np.std(decision_thresholds)
 
-            self.base_estimator.fit(X, y)
+            self._base_estimator.fit(X, y)
         return self
 
     def predict(self, X):
@@ -798,17 +823,18 @@ def predict(self, X):
         C : array, shape (n_samples,)
             The predicted class
         """
-        X = check_array(X)
-        check_is_fitted(
-            self, ["label_encoder_", "decision_threshold_", "std_", "classes_"]
-        )
+        check_is_fitted(self)
 
-        y_score = _get_binary_score(self.base_estimator, X, self.method,
-                                    self.pos_label)
+        y_score = _get_binary_score(
+            self._base_estimator, X, self.method, self.pos_label
+        )
         return self.label_encoder_.inverse_transform(
             (y_score > self.decision_threshold_).astype(int)
         )
 
+    def _more_tags(self):
+        return {"binary_only": True}
+
 
 class _CutoffClassifier:
     """Cutoff point selection.
@@ -876,8 +902,9 @@ def fit(self, X, y):
         self : object
             Instance of self
         """
-        y_score = _get_binary_score(self.base_estimator, X, self.method,
-                                    self.pos_label)
+        y_score = _get_binary_score(
+            self.base_estimator, X, self.method, self.pos_label
+        )
         if self.strategy == 'f_beta':
             precision, recall, thresholds = precision_recall_curve(
                 y, y_score, pos_label=self.pos_label
@@ -905,9 +932,6 @@ def fit(self, X, y):
             self.decision_threshold_ = thresholds[indices[max_tnr_index]]
         return self
 
-    def _more_tags(self):
-        return {"binary_only": True}
-
 
 def _get_binary_score(clf, X, method=None, pos_label=1):
     """Binary classification score for the positive label (0 or 1)

From 6e12f8ad0cede4b7d57341258684ff6c8072d8dd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Feb 2020 08:19:12 +0100
Subject: [PATCH 07/44] iter

---
 sklearn/calibration.py            | 426 +++++++++++++-----------------
 sklearn/tests/test_calibration.py | 159 ++++++-----
 2 files changed, 277 insertions(+), 308 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b4d90df9aca97..46f4af9b97e1d 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -7,9 +7,10 @@
 #
 # License: BSD 3 clause
 
-import warnings
 from copy import deepcopy
 from inspect import signature
+import numbers
+import warnings
 
 from math import log
 import numpy as np
@@ -24,6 +25,7 @@
 from .base import MetaEstimatorMixin
 from .base import RegressorMixin
 from .base import clone
+from .exceptions import NotFittedError
 from .isotonic import IsotonicRegression
 from .metrics import precision_recall_curve
 from .metrics import roc_curve
@@ -44,7 +46,6 @@
 from .utils.validation import _deprecate_positional_args
 
 
-
 class CalibratedClassifierCV(BaseEstimator, ClassifierMixin,
                              MetaEstimatorMixin):
     """Probability calibration with isotonic regression or logistic regression.
@@ -629,75 +630,67 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
 
     Parameters
     ----------
-    base_estimator : obj
+    base_estimator : estimator instance
         The binary classifier whose decision threshold will be adapted
-        according to the acquired cutoff point. The estimator must have a
-        decision_function or a predict_proba
+        according to the acquired cutoff point. The estimator must implement
+        `decision_function` or `predict_proba` function.
 
-    strategy : str, optional (default='roc')
-        The strategy to use for choosing the cutoff point
+    strategy : {"roc", "f_beta", "max_tpr", "max_tnr", "constant"}, \
+            default="roc" The strategy to use for choosing the cutoff point.
 
-        'roc'
-            selects the point on the roc curve that is closest to the ideal
-            corner (0, 1)
+        - "roc" selects the point on the ROC curve that is closest to the ideal
+          corner (0, 1).
 
-        'f_beta'
-            selects a decision threshold that maximizes the f_beta score
+        - "f_beta" selects a decision threshold that maximizes the `f_beta`
+          score.
 
-        'max_tpr'
-            selects the point that yields the highest true positive rate with
-            true negative rate at least equal to the value of the parameter
-            threshold
+        - "max_tpr" selects the point that yields the highest true positive
+          rate (TPR) with true negative rate (TNR) at least equal to the value
+          of the parameter threshold.
 
-        'max_tnr'
-            selects the point that yields the highest true negative rate with
-            true positive rate at least equal to the value of the parameter
-            threshold
+        - "max_tnr" selects the point that yields the highest true negative
+          rate (TNR) with true positive rate (TPR) at least equal to the value
+          of the parameter threshold.
 
-    method : str or None, optional (default=None)
-        The method to be used for acquiring the score
-
-        'decision_function'
-            base_estimator.decision_function will be used for scoring
+        - "constant" will use the threshold specified by the parameter
+          `decision_threshold`.
 
-        'predict_proba'
-            base_estimator.predict_proba will be used for scoring
+    method : {"auto", "decision_function", "predict_proba"}, default="auto" The
+        method to be used to get the predictions. If `"auto"` (default), the
+        base estimator will try to invoke `decision_function` or
+        `predict_proba`, in that order.
 
-        None
-            base_estimator.decision_function will be used first and if not
-            available base_estimator.predict_proba
+    beta : float in [0, 1], optional (default=None) beta value to be used in
+        case strategy == 'f_beta'
 
-    beta : float in [0, 1], optional (default=None)
-        beta value to be used in case strategy == 'f_beta'
+    threshold : float in [0, 1] or None, (default=None) In case strategy is
+        'max_tpr' or 'max_tnr' this parameter must be set to specify the
+        threshold for the true negative rate or true positive rate respectively
+        that needs to be achieved
 
-    threshold : float in [0, 1] or None, (default=None)
-        In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
-        to specify the threshold for the true negative rate or true positive
-        rate respectively that needs to be achieved
+    decision_threshold : float, default=0.5
+        When `strategy="constant"`, decision threshold used as cutoff point.
 
-    pos_label : object, optional (default=1)
-        Object representing the positive label
+    pos_label : object, optional (default=1) Object representing the positive
+        label
 
     cv : int, cross-validation generator, iterable or 'prefit', optional
-        (default=3). Determines the cross-validation splitting strategy.
-        If cv='prefit' the base estimator is assumed to be fitted and all data
+        (default=3). Determines the cross-validation splitting strategy. If
+        cv='prefit' the base estimator is assumed to be fitted and all data
         will be used for the calibration of the probability threshold
 
     Attributes
     ----------
-    decision_threshold_ : float
-        Decision threshold for the positive class. Determines the output of
-        predict
+    decision_threshold_ : float Decision threshold for the positive class.
+        Determines the output of predict
 
-    std_ : float
-        Standard deviation of the obtained decision thresholds for when the
-        provided base estimator is not pre-trained and the decision_threshold_
-        is computed as the mean of the decision threshold of each
-        cross-validation iteration. If the base estimator is pre-trained then
-        std_ = None
+    std_ : float Standard deviation of the obtained decision thresholds for
+        when the provided base estimator is not pre-trained and the
+        decision_threshold_ is computed as the mean of the decision threshold
+        of each cross-validation iteration. If the base estimator is
+        pre-trained then std_ = None
 
-    classes_ : array, shape (n_classes)
-        The class labels.
+    classes_ : array, shape (n_classes) The class labels.
 
     References
     ----------
@@ -706,16 +699,89 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
            Clinical chemistry, 1993
 
     """
-    def __init__(self, base_estimator, strategy='roc', method=None, beta=None,
-                 threshold=None, pos_label=1, cv=3):
+    def __init__(self, base_estimator, strategy="roc", method="auto",
+                 beta=None, threshold=None, decision_threshold=0.5,
+                 pos_label=1, cv=3):
         self.base_estimator = base_estimator
         self.strategy = strategy
         self.method = method
         self.beta = beta
         self.threshold = threshold
+        self.decision_threshold = decision_threshold
         self.pos_label = pos_label
         self.cv = cv
 
+    def _validate_parameters(self):
+        """Validate the input parameters."""
+        supported_methods = ("decision_function", "predict_proba")
+        if self.method == "auto":
+            has_methods = [
+                hasattr(self.base_estimator, m) for m in supported_methods
+            ]
+            if not any(has_methods):
+                raise TypeError(
+                    f"'base_estimator' must implement one of the "
+                    f"{', '.join(supported_methods)} methods."
+                )
+            self._method = next(
+                (m for m, i in zip(supported_methods, has_methods) if i), None
+            )
+        else:
+            if self.method not in supported_methods:
+                raise ValueError(
+                    f"'method' should be one of {', '.join(supported_methods)}"
+                    f". Got {self.method} instead."
+                )
+            elif not hasattr(self.base_estimator, self.method):
+                raise TypeError(
+                    f"'base_estimator' does not implement {self.method}."
+                )
+            self._method = self.method
+
+        strategies = ("roc", "f_beta", "max_tpr", "max_tnr", "constant")
+        if self.strategy not in strategies:
+            raise ValueError(
+                f"'strategy' must be of {', '.join(strategies)}. "
+                f"Got {self.strategy} instead."
+            )
+        elif self.strategy in ("max_tpr", "max_tnr"):
+            if not isinstance(self.threshold, numbers.Real):
+                raise TypeError(
+                    "When strategy is max_tpr or max_tnr, threshold should be "
+                    f"a real in [0, 1]. Got {type(self.threshold)} instead."
+                )
+            elif not (0 < self.threshold < 1):
+                raise ValueError(
+                    f"threshold should be in the range [0, 1]. "
+                    f"Got {self.threshold} instead."
+                )
+        elif self.strategy == "f_beta":
+            if not isinstance(self.beta, numbers.Real):
+                raise TypeError(
+                    "When strategy is f_beta, beta should be a real. "
+                    f"Got {type(self.beta)} instead."
+                )
+        elif self.strategy == "constant":
+            if (self.method == "predict_proba" and
+                    not (0 < self.decision_threshold < 1)):
+                raise ValueError(
+                    f"decision_threshold should be in the range [0, 1] when "
+                    f"using 'predict_proba'. Got {self.decision_threshold} "
+                    "instead."
+                )
+
+    def _validate_data(self, X, y):
+        X = check_array(
+            X, accept_sparse=['csc', 'csr'], force_all_finite=False,
+            allow_nd=True,
+        )
+        y = check_array(y, ensure_2d=False, dtype=None)
+        check_classification_targets(y)
+        y_type = type_of_target(y)
+        if y_type != 'binary':
+            raise ValueError(f'Expected target of binary type. Got {y_type}.')
+        return X, y
+
     def fit(self, X, y):
         """Fit model
 
@@ -732,60 +798,25 @@ def fit(self, X, y):
         self : object
             Instance of self
         """
-        if (not hasattr(self.base_estimator, 'decision_function') and
-                not hasattr(self.base_estimator, 'predict_proba')):
-            raise TypeError('The base_estimator needs to implement either a '
-                            'decision_function or a predict_proba method')
-
-        if self.strategy not in ('roc', 'f_beta', 'max_tpr', 'max_tnr'):
-            raise ValueError('strategy can either be "roc" or "max_tpr" or '
-                             '"max_tnr. Got {} instead'.format(self.strategy))
-
-        if self.method not in (None, 'decision_function', 'predict_proba'):
-            raise ValueError('method param can either be "decision_function" '
-                             'or "predict_proba" or None. '
-                             'Got {} instead'.format(self.method))
-
-        if self.strategy == 'max_tpr' or self.strategy == 'max_tnr':
-            if (not self.threshold or not
-                    isinstance(self.threshold, (int, float))
-                    or not self.threshold >= 0 or not self.threshold <= 1):
-                raise ValueError('parameter threshold must be a number in'
-                                 '[0, 1]. '
-                                 'Got {} instead'.format(self.threshold))
-
-        if self.strategy == 'f_beta':
-            if not self.beta or not isinstance(self.beta, (int, float)):
-                raise ValueError('parameter beta must be a real number.'
-                                 'Got {} instead'.format(type(self.beta)))
-
-        self._base_estimator = (deepcopy(self.base_estimator)
-                                if self.cv != "prefit"
-                                else self.base_estimator)
-
-        X = check_array(
-            X, accept_sparse=['csc', 'csr'], force_all_finite=False,
-            allow_nd=True,
-        )
-        y = check_array(y, ensure_2d=False, dtype=None)
-        # FIXME: check_classification_targets should return the type of target
-        # as well
-        check_classification_targets(y)
-        y_type = type_of_target(y)
-        if y_type != 'binary':
-            raise ValueError(f'Expected target of binary type. Got {y_type}.')
-
-        self.label_encoder_ = LabelEncoder().fit(y)
-        self.classes_ = self.label_encoder_.classes_
+        self._validate_parameters()
+        X, y = self._validate_data(X, y)
 
-        y = self.label_encoder_.transform(y)
-        self._pos_label = self.label_encoder_.transform([self.pos_label])[0]
+        self._label_encoder = LabelEncoder().fit(y)
+        self.classes_ = self._label_encoder.classes_
 
-        if self.cv == 'prefit':
-            self.decision_threshold_ = _CutoffClassifier(
-                self._base_estimator, self.strategy, self.method, self.beta,
-                self.threshold, self._pos_label
-            ).fit(X, y).decision_threshold_
+        try:
+            check_is_fitted(self.base_estimator)
+            self._base_estimator = deepcopy(self.base_estimator)
+        except NotFittedError:
+            self._base_estimator = clone(self.base_estimator).fit(X, y)
+
+        if self.strategy == "constant":
+            self.decision_threshold_ = self.decision_threshold
+        elif self.cv == 'prefit':
+            self.decision_threshold_ = _find_optimal_decision_threshold(
+                self._base_estimator, X, y, self.strategy, self._method,
+                self.beta, self.threshold, self.pos_label
+            )
             self.std_ = None
         else:
             cv = check_cv(self.cv, y, classifier=True)
@@ -796,18 +827,15 @@ def fit(self, X, y):
                     _safe_indexing(X, train), _safe_indexing(y, train)
                 )
                 decision_thresholds.append(
-                    _CutoffClassifier(
-                        estimator, self.strategy, self.method,
-                        self.beta, self.threshold,
+                    _find_optimal_decision_threshold(
+                        estimator,
+                        _safe_indexing(X, test), _safe_indexing(y, test),
+                        self.strategy, self._method, self.beta, self.threshold,
                         self.pos_label
-                    ).fit(
-                        _safe_indexing(X, test), _safe_indexing(y, test)
-                    ).decision_threshold_
+                    )
                 )
             self.decision_threshold_ = np.mean(decision_thresholds)
             self.std_ = np.std(decision_thresholds)
-
-            self._base_estimator.fit(X, y)
         return self
 
     def predict(self, X):
@@ -826,9 +854,9 @@ def predict(self, X):
         check_is_fitted(self)
 
         y_score = _get_binary_score(
-            self._base_estimator, X, self.method, self.pos_label
+            self._base_estimator, X, self._method, self.pos_label
         )
-        return self.label_encoder_.inverse_transform(
+        return self._label_encoder.inverse_transform(
             (y_score > self.decision_threshold_).astype(int)
         )
 
@@ -836,104 +864,35 @@ def _more_tags(self):
         return {"binary_only": True}
 
 
-class _CutoffClassifier:
-    """Cutoff point selection.
-
-    It assumes that base_estimator has already been fit, and uses the input set
-    of the fit function to select a cutoff point. Note that this class should
-    not be used as an estimator directly. Use the CutoffClassifier with
-    cv="prefit" instead.
-
-    Parameters
-    ----------
-    base_estimator : obj
-        The binary classifier whose decision threshold will be adapted
-        according to the acquired cutoff point. The estimator must have a
-        decision_function or a predict_proba
-
-    strategy : 'roc' or 'f_beta' or 'max_tpr' or 'max_tnr'
-        The method to use for choosing the cutoff point
-
-    method : str or None, optional (default=None)
-        The method to be used for acquiring the score. Can either be
-        "decision_function" or "predict_proba" or None. If None then
-        decision_function will be used first and if not available
-        predict_proba
-
-    beta : float in [0, 1]
-        beta value to be used in case strategy == 'f_beta'
-
-    threshold : float in [0, 1]
-        minimum required value for the true negative rate (specificity) in case
-        strategy 'max_tpr' is used or for the true positive rate (sensitivity)
-        in case method 'max_tnr' is used
-
-    pos_label : object
-        Label considered as positive during the roc_curve construction
-
-    Attributes
-    ----------
-    decision_threshold_ : float
-        Acquired decision threshold for the positive class
-    """
-    def __init__(self, base_estimator, strategy, method, beta, threshold,
-                 pos_label):
-        self.base_estimator = base_estimator
-        self.strategy = strategy
-        self.method = method
-        self.beta = beta
-        self.threshold = threshold
-        self.pos_label = pos_label
-
-    def fit(self, X, y):
-        """Select a decision threshold for the fitted model's positive class
-        using one of the available methods
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Training data
-
-        y : array-like, shape (n_samples,)
-            Target values
-
-        Returns
-        -------
-        self : object
-            Instance of self
-        """
-        y_score = _get_binary_score(
-            self.base_estimator, X, self.method, self.pos_label
+def _find_optimal_decision_threshold(estimator, X, y, strategy, method, beta,
+                                     threshold, pos_label):
+    y_score = _get_binary_score(
+        estimator, X, method=method, pos_label=pos_label
+    )
+    if strategy == 'f_beta':
+        precision, recall, thresholds = precision_recall_curve(
+            y, y_score, pos_label=pos_label
         )
-        if self.strategy == 'f_beta':
-            precision, recall, thresholds = precision_recall_curve(
-                y, y_score, pos_label=self.pos_label
-            )
-            f_beta = ((1 + self.beta**2) * (precision * recall) /
-                      (self.beta**2 * precision + recall))
-            self.decision_threshold_ = thresholds[np.argmax(f_beta)]
-            return self
-
-        fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=self.pos_label)
-
-        if self.strategy == 'roc':
-            # we find the threshold of the point (fpr, tpr) with the smallest
-            # euclidean distance from the "ideal" corner (0, 1)
-            self.decision_threshold_ = thresholds[
-                np.argmin(fpr ** 2 + (tpr - 1) ** 2)
-            ]
-        elif self.strategy == 'max_tpr':
-            indices = np.where(1 - fpr >= self.threshold)[0]
-            max_tpr_index = np.argmax(tpr[indices])
-            self.decision_threshold_ = thresholds[indices[max_tpr_index]]
-        else:
-            indices = np.where(tpr >= self.threshold)[0]
-            max_tnr_index = np.argmax(1 - fpr[indices])
-            self.decision_threshold_ = thresholds[indices[max_tnr_index]]
-        return self
-
-
-def _get_binary_score(clf, X, method=None, pos_label=1):
+        f_beta = ((1 + beta ** 2) * (precision * recall) /
+                  (beta ** 2 * precision + recall))
+        return thresholds[np.argmax(f_beta)]
+
+    fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=pos_label)
+
+    if strategy == 'roc':
+        # we find the threshold of the point (fpr, tpr) with the smallest
+        # euclidean distance from the "ideal" corner (0, 1)
+        return thresholds[np.argmin(fpr ** 2 + (tpr - 1) ** 2)]
+    elif strategy == 'max_tpr':
+        indices = np.where(1 - fpr >= threshold)[0]
+        max_tpr_index = np.argmax(tpr[indices])
+        return thresholds[indices[max_tpr_index]]
+    indices = np.where(tpr >= threshold)[0]
+    max_tnr_index = np.argmax(1 - fpr[indices])
+    return thresholds[indices[max_tnr_index]]
+
+
+def _get_binary_score(estimator, X, method, pos_label):
     """Binary classification score for the positive label (0 or 1)
 
     Returns the score that a binary classifier outputs for the positive label
@@ -941,15 +900,14 @@ def _get_binary_score(clf, X, method=None, pos_label=1):
 
     Parameters
     ----------
-    clf : object
-        Classifier object to be used for acquiring the scores. Needs to have
-        a decision_function or a predict_proba method
+    estimator : estimator object
+        Fitted estimator to get prediction from.
 
-    X : array-like, shape (n_samples, n_features)
-        The samples
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
 
-    pos_label : int, optional (default=1)
-        The positive label. Can either be 0 or 1
+    pos_label : int or str
+        The positive label.
 
     method : str or None, optional (default=None)
         The method to be used for acquiring the score. Can either be
@@ -963,27 +921,11 @@ def _get_binary_score(clf, X, method=None, pos_label=1):
         The return value of the provided classifier's decision_function or
         predict_proba depending on the method used.
     """
-    if len(clf.classes_) != 2:
-        raise ValueError('Expected binary classifier. Found {} classes'.format(
-            len(clf.classes_)
-        ))
-
-    if method not in (None, 'decision_function', 'predict_proba'):
-        raise ValueError('scoring param can either be "decision_function" '
-                         'or "predict_proba" or None. '
-                         'Got {} instead'.format(method))
-
-    if not method:
-        try:
-            y_score = clf.decision_function(X)
-            if pos_label == clf.classes_[0]:
-                y_score = -y_score
-        except (NotImplementedError, AttributeError):
-            y_score = clf.predict_proba(X)[:, pos_label]
-    elif method == 'decision_function':
-        y_score = clf.decision_function(X)
-        if pos_label == clf.classes_[0]:
-            y_score = - y_score
-    else:
-        y_score = clf.predict_proba(X)[:, pos_label]
+    # FIXME: what if estimator was fitted on encoded label??
+    y_score = getattr(estimator, method)(X)
+    if y_score.ndim == 2:
+        # probabilities
+        y_score = y_score[:, pos_label]
+    elif pos_label == estimator.classes_[0]:
+        y_score = -y_score
     return y_score
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index e7970ebb76adc..b017547f595e3 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -450,19 +450,6 @@ def test_cutoff_prefit():
     with pytest.raises(ValueError):
         clf_roc.fit(X_non_binary, y_non_binary)
 
-    clf_foo = CutoffClassifier(lr, strategy='f_beta', beta='foo')
-    with pytest.raises(ValueError):
-        clf_foo.fit(X_train, y_train)
-
-    clf_foo = CutoffClassifier(lr, strategy='foo')
-    with pytest.raises(ValueError):
-        clf_foo.fit(X_train, y_train)
-
-    for method in ['max_tpr', 'max_tnr']:
-        clf_missing_info = CutoffClassifier(lr, strategy=method)
-        with pytest.raises(ValueError):
-            clf_missing_info.fit(X_train, y_train)
-
 
 def test_cutoff_cv():
     X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
@@ -499,58 +486,98 @@ def test_cutoff_cv():
     assert tpr_roc + tnr_roc > tpr + tnr
 
 
-def test_get_binary_score():
+# def test_get_binary_score():
+#     X, y = make_classification(n_samples=200, n_features=6, random_state=42,
+#                                n_classes=2)
+
+#     X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.4,
+#                                                    random_state=42)
+#     lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
+#     y_pred_proba = lr.predict_proba(X_test)
+#     y_pred_score = lr.decision_function(X_test)
+
+#     assert_array_equal(
+#         y_pred_score, _get_binary_score(
+#             lr, X_test, method='decision_function', pos_label=1)
+#     )
+
+#     assert_array_equal(
+#         - y_pred_score, _get_binary_score(
+#             lr, X_test, method='decision_function', pos_label=0)
+#     )
+
+#     assert_array_equal(
+#         y_pred_proba[:, 1], _get_binary_score(
+#             lr, X_test, method='predict_proba', pos_label=1)
+#     )
+
+#     assert_array_equal(
+#         y_pred_proba[:, 0], _get_binary_score(
+#             lr, X_test, method='predict_proba', pos_label=0)
+#     )
+
+#     assert_array_equal(
+#         y_pred_score,
+#         _get_binary_score(lr, X_test, method=None, pos_label=1)
+#     )
+
+#     with pytest.raises(ValueError):
+#         _get_binary_score(lr, X_test, method='foo')
+
+#     # classifier that does not have a decision_function
+#     rf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
+#     y_pred_proba_rf = rf.predict_proba(X_test)
+#     assert_array_equal(
+#         y_pred_proba_rf[:, 1],
+#         _get_binary_score(rf, X_test, method=None, pos_label=1)
+#     )
+
+#     X_non_binary, y_non_binary = make_classification(
+#         n_samples=20, n_features=6, random_state=42, n_classes=4,
+#         n_informative=4
+#     )
+
+#     rf_non_bin = RandomForestClassifier(n_estimators=10).fit(X_non_binary,
+#                                                              y_non_binary)
+#     with pytest.raises(ValueError):
+#         _get_binary_score(rf_non_bin, X_non_binary)
+
+
+class MockNoPredictorClassifier:
+    pass
+
+
+@pytest.mark.parametrize(
+    "Estimator, params, err_type, err_msg",
+    [
+        (LogisticRegression, {"method": "xxx"}, ValueError,
+         "'method' should be one of"),
+        (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
+         "'base_estimator' must implement one of the"),
+        (RandomForestRegressor, {"method": "predict_proba"}, TypeError,
+         "'base_estimator' does not implement predict_proba"),
+        (LogisticRegression, {"strategy": "xxx"}, ValueError,
+         "'strategy' must be of"),
+        (LogisticRegression, {"strategy": "max_tpr"}, TypeError,
+         "When strategy is max_tpr or max_tnr"),
+        (LogisticRegression, {"strategy": "f_beta"}, TypeError,
+         "When strategy is f_beta"),
+        (LogisticRegression, {"strategy": "max_tnr"}, TypeError,
+         "When strategy is max_tpr or max_tnr"),
+        (LogisticRegression, {"strategy": "max_tpr", "threshold": 10},
+         ValueError, r"threshold should be in the range \[0, 1\]"),
+        (LogisticRegression, {"strategy": "max_tnr", "threshold": 10},
+         ValueError, r"threshold should be in the range \[0, 1\]"),
+        (LogisticRegression,
+         {"strategy": "constant", "method": "predict_proba",
+          "decision_threshold": 10},
+         ValueError, r"decision_threshold should be in the range \[0, 1\]")
+    ]
+)
+def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
+                                             err_msg):
     X, y = make_classification(n_samples=200, n_features=6, random_state=42,
                                n_classes=2)
-
-    X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.4,
-                                                   random_state=42)
-    lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
-    y_pred_proba = lr.predict_proba(X_test)
-    y_pred_score = lr.decision_function(X_test)
-
-    assert_array_equal(
-        y_pred_score, _get_binary_score(
-            lr, X_test, method='decision_function', pos_label=1)
-    )
-
-    assert_array_equal(
-        - y_pred_score, _get_binary_score(
-            lr, X_test, method='decision_function', pos_label=0)
-    )
-
-    assert_array_equal(
-        y_pred_proba[:, 1], _get_binary_score(
-            lr, X_test, method='predict_proba', pos_label=1)
-    )
-
-    assert_array_equal(
-        y_pred_proba[:, 0], _get_binary_score(
-            lr, X_test, method='predict_proba', pos_label=0)
-    )
-
-    assert_array_equal(
-        y_pred_score,
-        _get_binary_score(lr, X_test, method=None, pos_label=1)
-    )
-
-    with pytest.raises(ValueError):
-        _get_binary_score(lr, X_test, method='foo')
-
-    # classifier that does not have a decision_function
-    rf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
-    y_pred_proba_rf = rf.predict_proba(X_test)
-    assert_array_equal(
-        y_pred_proba_rf[:, 1],
-        _get_binary_score(rf, X_test, method=None, pos_label=1)
-    )
-
-    X_non_binary, y_non_binary = make_classification(
-        n_samples=20, n_features=6, random_state=42, n_classes=4,
-        n_informative=4
-    )
-
-    rf_non_bin = RandomForestClassifier(n_estimators=10).fit(X_non_binary,
-                                                             y_non_binary)
-    with pytest.raises(ValueError):
-        _get_binary_score(rf_non_bin, X_non_binary)
+    with pytest.raises(err_type, match=err_msg):
+        clf = CutoffClassifier(base_estimator=Estimator(), **params)
+        clf.fit(X, y)

From 420df8dfa30c3408bd15c578bef60ad0c0ce09af Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 24 Feb 2020 19:21:15 +0100
Subject: [PATCH 08/44] xxx

---
 sklearn/calibration.py            | 237 ++++++++----------------
 sklearn/tests/test_calibration.py | 292 +++++++++++++++---------------
 2 files changed, 226 insertions(+), 303 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 46f4af9b97e1d..7177d1629c25a 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -30,6 +30,7 @@
 from .metrics import precision_recall_curve
 from .metrics import roc_curve
 from .model_selection import check_cv
+from .model_selection import cross_val_predict
 from .preprocessing import label_binarize
 from .preprocessing import LabelBinarizer
 from .svm import LinearSVC
@@ -631,66 +632,27 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     Parameters
     ----------
     base_estimator : estimator instance
-        The binary classifier whose decision threshold will be adapted
-        according to the acquired cutoff point. The estimator must implement
-        `decision_function` or `predict_proba` function.
+        doc
 
-    strategy : {"roc", "f_beta", "max_tpr", "max_tnr", "constant"}, \
-            default="roc" The strategy to use for choosing the cutoff point.
+    objective_name : {...}, default=...
+        doc
 
-        - "roc" selects the point on the ROC curve that is closest to the ideal
-          corner (0, 1).
+    object_value : float, default=None
+        doc
 
-        - "f_beta" selects a decision threshold that maximizes the `f_beta`
-          score.
+    method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        doc
 
-        - "max_tpr" selects the point that yields the highest true positive
-          rate (TPR) with true negative rate (TNR) at least equal to the value
-          of the parameter threshold.
-
-        - "max_tnr" selects the point that yields the highest true negative
-          rate (TNR) with true positive rate (TPR) at least equal to the value
-          of the parameter threshold.
-
-        - "constant" will use the threshold specified by the parameter
-          `decision_threshold`.
-
-    method : {"auto", "decision_function", "predict_proba"}, default="auto" The
-        method to be used to get the predictions. If `"auto"` (default), the
-        base estimator will try to invoke `decision_function` or
-        `predict_proba`, in that order.
-
-    beta : float in [0, 1], optional (default=None) beta value to be used in
-        case strategy == 'f_beta'
-
-    threshold : float in [0, 1] or None, (default=None) In case strategy is
-        'max_tpr' or 'max_tnr' this parameter must be set to specify the
-        threshold for the true negative rate or true positive rate respectively
-        that needs to be achieved
-
-    decision_threshold : float, default=0.5
-        When `strategy="constant"`, decision threshold used as cutoff point.
-
-    pos_label : object, optional (default=1) Object representing the positive
-        label
-
-    cv : int, cross-validation generator, iterable or 'prefit', optional
-        (default=3). Determines the cross-validation splitting strategy. If
-        cv='prefit' the base estimator is assumed to be fitted and all data
-        will be used for the calibration of the probability threshold
+    pos_label : int or str, default=None
+        doc
 
     Attributes
     ----------
-    decision_threshold_ : float Decision threshold for the positive class.
-        Determines the output of predict
+    decision_threshold_ : float
+        The new decision threshold.
 
-    std_ : float Standard deviation of the obtained decision thresholds for
-        when the provided base estimator is not pre-trained and the
-        decision_threshold_ is computed as the mean of the decision threshold
-        of each cross-validation iteration. If the base estimator is
-        pre-trained then std_ = None
-
-    classes_ : array, shape (n_classes) The class labels.
+    classes_ : array of shape (n_classes,)
+        The class labels.
 
     References
     ----------
@@ -699,17 +661,14 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
            Clinical chemistry, 1993
 
     """
-    def __init__(self, base_estimator, strategy="roc", method="auto",
-                 beta=None, threshold=None, decision_threshold=0.5,
-                 pos_label=1, cv=3):
+
+    def __init__(self, base_estimator, objective_name, objective_value=None,
+                 method="auto", pos_label=None):
         self.base_estimator = base_estimator
-        self.strategy = strategy
+        self.objective_name = objective_name
+        self.objective_value = objective_value
         self.method = method
-        self.beta = beta
-        self.threshold = threshold
-        self.decision_threshold = decision_threshold
         self.pos_label = pos_label
-        self.cv = cv
 
     def _validate_parameters(self):
         """Validate the input parameters."""
@@ -738,43 +697,7 @@ def _validate_parameters(self):
                 )
             self._method = self.method
 
-        strategies = ("roc", "f_beta", "max_tpr", "max_tnr", "constant")
-        if self.strategy not in strategies:
-            raise ValueError(
-                f"'strategy' must be of {', '.join(strategies)}. "
-                f"Got {self.strategy} instead."
-            )
-        elif self.strategy in ("max_tpr", "max_tnr"):
-            if not isinstance(self.threshold, numbers.Real):
-                raise TypeError(
-                    "When strategy is max_tpr or max_tnr, threshold should be "
-                    f"a real in [0, 1]. Got {type(self.threshold)} instead."
-                )
-            elif not (0 < self.threshold < 1):
-                raise ValueError(
-                    f"threshold should be in the range [0, 1]. "
-                    f"Got {self.threshold} instead."
-                )
-        elif self.strategy == "f_beta":
-            if not isinstance(self.beta, numbers.Real):
-                raise TypeError(
-                    "When strategy is f_beta, beta should be a real. "
-                    f"Got {type(self.beta)} instead."
-                )
-        elif self.strategy == "constant":
-            if (self.method == "predict_proba" and
-                    not (0 < self.decision_threshold < 1)):
-                raise ValueError(
-                    f"decision_threshold should be in the range [0, 1] when "
-                    f"using 'predict_proba'. Got {self.decision_threshold} "
-                    "instead."
-                )
-
     def _validate_data(self, X, y):
-        X = check_array(
-            X, accept_sparse=['csc', 'csr'], force_all_finite=False,
-            allow_nd=True,
-        )
         y = check_array(y, ensure_2d=False, dtype=None)
         check_classification_targets(y)
         y_type = type_of_target(y)
@@ -801,41 +724,31 @@ def fit(self, X, y):
         self._validate_parameters()
         X, y = self._validate_data(X, y)
 
-        self._label_encoder = LabelEncoder().fit(y)
-        self.classes_ = self._label_encoder.classes_
-
         try:
-            check_is_fitted(self.base_estimator)
-            self._base_estimator = deepcopy(self.base_estimator)
+            check_is_fitted(self.base_estimator, attributes=["n_classes_"])
+            self._estimator = self.base_estimator
         except NotFittedError:
-            self._base_estimator = clone(self.base_estimator).fit(X, y)
-
-        if self.strategy == "constant":
-            self.decision_threshold_ = self.decision_threshold
-        elif self.cv == 'prefit':
-            self.decision_threshold_ = _find_optimal_decision_threshold(
-                self._base_estimator, X, y, self.strategy, self._method,
-                self.beta, self.threshold, self.pos_label
-            )
-            self.std_ = None
-        else:
-            cv = check_cv(self.cv, y, classifier=True)
-            decision_thresholds = []
+            self._estimator = clone(self.base_estimator).fit(X, y)
+        self.classes_ = self._estimator.classes_
 
-            for train, test in cv.split(X, y):
-                estimator = clone(self._base_estimator).fit(
-                    _safe_indexing(X, train), _safe_indexing(y, train)
-                )
-                decision_thresholds.append(
-                    _find_optimal_decision_threshold(
-                        estimator,
-                        _safe_indexing(X, test), _safe_indexing(y, test),
-                        self.strategy, self._method, self.beta, self.threshold,
-                        self.pos_label
-                    )
+        y_score = getattr(self._estimator, self._method)(X)
+        if self.objective_name in ("precision", "recall"):
+            precision, recall, threshold = precision_recall_curve(
+                y, y_score, pos_label=self.pos_label
+            )
+            if self.objective_name == "precision":
+                # precision is ordered in increasing order
+                indices = np.flatnonzero(precision >= self.objective_value)
+                self.decision_threshold_ = \
+                    threshold[indices[np.argmax(recall[indices])]]
+            else:
+                # recall is ordered in descending order
+                higher_bound_idx = recall.size - np.searchsorted(
+                    recall[::-1], self.objective_value
                 )
-            self.decision_threshold_ = np.mean(decision_thresholds)
-            self.std_ = np.std(decision_thresholds)
+                max_precision_idx = np.argmax(precision[:higher_bound_idx])
+                self.decision_threshold_ = \
+                    threshold[:higher_bound_idx][max_precision_idx]
         return self
 
     def predict(self, X):
@@ -853,46 +766,46 @@ def predict(self, X):
         """
         check_is_fitted(self)
 
-        y_score = _get_binary_score(
-            self._base_estimator, X, self._method, self.pos_label
-        )
-        return self._label_encoder.inverse_transform(
-            (y_score > self.decision_threshold_).astype(int)
-        )
+        # y_score = _get_binary_score(
+        #     self._base_estimator, X, self._method, self.pos_label
+        # )
+        # return self._label_encoder.inverse_transform(
+        #     (y_score > self.decision_threshold_).astype(int)
+        # )
 
     def _more_tags(self):
         return {"binary_only": True}
 
 
-def _find_optimal_decision_threshold(estimator, X, y, strategy, method, beta,
-                                     threshold, pos_label):
-    y_score = _get_binary_score(
-        estimator, X, method=method, pos_label=pos_label
-    )
-    if strategy == 'f_beta':
-        precision, recall, thresholds = precision_recall_curve(
-            y, y_score, pos_label=pos_label
-        )
-        f_beta = ((1 + beta ** 2) * (precision * recall) /
-                  (beta ** 2 * precision + recall))
-        return thresholds[np.argmax(f_beta)]
-
-    fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=pos_label)
-
-    if strategy == 'roc':
-        # we find the threshold of the point (fpr, tpr) with the smallest
-        # euclidean distance from the "ideal" corner (0, 1)
-        return thresholds[np.argmin(fpr ** 2 + (tpr - 1) ** 2)]
-    elif strategy == 'max_tpr':
-        indices = np.where(1 - fpr >= threshold)[0]
-        max_tpr_index = np.argmax(tpr[indices])
-        return thresholds[indices[max_tpr_index]]
-    indices = np.where(tpr >= threshold)[0]
-    max_tnr_index = np.argmax(1 - fpr[indices])
-    return thresholds[indices[max_tnr_index]]
-
-
-def _get_binary_score(estimator, X, method, pos_label):
+# def _find_optimal_decision_threshold(estimator, X, y, strategy, method, beta,
+#                                      threshold, pos_label):
+#     y_score = _get_binary_score(
+#         estimator, X, method=method, pos_label=pos_label
+#     )
+#     if strategy == 'f_beta':
+#         precision, recall, thresholds = precision_recall_curve(
+#             y, y_score, pos_label=pos_label
+#         )
+#         f_beta = ((1 + beta ** 2) * (precision * recall) /
+#                   (beta ** 2 * precision + recall))
+#         return thresholds[np.argmax(f_beta)]
+
+#     fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=pos_label)
+
+#     if strategy == 'roc':
+#         # we find the threshold of the point (fpr, tpr) with the smallest
+#         # euclidean distance from the "ideal" corner (0, 1)
+#         return thresholds[np.argmin(fpr ** 2 + (tpr - 1) ** 2)]
+#     elif strategy == 'max_tpr':
+#         indices = np.where(1 - fpr >= threshold)[0]
+#         max_tpr_index = np.argmax(tpr[indices])
+#         return thresholds[indices[max_tpr_index]]
+#     indices = np.where(tpr >= threshold)[0]
+#     max_tnr_index = np.argmax(1 - fpr[indices])
+#     return thresholds[indices[max_tnr_index]]
+
+
+def _get_prediction(estimator, X, method, pos_label):
     """Binary classification score for the positive label (0 or 1)
 
     Returns the score that a binary classifier outputs for the positive label
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index b017547f595e3..7563a13ae6da6 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -29,7 +29,6 @@
 
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.calibration import CutoffClassifier
-from sklearn.calibration import _get_binary_score
 from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
 from sklearn.calibration import calibration_curve
 
@@ -353,137 +352,137 @@ def decision_function(self, X):
     calibrated_clf.fit(X, y)
 
 
-def test_cutoff_prefit():
-    calibration_samples = 200
-    X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
-                               n_classes=2)
+# def test_cutoff_prefit():
+#     calibration_samples = 200
+#     X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
+#                                n_classes=2)
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                        test_size=0.4,
-                                                        random_state=42)
-    lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
+#     X_train, X_test, y_train, y_test = train_test_split(X, y,
+#                                                         test_size=0.4,
+#                                                         random_state=42)
+#     lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
 
-    clf_roc = CutoffClassifier(lr, strategy='roc', cv='prefit').fit(
-        X_test[:calibration_samples], y_test[:calibration_samples]
-    )
+#     clf_roc = CutoffClassifier(lr, strategy='roc', cv='prefit').fit(
+#         X_test[:calibration_samples], y_test[:calibration_samples]
+#     )
 
-    y_pred = lr.predict(X_test[calibration_samples:])
-    y_pred_roc = clf_roc.predict(X_test[calibration_samples:])
+#     y_pred = lr.predict(X_test[calibration_samples:])
+#     y_pred_roc = clf_roc.predict(X_test[calibration_samples:])
 
-    tn, fp, fn, tp = confusion_matrix(
-        y_test[calibration_samples:], y_pred).ravel()
-    tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
-        y_test[calibration_samples:], y_pred_roc).ravel()
+#     tn, fp, fn, tp = confusion_matrix(
+#         y_test[calibration_samples:], y_pred).ravel()
+#     tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
+#         y_test[calibration_samples:], y_pred_roc).ravel()
 
-    tpr = tp / (tp + fn)
-    tnr = tn / (tn + fp)
+#     tpr = tp / (tp + fn)
+#     tnr = tn / (tn + fp)
 
-    tpr_roc = tp_roc / (tp_roc + fn_roc)
-    tnr_roc = tn_roc / (tn_roc + fp_roc)
+#     tpr_roc = tp_roc / (tp_roc + fn_roc)
+#     tnr_roc = tn_roc / (tn_roc + fp_roc)
 
-    # check that the sum of tpr and tnr has improved
-    assert tpr_roc + tnr_roc > tpr + tnr
+#     # check that the sum of tpr and tnr has improved
+#     assert tpr_roc + tnr_roc > tpr + tnr
 
-    clf_f1 = CutoffClassifier(
-        lr, strategy='f_beta', method='predict_proba', beta=1,
-        cv='prefit').fit(
-        X_test[:calibration_samples], y_test[:calibration_samples]
-    )
+#     clf_f1 = CutoffClassifier(
+#         lr, strategy='f_beta', method='predict_proba', beta=1,
+#         cv='prefit').fit(
+#         X_test[:calibration_samples], y_test[:calibration_samples]
+#     )
 
-    y_pred_f1 = clf_f1.predict(X_test[calibration_samples:])
-    assert (f1_score(y_test[calibration_samples:], y_pred_f1) >
-            f1_score(y_test[calibration_samples:], y_pred))
+#     y_pred_f1 = clf_f1.predict(X_test[calibration_samples:])
+#     assert (f1_score(y_test[calibration_samples:], y_pred_f1) >
+#             f1_score(y_test[calibration_samples:], y_pred))
 
-    clf_fbeta = CutoffClassifier(
-        lr, strategy='f_beta', method='predict_proba', beta=2,
-        cv='prefit').fit(
-        X_test[:calibration_samples], y_test[:calibration_samples]
-    )
+#     clf_fbeta = CutoffClassifier(
+#         lr, strategy='f_beta', method='predict_proba', beta=2,
+#         cv='prefit').fit(
+#         X_test[:calibration_samples], y_test[:calibration_samples]
+#     )
 
-    y_pred_fbeta = clf_fbeta.predict(X_test[calibration_samples:])
-    assert (recall_score(y_test[calibration_samples:], y_pred_fbeta) >
-            recall_score(y_test[calibration_samples:], y_pred))
+#     y_pred_fbeta = clf_fbeta.predict(X_test[calibration_samples:])
+#     assert (recall_score(y_test[calibration_samples:], y_pred_fbeta) >
+#             recall_score(y_test[calibration_samples:], y_pred))
 
-    clf_max_tpr = CutoffClassifier(
-        lr, strategy='max_tpr', threshold=0.7, cv='prefit'
-    ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
+#     clf_max_tpr = CutoffClassifier(
+#         lr, strategy='max_tpr', threshold=0.7, cv='prefit'
+#     ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
 
-    y_pred_max_tpr = clf_max_tpr.predict(X_test[calibration_samples:])
+#     y_pred_max_tpr = clf_max_tpr.predict(X_test[calibration_samples:])
 
-    tn_max_tpr, fp_max_tpr, fn_max_tpr, tp_max_tpr = confusion_matrix(
-        y_test[calibration_samples:], y_pred_max_tpr).ravel()
+#     tn_max_tpr, fp_max_tpr, fn_max_tpr, tp_max_tpr = confusion_matrix(
+#         y_test[calibration_samples:], y_pred_max_tpr).ravel()
 
-    tpr_max_tpr = tp_max_tpr / (tp_max_tpr + fn_max_tpr)
-    tnr_max_tpr = tn_max_tpr / (tn_max_tpr + fp_max_tpr)
+#     tpr_max_tpr = tp_max_tpr / (tp_max_tpr + fn_max_tpr)
+#     tnr_max_tpr = tn_max_tpr / (tn_max_tpr + fp_max_tpr)
 
-    # check that the tpr increases with tnr >= min_val_tnr
-    assert tpr_max_tpr > tpr
-    assert tpr_max_tpr > tpr_roc
-    assert tnr_max_tpr >= 0.7
+#     # check that the tpr increases with tnr >= min_val_tnr
+#     assert tpr_max_tpr > tpr
+#     assert tpr_max_tpr > tpr_roc
+#     assert tnr_max_tpr >= 0.7
 
-    clf_max_tnr = CutoffClassifier(
-        lr, strategy='max_tnr', threshold=0.7, cv='prefit'
-    ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
+#     clf_max_tnr = CutoffClassifier(
+#         lr, strategy='max_tnr', threshold=0.7, cv='prefit'
+#     ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
 
-    y_pred_clf = clf_max_tnr.predict(X_test[calibration_samples:])
+#     y_pred_clf = clf_max_tnr.predict(X_test[calibration_samples:])
 
-    tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(
-        y_test[calibration_samples:], y_pred_clf).ravel()
+#     tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(
+#         y_test[calibration_samples:], y_pred_clf).ravel()
 
-    tnr_clf_max_tnr = tn_clf / (tn_clf + fp_clf)
-    tpr_clf_max_tnr = tp_clf / (tp_clf + fn_clf)
+#     tnr_clf_max_tnr = tn_clf / (tn_clf + fp_clf)
+#     tpr_clf_max_tnr = tp_clf / (tp_clf + fn_clf)
 
-    # check that the tnr increases with tpr >= min_val_tpr
-    assert tnr_clf_max_tnr > tnr
-    assert tnr_clf_max_tnr > tnr_roc
-    assert tpr_clf_max_tnr >= 0.7
+#     # check that the tnr increases with tpr >= min_val_tpr
+#     assert tnr_clf_max_tnr > tnr
+#     assert tnr_clf_max_tnr > tnr_roc
+#     assert tpr_clf_max_tnr >= 0.7
 
-    # check error cases
-    clf_bad_base_estimator = CutoffClassifier([])
-    with pytest.raises(TypeError):
-        clf_bad_base_estimator.fit(X_train, y_train)
+#     # check error cases
+#     clf_bad_base_estimator = CutoffClassifier([])
+#     with pytest.raises(TypeError):
+#         clf_bad_base_estimator.fit(X_train, y_train)
 
-    X_non_binary, y_non_binary = make_classification(
-        n_samples=20, n_features=6, random_state=42, n_classes=4,
-        n_informative=4
-    )
-    with pytest.raises(ValueError):
-        clf_roc.fit(X_non_binary, y_non_binary)
-
-
-def test_cutoff_cv():
-    X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
-                               n_classes=2)
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                        test_size=0.4,
-                                                        random_state=42)
-    lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
-    clf_roc = CutoffClassifier(LogisticRegression(solver='liblinear'),
-                               strategy='roc',
-                               cv=3).fit(
-        X_train, y_train
-    )
+#     X_non_binary, y_non_binary = make_classification(
+#         n_samples=20, n_features=6, random_state=42, n_classes=4,
+#         n_informative=4
+#     )
+#     with pytest.raises(ValueError):
+#         clf_roc.fit(X_non_binary, y_non_binary)
 
-    assert clf_roc.decision_threshold_ != 0
-    assert clf_roc.std_ is not None and clf_roc.std_ != 0
 
-    y_pred = lr.predict(X_test)
-    y_pred_roc = clf_roc.predict(X_test)
+# def test_cutoff_cv():
+#     X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
+#                                n_classes=2)
+
+#     X_train, X_test, y_train, y_test = train_test_split(X, y,
+#                                                         test_size=0.4,
+#                                                         random_state=42)
+#     lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
+#     clf_roc = CutoffClassifier(LogisticRegression(solver='liblinear'),
+#                                strategy='roc',
+#                                cv=3).fit(
+#         X_train, y_train
+#     )
+
+#     assert clf_roc.decision_threshold_ != 0
+#     assert clf_roc.std_ is not None and clf_roc.std_ != 0
 
-    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
-    tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
-        y_test, y_pred_roc
-    ).ravel()
+#     y_pred = lr.predict(X_test)
+#     y_pred_roc = clf_roc.predict(X_test)
 
-    tpr = tp / (tp + fn)
-    tnr = tn / (tn + fp)
+#     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
+#     tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
+#         y_test, y_pred_roc
+#     ).ravel()
 
-    tpr_roc = tp_roc / (tp_roc + fn_roc)
-    tnr_roc = tn_roc / (tn_roc + fp_roc)
+#     tpr = tp / (tp + fn)
+#     tnr = tn / (tn + fp)
 
-    # check that the sum of tpr + tnr has improved
-    assert tpr_roc + tnr_roc > tpr + tnr
+#     tpr_roc = tp_roc / (tp_roc + fn_roc)
+#     tnr_roc = tn_roc / (tn_roc + fp_roc)
+
+#     # check that the sum of tpr + tnr has improved
+#     assert tpr_roc + tnr_roc > tpr + tnr
 
 
 # def test_get_binary_score():
@@ -543,41 +542,52 @@ def test_cutoff_cv():
 #         _get_binary_score(rf_non_bin, X_non_binary)
 
 
-class MockNoPredictorClassifier:
-    pass
-
-
-@pytest.mark.parametrize(
-    "Estimator, params, err_type, err_msg",
-    [
-        (LogisticRegression, {"method": "xxx"}, ValueError,
-         "'method' should be one of"),
-        (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
-         "'base_estimator' must implement one of the"),
-        (RandomForestRegressor, {"method": "predict_proba"}, TypeError,
-         "'base_estimator' does not implement predict_proba"),
-        (LogisticRegression, {"strategy": "xxx"}, ValueError,
-         "'strategy' must be of"),
-        (LogisticRegression, {"strategy": "max_tpr"}, TypeError,
-         "When strategy is max_tpr or max_tnr"),
-        (LogisticRegression, {"strategy": "f_beta"}, TypeError,
-         "When strategy is f_beta"),
-        (LogisticRegression, {"strategy": "max_tnr"}, TypeError,
-         "When strategy is max_tpr or max_tnr"),
-        (LogisticRegression, {"strategy": "max_tpr", "threshold": 10},
-         ValueError, r"threshold should be in the range \[0, 1\]"),
-        (LogisticRegression, {"strategy": "max_tnr", "threshold": 10},
-         ValueError, r"threshold should be in the range \[0, 1\]"),
-        (LogisticRegression,
-         {"strategy": "constant", "method": "predict_proba",
-          "decision_threshold": 10},
-         ValueError, r"decision_threshold should be in the range \[0, 1\]")
-    ]
-)
-def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
-                                             err_msg):
-    X, y = make_classification(n_samples=200, n_features=6, random_state=42,
-                               n_classes=2)
-    with pytest.raises(err_type, match=err_msg):
-        clf = CutoffClassifier(base_estimator=Estimator(), **params)
-        clf.fit(X, y)
+# class MockNoPredictorClassifier:
+#     pass
+
+
+# @pytest.mark.parametrize(
+#     "Estimator, params, err_type, err_msg",
+#     [
+#         (LogisticRegression, {"method": "xxx"}, ValueError,
+#          "'method' should be one of"),
+#         (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
+#          "'base_estimator' must implement one of the"),
+#         (RandomForestRegressor, {"method": "predict_proba"}, TypeError,
+#          "'base_estimator' does not implement predict_proba"),
+#         (LogisticRegression, {"strategy": "xxx"}, ValueError,
+#          "'strategy' must be of"),
+#         (LogisticRegression, {"strategy": "max_tpr"}, TypeError,
+#          "When strategy is max_tpr or max_tnr"),
+#         (LogisticRegression, {"strategy": "f_beta"}, TypeError,
+#          "When strategy is f_beta"),
+#         (LogisticRegression, {"strategy": "max_tnr"}, TypeError,
+#          "When strategy is max_tpr or max_tnr"),
+#         (LogisticRegression, {"strategy": "max_tpr", "threshold": 10},
+#          ValueError, r"threshold should be in the range \[0, 1\]"),
+#         (LogisticRegression, {"strategy": "max_tnr", "threshold": 10},
+#          ValueError, r"threshold should be in the range \[0, 1\]"),
+#         (LogisticRegression,
+#          {"strategy": "constant", "method": "predict_proba",
+#           "decision_threshold": 10},
+#          ValueError, r"decision_threshold should be in the range \[0, 1\]")
+#     ]
+# )
+# def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
+#                                              err_msg):
+#     X, y = make_classification(n_samples=200, n_features=6, random_state=42,
+#                                n_classes=2)
+#     with pytest.raises(err_type, match=err_msg):
+#         clf = CutoffClassifier(base_estimator=Estimator(), **params)
+#         clf.fit(X, y)
+
+
+def test_cutoff_classifier():
+    from sklearn.datasets import load_breast_cancer
+    X, y = load_breast_cancer(return_X_y=True)
+    clf = LogisticRegression().fit(X, y)
+    clf_optimized = CutoffClassifier(
+        base_estimator=clf, objective_name="precision", objective_value=0.9
+    )
+    clf_optimized.fit(X, y)
+    print(clf_optimized.decision_threshold_)

From 6b36f68536b00d2ab4a6ae5346f096a15bf35352 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 00:15:25 +0100
Subject: [PATCH 09/44] iter

---
 sklearn/calibration.py            | 213 ++++++++++----------
 sklearn/tests/test_calibration.py | 309 +++++++-----------------------
 2 files changed, 169 insertions(+), 353 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 7177d1629c25a..c37803ff16abb 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -27,6 +27,7 @@
 from .base import clone
 from .exceptions import NotFittedError
 from .isotonic import IsotonicRegression
+from .metrics import balanced_accuracy_score
 from .metrics import precision_recall_curve
 from .metrics import roc_curve
 from .model_selection import check_cv
@@ -624,27 +625,40 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     traditional binary classification evaluation statistics such as the true
     positive and true negative rates and F-scores.
 
-    If cv="prefit" the base estimator is assumed to be fitted and all data will
-    be used for the selection of the cutoff point. Otherwise the decision
-    threshold is calculated as the average of the thresholds resulting from the
-    cross-validation loop.
-
     Parameters
     ----------
-    base_estimator : estimator instance
-        doc
+    base_estimator : estimator object
+        The classifier, fitted or not fitted, from which we want to optimize
+        the decision threshold used during `predict`.
+
+    objective_metric : {"tpr", "tnr"} or callable, \
+            default=balanced_accuracy_score
+        The objective metric to be optimized:
 
-    objective_name : {...}, default=...
-        doc
+        * `"tpr"`: Find the decision threshold for a true positive ratio (TPR)
+          of `objective_value`.
+        * `"tnr"`: Find the decision threshold for a true negative ratio (TNR)
+          of `objective_value`.
+        * one of the scikit-learn scoring metric.
 
-    object_value : float, default=None
-        doc
+    objective_value : float, default=None
+        The value associated with the `objective_metric` metric for which we
+        want to find the decision threshold.
 
     method : {"auto", "decision_function", "predict_proba"}, default="auto"
-        doc
+        Methods by the classifier `base_estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each estimator,
+          `"decision_function` or `"predict_proba"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the estimator, it will raise an
+          error.
 
     pos_label : int or str, default=None
-        doc
+        The label of the positive class. When `pos_label=None`, if `y_true` is
+        in `{-1, 1}` or `{0, 1}`, `pos_label` is set to 1, otherwise an error
+        will be raised.
 
     Attributes
     ----------
@@ -659,13 +673,18 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
            evaluation tool in clinical medicine, MH Zweig, G Campbell -
            Clinical chemistry, 1993
-
     """
 
-    def __init__(self, base_estimator, objective_name, objective_value=None,
-                 method="auto", pos_label=None):
+    def __init__(
+        self,
+        base_estimator,
+        objective_metric=balanced_accuracy_score,
+        objective_value=None,
+        method="auto",
+        pos_label=None
+    ):
         self.base_estimator = base_estimator
-        self.objective_name = objective_name
+        self.objective_metric = objective_metric
         self.objective_value = objective_value
         self.method = method
         self.pos_label = pos_label
@@ -696,8 +715,16 @@ def _validate_parameters(self):
                     f"'base_estimator' does not implement {self.method}."
                 )
             self._method = self.method
+        if (self.objective_metric not in ("tnr", "tpr") and
+                self.objective_value is not None):
+            raise ValueError(
+                f"When 'objective_metric' is a predefined scoring function, "
+                f"'objective_value' should be None. Got {self.objective_value}"
+                f" instead."
+            )
 
-    def _validate_data(self, X, y):
+    @staticmethod
+    def _validate_data(X, y):
         y = check_array(y, ensure_2d=False, dtype=None)
         check_classification_targets(y)
         y_type = type_of_target(y)
@@ -705,21 +732,41 @@ def _validate_data(self, X, y):
             raise ValueError(f'Expected target of binary type. Got {y_type}.')
         return X, y
 
+    @staticmethod
+    def _get_pos_label_score(y_score, classes, pos_label):
+        """Get score of the positive class."""
+        if y_score.ndim == 2:
+            pos_label_encoded = np.flatnonzero(classes == pos_label).item(0)
+            y_score = y_score[:, pos_label_encoded]
+        return y_score
+
+    def _optimize_scorer(self, estimator, y_true, y_score, scorer, classes,
+                         pos_label):
+        # `np.unique` is already sorting the value, no need to call
+        #  `thresholds.sort()`
+        thresholds = np.unique(
+            self._get_pos_label_score(y_score, classes, pos_label)
+        )
+        scores = [scorer(y_true, (y_score >= th).astype(int))
+                  for th in thresholds]
+        return thresholds[np.argmax(scores)]
+
     def fit(self, X, y):
-        """Fit model
+        """Find the decision threshold.
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
-            Training data
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            The training data.
 
-        y : array-like, shape (n_samples,)
-            Target values. There must be two 2 distinct values
+        y : array-like of shape (n_samples,)
+            Target values. It should be a binary target.
 
         Returns
         -------
         self : object
-            Instance of self
+            Returns an instance of self.
         """
         self._validate_parameters()
         X, y = self._validate_data(X, y)
@@ -732,23 +779,22 @@ def fit(self, X, y):
         self.classes_ = self._estimator.classes_
 
         y_score = getattr(self._estimator, self._method)(X)
-        if self.objective_name in ("precision", "recall"):
-            precision, recall, threshold = precision_recall_curve(
+        if self.objective_metric in ("tpr", "tnr"):
+            fpr, tpr, thresholds = roc_curve(
                 y, y_score, pos_label=self.pos_label
             )
-            if self.objective_name == "precision":
-                # precision is ordered in increasing order
-                indices = np.flatnonzero(precision >= self.objective_value)
-                self.decision_threshold_ = \
-                    threshold[indices[np.argmax(recall[indices])]]
-            else:
-                # recall is ordered in descending order
-                higher_bound_idx = recall.size - np.searchsorted(
-                    recall[::-1], self.objective_value
-                )
-                max_precision_idx = np.argmax(precision[:higher_bound_idx])
-                self.decision_threshold_ = \
-                    threshold[:higher_bound_idx][max_precision_idx]
+            if self.objective_metric == "tnr":
+                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]
+
+            threshold_idx = np.searchsorted(
+                eval(self.objective_metric), self.objective_value
+            )
+            self.decision_threshold_ = thresholds[threshold_idx]
+        else:
+            self.decision_threshold_ = self._optimize_scorer(
+                self._estimator, y, y_score, self.objective_metric,
+                self.classes_, self.pos_label,
+            )
         return self
 
     def predict(self, X):
@@ -756,89 +802,24 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
-            The samples
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            The data matrix.
 
         Returns
         -------
-        C : array, shape (n_samples,)
-            The predicted class
+        C : ndarray of shape (n_samples,)
+            The predicted class.
         """
         check_is_fitted(self)
 
-        # y_score = _get_binary_score(
-        #     self._base_estimator, X, self._method, self.pos_label
-        # )
-        # return self._label_encoder.inverse_transform(
-        #     (y_score > self.decision_threshold_).astype(int)
-        # )
+        decision_function = getattr(self._estimator, self._method)
+        y_score = self._get_pos_label_score(
+            decision_function(X), self.classes_, self.pos_label
+        )
+        y_class_indices = (y_score >= self.decision_threshold_).astype(int)
+
+        return self._estimator.classes_[y_class_indices]
 
     def _more_tags(self):
         return {"binary_only": True}
-
-
-# def _find_optimal_decision_threshold(estimator, X, y, strategy, method, beta,
-#                                      threshold, pos_label):
-#     y_score = _get_binary_score(
-#         estimator, X, method=method, pos_label=pos_label
-#     )
-#     if strategy == 'f_beta':
-#         precision, recall, thresholds = precision_recall_curve(
-#             y, y_score, pos_label=pos_label
-#         )
-#         f_beta = ((1 + beta ** 2) * (precision * recall) /
-#                   (beta ** 2 * precision + recall))
-#         return thresholds[np.argmax(f_beta)]
-
-#     fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=pos_label)
-
-#     if strategy == 'roc':
-#         # we find the threshold of the point (fpr, tpr) with the smallest
-#         # euclidean distance from the "ideal" corner (0, 1)
-#         return thresholds[np.argmin(fpr ** 2 + (tpr - 1) ** 2)]
-#     elif strategy == 'max_tpr':
-#         indices = np.where(1 - fpr >= threshold)[0]
-#         max_tpr_index = np.argmax(tpr[indices])
-#         return thresholds[indices[max_tpr_index]]
-#     indices = np.where(tpr >= threshold)[0]
-#     max_tnr_index = np.argmax(1 - fpr[indices])
-#     return thresholds[indices[max_tnr_index]]
-
-
-def _get_prediction(estimator, X, method, pos_label):
-    """Binary classification score for the positive label (0 or 1)
-
-    Returns the score that a binary classifier outputs for the positive label
-    acquired either from decision_function or predict_proba
-
-    Parameters
-    ----------
-    estimator : estimator object
-        Fitted estimator to get prediction from.
-
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The data matrix.
-
-    pos_label : int or str
-        The positive label.
-
-    method : str or None, optional (default=None)
-        The method to be used for acquiring the score. Can either be
-        "decision_function" or "predict_proba" or None. If None then
-        decision_function will be used first and if not available
-        predict_proba
-
-    Returns
-    -------
-    y_score : array-like, shape (n_samples,)
-        The return value of the provided classifier's decision_function or
-        predict_proba depending on the method used.
-    """
-    # FIXME: what if estimator was fitted on encoded label??
-    y_score = getattr(estimator, method)(X)
-    if y_score.ndim == 2:
-        # probabilities
-        y_score = y_score[:, pos_label]
-    elif pos_label == estimator.classes_[0]:
-        y_score = -y_score
-    return y_score
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 7563a13ae6da6..d75b39c8cd703 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -6,13 +6,17 @@
 from scipy import sparse
 
 from sklearn.base import BaseEstimator
-from sklearn.model_selection import LeaveOneOut
-
-from sklearn.datasets import make_classification, make_blobs
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.datasets import load_breast_cancer
+from sklearn.datasets import load_iris
+from sklearn.datasets import make_blobs
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
 from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import LeaveOneOut
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import brier_score_loss
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import f1_score
@@ -20,7 +24,10 @@
 from sklearn.metrics import recall_score
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVC
+from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
@@ -352,242 +359,70 @@ def decision_function(self, X):
     calibrated_clf.fit(X, y)
 
 
-# def test_cutoff_prefit():
-#     calibration_samples = 200
-#     X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
-#                                n_classes=2)
-
-#     X_train, X_test, y_train, y_test = train_test_split(X, y,
-#                                                         test_size=0.4,
-#                                                         random_state=42)
-#     lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
-
-#     clf_roc = CutoffClassifier(lr, strategy='roc', cv='prefit').fit(
-#         X_test[:calibration_samples], y_test[:calibration_samples]
-#     )
-
-#     y_pred = lr.predict(X_test[calibration_samples:])
-#     y_pred_roc = clf_roc.predict(X_test[calibration_samples:])
-
-#     tn, fp, fn, tp = confusion_matrix(
-#         y_test[calibration_samples:], y_pred).ravel()
-#     tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
-#         y_test[calibration_samples:], y_pred_roc).ravel()
-
-#     tpr = tp / (tp + fn)
-#     tnr = tn / (tn + fp)
-
-#     tpr_roc = tp_roc / (tp_roc + fn_roc)
-#     tnr_roc = tn_roc / (tn_roc + fp_roc)
-
-#     # check that the sum of tpr and tnr has improved
-#     assert tpr_roc + tnr_roc > tpr + tnr
-
-#     clf_f1 = CutoffClassifier(
-#         lr, strategy='f_beta', method='predict_proba', beta=1,
-#         cv='prefit').fit(
-#         X_test[:calibration_samples], y_test[:calibration_samples]
-#     )
-
-#     y_pred_f1 = clf_f1.predict(X_test[calibration_samples:])
-#     assert (f1_score(y_test[calibration_samples:], y_pred_f1) >
-#             f1_score(y_test[calibration_samples:], y_pred))
-
-#     clf_fbeta = CutoffClassifier(
-#         lr, strategy='f_beta', method='predict_proba', beta=2,
-#         cv='prefit').fit(
-#         X_test[:calibration_samples], y_test[:calibration_samples]
-#     )
-
-#     y_pred_fbeta = clf_fbeta.predict(X_test[calibration_samples:])
-#     assert (recall_score(y_test[calibration_samples:], y_pred_fbeta) >
-#             recall_score(y_test[calibration_samples:], y_pred))
-
-#     clf_max_tpr = CutoffClassifier(
-#         lr, strategy='max_tpr', threshold=0.7, cv='prefit'
-#     ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
-
-#     y_pred_max_tpr = clf_max_tpr.predict(X_test[calibration_samples:])
-
-#     tn_max_tpr, fp_max_tpr, fn_max_tpr, tp_max_tpr = confusion_matrix(
-#         y_test[calibration_samples:], y_pred_max_tpr).ravel()
-
-#     tpr_max_tpr = tp_max_tpr / (tp_max_tpr + fn_max_tpr)
-#     tnr_max_tpr = tn_max_tpr / (tn_max_tpr + fp_max_tpr)
-
-#     # check that the tpr increases with tnr >= min_val_tnr
-#     assert tpr_max_tpr > tpr
-#     assert tpr_max_tpr > tpr_roc
-#     assert tnr_max_tpr >= 0.7
-
-#     clf_max_tnr = CutoffClassifier(
-#         lr, strategy='max_tnr', threshold=0.7, cv='prefit'
-#     ).fit(X_test[:calibration_samples], y_test[:calibration_samples])
-
-#     y_pred_clf = clf_max_tnr.predict(X_test[calibration_samples:])
-
-#     tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(
-#         y_test[calibration_samples:], y_pred_clf).ravel()
-
-#     tnr_clf_max_tnr = tn_clf / (tn_clf + fp_clf)
-#     tpr_clf_max_tnr = tp_clf / (tp_clf + fn_clf)
+class MockNoPredictorClassifier:
+    pass
+
+
+@pytest.mark.parametrize(
+    "Estimator, params, err_type, err_msg",
+    [
+        (LogisticRegression, {"method": "xxx"}, ValueError,
+         "'method' should be one of"),
+        (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
+         "'base_estimator' must implement one of the"),
+        (RandomForestRegressor, {"method": "predict_proba"}, TypeError,
+         "'base_estimator' does not implement predict_proba"),
+        (LogisticRegression,
+         {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
+         "When 'objective_metric' is a predefined scoring function")
+    ]
+)
+def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
+                                             err_msg):
+    X, y = make_classification(n_samples=200, n_features=6, random_state=42,
+                               n_classes=2)
+    with pytest.raises(err_type, match=err_msg):
+        clf = CutoffClassifier(base_estimator=Estimator(), **params)
+        clf.fit(X, y)
+
+
+def test_cutoffclassifier_not_binary():
+    X, y = load_iris(return_X_y=True)
+    with pytest.raises(ValueError, match="Expected target of binary type."):
+        clf = CutoffClassifier(base_estimator=LogisticRegression()).fit(X, y)
+
+
+def test_cutoffclassifier_limit_tpr_tnr():
+    X, y = load_breast_cancer(return_X_y=True)
+    clf = CutoffClassifier(
+        base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        objective_metric="tpr",
+        objective_value=0,
+    )
+    y_pred_tpr = clf.fit(X, y).predict(X)
+    clf.set_params(objective_metric="tnr")
+    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
+    assert_array_equal(y_pred_tnr, y_pred_tpr)
 
-#     # check that the tnr increases with tpr >= min_val_tpr
-#     assert tnr_clf_max_tnr > tnr
-#     assert tnr_clf_max_tnr > tnr_roc
-#     assert tpr_clf_max_tnr >= 0.7
 
-#     # check error cases
-#     clf_bad_base_estimator = CutoffClassifier([])
-#     with pytest.raises(TypeError):
-#         clf_bad_base_estimator.fit(X_train, y_train)
+def test_cutoffclassifier_with_objective_value():
+    X, y = load_breast_cancer(return_X_y=True)
+    # remove feature to degrade performances
+    X = X[:, :5]
 
-#     X_non_binary, y_non_binary = make_classification(
-#         n_samples=20, n_features=6, random_state=42, n_classes=4,
-#         n_informative=4
-#     )
-#     with pytest.raises(ValueError):
-#         clf_roc.fit(X_non_binary, y_non_binary)
+    # make the problem completely imbalanced such that the balanced accuracy
+    # is low
+    indices_pos = np.flatnonzero(y == 1)
+    indices_pos = indices_pos[:indices_pos.size // 50]
+    indices_neg = np.flatnonzero(y == 0)
 
+    X = np.vstack([X[indices_neg], X[indices_pos]])
+    y = np.hstack([y[indices_neg], y[indices_pos]])
 
-# def test_cutoff_cv():
-#     X, y = make_classification(n_samples=1000, n_features=6, random_state=42,
-#                                n_classes=2)
-
-#     X_train, X_test, y_train, y_test = train_test_split(X, y,
-#                                                         test_size=0.4,
-#                                                         random_state=42)
-#     lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
-#     clf_roc = CutoffClassifier(LogisticRegression(solver='liblinear'),
-#                                strategy='roc',
-#                                cv=3).fit(
-#         X_train, y_train
-#     )
-
-#     assert clf_roc.decision_threshold_ != 0
-#     assert clf_roc.std_ is not None and clf_roc.std_ != 0
-
-#     y_pred = lr.predict(X_test)
-#     y_pred_roc = clf_roc.predict(X_test)
-
-#     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
-#     tn_roc, fp_roc, fn_roc, tp_roc = confusion_matrix(
-#         y_test, y_pred_roc
-#     ).ravel()
-
-#     tpr = tp / (tp + fn)
-#     tnr = tn / (tn + fp)
-
-#     tpr_roc = tp_roc / (tp_roc + fn_roc)
-#     tnr_roc = tn_roc / (tn_roc + fp_roc)
-
-#     # check that the sum of tpr + tnr has improved
-#     assert tpr_roc + tnr_roc > tpr + tnr
-
-
-# def test_get_binary_score():
-#     X, y = make_classification(n_samples=200, n_features=6, random_state=42,
-#                                n_classes=2)
-
-#     X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.4,
-#                                                    random_state=42)
-#     lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
-#     y_pred_proba = lr.predict_proba(X_test)
-#     y_pred_score = lr.decision_function(X_test)
-
-#     assert_array_equal(
-#         y_pred_score, _get_binary_score(
-#             lr, X_test, method='decision_function', pos_label=1)
-#     )
-
-#     assert_array_equal(
-#         - y_pred_score, _get_binary_score(
-#             lr, X_test, method='decision_function', pos_label=0)
-#     )
-
-#     assert_array_equal(
-#         y_pred_proba[:, 1], _get_binary_score(
-#             lr, X_test, method='predict_proba', pos_label=1)
-#     )
-
-#     assert_array_equal(
-#         y_pred_proba[:, 0], _get_binary_score(
-#             lr, X_test, method='predict_proba', pos_label=0)
-#     )
-
-#     assert_array_equal(
-#         y_pred_score,
-#         _get_binary_score(lr, X_test, method=None, pos_label=1)
-#     )
-
-#     with pytest.raises(ValueError):
-#         _get_binary_score(lr, X_test, method='foo')
-
-#     # classifier that does not have a decision_function
-#     rf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
-#     y_pred_proba_rf = rf.predict_proba(X_test)
-#     assert_array_equal(
-#         y_pred_proba_rf[:, 1],
-#         _get_binary_score(rf, X_test, method=None, pos_label=1)
-#     )
-
-#     X_non_binary, y_non_binary = make_classification(
-#         n_samples=20, n_features=6, random_state=42, n_classes=4,
-#         n_informative=4
-#     )
-
-#     rf_non_bin = RandomForestClassifier(n_estimators=10).fit(X_non_binary,
-#                                                              y_non_binary)
-#     with pytest.raises(ValueError):
-#         _get_binary_score(rf_non_bin, X_non_binary)
-
-
-# class MockNoPredictorClassifier:
-#     pass
-
-
-# @pytest.mark.parametrize(
-#     "Estimator, params, err_type, err_msg",
-#     [
-#         (LogisticRegression, {"method": "xxx"}, ValueError,
-#          "'method' should be one of"),
-#         (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
-#          "'base_estimator' must implement one of the"),
-#         (RandomForestRegressor, {"method": "predict_proba"}, TypeError,
-#          "'base_estimator' does not implement predict_proba"),
-#         (LogisticRegression, {"strategy": "xxx"}, ValueError,
-#          "'strategy' must be of"),
-#         (LogisticRegression, {"strategy": "max_tpr"}, TypeError,
-#          "When strategy is max_tpr or max_tnr"),
-#         (LogisticRegression, {"strategy": "f_beta"}, TypeError,
-#          "When strategy is f_beta"),
-#         (LogisticRegression, {"strategy": "max_tnr"}, TypeError,
-#          "When strategy is max_tpr or max_tnr"),
-#         (LogisticRegression, {"strategy": "max_tpr", "threshold": 10},
-#          ValueError, r"threshold should be in the range \[0, 1\]"),
-#         (LogisticRegression, {"strategy": "max_tnr", "threshold": 10},
-#          ValueError, r"threshold should be in the range \[0, 1\]"),
-#         (LogisticRegression,
-#          {"strategy": "constant", "method": "predict_proba",
-#           "decision_threshold": 10},
-#          ValueError, r"decision_threshold should be in the range \[0, 1\]")
-#     ]
-# )
-# def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
-#                                              err_msg):
-#     X, y = make_classification(n_samples=200, n_features=6, random_state=42,
-#                                n_classes=2)
-#     with pytest.raises(err_type, match=err_msg):
-#         clf = CutoffClassifier(base_estimator=Estimator(), **params)
-#         clf.fit(X, y)
-
-
-def test_cutoff_classifier():
-    from sklearn.datasets import load_breast_cancer
-    X, y = load_breast_cancer(return_X_y=True)
-    clf = LogisticRegression().fit(X, y)
-    clf_optimized = CutoffClassifier(
-        base_estimator=clf, objective_name="precision", objective_value=0.9
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model = CutoffClassifier(
+        base_estimator=lr, objective_metric=balanced_accuracy_score
     )
-    clf_optimized.fit(X, y)
-    print(clf_optimized.decision_threshold_)
+    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
+    score_baseline = balanced_accuracy_score(y, lr.predict(X))
+    assert score_optimized > score_baseline

From 00da5f7441ea1c4e674938a220cdbd8ceb8c628f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 00:20:27 +0100
Subject: [PATCH 10/44] pep8

---
 sklearn/calibration.py            | 7 +------
 sklearn/tests/test_calibration.py | 7 +------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index c37803ff16abb..24b484594d7cd 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -7,9 +7,7 @@
 #
 # License: BSD 3 clause
 
-from copy import deepcopy
 from inspect import signature
-import numbers
 import warnings
 
 from math import log
@@ -28,10 +26,8 @@
 from .exceptions import NotFittedError
 from .isotonic import IsotonicRegression
 from .metrics import balanced_accuracy_score
-from .metrics import precision_recall_curve
 from .metrics import roc_curve
 from .model_selection import check_cv
-from .model_selection import cross_val_predict
 from .preprocessing import label_binarize
 from .preprocessing import LabelBinarizer
 from .svm import LinearSVC
@@ -39,7 +35,6 @@
 from .utils import check_array
 from .utils import column_or_1d
 from .utils import indexable
-from .utils import _safe_indexing
 from .utils.multiclass import check_classification_targets
 from .utils.multiclass import type_of_target
 from .utils.validation import check_is_fitted
@@ -784,7 +779,7 @@ def fit(self, X, y):
                 y, y_score, pos_label=self.pos_label
             )
             if self.objective_metric == "tnr":
-                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]
+                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]  # noqa
 
             threshold_idx = np.searchsorted(
                 eval(self.objective_metric), self.objective_value
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index d75b39c8cd703..db79486b1b139 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -15,19 +15,14 @@
 from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import LeaveOneOut
-from sklearn.model_selection import train_test_split
 from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import brier_score_loss
-from sklearn.metrics import confusion_matrix
-from sklearn.metrics import f1_score
 from sklearn.metrics import log_loss
-from sklearn.metrics import recall_score
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVC
-from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
@@ -389,7 +384,7 @@ def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
 def test_cutoffclassifier_not_binary():
     X, y = load_iris(return_X_y=True)
     with pytest.raises(ValueError, match="Expected target of binary type."):
-        clf = CutoffClassifier(base_estimator=LogisticRegression()).fit(X, y)
+        CutoffClassifier(base_estimator=LogisticRegression()).fit(X, y)
 
 
 def test_cutoffclassifier_limit_tpr_tnr():

From e8837e09b5b5b0c553d10479d94b252c937ca6de Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 18:35:27 +0100
Subject: [PATCH 11/44] iter

---
 sklearn/calibration.py            | 112 ++++++++++++++++++++++--------
 sklearn/tests/test_calibration.py |  28 +++++++-
 sklearn/utils/estimator_checks.py |   2 +-
 3 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 24b484594d7cd..37104df0f5d31 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -618,7 +618,8 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     Estimator that calibrates the decision threshold (cutoff point) that is
     used for prediction. The methods for picking cutoff points make use of
     traditional binary classification evaluation statistics such as the true
-    positive and true negative rates and F-scores.
+    positive and true negative rates or any metrics accepting true labels and
+    the output of a scoring functions from a scikit-learn estimator.
 
     Parameters
     ----------
@@ -636,6 +637,9 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
           of `objective_value`.
         * one of the scikit-learn scoring metric.
 
+    objective_metric_params : dict, default=None
+        Some extra parameters to pass to `objective_metric`.
+
     objective_value : float, default=None
         The value associated with the `objective_metric` metric for which we
         want to find the decision threshold.
@@ -662,24 +666,20 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
 
     classes_ : array of shape (n_classes,)
         The class labels.
-
-    References
-    ----------
-    .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
-           evaluation tool in clinical medicine, MH Zweig, G Campbell -
-           Clinical chemistry, 1993
     """
 
     def __init__(
         self,
         base_estimator,
         objective_metric=balanced_accuracy_score,
+        objective_metric_params=None,
         objective_value=None,
         method="auto",
         pos_label=None
     ):
         self.base_estimator = base_estimator
         self.objective_metric = objective_metric
+        self.objective_metric_params = objective_metric_params
         self.objective_value = objective_value
         self.method = method
         self.pos_label = pos_label
@@ -718,6 +718,29 @@ def _validate_parameters(self):
                 f" instead."
             )
 
+        # ensure binary classification if pos_label is not specified
+        # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
+        # triggering a FutureWarning by calling np.array_equal(a, b)
+        # when elements in the two arrays are not comparable.
+        classes = self.classes_
+        if (self.pos_label is None and (
+                classes.dtype.kind in ('O', 'U', 'S') or
+                not (np.array_equal(classes, [0, 1]) or
+                     np.array_equal(classes, [-1, 1]) or
+                     np.array_equal(classes, [0]) or
+                     np.array_equal(classes, [-1]) or
+                     np.array_equal(classes, [1])))):
+            classes_repr = ", ".join(repr(c) for c in classes)
+            raise ValueError(
+                f"'y_true' takes value in {classes_repr} and 'pos_label' is "
+                f"not specified: either make 'y_true' take value in "
+                "{{0, 1}} or {{-1, 1}} or pass pos_label explicitly."
+            )
+        elif self.pos_label is None:
+            self._pos_label = 1.
+        else:
+            self._pos_label = self.pos_label
+
     @staticmethod
     def _validate_data(X, y):
         y = check_array(y, ensure_2d=False, dtype=None)
@@ -735,17 +758,6 @@ def _get_pos_label_score(y_score, classes, pos_label):
             y_score = y_score[:, pos_label_encoded]
         return y_score
 
-    def _optimize_scorer(self, estimator, y_true, y_score, scorer, classes,
-                         pos_label):
-        # `np.unique` is already sorting the value, no need to call
-        #  `thresholds.sort()`
-        thresholds = np.unique(
-            self._get_pos_label_score(y_score, classes, pos_label)
-        )
-        scores = [scorer(y_true, (y_score >= th).astype(int))
-                  for th in thresholds]
-        return thresholds[np.argmax(scores)]
-
     def fit(self, X, y):
         """Find the decision threshold.
 
@@ -763,7 +775,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_parameters()
         X, y = self._validate_data(X, y)
 
         try:
@@ -772,24 +783,61 @@ def fit(self, X, y):
         except NotFittedError:
             self._estimator = clone(self.base_estimator).fit(X, y)
         self.classes_ = self._estimator.classes_
+        if len(self.classes_) == 1:
+            raise ValueError(
+                f"This classifier needs samples from 2 classes in the data "
+                f"to be trained but the data contains only the class: "
+                f"{self.classes_.item(0)}"
+            )
+
+        # delayed the parameters check until we have a fitted base estimator
+        # with known classes
+        self._validate_parameters()
+
+        # warm start a label encoder using the fitted estimator
+        label_encoder = LabelEncoder()
+        label_encoder.classes_ = self.classes_
+
+        y_encoded = label_encoder.transform(y)
+        self._pos_label_encoded = np.flatnonzero(
+            self.classes_ == self._pos_label
+        ).item(0)
 
         y_score = getattr(self._estimator, self._method)(X)
         if self.objective_metric in ("tpr", "tnr"):
             fpr, tpr, thresholds = roc_curve(
-                y, y_score, pos_label=self.pos_label
+                y_encoded, y_score, pos_label=self._pos_label_encoded
             )
+            metric = tpr
             if self.objective_metric == "tnr":
-                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]  # noqa
+                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]
+                metric = tnr
 
             threshold_idx = np.searchsorted(
-                eval(self.objective_metric), self.objective_value
+                metric, self.objective_value
             )
             self.decision_threshold_ = thresholds[threshold_idx]
         else:
-            self.decision_threshold_ = self._optimize_scorer(
-                self._estimator, y, y_score, self.objective_metric,
-                self.classes_, self.pos_label,
+            # `np.unique` is already sorting the value, no need to call
+            #  `thresholds.sort()`
+            thresholds = np.unique(
+                self._get_pos_label_score(
+                    y_score, self.classes_, self.pos_label
+                )
             )
+            params = ({} if self.objective_metric_params is None
+                      else self.objective_metric_params)
+            metric_signature = signature(self.objective_metric)
+            if "pos_label" in metric_signature.parameters:
+                params["pos_label"] = self._pos_label_encoded
+            scores = [
+                self.objective_metric(
+                    y_encoded, (y_score >= th).astype(int), **params
+                )
+                for th in thresholds
+            ]
+            self.decision_threshold_ = thresholds[np.argmax(scores)]
+
         return self
 
     def predict(self, X):
@@ -810,11 +858,19 @@ def predict(self, X):
 
         decision_function = getattr(self._estimator, self._method)
         y_score = self._get_pos_label_score(
-            decision_function(X), self.classes_, self.pos_label
+            decision_function(X), self.classes_, self._pos_label
         )
         y_class_indices = (y_score >= self.decision_threshold_).astype(int)
 
-        return self._estimator.classes_[y_class_indices]
+        return self.classes_[y_class_indices]
 
     def _more_tags(self):
-        return {"binary_only": True}
+        return {
+            "binary_only": True,
+            "_xfail_test": {
+                "check_classifiers_classes":
+                "requires non default 'pos_label='two'' parameter",
+                "check_fit2d_1feature":
+                "requires non default 'pos_label=2' parameter",
+            }
+        }
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index db79486b1b139..629030aa0343c 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -17,12 +17,15 @@
 from sklearn.model_selection import LeaveOneOut
 from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import brier_score_loss
+from sklearn.metrics import f1_score
+from sklearn.metrics import fbeta_score
 from sklearn.metrics import log_loss
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVC
+from sklearn.svm import SVC
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
@@ -354,8 +357,10 @@ def decision_function(self, X):
     calibrated_clf.fit(X, y)
 
 
-class MockNoPredictorClassifier:
-    pass
+class MockNoPredictorClassifier(BaseEstimator):
+    def fit(self, X, y):
+        self.classes_ = np.array([0, 1])
+        return self
 
 
 @pytest.mark.parametrize(
@@ -365,7 +370,7 @@ class MockNoPredictorClassifier:
          "'method' should be one of"),
         (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
          "'base_estimator' must implement one of the"),
-        (RandomForestRegressor, {"method": "predict_proba"}, TypeError,
+        (SVC, {"method": "predict_proba"}, TypeError,
          "'base_estimator' does not implement predict_proba"),
         (LogisticRegression,
          {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
@@ -421,3 +426,20 @@ def test_cutoffclassifier_with_objective_value():
     score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
     score_baseline = balanced_accuracy_score(y, lr.predict(X))
     assert score_optimized > score_baseline
+
+
+def test_cutoffclassifier_metric_with_parameter():
+    X, y = load_breast_cancer(return_X_y=True)
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model_fbeta = CutoffClassifier(
+        base_estimator=lr, objective_metric=fbeta_score,
+        objective_metric_params={"beta": 1}
+    )
+    model_f1 = CutoffClassifier(
+        base_estimator=lr, objective_metric=f1_score,
+    )
+    model_f1.fit(X, y)
+    model_fbeta.fit(X, y)
+
+    assert (model_fbeta.decision_threshold_ ==
+            pytest.approx(model_f1.decision_threshold_))
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 1e86f68d4ca3c..b5cdbf7074fd7 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2465,7 +2465,7 @@ def check_classifier_data_not_an_array(name, estimator_orig):
     X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1],
                   [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]])
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    y = [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]
+    y = [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0]
     y = _enforce_estimator_tags_y(estimator_orig, y)
     for obj_type in ["NotAnArray", "PandasDataframe"]:
         check_estimators_data_not_an_array(name, estimator_orig, X, y,

From 1a03fad9e9a9caf0f81d5ed9873bfcdaa1fdba8a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 19:46:49 +0100
Subject: [PATCH 12/44] move to model_selection

---
 doc/modules/classes.rst             |   2 +-
 setup.cfg                           |   2 +-
 sklearn/calibration.py              | 268 ----------------------------
 sklearn/model_selection/__init__.py |   3 +
 sklearn/tests/test_calibration.py   |  98 ----------
 5 files changed, 5 insertions(+), 368 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2f1d8bba7e653..c112a29b53e0c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -69,7 +69,6 @@ Functions
    :template: class.rst
 
    calibration.CalibratedClassifierCV
-   calibration.CutoffClassifier
 
 .. autosummary::
    :toctree: generated/
@@ -1137,6 +1136,7 @@ Splitter Classes
    :toctree: generated/
    :template: class.rst
 
+   model_selection.CutoffClassifier
    model_selection.GroupKFold
    model_selection.GroupShuffleSplit
    model_selection.KFold
diff --git a/setup.cfg b/setup.cfg
index f086993b26a29..95e4417b816e1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ addopts =
     --ignore examples
     --ignore maint_tools
     --doctest-modules
-    --disable-pytest-warnings
+    # --disable-pytest-warnings
     -rxXs
 
 filterwarnings =
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 37104df0f5d31..b56ff8b4f9437 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -23,9 +23,7 @@
 from .base import MetaEstimatorMixin
 from .base import RegressorMixin
 from .base import clone
-from .exceptions import NotFittedError
 from .isotonic import IsotonicRegression
-from .metrics import balanced_accuracy_score
 from .metrics import roc_curve
 from .model_selection import check_cv
 from .preprocessing import label_binarize
@@ -35,8 +33,6 @@
 from .utils import check_array
 from .utils import column_or_1d
 from .utils import indexable
-from .utils.multiclass import check_classification_targets
-from .utils.multiclass import type_of_target
 from .utils.validation import check_is_fitted
 from .utils.validation import check_consistent_length
 from .utils.validation import _check_sample_weight
@@ -610,267 +606,3 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
     prob_pred = bin_sums[nonzero] / bin_total[nonzero]
 
     return prob_true, prob_pred
-
-
-class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
-    """Decision threshold calibration for binary classification.
-
-    Estimator that calibrates the decision threshold (cutoff point) that is
-    used for prediction. The methods for picking cutoff points make use of
-    traditional binary classification evaluation statistics such as the true
-    positive and true negative rates or any metrics accepting true labels and
-    the output of a scoring functions from a scikit-learn estimator.
-
-    Parameters
-    ----------
-    base_estimator : estimator object
-        The classifier, fitted or not fitted, from which we want to optimize
-        the decision threshold used during `predict`.
-
-    objective_metric : {"tpr", "tnr"} or callable, \
-            default=balanced_accuracy_score
-        The objective metric to be optimized:
-
-        * `"tpr"`: Find the decision threshold for a true positive ratio (TPR)
-          of `objective_value`.
-        * `"tnr"`: Find the decision threshold for a true negative ratio (TNR)
-          of `objective_value`.
-        * one of the scikit-learn scoring metric.
-
-    objective_metric_params : dict, default=None
-        Some extra parameters to pass to `objective_metric`.
-
-    objective_value : float, default=None
-        The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold.
-
-    method : {"auto", "decision_function", "predict_proba"}, default="auto"
-        Methods by the classifier `base_estimator` corresponding to the
-        decision function for which we want to find a threshold. It can be:
-
-        * if `"auto"`, it will try to invoke, for each estimator,
-          `"decision_function` or `"predict_proba"` in that order.
-        * otherwise, one of `"predict_proba"` or `"decision_function"`.
-          If the method is not implemented by the estimator, it will raise an
-          error.
-
-    pos_label : int or str, default=None
-        The label of the positive class. When `pos_label=None`, if `y_true` is
-        in `{-1, 1}` or `{0, 1}`, `pos_label` is set to 1, otherwise an error
-        will be raised.
-
-    Attributes
-    ----------
-    decision_threshold_ : float
-        The new decision threshold.
-
-    classes_ : array of shape (n_classes,)
-        The class labels.
-    """
-
-    def __init__(
-        self,
-        base_estimator,
-        objective_metric=balanced_accuracy_score,
-        objective_metric_params=None,
-        objective_value=None,
-        method="auto",
-        pos_label=None
-    ):
-        self.base_estimator = base_estimator
-        self.objective_metric = objective_metric
-        self.objective_metric_params = objective_metric_params
-        self.objective_value = objective_value
-        self.method = method
-        self.pos_label = pos_label
-
-    def _validate_parameters(self):
-        """Validate the input parameters."""
-        supported_methods = ("decision_function", "predict_proba")
-        if self.method == "auto":
-            has_methods = [
-                hasattr(self.base_estimator, m) for m in supported_methods
-            ]
-            if not any(has_methods):
-                raise TypeError(
-                    f"'base_estimator' must implement one of the "
-                    f"{', '.join(supported_methods)} methods."
-                )
-            self._method = next(
-                (m for m, i in zip(supported_methods, has_methods) if i), None
-            )
-        else:
-            if self.method not in supported_methods:
-                raise ValueError(
-                    f"'method' should be one of {', '.join(supported_methods)}"
-                    f". Got {self.method} instead."
-                )
-            elif not hasattr(self.base_estimator, self.method):
-                raise TypeError(
-                    f"'base_estimator' does not implement {self.method}."
-                )
-            self._method = self.method
-        if (self.objective_metric not in ("tnr", "tpr") and
-                self.objective_value is not None):
-            raise ValueError(
-                f"When 'objective_metric' is a predefined scoring function, "
-                f"'objective_value' should be None. Got {self.objective_value}"
-                f" instead."
-            )
-
-        # ensure binary classification if pos_label is not specified
-        # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
-        # triggering a FutureWarning by calling np.array_equal(a, b)
-        # when elements in the two arrays are not comparable.
-        classes = self.classes_
-        if (self.pos_label is None and (
-                classes.dtype.kind in ('O', 'U', 'S') or
-                not (np.array_equal(classes, [0, 1]) or
-                     np.array_equal(classes, [-1, 1]) or
-                     np.array_equal(classes, [0]) or
-                     np.array_equal(classes, [-1]) or
-                     np.array_equal(classes, [1])))):
-            classes_repr = ", ".join(repr(c) for c in classes)
-            raise ValueError(
-                f"'y_true' takes value in {classes_repr} and 'pos_label' is "
-                f"not specified: either make 'y_true' take value in "
-                "{{0, 1}} or {{-1, 1}} or pass pos_label explicitly."
-            )
-        elif self.pos_label is None:
-            self._pos_label = 1.
-        else:
-            self._pos_label = self.pos_label
-
-    @staticmethod
-    def _validate_data(X, y):
-        y = check_array(y, ensure_2d=False, dtype=None)
-        check_classification_targets(y)
-        y_type = type_of_target(y)
-        if y_type != 'binary':
-            raise ValueError(f'Expected target of binary type. Got {y_type}.')
-        return X, y
-
-    @staticmethod
-    def _get_pos_label_score(y_score, classes, pos_label):
-        """Get score of the positive class."""
-        if y_score.ndim == 2:
-            pos_label_encoded = np.flatnonzero(classes == pos_label).item(0)
-            y_score = y_score[:, pos_label_encoded]
-        return y_score
-
-    def fit(self, X, y):
-        """Find the decision threshold.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix, dataframe} of shape \
-                (n_samples, n_features)
-            The training data.
-
-        y : array-like of shape (n_samples,)
-            Target values. It should be a binary target.
-
-        Returns
-        -------
-        self : object
-            Returns an instance of self.
-        """
-        X, y = self._validate_data(X, y)
-
-        try:
-            check_is_fitted(self.base_estimator, attributes=["n_classes_"])
-            self._estimator = self.base_estimator
-        except NotFittedError:
-            self._estimator = clone(self.base_estimator).fit(X, y)
-        self.classes_ = self._estimator.classes_
-        if len(self.classes_) == 1:
-            raise ValueError(
-                f"This classifier needs samples from 2 classes in the data "
-                f"to be trained but the data contains only the class: "
-                f"{self.classes_.item(0)}"
-            )
-
-        # delayed the parameters check until we have a fitted base estimator
-        # with known classes
-        self._validate_parameters()
-
-        # warm start a label encoder using the fitted estimator
-        label_encoder = LabelEncoder()
-        label_encoder.classes_ = self.classes_
-
-        y_encoded = label_encoder.transform(y)
-        self._pos_label_encoded = np.flatnonzero(
-            self.classes_ == self._pos_label
-        ).item(0)
-
-        y_score = getattr(self._estimator, self._method)(X)
-        if self.objective_metric in ("tpr", "tnr"):
-            fpr, tpr, thresholds = roc_curve(
-                y_encoded, y_score, pos_label=self._pos_label_encoded
-            )
-            metric = tpr
-            if self.objective_metric == "tnr":
-                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]
-                metric = tnr
-
-            threshold_idx = np.searchsorted(
-                metric, self.objective_value
-            )
-            self.decision_threshold_ = thresholds[threshold_idx]
-        else:
-            # `np.unique` is already sorting the value, no need to call
-            #  `thresholds.sort()`
-            thresholds = np.unique(
-                self._get_pos_label_score(
-                    y_score, self.classes_, self.pos_label
-                )
-            )
-            params = ({} if self.objective_metric_params is None
-                      else self.objective_metric_params)
-            metric_signature = signature(self.objective_metric)
-            if "pos_label" in metric_signature.parameters:
-                params["pos_label"] = self._pos_label_encoded
-            scores = [
-                self.objective_metric(
-                    y_encoded, (y_score >= th).astype(int), **params
-                )
-                for th in thresholds
-            ]
-            self.decision_threshold_ = thresholds[np.argmax(scores)]
-
-        return self
-
-    def predict(self, X):
-        """Predict using the calibrated decision threshold
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix, dataframe} of shape \
-                (n_samples, n_features)
-            The data matrix.
-
-        Returns
-        -------
-        C : ndarray of shape (n_samples,)
-            The predicted class.
-        """
-        check_is_fitted(self)
-
-        decision_function = getattr(self._estimator, self._method)
-        y_score = self._get_pos_label_score(
-            decision_function(X), self.classes_, self._pos_label
-        )
-        y_class_indices = (y_score >= self.decision_threshold_).astype(int)
-
-        return self.classes_[y_class_indices]
-
-    def _more_tags(self):
-        return {
-            "binary_only": True,
-            "_xfail_test": {
-                "check_classifiers_classes":
-                "requires non default 'pos_label='two'' parameter",
-                "check_fit2d_1feature":
-                "requires non default 'pos_label=2' parameter",
-            }
-        }
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 82a9b9371710d..6f39c0bc1499d 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -29,7 +29,10 @@
 from ._search import ParameterSampler
 from ._search import fit_grid_point
 
+from ._prediction import CutoffClassifier
+
 __all__ = ('BaseCrossValidator',
+           'CutoffClassifier',
            'GridSearchCV',
            'TimeSeriesSplit',
            'KFold',
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 629030aa0343c..eb0ad7d800643 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -6,26 +6,17 @@
 from scipy import sparse
 
 from sklearn.base import BaseEstimator
-from sklearn.datasets import load_breast_cancer
-from sklearn.datasets import load_iris
 from sklearn.datasets import make_blobs
 from sklearn.datasets import make_classification
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.impute import SimpleImputer
-from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import LeaveOneOut
-from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import brier_score_loss
-from sklearn.metrics import f1_score
-from sklearn.metrics import fbeta_score
 from sklearn.metrics import log_loss
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVC
-from sklearn.svm import SVC
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
@@ -33,7 +24,6 @@
 from sklearn.utils._testing import ignore_warnings
 
 from sklearn.calibration import CalibratedClassifierCV
-from sklearn.calibration import CutoffClassifier
 from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
 from sklearn.calibration import calibration_curve
 
@@ -355,91 +345,3 @@ def decision_function(self, X):
     calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
     # we should be able to fit this classifier with no error
     calibrated_clf.fit(X, y)
-
-
-class MockNoPredictorClassifier(BaseEstimator):
-    def fit(self, X, y):
-        self.classes_ = np.array([0, 1])
-        return self
-
-
-@pytest.mark.parametrize(
-    "Estimator, params, err_type, err_msg",
-    [
-        (LogisticRegression, {"method": "xxx"}, ValueError,
-         "'method' should be one of"),
-        (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
-         "'base_estimator' must implement one of the"),
-        (SVC, {"method": "predict_proba"}, TypeError,
-         "'base_estimator' does not implement predict_proba"),
-        (LogisticRegression,
-         {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
-         "When 'objective_metric' is a predefined scoring function")
-    ]
-)
-def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
-                                             err_msg):
-    X, y = make_classification(n_samples=200, n_features=6, random_state=42,
-                               n_classes=2)
-    with pytest.raises(err_type, match=err_msg):
-        clf = CutoffClassifier(base_estimator=Estimator(), **params)
-        clf.fit(X, y)
-
-
-def test_cutoffclassifier_not_binary():
-    X, y = load_iris(return_X_y=True)
-    with pytest.raises(ValueError, match="Expected target of binary type."):
-        CutoffClassifier(base_estimator=LogisticRegression()).fit(X, y)
-
-
-def test_cutoffclassifier_limit_tpr_tnr():
-    X, y = load_breast_cancer(return_X_y=True)
-    clf = CutoffClassifier(
-        base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
-        objective_metric="tpr",
-        objective_value=0,
-    )
-    y_pred_tpr = clf.fit(X, y).predict(X)
-    clf.set_params(objective_metric="tnr")
-    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert_array_equal(y_pred_tnr, y_pred_tpr)
-
-
-def test_cutoffclassifier_with_objective_value():
-    X, y = load_breast_cancer(return_X_y=True)
-    # remove feature to degrade performances
-    X = X[:, :5]
-
-    # make the problem completely imbalanced such that the balanced accuracy
-    # is low
-    indices_pos = np.flatnonzero(y == 1)
-    indices_pos = indices_pos[:indices_pos.size // 50]
-    indices_neg = np.flatnonzero(y == 0)
-
-    X = np.vstack([X[indices_neg], X[indices_pos]])
-    y = np.hstack([y[indices_neg], y[indices_pos]])
-
-    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model = CutoffClassifier(
-        base_estimator=lr, objective_metric=balanced_accuracy_score
-    )
-    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
-    score_baseline = balanced_accuracy_score(y, lr.predict(X))
-    assert score_optimized > score_baseline
-
-
-def test_cutoffclassifier_metric_with_parameter():
-    X, y = load_breast_cancer(return_X_y=True)
-    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model_fbeta = CutoffClassifier(
-        base_estimator=lr, objective_metric=fbeta_score,
-        objective_metric_params={"beta": 1}
-    )
-    model_f1 = CutoffClassifier(
-        base_estimator=lr, objective_metric=f1_score,
-    )
-    model_f1.fit(X, y)
-    model_fbeta.fit(X, y)
-
-    assert (model_fbeta.decision_threshold_ ==
-            pytest.approx(model_f1.decision_threshold_))

From 63b285a9199dcd2243dde91d550b7b9f90d8ad97 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 19:47:14 +0100
Subject: [PATCH 13/44] add the missing files

---
 sklearn/model_selection/_prediction.py        | 280 ++++++++++++++++++
 .../model_selection/tests/test_prediction.py  | 165 +++++++++++
 2 files changed, 445 insertions(+)
 create mode 100644 sklearn/model_selection/_prediction.py
 create mode 100644 sklearn/model_selection/tests/test_prediction.py

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
new file mode 100644
index 0000000000000..044f1b7d98f39
--- /dev/null
+++ b/sklearn/model_selection/_prediction.py
@@ -0,0 +1,280 @@
+from inspect import signature
+
+import numpy as np
+
+from ..base import clone
+from ..base import BaseEstimator
+from ..base import ClassifierMixin
+from ..base import MetaEstimatorMixin
+from ..exceptions import NotFittedError
+from ..metrics import balanced_accuracy_score
+from ..metrics import roc_curve
+from ..preprocessing import LabelEncoder
+from ..utils import check_array
+from ..utils.multiclass import check_classification_targets
+from ..utils.multiclass import type_of_target
+from ..utils.validation import check_is_fitted
+
+
+class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
+    """Decision threshold calibration for binary classification.
+
+    Estimator that calibrates the decision threshold (cutoff point) that is
+    used for prediction. The methods for picking cutoff points make use of
+    traditional binary classification evaluation statistics such as the true
+    positive and true negative rates or any metrics accepting true labels and
+    the output of a scoring function from a scikit-learn estimator.
+
+    Parameters
+    ----------
+    base_estimator : estimator object
+        The classifier, fitted or not fitted, from which we want to optimize
+        the decision threshold used during `predict`.
+
+    objective_metric : {"tpr", "tnr"} or callable, \
+            default=balanced_accuracy_score
+        The objective metric to be optimized. Can be on of:
+
+        * `"tpr"`: Find the decision threshold for a true positive ratio (TPR)
+          of `objective_value`.
+        * `"tnr"`: Find the decision threshold for a true negative ratio (TNR)
+          of `objective_value`.
+        * a callable with the signature `metric(y_true, y_score, **kwargs)`.
+
+    objective_metric_params : dict, default=None
+        Some extra parameters to pass to `objective_metric`.
+
+    objective_value : float, default=None
+        The value associated with the `objective_metric` metric for which we
+        want to find the decision threshold. Only apply when `objective_metric`
+        is `"tpr"` or `"tnr"`
+
+    method : {"auto", "decision_function", "predict_proba"}, default="auto"
+        Methods by the classifier `base_estimator` corresponding to the
+        decision function for which we want to find a threshold. It can be:
+
+        * if `"auto"`, it will try to invoke, for each estimator,
+          `"decision_function` or `"predict_proba"` in that order.
+        * otherwise, one of `"predict_proba"` or `"decision_function"`.
+          If the method is not implemented by the estimator, it will raise an
+          error.
+
+    pos_label : int or str, default=None
+        The label of the positive class. When `pos_label=None`, if `y_true` is
+        in `{-1, 1}` or `{0, 1}`, `pos_label` is set to 1, otherwise an error
+        will be raised.
+
+    Attributes
+    ----------
+    decision_threshold_ : float
+        The new decision threshold.
+
+    classes_ : array of shape (n_classes,)
+        The class labels.
+    """
+
+    def __init__(
+        self,
+        base_estimator,
+        objective_metric=balanced_accuracy_score,
+        objective_metric_params=None,
+        objective_value=None,
+        method="auto",
+        pos_label=None
+    ):
+        self.base_estimator = base_estimator
+        self.objective_metric = objective_metric
+        self.objective_metric_params = objective_metric_params
+        self.objective_value = objective_value
+        self.method = method
+        self.pos_label = pos_label
+
+    def _validate_parameters(self):
+        """Validate the input parameters."""
+        supported_methods = ("decision_function", "predict_proba")
+        if self.method == "auto":
+            has_methods = [
+                hasattr(self.base_estimator, m) for m in supported_methods
+            ]
+            if not any(has_methods):
+                raise TypeError(
+                    f"'base_estimator' must implement one of the "
+                    f"{', '.join(supported_methods)} methods."
+                )
+            self._method = next(
+                (m for m, i in zip(supported_methods, has_methods) if i), None
+            )
+        else:
+            if self.method not in supported_methods:
+                raise ValueError(
+                    f"'method' should be one of {', '.join(supported_methods)}"
+                    f". Got {self.method} instead."
+                )
+            elif not hasattr(self.base_estimator, self.method):
+                raise TypeError(
+                    f"'base_estimator' does not implement {self.method}."
+                )
+            self._method = self.method
+        if (self.objective_metric not in ("tnr", "tpr") and
+                self.objective_value is not None):
+            raise ValueError(
+                f"When 'objective_metric' is a scoring function, "
+                f"'objective_value' should be None. Got {self.objective_value}"
+                f" instead."
+            )
+
+        # ensure binary classification if `pos_label` is not specified
+        # `classes.dtype.kind` in ('O', 'U', 'S') is required to avoid
+        # triggering a FutureWarning by calling np.array_equal(a, b)
+        # when elements in the two arrays are not comparable.
+        if (self.pos_label is None and (
+                self.classes_.dtype.kind in ('O', 'U', 'S') or
+                not (np.array_equal(self.classes_, [0, 1]) or
+                     np.array_equal(self.classes_, [-1, 1]) or
+                     np.array_equal(self.classes_, [0]) or
+                     np.array_equal(self.classes_, [-1]) or
+                     np.array_equal(self.classes_, [1])))):
+            classes_repr = ", ".join(repr(c) for c in self.classes_)
+            raise ValueError(
+                f"'y_true' takes value in {classes_repr} and 'pos_label' is "
+                f"not specified: either make 'y_true' take value in "
+                "{{0, 1}} or {{-1, 1}} or pass pos_label explicitly."
+            )
+        elif self.pos_label is None:
+            self._pos_label = 1
+        else:
+            self._pos_label = self.pos_label
+
+    @staticmethod
+    def _validate_data(X, y):
+        y = check_array(y, ensure_2d=False, dtype=None)
+        check_classification_targets(y)
+        y_type = type_of_target(y)
+        if y_type != 'binary':
+            raise ValueError(f'Expected target of binary type. Got {y_type}.')
+        return X, y
+
+    @staticmethod
+    def _get_pos_label_score(y_score, classes, pos_label):
+        """Get score of the positive class."""
+        if y_score.ndim == 2:
+            pos_label_encoded = np.flatnonzero(classes == pos_label).item(0)
+            y_score = y_score[:, pos_label_encoded]
+        return y_score
+
+    def fit(self, X, y):
+        """Find the decision threshold.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            The training data.
+
+        y : array-like of shape (n_samples,)
+            Target values. It should be a binary target.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        X, y = self._validate_data(X, y)
+
+        try:
+            check_is_fitted(self.base_estimator, attributes=["classes_"])
+            self._estimator = self.base_estimator
+        except NotFittedError:
+            self._estimator = clone(self.base_estimator).fit(X, y)
+        self.classes_ = self._estimator.classes_
+        if len(self.classes_) == 1:
+            raise ValueError(
+                f"This classifier needs samples from 2 classes in the data "
+                f"to be trained but the data contains only the class: "
+                f"{self.classes_.item(0)}"
+            )
+
+        # delayed the parameters check until we have a fitted base estimator
+        # with known classes
+        self._validate_parameters()
+
+        # warm start a label encoder using the fitted estimator
+        label_encoder = LabelEncoder()
+        label_encoder.classes_ = self.classes_
+
+        y_encoded = label_encoder.transform(y)
+        self._pos_label_encoded = np.flatnonzero(
+            self.classes_ == self._pos_label
+        ).item(0)
+
+        y_score = getattr(self._estimator, self._method)(X)
+        if self.objective_metric in ("tpr", "tnr"):
+            fpr, tpr, thresholds = roc_curve(
+                y_encoded, y_score, pos_label=self._pos_label_encoded
+            )
+            metric = tpr
+            if self.objective_metric == "tnr":
+                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]
+                metric = tnr
+
+            threshold_idx = np.searchsorted(
+                metric, self.objective_value
+            )
+            self.decision_threshold_ = thresholds[threshold_idx]
+        else:
+            # `np.unique` is already sorting the value, no need to call
+            #  `thresholds.sort()`
+            thresholds = np.unique(
+                self._get_pos_label_score(
+                    y_score, self.classes_, self.pos_label
+                )
+            )
+            params = ({} if self.objective_metric_params is None
+                      else self.objective_metric_params)
+            metric_signature = signature(self.objective_metric)
+            if "pos_label" in metric_signature.parameters:
+                params["pos_label"] = self._pos_label_encoded
+            scores = [
+                self.objective_metric(
+                    y_encoded, (y_score >= th).astype(int), **params
+                )
+                for th in thresholds
+            ]
+            self.decision_threshold_ = thresholds[np.argmax(scores)]
+
+        return self
+
+    def predict(self, X):
+        """Predict using the calibrated decision threshold
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            The predicted class.
+        """
+        check_is_fitted(self)
+
+        decision_function = getattr(self._estimator, self._method)
+        y_score = self._get_pos_label_score(
+            decision_function(X), self.classes_, self._pos_label
+        )
+        y_class_indices = (y_score >= self.decision_threshold_).astype(int)
+
+        return self.classes_[y_class_indices]
+
+    def _more_tags(self):
+        return {
+            "binary_only": True,
+            "_xfail_test": {
+                "check_classifiers_classes":
+                "requires non default 'pos_label='two'' parameter",
+                "check_fit2d_1feature":
+                "requires non default 'pos_label=2' parameter",
+            }
+        }
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
new file mode 100644
index 0000000000000..722325cb0e81a
--- /dev/null
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -0,0 +1,165 @@
+import numpy as np
+import pytest
+
+from sklearn.base import BaseEstimator
+from sklearn.datasets import load_breast_cancer
+from sklearn.datasets import load_iris
+from sklearn.datasets import make_classification
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import balanced_accuracy_score
+from sklearn.metrics import f1_score
+from sklearn.metrics import fbeta_score
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.utils._testing import assert_array_equal
+
+from sklearn.model_selection import CutoffClassifier
+
+
+class MockNoPredictorClassifier(BaseEstimator):
+    """Classifier which does not predict."""
+    def fit(self, X, y):
+        self.classes_ = np.array([0, 1])
+        return self
+
+
+@pytest.mark.parametrize(
+    "Estimator, params, err_type, err_msg",
+    [
+        (LogisticRegression, {"method": "xxx"}, ValueError,
+         "'method' should be one of"),
+        (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
+         "'base_estimator' must implement one of the"),
+        (SVC, {"method": "predict_proba"}, TypeError,
+         "'base_estimator' does not implement predict_proba"),
+        (LogisticRegression,
+         {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
+         "When 'objective_metric' is a scoring function")
+    ]
+)
+def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
+                                             err_msg):
+    # check that the proper errors are raised with wrong parameters
+    X, y = make_classification(n_samples=200, n_features=6, random_state=42,
+                               n_classes=2)
+    with pytest.raises(err_type, match=err_msg):
+        clf = CutoffClassifier(base_estimator=Estimator(), **params)
+        clf.fit(X, y)
+
+
+def test_cutoffclassifier_error_pos_label():
+    # check that we raise when the classes are not in {0, 1} or {-1, 1}
+    X, y = load_breast_cancer(return_X_y=True)
+    y += 1
+    err_msg = "'y_true' takes value in 1, 2 and 'pos_label' is not specified"
+    with pytest.raises(ValueError, match=err_msg):
+        CutoffClassifier(
+            base_estimator=make_pipeline(StandardScaler(),
+                                         LogisticRegression())
+        ).fit(X, y)
+
+
+def test_cutoffclassifier_not_binary():
+    # check that we only accept binary target
+    X, y = load_iris(return_X_y=True)
+    with pytest.raises(ValueError, match="Expected target of binary type."):
+        CutoffClassifier(
+            base_estimator=make_pipeline(StandardScaler(),
+                                         LogisticRegression())
+        ).fit(X, y)
+
+
+def test_cutoffclassifier_limit_tpr_tnr():
+    # check that an objective value of 0 give opposite predictions in with
+    # tpr and tnr
+    X, y = load_breast_cancer(return_X_y=True)
+    clf = CutoffClassifier(
+        base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        objective_metric="tpr",
+        objective_value=0,
+    )
+    y_pred_tpr = clf.fit(X, y).predict(X)
+    clf.set_params(objective_metric="tnr")
+    y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
+    assert_array_equal(y_pred_tnr, y_pred_tpr)
+
+
+def test_cutoffclassifier_with_objective_value():
+    # check that we can optimize a given metric as a callable
+    X, y = load_breast_cancer(return_X_y=True)
+    # remove feature to degrade performances
+    X = X[:, :5]
+
+    # make the problem completely imbalanced such that the balanced accuracy
+    # is low
+    indices_pos = np.flatnonzero(y == 1)
+    indices_pos = indices_pos[:indices_pos.size // 50]
+    indices_neg = np.flatnonzero(y == 0)
+
+    X = np.vstack([X[indices_neg], X[indices_pos]])
+    y = np.hstack([y[indices_neg], y[indices_pos]])
+
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model = CutoffClassifier(
+        base_estimator=lr, objective_metric=balanced_accuracy_score
+    )
+    score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
+    score_baseline = balanced_accuracy_score(y, lr.predict(X))
+    assert score_optimized > score_baseline
+    assert_array_equal(model.classes_, [0, 1])
+
+
+def test_cutoffclassifier_metric_with_parameter():
+    # check that we can pass a metric with a parameter
+    # in addition check that f_beta with beta=1 is equivalent to f1
+    X, y = load_breast_cancer(return_X_y=True)
+    lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model_fbeta = CutoffClassifier(
+        base_estimator=lr, objective_metric=fbeta_score,
+        objective_metric_params={"beta": 1}
+    ).fit(X, y)
+    model_f1 = CutoffClassifier(
+        base_estimator=lr, objective_metric=f1_score,
+    ).fit(X, y)
+
+    assert (model_fbeta.decision_threshold_ ==
+            pytest.approx(model_f1.decision_threshold_))
+
+
+def test_cutoffclassifier_pretrained_estimator():
+    # check that passing a pre-trained estimator is equivalent to training it
+    # in the meta-estimator
+    X, y = load_breast_cancer(return_X_y=True)
+    lr_prefit = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    lr = make_pipeline(StandardScaler(), LogisticRegression())
+    model_prefit = CutoffClassifier(base_estimator=lr_prefit).fit(X, y)
+    model = CutoffClassifier(base_estimator=lr).fit(X, y)
+
+    assert (model_prefit.decision_threshold_ ==
+            pytest.approx(model.decision_threshold_))
+
+    # check that we did not make any clone/copy of the pretrained estimator
+    assert model_prefit._estimator is lr_prefit
+
+
+@pytest.mark.parametrize("metric", [balanced_accuracy_score, f1_score])
+@pytest.mark.parametrize("dtype", [None, object])
+def test_cutoffclassifier_with_string_targets(dtype, metric):
+    # check that targets represented by str are properly managed
+    # check with several metrics to be sure that `pos_label` is properly
+    # dispatched
+    X, y = load_breast_cancer(return_X_y=True)
+    # replaces y by some strings
+    classes = np.array(["healthy", "cancer"])
+    if dtype is not None:
+        classes = classes.astype(dtype)
+    y = classes[y]
+    model = CutoffClassifier(
+        base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+        objective_metric=metric,
+        pos_label="cancer",
+    ).fit(X, y)
+    assert_array_equal(np.sort(model.classes_), np.sort(classes))
+    y_pred = model.predict(X[[0], :])
+    assert y_pred.item(0) in classes

From 65e8329cf8c2b8a1a3f49dc6e55e430591c9fb75 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 19:51:02 +0100
Subject: [PATCH 14/44] Remove current documentation

---
 doc/modules/calibration.rst                   | 130 +-------------
 doc/whats_new/v0.23.rst                       |  15 +-
 .../plot_decision_threshold_calibration.py    | 167 ------------------
 3 files changed, 9 insertions(+), 303 deletions(-)
 delete mode 100644 examples/calibration/plot_decision_threshold_calibration.py

diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
index daaf8c9a9545f..16e63c37e7fa3 100644
--- a/doc/modules/calibration.rst
+++ b/doc/modules/calibration.rst
@@ -1,16 +1,11 @@
 .. _calibration:
 
-======================
-Prediction calibration
-======================
-
-.. currentmodule:: sklearn.calibration
-
-.. _probability_calibration:
-
+=======================
 Probability calibration
 =======================
 
+.. currentmodule:: sklearn.calibration
+
 When performing classification you often want not only to predict the class
 label, but also obtain a probability of the respective label. This probability
 gives you some kind of confidence on the prediction. Some models can give you
@@ -160,122 +155,3 @@ well a classifier is calibrated.
 
     .. [4] Transforming Classifier Scores into Accurate Multiclass
            Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
-
-.. _decision_threshold_calibration:
-
-Decision Threshold calibration
-==============================
-
-.. currentmodule:: sklearn.calibration
-
-Often Machine Learning classifiers base their
-predictions on real-valued decision functions or probability estimates that
-carry the inherited biases of their models. Additionally when using a machine
-learning model the evaluation criteria can differ from the optimisation
-objectives used by the model during training.
-
-When predicting between two classes it is commonly advised that an appropriate
-decision threshold is estimated based on some cutoff criteria rather than
-arbitrarily using the midpoint of the space of possible values. Estimating a
-decision threshold for a specific use case can help to increase the overall
-accuracy of the model and provide better handling for sensitive classes.
-
-.. currentmodule:: sklearn.calibration
-
-:class:`CutoffClassifier` can be used as a wrapper around a model for binary
-classification to help obtain a more appropriate decision threshold and use it
-for predicting new samples.
-
-Usage
------
-
-To use the :class:`CutoffClassifier` you need to provide an estimator that has
-a ``decision_function`` or a ``predict_proba`` method. The ``method``
-parameter controls whether the first will be preferred over the second if both
-are available.
-
-The wrapped estimator can be pre-trained, in which case ``cv = 'prefit'``, or
-not. If the classifier is not trained then a cross-validation loop specified by
-the parameter ``cv`` can be used to obtain a decision threshold by averaging
-all decision thresholds calculated on the hold-out parts of each cross
-validation iteration. Finally the model is trained on all the provided data.
-When using ``cv = 'prefit'`` you need to make sure to use a hold-out part of
-your data for calibration.
-
-The strategies, controlled by the parameter ``strategy``, for finding
-appropriate decision thresholds are based either on precision recall estimates
-or true positive and true negative rates. Specifically:
-
-.. currentmodule:: sklearn.metrics
-
-* ``f_beta``
-   selects a decision threshold that maximizes the :func:`fbeta_score`. The
-   value of beta is specified by the parameter ``beta``. The ``beta`` parameter
-   determines the weight of precision. When ``beta = 1`` both precision recall
-   get the same weight therefore the maximization target in this case is the
-   :func:`f1_score`. if ``beta < 1`` more weight is given to precision whereas
-   if ``beta > 1`` more weight is given to recall.
-
-* ``roc``
-   selects the decision threshold for the point on the :func:`roc_curve` that
-   is closest to the ideal corner (0, 1)
-
-* ``max_tpr``
-   selects the decision threshold for the point that yields the highest true
-   positive rate while maintaining a minimum true negative rate, specified by
-   the parameter ``threshold``
-
-* ``max_tnr``
-   selects the decision threshold for the point that yields the highest true
-   negative rate while maintaining a minimum true positive rate, specified by
-   the parameter ``threshold``
-
-Here is a simple usage example::
-
-   >>> from sklearn.calibration import CutoffClassifier
-   >>> from sklearn.datasets import load_breast_cancer
-   >>> from sklearn.naive_bayes import GaussianNB
-   >>> from sklearn.metrics import precision_score
-   >>> from sklearn.model_selection import train_test_split
-
-   >>> X, y = load_breast_cancer(return_X_y=True)
-   >>> X_train, X_test, y_train, y_test = train_test_split(
-   ...     X, y, train_size=0.6, random_state=42)
-   >>> clf = CutoffClassifier(GaussianNB(), strategy='f_beta', beta=0.6,
-   ...                        cv=3).fit(X_train, y_train)
-   >>> y_pred = clf.predict(X_test)
-   >>> precision_score(y_test, y_pred)                   # doctest: +ELLIPSIS
-   0.959...
-
-.. topic:: Examples:
-
- * :ref:`sphx_glr_auto_examples_calibration_plot_decision_threshold_calibration.py`: Decision
-   threshold calibration on the breast cancer dataset
-
-.. currentmodule:: sklearn.calibration
-
-The following image shows the results of using the :class:`CutoffClassifier`
-for finding a decision threshold for a :class:`LogisticRegression` classifier
-and an :class:`AdaBoostClassifier` for two use cases.
-
-.. figure:: ../auto_examples/calibration/images/sphx_glr_plot_decision_threshold_calibration_001.png
-   :target: ../auto_examples/calibration/plot_decision_threshold_calibration.html
-   :align: center
-
-In the first case we want to increase the overall accuracy of the classifier on
-the breast cancer dataset. In the second case we want to find a decision
-threshold that yields maximum true positive rate while maintaining a minimum
-value for the true negative rate.
-
-.. topic:: References:
-
-    * Receiver-operating characteristic (ROC) plots: a fundamental
-      evaluation tool in clinical medicine, MH Zweig, G Campbell -
-      Clinical chemistry, 1993
-
-Notes
------
-
-Calibrating the decision threshold of a classifier does not guarantee increased
-performance. The generalisation ability of the obtained decision threshold has
-to be evaluated.
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 6925414835f63..6a75117d3c6d6 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -45,15 +45,6 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
-:mod:`sklearn.calibration`
-..........................
-
-- |MajorFeature| :class:`calibration.CutoffClassifier` calibrates the decision
-  threshold function of a classifier by maximizing a binary classification
-  metric through cross-validation.
-  :pr:`16525` by :user:`Prokopis Gryllos <PGryllos>`
-  and :user:`Guillaume Lemaitre <glemaitre>`.
-
 :mod:`sklearn.cluster`
 ......................
 
@@ -284,6 +275,12 @@ Changelog
   be removed in 0.25. :pr:`16401` by
   :user:`Arie Pratama Sutiono <ariepratama>`
 
+- |MajorFeature| :class:`model_selection.CutoffClassifier` calibrates the
+  decision threshold function of a classifier by maximizing a binary
+  classification metric through cross-validation.
+  :pr:`16525` by :user:`Guillaume Lemaitre <glemaitre>` and
+  :user:`Prokopis Gryllos <PGryllos>`.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/examples/calibration/plot_decision_threshold_calibration.py b/examples/calibration/plot_decision_threshold_calibration.py
deleted file mode 100644
index e14e680380e17..0000000000000
--- a/examples/calibration/plot_decision_threshold_calibration.py
+++ /dev/null
@@ -1,167 +0,0 @@
-"""
-======================================================================
-Decision threshold (cutoff point) calibration on breast cancer dataset
-======================================================================
-
-Machine learning classifiers often base their predictions on real-valued
-decision functions that don't always have accuracy as their objective. Moreover
-the learning objective of a model can differ from the user's needs hence using
-an arbitrary decision threshold as defined by the model can be not ideal.
-
-The CutoffClassifier can be used to calibrate the decision threshold of a model
-in order to increase the classifier's trustworthiness. Optimization objectives
-during the decision threshold calibration can be the true positive and / or
-the true negative rate as well as the f beta score.
-
-In this example the decision threshold calibration is applied on two
-classifiers trained on the breast cancer dataset. The goal in the first case is
-to maximize the f1 score of the classifiers whereas in the second the goal is
-to maximize the true positive rate while maintaining a minimum true negative
-rate.
-
-As you can see after calibration the f1 score of the LogisticRegression
-classifiers has increased slightly whereas the accuracy of the
-AdaBoostClassifier classifier has stayed the same.
-
-For the second goal as seen after calibration both classifiers achieve better
-true positive rate while their respective true negative rates have decreased
-slightly or remained stable.
-"""
-
-# Author: Prokopios Gryllos <prokopis.gryllos@sentiance.com>
-#
-# License: BSD 3 clause
-
-from __future__ import division
-
-import numpy as np
-
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.metrics import confusion_matrix, f1_score
-from sklearn.calibration import CutoffClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.datasets import load_breast_cancer
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
-
-
-print(__doc__)
-
-# percentage of the training set that will be used for calibration
-calibration_samples_percentage = 0.2
-
-X, y = load_breast_cancer(return_X_y=True)
-
-X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6,
-                                                    random_state=42)
-
-calibration_samples = int(len(X_train) * calibration_samples_percentage)
-
-lr = LogisticRegression().fit(
-    X_train[:-calibration_samples], y_train[:-calibration_samples])
-
-y_pred_lr = lr.predict(X_test)
-tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(y_test, y_pred_lr).ravel()
-tpr_lr = tp_lr / (tp_lr + fn_lr)
-tnr_lr = tn_lr / (tn_lr + fp_lr)
-f_one_lr = f1_score(y_test, y_pred_lr)
-
-ada = AdaBoostClassifier().fit(
-    X_train[:-calibration_samples], y_train[:-calibration_samples])
-
-y_pred_ada = ada.predict(X_test)
-tn_ada, fp_ada, fn_ada, tp_ada = confusion_matrix(y_test, y_pred_ada).ravel()
-tpr_ada = tp_ada / (tp_ada + fn_ada)
-tnr_ada = tn_ada / (tn_ada + fp_ada)
-f_one_ada = f1_score(y_test, y_pred_ada)
-
-# objective 1: we want to calibrate the decision threshold in order to achieve
-# better f1 score
-lr_f_beta = CutoffClassifier(
-    lr, strategy='f_beta', method='predict_proba', beta=1, cv='prefit').fit(
-    X_train[calibration_samples:], y_train[calibration_samples:])
-
-y_pred_lr_f_beta = lr_f_beta.predict(X_test)
-f_one_lr_f_beta = f1_score(y_test, y_pred_lr_f_beta)
-
-ada_f_beta = CutoffClassifier(
-    ada, strategy='f_beta', method='predict_proba', beta=1, cv='prefit'
-).fit(X_train[calibration_samples:], y_train[calibration_samples:])
-
-y_pred_ada_f_beta = ada_f_beta.predict(X_test)
-f_one_ada_f_beta = f1_score(y_test, y_pred_ada_f_beta)
-
-# objective 2: we want to maximize the true positive rate while the true
-# negative rate is at least 0.7
-lr_max_tpr = CutoffClassifier(
-    lr, strategy='max_tpr', method='predict_proba', threshold=0.7, cv='prefit'
-).fit(X_train[calibration_samples:], y_train[calibration_samples:])
-
-y_pred_lr_max_tpr = lr_max_tpr.predict(X_test)
-tn_lr_max_tpr, fp_lr_max_tpr, fn_lr_max_tpr, tp_lr_max_tpr = \
-    confusion_matrix(y_test, y_pred_lr_max_tpr).ravel()
-tpr_lr_max_tpr = tp_lr_max_tpr / (tp_lr_max_tpr + fn_lr_max_tpr)
-tnr_lr_max_tpr = tn_lr_max_tpr / (tn_lr_max_tpr + fp_lr_max_tpr)
-
-ada_max_tpr = CutoffClassifier(
-    ada, strategy='max_tpr', method='predict_proba', threshold=0.7, cv='prefit'
-).fit(X_train[calibration_samples:], y_train[calibration_samples:])
-
-y_pred_ada_max_tpr = ada_max_tpr.predict(X_test)
-tn_ada_max_tpr, fp_ada_max_tpr, fn_ada_max_tpr, tp_ada_max_tpr = \
-    confusion_matrix(y_test, y_pred_ada_max_tpr).ravel()
-tpr_ada_max_tpr = tp_ada_max_tpr / (tp_ada_max_tpr + fn_ada_max_tpr)
-tnr_ada_max_tpr = tn_ada_max_tpr / (tn_ada_max_tpr + fp_ada_max_tpr)
-
-print('Calibrated threshold')
-print('Logistic Regression classifier: {}'.format(
-    lr_max_tpr.decision_threshold_))
-print('AdaBoost classifier: {}'.format(ada_max_tpr.decision_threshold_))
-print('before calibration')
-print('Logistic Regression classifier: tpr = {}, tnr = {}, f1 = {}'.format(
-    tpr_lr, tnr_lr, f_one_lr))
-print('AdaBoost classifier: tpr = {}, tpn = {}, f1 = {}'.format(
-    tpr_ada, tnr_ada, f_one_ada))
-
-print('true positive and true negative rates after calibration')
-print('Logistic Regression classifier: tpr = {}, tnr = {}, f1 = {}'.format(
-    tpr_lr_max_tpr, tnr_lr_max_tpr, f_one_lr_f_beta))
-print('AdaBoost classifier: tpr = {}, tnr = {}, f1 = {}'.format(
-    tpr_ada_max_tpr, tnr_ada_max_tpr, f_one_ada_f_beta))
-
-#########
-# plots #
-#########
-bar_width = 0.2
-
-plt.subplot(2, 1, 1)
-index = np.asarray([1, 2])
-plt.bar(index, [f_one_lr, f_one_ada], bar_width, color='r',
-        label='Before calibration')
-
-plt.bar(index + bar_width, [f_one_lr_f_beta, f_one_ada_f_beta], bar_width,
-        color='b', label='After calibration')
-
-plt.xticks(index + bar_width / 2, ('f1 logistic', 'f1 adaboost'))
-
-plt.ylabel('scores')
-plt.title('f1 score')
-plt.legend(bbox_to_anchor=(.5, -.2), loc='center', borderaxespad=0.)
-
-plt.subplot(2, 1, 2)
-index = np.asarray([1, 2, 3, 4])
-plt.bar(index, [tpr_lr, tnr_lr, tpr_ada, tnr_ada],
-        bar_width, color='r', label='Before calibration')
-
-plt.bar(index + bar_width,
-        [tpr_lr_max_tpr, tnr_lr_max_tpr, tpr_ada_max_tpr, tnr_ada_max_tpr],
-        bar_width, color='b', label='After calibration')
-
-plt.xticks(
-    index + bar_width / 2,
-    ('tpr logistic', 'tnr logistic', 'tpr adaboost', 'tnr adaboost'))
-plt.ylabel('scores')
-plt.title('true positive & true negative rate')
-
-plt.subplots_adjust(hspace=0.6)
-plt.show()

From d924684efab667b3f0da2493d50b15f54b100d6b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 20:09:57 +0100
Subject: [PATCH 15/44] DOC add docstring examples

---
 sklearn/model_selection/_prediction.py | 47 ++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 044f1b7d98f39..07529776f1b7d 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -71,6 +71,53 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
 
     classes_ : array of shape (n_classes,)
         The class labels.
+
+    Examples
+    --------
+    First, we will load the breast cancer databases and make it highly
+    imbalanced.
+
+    >>> import numpy as np
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> X, y = load_breast_cancer(return_X_y=True)
+    >>> pos_idx = np.flatnonzero(y == 1)[:10].tolist()
+    >>> neg_idx = np.flatnonzero(y == 0).tolist()
+    >>> X, y = X[pos_idx + neg_idx, :], y[pos_idx + neg_idx]
+
+    Then, we can split into a training and testing set and keep the
+    same imbalance level in both sets.
+
+    >>> from sklearn.model_selection import train_test_split
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, stratify=y, random_state=0
+    ... )
+
+    We can check the performance of a logistic regression model.
+
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.pipeline import make_pipeline
+    >>> model = make_pipeline(StandardScaler(), LogisticRegression())
+    >>> model.fit(X_train, y_train)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('logisticregression', LogisticRegression())])
+    >>> from sklearn.metrics import balanced_accuracy_score
+    >>> y_pred = model.predict(X_test)
+    >>> print(f"Score: {balanced_accuracy_score(y_test, y_pred):.3f}")
+    Score: 0.833
+
+    We will try to correct the decision threshold which is impacted by the
+    class imbalanced.
+
+    >>> from sklearn.model_selection import CutoffClassifier
+    >>> model_optimized = CutoffClassifier(
+    ...     base_estimator=model, objective_metric=balanced_accuracy_score
+    ... )
+    >>> model_optimized.fit(X, y)
+    CutoffClassifier(...)
+    >>> y_pred = model_optimized.predict(X_test)
+    >>> print(f"Score: {balanced_accuracy_score(y_test, y_pred):.3f}")
+    Score: 0.962
     """
 
     def __init__(

From 237c9190ec965a27693306cd43ca6507e8298a11 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 20:40:26 +0100
Subject: [PATCH 16/44] PEP8

---
 sklearn/calibration.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b56ff8b4f9437..42bc22a6a67cc 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -24,7 +24,6 @@
 from .base import RegressorMixin
 from .base import clone
 from .isotonic import IsotonicRegression
-from .metrics import roc_curve
 from .model_selection import check_cv
 from .preprocessing import label_binarize
 from .preprocessing import LabelBinarizer

From 4210537569953c0073a922cd6a5653c057238055 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 21:15:38 +0100
Subject: [PATCH 17/44] fix for predict proba

---
 doc/modules/calibration.rst                   |  1 +
 doc/modules/classes.rst                       |  4 ++--
 setup.cfg                                     |  2 +-
 sklearn/model_selection/_prediction.py        | 22 +++++--------------
 .../model_selection/tests/test_prediction.py  | 17 +++++++++++---
 5 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
index 16e63c37e7fa3..19df08ea3b1fe 100644
--- a/doc/modules/calibration.rst
+++ b/doc/modules/calibration.rst
@@ -6,6 +6,7 @@ Probability calibration
 
 .. currentmodule:: sklearn.calibration
 
+
 When performing classification you often want not only to predict the class
 label, but also obtain a probability of the respective label. This probability
 gives you some kind of confidence on the prediction. Some models can give you
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c112a29b53e0c..8f4db25e64e7b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -53,8 +53,8 @@ Functions
 
 .. _calibration_ref:
 
-:mod:`sklearn.calibration`: Prediction Calibration
-==================================================
+:mod:`sklearn.calibration`: Probability Calibration
+===================================================
 
 .. automodule:: sklearn.calibration
    :no-members:
diff --git a/setup.cfg b/setup.cfg
index 95e4417b816e1..f086993b26a29 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ addopts =
     --ignore examples
     --ignore maint_tools
     --doctest-modules
-    # --disable-pytest-warnings
+    --disable-pytest-warnings
     -rxXs
 
 filterwarnings =
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 07529776f1b7d..c78a4e50e133f 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -201,14 +201,6 @@ def _validate_data(X, y):
             raise ValueError(f'Expected target of binary type. Got {y_type}.')
         return X, y
 
-    @staticmethod
-    def _get_pos_label_score(y_score, classes, pos_label):
-        """Get score of the positive class."""
-        if y_score.ndim == 2:
-            pos_label_encoded = np.flatnonzero(classes == pos_label).item(0)
-            y_score = y_score[:, pos_label_encoded]
-        return y_score
-
     def fit(self, X, y):
         """Find the decision threshold.
 
@@ -271,11 +263,9 @@ def fit(self, X, y):
         else:
             # `np.unique` is already sorting the value, no need to call
             #  `thresholds.sort()`
-            thresholds = np.unique(
-                self._get_pos_label_score(
-                    y_score, self.classes_, self.pos_label
-                )
-            )
+            if y_score.ndim == 2:
+                y_score = y_score[:, self._pos_label_encoded]
+            thresholds = np.unique(y_score)
             params = ({} if self.objective_metric_params is None
                       else self.objective_metric_params)
             metric_signature = signature(self.objective_metric)
@@ -308,9 +298,9 @@ def predict(self, X):
         check_is_fitted(self)
 
         decision_function = getattr(self._estimator, self._method)
-        y_score = self._get_pos_label_score(
-            decision_function(X), self.classes_, self._pos_label
-        )
+        y_score = decision_function(X)
+        if y_score.ndim == 2:
+            y_score = y_score[:, self._pos_label_encoded]
         y_class_indices = (y_score >= self.decision_threshold_).astype(int)
 
         return self.classes_[y_class_indices]
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 722325cb0e81a..205ed5ab4da98 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -85,7 +85,11 @@ def test_cutoffclassifier_limit_tpr_tnr():
     assert_array_equal(y_pred_tnr, y_pred_tpr)
 
 
-def test_cutoffclassifier_with_objective_value():
+@pytest.mark.parametrize(
+    "method",
+    ["auto", "decision_function", "predict_proba"]
+)
+def test_cutoffclassifier_with_objective_value(method):
     # check that we can optimize a given metric as a callable
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances
@@ -102,7 +106,9 @@ def test_cutoffclassifier_with_objective_value():
 
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     model = CutoffClassifier(
-        base_estimator=lr, objective_metric=balanced_accuracy_score
+        base_estimator=lr,
+        objective_metric=balanced_accuracy_score,
+        method=method,
     )
     score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
     score_baseline = balanced_accuracy_score(y, lr.predict(X))
@@ -143,9 +149,13 @@ def test_cutoffclassifier_pretrained_estimator():
     assert model_prefit._estimator is lr_prefit
 
 
+@pytest.mark.parametrize(
+    "method",
+    ["auto", "decision_function", "predict_proba"]
+)
 @pytest.mark.parametrize("metric", [balanced_accuracy_score, f1_score])
 @pytest.mark.parametrize("dtype", [None, object])
-def test_cutoffclassifier_with_string_targets(dtype, metric):
+def test_cutoffclassifier_with_string_targets(method, dtype, metric):
     # check that targets represented by str are properly managed
     # check with several metrics to be sure that `pos_label` is properly
     # dispatched
@@ -159,6 +169,7 @@ def test_cutoffclassifier_with_string_targets(dtype, metric):
         base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
         pos_label="cancer",
+        method=method,
     ).fit(X, y)
     assert_array_equal(np.sort(model.classes_), np.sort(classes))
     y_pred = model.predict(X[[0], :])

From 255dfe89274664cd42059858d6e4370d3833bcb2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 23:43:37 +0100
Subject: [PATCH 18/44] add support for cost-sensitive

---
 setup.cfg                                     |  2 +-
 sklearn/model_selection/_prediction.py        | 60 +++++++++++++------
 .../model_selection/tests/test_prediction.py  | 21 ++++++-
 3 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index f086993b26a29..95e4417b816e1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ addopts =
     --ignore examples
     --ignore maint_tools
     --doctest-modules
-    --disable-pytest-warnings
+    # --disable-pytest-warnings
     -rxXs
 
 filterwarnings =
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index c78a4e50e133f..013d7ab54e6a5 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -8,6 +8,7 @@
 from ..base import MetaEstimatorMixin
 from ..exceptions import NotFittedError
 from ..metrics import balanced_accuracy_score
+from ..metrics import confusion_matrix
 from ..metrics import roc_curve
 from ..preprocessing import LabelEncoder
 from ..utils import check_array
@@ -31,14 +32,16 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         The classifier, fitted or not fitted, from which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"tpr", "tnr"} or callable, \
+    objective_metric : {"tpr", "tnr"}, ndarray of shape (2, 2) or callable, \
             default=balanced_accuracy_score
         The objective metric to be optimized. Can be on of:
 
-        * `"tpr"`: Find the decision threshold for a true positive ratio (TPR)
+        * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
           of `objective_value`.
-        * `"tnr"`: Find the decision threshold for a true negative ratio (TNR)
+        * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
           of `objective_value`.
+        * `"cost_matrix"`: find the decision threshold which minimize the total
+           cost using the cost matrix given in `objective_value`.
         * a callable with the signature `metric(y_true, y_score, **kwargs)`.
 
     objective_metric_params : dict, default=None
@@ -46,8 +49,12 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
 
     objective_value : float, default=None
         The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold. Only apply when `objective_metric`
-        is `"tpr"` or `"tnr"`
+        want to find the decision threshold. When
+        `objective_metric='cost_matrix'`, this parameter should be a 2x2 cost
+        matrix with the same organization than a
+        :func:`sklearn.metrics.confusion_matrix`: the count of true negatives
+        is :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives
+        is :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
 
     method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `base_estimator` corresponding to the
@@ -162,13 +169,19 @@ def _validate_parameters(self):
                     f"'base_estimator' does not implement {self.method}."
                 )
             self._method = self.method
-        if (self.objective_metric not in ("tnr", "tpr") and
+        if (self.objective_metric not in ("tpr", "tnr", "cost_matrix") and
                 self.objective_value is not None):
             raise ValueError(
                 f"When 'objective_metric' is a scoring function, "
-                f"'objective_value' should be None. Got {self.objective_value}"
-                f" instead."
+                f"'objective_value' should be None. Got "
+                f"{self.objective_metric} instead."
             )
+        elif self.objective_metric == "cost_matrix":
+            if self.objective_value.shape != (2, 2):
+                raise ValueError(
+                    f"When 'objective_metric' is a cost matrix, it must be of "
+                    f"shape (2, 2). Got {self.objective_value.shape} instead."
+                )
 
         # ensure binary classification if `pos_label` is not specified
         # `classes.dtype.kind` in ('O', 'U', 'S') is required to avoid
@@ -201,6 +214,11 @@ def _validate_data(X, y):
             raise ValueError(f'Expected target of binary type. Got {y_type}.')
         return X, y
 
+    @staticmethod
+    def cost_sensitive_score(y_true, y_pred, cost_matrix):
+        cm = confusion_matrix(y_true, y_pred) * cost_matrix
+        return np.diag(cm) / cm.sum()
+
     def fit(self, X, y):
         """Find the decision threshold.
 
@@ -247,7 +265,7 @@ def fit(self, X, y):
         ).item(0)
 
         y_score = getattr(self._estimator, self._method)(X)
-        if self.objective_metric in ("tpr", "tnr"):
+        if self.objective_metric in ("tnr", "tpr"):
             fpr, tpr, thresholds = roc_curve(
                 y_encoded, y_score, pos_label=self._pos_label_encoded
             )
@@ -261,22 +279,26 @@ def fit(self, X, y):
             )
             self.decision_threshold_ = thresholds[threshold_idx]
         else:
-            # `np.unique` is already sorting the value, no need to call
-            #  `thresholds.sort()`
             if y_score.ndim == 2:
                 y_score = y_score[:, self._pos_label_encoded]
+            # `np.unique` is already sorting the value, no need to call
+            #  `thresholds.sort()`
             thresholds = np.unique(y_score)
+            scores = []
             params = ({} if self.objective_metric_params is None
                       else self.objective_metric_params)
-            metric_signature = signature(self.objective_metric)
-            if "pos_label" in metric_signature.parameters:
-                params["pos_label"] = self._pos_label_encoded
-            scores = [
-                self.objective_metric(
-                    y_encoded, (y_score >= th).astype(int), **params
+            if callable(self.objective_metric):
+                metric_func = self.objective_metric
+                metric_signature = signature(metric_func)
+                if "pos_label" in metric_signature.parameters:
+                    params["pos_label"] = self._pos_label_encoded
+            else:
+                metric_func = self.cost_sensitive_score
+                params["cost_matrix"] = self.objective_value
+            for th in thresholds:
+                scores.append(metric_func(
+                    y_encoded, (y_score >= th).astype(int), **params)
                 )
-                for th in thresholds
-            ]
             self.decision_threshold_ = thresholds[np.argmax(scores)]
 
         return self
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 205ed5ab4da98..fc26d02c5a844 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -35,7 +35,11 @@ def fit(self, X, y):
          "'base_estimator' does not implement predict_proba"),
         (LogisticRegression,
          {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
-         "When 'objective_metric' is a scoring function")
+         "When 'objective_metric' is a scoring function"),
+        (LogisticRegression,
+         {"objective_metric": "cost_matrix",
+          "objective_value": np.array([[1], [2]])}, ValueError,
+         "When 'objective_metric' is a cost matrix"),
     ]
 )
 def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
@@ -174,3 +178,18 @@ def test_cutoffclassifier_with_string_targets(method, dtype, metric):
     assert_array_equal(np.sort(model.classes_), np.sort(classes))
     y_pred = model.predict(X[[0], :])
     assert y_pred.item(0) in classes
+
+
+def test_cutoffclassifier_cost_matrix():
+    X, y = load_breast_cancer(return_X_y=True)
+    cost_matrix = np.array([[0, 0],
+                            [0, 1]])
+    clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+    model = CutoffClassifier(
+        base_estimator=clf,
+        objective_metric="cost_matrix",
+        objective_value=cost_matrix,
+    )
+    model.fit(X, y)
+    assert clf.predict(X).sum() < model.predict(X).sum()
+    assert model.predict(X).sum() > (y.size * 0.9)

From f3a372d89fa5f178a0329cbb77ccac254dbb7726 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 25 Feb 2020 23:47:50 +0100
Subject: [PATCH 19/44] revert calibration changes

---
 doc/modules/classes.rst           |  1 +
 setup.cfg                         |  2 +-
 sklearn/calibration.py            | 24 ++++++++----------------
 sklearn/tests/test_calibration.py | 24 ++++++++++--------------
 4 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 8f4db25e64e7b..bee19565552fb 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -70,6 +70,7 @@ Functions
 
    calibration.CalibratedClassifierCV
 
+
 .. autosummary::
    :toctree: generated/
    :template: function.rst
diff --git a/setup.cfg b/setup.cfg
index 95e4417b816e1..f086993b26a29 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ addopts =
     --ignore examples
     --ignore maint_tools
     --doctest-modules
-    # --disable-pytest-warnings
+    --disable-pytest-warnings
     -rxXs
 
 filterwarnings =
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 42bc22a6a67cc..ff9c4b3e75c44 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -7,8 +7,8 @@
 #
 # License: BSD 3 clause
 
-from inspect import signature
 import warnings
+from inspect import signature
 
 from math import log
 import numpy as np
@@ -18,23 +18,15 @@
 from scipy.optimize import fmin_bfgs
 from .preprocessing import LabelEncoder
 
-from .base import BaseEstimator
-from .base import ClassifierMixin
-from .base import MetaEstimatorMixin
-from .base import RegressorMixin
-from .base import clone
+from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
+                   MetaEstimatorMixin)
+from .preprocessing import label_binarize, LabelBinarizer
+from .utils import check_X_y, check_array, indexable, column_or_1d
+from .utils.validation import check_is_fitted, check_consistent_length
+from .utils.validation import _check_sample_weight
 from .isotonic import IsotonicRegression
-from .model_selection import check_cv
-from .preprocessing import label_binarize
-from .preprocessing import LabelBinarizer
 from .svm import LinearSVC
-from .utils import check_X_y
-from .utils import check_array
-from .utils import column_or_1d
-from .utils import indexable
-from .utils.validation import check_is_fitted
-from .utils.validation import check_consistent_length
-from .utils.validation import _check_sample_weight
+from .model_selection import check_cv
 from .utils.validation import _deprecate_positional_args
 
 
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index eb0ad7d800643..f131eab4c1680 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -6,23 +6,19 @@
 from scipy import sparse
 
 from sklearn.base import BaseEstimator
-from sklearn.datasets import make_blobs
-from sklearn.datasets import make_classification
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.impute import SimpleImputer
 from sklearn.model_selection import LeaveOneOut
-from sklearn.metrics import brier_score_loss
-from sklearn.metrics import log_loss
+
+from sklearn.utils._testing import (assert_array_almost_equal,
+                                   assert_almost_equal,
+                                   assert_array_equal,
+                                   assert_raises, ignore_warnings)
+from sklearn.datasets import make_classification, make_blobs
 from sklearn.naive_bayes import MultinomialNB
-from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.svm import LinearSVC
-from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import ignore_warnings
-
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import brier_score_loss, log_loss
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
 from sklearn.calibration import calibration_curve

From f998625b084d4fbf288a504d0730bfe5641c4895 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 Feb 2020 14:01:26 +0100
Subject: [PATCH 20/44] start doc

---
 doc/model_selection.rst                |  1 +
 doc/modules/prediction.rst             | 34 ++++++++++++++++++++++++++
 sklearn/model_selection/_prediction.py | 10 ++++----
 3 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 doc/modules/prediction.rst

diff --git a/doc/model_selection.rst b/doc/model_selection.rst
index 7b540072c15e5..fef4a71477734 100644
--- a/doc/model_selection.rst
+++ b/doc/model_selection.rst
@@ -10,6 +10,7 @@ Model selection and evaluation
 
     modules/cross_validation
     modules/grid_search
+    modules/prediction
     modules/model_evaluation
     modules/model_persistence
     modules/learning_curve
diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
new file mode 100644
index 0000000000000..545dea2a380cb
--- /dev/null
+++ b/doc/modules/prediction.rst
@@ -0,0 +1,34 @@
+.. currentmodule:: sklearn.model_selection
+
+.. _prediction_tuning:
+
+================================================
+Tuning of the decision threshold of an estimator
+================================================
+
+The real-valued decision functions, i.e. `decision_function` and
+`predict_proba`, of machine-learning classifiers carry the inherited biases of
+the fitted model; e.g, in a class imbalanced setting, a classifier
+will naturally lean toward the most frequent class. In some other cases, the
+generic objective function used to train a model is generally unaware of the
+evaluation criteria used to evaluate the model; e.g., one might want to
+penalized differently a false-positive and false-negative ---it will be less
+detrimental to show an image without a cancer (i.e., false-positive) to a
+radiologist than hidding one with a cancer (i.e, false-negtative).
+
+In a binary classification scenario, the hard-prediction, i.e. `predict`, for a
+classifier most commonly use the `predict_proba` and apply a decision threshold
+at 0.5 to output a positive or negative label. Thus, this hard-prediction
+suffers from the same drawbacks than the one raised in the above paragraph.
+
+Post-tuning of the decision threshold
+=====================================
+
+:class:`CutoffClassifier` allows for post-tuning the decision threshold using
+either `decision_function` or `predict_proba` and an objective metric for which
+we want our threshold to be optimized for.
+
+Fine-tune using a single objective metric
+-----------------------------------------
+
+:class:`CutoffClassifier` accepts
\ No newline at end of file
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 013d7ab54e6a5..0d72685d1573e 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -32,17 +32,17 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         The classifier, fitted or not fitted, from which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : {"tpr", "tnr"}, ndarray of shape (2, 2) or callable, \
+    objective_metric : callable, {"tpr", "tnr"} or ndarray of shape (2, 2), \
             default=balanced_accuracy_score
-        The objective metric to be optimized. Can be on of:
+        The objective metric to be optimized. Can be one of:
 
+        * a callable with the signature `metric(y_true, y_score, **kwargs)`;
         * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
-          of `objective_value`.
+          of `objective_value`;
         * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
-          of `objective_value`.
+          of `objective_value`;
         * `"cost_matrix"`: find the decision threshold which minimize the total
            cost using the cost matrix given in `objective_value`.
-        * a callable with the signature `metric(y_true, y_score, **kwargs)`.
 
     objective_metric_params : dict, default=None
         Some extra parameters to pass to `objective_metric`.

From ca4c50a95768211dcb663750bd281ebb7a4e67b6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 11 Mar 2020 09:24:28 +0100
Subject: [PATCH 21/44] iter

---
 doc/modules/prediction.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/prediction.rst b/doc/modules/prediction.rst
index 545dea2a380cb..9065704f9f89d 100644
--- a/doc/modules/prediction.rst
+++ b/doc/modules/prediction.rst
@@ -13,8 +13,9 @@ will naturally lean toward the most frequent class. In some other cases, the
 generic objective function used to train a model is generally unaware of the
 evaluation criteria used to evaluate the model; e.g., one might want to
 penalized differently a false-positive and false-negative ---it will be less
-detrimental to show an image without a cancer (i.e., false-positive) to a
-radiologist than hidding one with a cancer (i.e, false-negtative).
+detrimental to show an MR image without a cancer (i.e., false-positive) to a
+radiologist than hidding one with a cancer (i.e, false-negtative) when
+developing some computer-aided diagnosis system.
 
 In a binary classification scenario, the hard-prediction, i.e. `predict`, for a
 classifier most commonly use the `predict_proba` and apply a decision threshold
@@ -31,4 +32,3 @@ we want our threshold to be optimized for.
 Fine-tune using a single objective metric
 -----------------------------------------
 
-:class:`CutoffClassifier` accepts
\ No newline at end of file

From c0acab09a37de7d4bce1503000c2e5bb15721093 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 19 Mar 2020 17:49:00 +0100
Subject: [PATCH 22/44] skip test

---
 sklearn/tests/test_docstring_parameters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 55af69ca6c10e..4884e4fa84020 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -177,8 +177,9 @@ def test_fit_docstring_attributes(name, Estimator):
     attributes = doc['Attributes']
 
     IGNORED = {'ClassifierChain', 'ColumnTransformer', 'CountVectorizer',
-               'DictVectorizer', 'FeatureUnion', 'GaussianRandomProjection',
-               'GridSearchCV', 'MultiOutputClassifier', 'MultiOutputRegressor',
+               'CutoffClassifier', 'DictVectorizer', 'FeatureUnion',
+               'GaussianRandomProjection', 'GridSearchCV',
+               'MultiOutputClassifier', 'MultiOutputRegressor',
                'NoSampleWeightWrapper', 'OneVsOneClassifier',
                'OneVsRestClassifier', 'OutputCodeClassifier', 'Pipeline',
                'RFE', 'RFECV', 'RandomizedSearchCV', 'RegressorChain',

From 3edc421af4f8ec82cee880687a67a0ffc2f61346 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Mar 2020 22:55:04 +0100
Subject: [PATCH 23/44] iter

---
 sklearn/model_selection/_prediction.py        | 277 ++++++++++++++----
 .../model_selection/tests/test_prediction.py  |  78 +++--
 2 files changed, 275 insertions(+), 80 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 0d72685d1573e..4782288b93876 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -1,6 +1,11 @@
 from inspect import signature
+import numbers
 
 import numpy as np
+from joblib import Parallel, delayed
+
+from ._split import check_cv
+from ._split import StratifiedShuffleSplit
 
 from ..base import clone
 from ..base import BaseEstimator
@@ -12,6 +17,7 @@
 from ..metrics import roc_curve
 from ..preprocessing import LabelEncoder
 from ..utils import check_array
+from ..utils import _safe_indexing
 from ..utils.multiclass import check_classification_targets
 from ..utils.multiclass import type_of_target
 from ..utils.validation import check_is_fitted
@@ -32,7 +38,7 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         The classifier, fitted or not fitted, from which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : callable, {"tpr", "tnr"} or ndarray of shape (2, 2), \
+    objective_metric : callable or {"tpr", "tnr"} \
             default=balanced_accuracy_score
         The objective metric to be optimized. Can be one of:
 
@@ -40,37 +46,72 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
           of `objective_value`;
         * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
-          of `objective_value`;
-        * `"cost_matrix"`: find the decision threshold which minimize the total
-           cost using the cost matrix given in `objective_value`.
+          of `objective_value`.
 
     objective_metric_params : dict, default=None
         Some extra parameters to pass to `objective_metric`.
 
     objective_value : float, default=None
         The value associated with the `objective_metric` metric for which we
-        want to find the decision threshold. When
-        `objective_metric='cost_matrix'`, this parameter should be a 2x2 cost
-        matrix with the same organization than a
-        :func:`sklearn.metrics.confusion_matrix`: the count of true negatives
-        is :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives
-        is :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
+        want to find the decision threshold when `objective_metric` is equal to
+        `"tpr"` or `"tnr"`.
 
     method : {"auto", "decision_function", "predict_proba"}, default="auto"
         Methods by the classifier `base_estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
 
-        * if `"auto"`, it will try to invoke, for each estimator,
+        * if `"auto"`, it will try to invoke, for each classifier,
           `"decision_function` or `"predict_proba"` in that order.
         * otherwise, one of `"predict_proba"` or `"decision_function"`.
-          If the method is not implemented by the estimator, it will raise an
+          If the method is not implemented by the classifier, it will raise an
           error.
 
+    n_threshold : int, default=1000
+        The number of decision threshold to use when discretizing the output
+        of the classifier `method`.
+
     pos_label : int or str, default=None
         The label of the positive class. When `pos_label=None`, if `y_true` is
         in `{-1, 1}` or `{0, 1}`, `pos_label` is set to 1, otherwise an error
         will be raised.
 
+    cv : int, float, cross-validation generator, iterable or "prefit", \
+            default=None
+        Determines the cross-validation splitting strategy used in
+        `cross_val_predict` to train classifier. Possible inputs for cv are:
+
+        * None, to use the default 5-fold stratified K-fold cross validation;
+        * An integer number, to specify the number of folds in a stratified
+          k-fold;
+        * A float number, to specify a single shuffle split. The floating
+          number should be in (0, 1) and represent the size of the validation
+          set;
+        * An object to be used as a cross-validation generator;
+        * An iterable yielding train, test splits;
+        * "prefit", to bypass the cross-validation.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    refit : "auto" or bool, default="auto"
+        Whether or not to refit the classifier on the entire training set once
+        the decision threshold has been found. By default, `refit="auto"` is
+        equivalent to `refit=False` when `cv` is a float number using a single
+        shuffle split or `cv="prefit"` otherwise `refit=True` in all other
+        cases. Note that forcing `refit=False` on cross-validation having more
+        than a single split will raise an error. Similarly, `refit=True` in
+        conjunction with `cv="prefit"` will raise an error.
+
+    random_state : int or RandomState, default=None
+        Controls the randomness of the training and testing indices produced
+        when `cv` is a single shuffle split (i.e., giving a float number).
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of jobs to run in parallel all `estimators` `fit`.
+        `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
+        using all processors. See Glossary for more details.
+
     Attributes
     ----------
     decision_threshold_ : float
@@ -134,14 +175,24 @@ def __init__(
         objective_metric_params=None,
         objective_value=None,
         method="auto",
-        pos_label=None
+        n_threshold=1000,
+        pos_label=None,
+        cv=None,
+        refit="auto",
+        random_state=None,
+        n_jobs=None,
     ):
         self.base_estimator = base_estimator
         self.objective_metric = objective_metric
         self.objective_metric_params = objective_metric_params
         self.objective_value = objective_value
         self.method = method
+        self.n_threshold = n_threshold
         self.pos_label = pos_label
+        self.cv = cv
+        self.refit = refit
+        self.random_state = random_state
+        self.n_jobs = n_jobs
 
     def _validate_parameters(self):
         """Validate the input parameters."""
@@ -169,19 +220,13 @@ def _validate_parameters(self):
                     f"'base_estimator' does not implement {self.method}."
                 )
             self._method = self.method
-        if (self.objective_metric not in ("tpr", "tnr", "cost_matrix") and
+        if (self.objective_metric not in ("tpr", "tnr") and
                 self.objective_value is not None):
             raise ValueError(
                 f"When 'objective_metric' is a scoring function, "
                 f"'objective_value' should be None. Got "
                 f"{self.objective_metric} instead."
             )
-        elif self.objective_metric == "cost_matrix":
-            if self.objective_value.shape != (2, 2):
-                raise ValueError(
-                    f"When 'objective_metric' is a cost matrix, it must be of "
-                    f"shape (2, 2). Got {self.objective_value.shape} instead."
-                )
 
         # ensure binary classification if `pos_label` is not specified
         # `classes.dtype.kind` in ('O', 'U', 'S') is required to avoid
@@ -205,6 +250,13 @@ def _validate_parameters(self):
         else:
             self._pos_label = self.pos_label
 
+        if (not isinstance(self.n_threshold, numbers.Integral) or
+                self.n_threshold < 0):
+            raise ValueError(
+                f"'n_threshold' should be a strictly positive integer. "
+                f"Got {self.n_threshold} instead."
+            )
+
     @staticmethod
     def _validate_data(X, y):
         y = check_array(y, ensure_2d=False, dtype=None)
@@ -214,10 +266,118 @@ def _validate_data(X, y):
             raise ValueError(f'Expected target of binary type. Got {y_type}.')
         return X, y
 
+    def _check_cv_refit(self, cv, refit, y, random_state):
+        if isinstance(cv, numbers.Real) and 0 < cv < 1:
+            cv = StratifiedShuffleSplit(
+                n_splits=1, test_size=cv, random_state=random_state
+            )
+            refit = False if refit == "auto" else refit
+        elif cv == "prefit":
+            if refit is True:
+                raise ValueError("When cv='prefit', refit cannot be True.")
+            refit = False
+        else:
+            cv = check_cv(cv, y=y, classifier=True)
+            if refit is False:
+                raise ValueError(
+                    "When cv has several folds, refit cannot be False"
+                )
+            refit = True
+        return cv, refit
+
     @staticmethod
-    def cost_sensitive_score(y_true, y_pred, cost_matrix):
-        cm = confusion_matrix(y_true, y_pred) * cost_matrix
-        return np.diag(cm) / cm.sum()
+    def _fit_and_score(estimator, X, y, train_idx, val_idx, predict_method,
+                       score_method, score_params, pos_label_encoded):
+        if train_idx is not None:
+            X_train = _safe_indexing(X, train_idx)
+            X_val = _safe_indexing(X, val_idx)
+            y_train = _safe_indexing(y, train_idx)
+            y_val = _safe_indexing(y, val_idx)
+
+            estimator.fit(X_train, y_train)
+        else:
+            X_val, y_val = X, y
+
+        y_score = getattr(estimator, predict_method)(X_val)
+        if y_score.ndim == 2:
+            y_score = y_score[:, pos_label_encoded]
+
+        if score_method in ("tnr", "tpr"):
+            fpr, tpr, potential_thresholds = roc_curve(
+                y_val, y_score, pos_label=pos_label_encoded
+            )
+            score_thresholds = tpr
+            if score_method == "tnr":
+                score_thresholds = (1 - fpr)[::-1]
+                potential_thresholds = potential_thresholds[::-1]
+        else:
+            params = {} if score_params is None else score_params
+            if "pos_label" in signature(score_method).parameters:
+                params["pos_label"] = pos_label_encoded
+            # `np.unique` is already sorting the value, no need to call
+            #  `potential_thresholds.sort()`
+            potential_thresholds = np.unique(y_score)
+            score_thresholds = np.array([
+                score_method(y_val, (y_score >= th).astype(int), **params)
+                for th in potential_thresholds
+            ])
+
+        return potential_thresholds, score_thresholds
+
+    @staticmethod
+    def _find_decision_threshold(thresholds, scores, n_thresholds,
+                                 objective_score):
+        min_threshold = np.min([th.min() for th in thresholds])
+        max_threshold = np.max([th.max() for th in thresholds])
+        ascending = thresholds[0].argmin() == 0
+        start = min_threshold if ascending else max_threshold
+        stop = max_threshold if ascending else min_threshold
+        thresholds_interpolated = np.linspace(start, stop, num=n_thresholds)
+        mean_score = np.mean(
+            [np.interp(thresholds_interpolated,
+                       thresholds[fold_idx], scores[fold_idx])
+             for fold_idx in range(len(scores))],
+            axis=0
+        )
+        if objective_score == "highest":
+            threshold_idx = mean_score.argmax()
+        else:
+            threshold_idx = np.searchsorted(mean_score, objective_score)
+        return thresholds_interpolated[threshold_idx]
+
+    # @staticmethod
+    # def _find_best_threshold(thresholds, scores, n_thresholds):
+    #     min_threshold = np.min([th.min() for th in thresholds])
+    #     max_threshold = np.max([th.max() for th in thresholds])
+    #     thresholds_interpolated = np.linspace(
+    #         min_threshold, max_threshold, num=n_thresholds
+    #     )
+    #     scores_interpolated = np.array(
+    #         [np.interp(thresholds_interpolated, thresholds[fold_idx],
+    #                    scores[fold_idx])
+    #          for fold_idx in range(len(scores))]
+    #     )
+    #     return thresholds_interpolated[
+    #         np.mean(scores_interpolated, axis=0).argmax()
+    #     ]
+
+    # @staticmethod
+    # def _find_closest_threshold(thresholds, scores, n_thresholds,
+    #                             objective_score):
+    #     min_threshold = np.min([th.min() for th in thresholds])
+    #     max_threshold = np.max([th.max() for th in thresholds])
+    #     ascending = thresholds[0].argmin() == 0
+    #     start = min_threshold if ascending else max_threshold
+    #     stop = max_threshold if ascending else min_threshold
+    #     thresholds_interpolated = np.linspace(start, stop, num=n_thresholds)
+    #     mean_score = np.mean(
+    #         [np.interp(thresholds_interpolated,
+    #                    thresholds[fold_idx], scores[fold_idx])
+    #          for fold_idx in range(len(scores))],
+    #         axis=0
+    #     )
+    #     threshold_idx = np.searchsorted(mean_score, objective_score)
+    #     return thresholds_interpolated[threshold_idx]
 
     def fit(self, X, y):
         """Find the decision threshold.
@@ -238,11 +398,22 @@ def fit(self, X, y):
         """
         X, y = self._validate_data(X, y)
 
-        try:
+        cv, refit = self._check_cv_refit(
+            self.cv, self.refit, y, self.random_state
+        )
+
+        # Start by fitting the final estimator
+        if refit:
+            self._estimator = clone(self.base_estimator).fit(X, y)
+        elif cv == "prefit":
             check_is_fitted(self.base_estimator, attributes=["classes_"])
             self._estimator = self.base_estimator
-        except NotFittedError:
-            self._estimator = clone(self.base_estimator).fit(X, y)
+        else:  # single shuffle split CV
+            train_idx, _ = next(cv.split(X, y))
+            X_train = _safe_indexing(X, train_idx)
+            y_train = _safe_indexing(y, train_idx)
+            self._estimator = clone(self.base_estimator).fit(X_train, y_train)
+
         self.classes_ = self._estimator.classes_
         if len(self.classes_) == 1:
             raise ValueError(
@@ -264,42 +435,30 @@ def fit(self, X, y):
             self.classes_ == self._pos_label
         ).item(0)
 
-        y_score = getattr(self._estimator, self._method)(X)
-        if self.objective_metric in ("tnr", "tpr"):
-            fpr, tpr, thresholds = roc_curve(
-                y_encoded, y_score, pos_label=self._pos_label_encoded
+        if cv == "prefit" or not refit:
+            model = self._estimator
+            splits = ([None, range(len(X))],)
+        else:
+            model = clone(self.base_estimator)
+            splits = cv.split(X, y)
+
+        thresholds, scores = zip(*Parallel(n_jobs=self.n_jobs)(
+            delayed(self._fit_and_score)(
+                model, X, y_encoded, train_idx, val_idx,
+                self._method,
+                self.objective_metric, self.objective_metric_params,
+                self._pos_label_encoded
             )
-            metric = tpr
-            if self.objective_metric == "tnr":
-                tnr, thresholds = (1 - fpr)[::-1], thresholds[::-1]
-                metric = tnr
+            for train_idx, val_idx in splits
+        ))
 
-            threshold_idx = np.searchsorted(
-                metric, self.objective_value
-            )
-            self.decision_threshold_ = thresholds[threshold_idx]
+        if self.objective_metric in ("tnr", "tpr"):
+            objective_value = self.objective_value
         else:
-            if y_score.ndim == 2:
-                y_score = y_score[:, self._pos_label_encoded]
-            # `np.unique` is already sorting the value, no need to call
-            #  `thresholds.sort()`
-            thresholds = np.unique(y_score)
-            scores = []
-            params = ({} if self.objective_metric_params is None
-                      else self.objective_metric_params)
-            if callable(self.objective_metric):
-                metric_func = self.objective_metric
-                metric_signature = signature(metric_func)
-                if "pos_label" in metric_signature.parameters:
-                    params["pos_label"] = self._pos_label_encoded
-            else:
-                metric_func = self.cost_sensitive_score
-                params["cost_matrix"] = self.objective_value
-            for th in thresholds:
-                scores.append(metric_func(
-                    y_encoded, (y_score >= th).astype(int), **params)
-                )
-            self.decision_threshold_ = thresholds[np.argmax(scores)]
+            objective_value = "highest"
+        self.decision_threshold_ = self._find_decision_threshold(
+            thresholds, scores, self.n_threshold, objective_value
+        )
 
         return self
 
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index fc26d02c5a844..e1b7fb540e01c 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -9,10 +9,12 @@
 from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import f1_score
 from sklearn.metrics import fbeta_score
+from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_allclose
 
 from sklearn.model_selection import CutoffClassifier
 
@@ -36,10 +38,15 @@ def fit(self, X, y):
         (LogisticRegression,
          {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
          "When 'objective_metric' is a scoring function"),
-        (LogisticRegression,
-         {"objective_metric": "cost_matrix",
-          "objective_value": np.array([[1], [2]])}, ValueError,
-         "When 'objective_metric' is a cost matrix"),
+        (LogisticRegression, {"cv": 1.5}, ValueError, "Got 1.5"),
+        (LogisticRegression, {"refit": False}, ValueError,
+         "When cv has several folds, refit cannot be False"),
+        (LogisticRegression, {"cv": "prefit", "refit": True}, ValueError,
+         "When cv='prefit', refit cannot be True."),
+        (LogisticRegression, {"n_threshold": -10}, ValueError,
+         "'n_threshold' should be a strictly positive integer."),
+        (LogisticRegression, {"n_threshold": 10.5}, ValueError,
+         "'n_threshold' should be a strictly positive integer."),
     ]
 )
 def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
@@ -86,7 +93,7 @@ def test_cutoffclassifier_limit_tpr_tnr():
     y_pred_tpr = clf.fit(X, y).predict(X)
     clf.set_params(objective_metric="tnr")
     y_pred_tnr = (~clf.fit(X, y).predict(X).astype(bool)).astype(int)
-    assert_array_equal(y_pred_tnr, y_pred_tpr)
+    assert np.mean(y_pred_tnr == y_pred_tpr) > 0.98
 
 
 @pytest.mark.parametrize(
@@ -141,16 +148,41 @@ def test_cutoffclassifier_pretrained_estimator():
     # check that passing a pre-trained estimator is equivalent to training it
     # in the meta-estimator
     X, y = load_breast_cancer(return_X_y=True)
-    lr_prefit = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+
+    random_state = 0
+    val_size = 0.2
+
+    cv = StratifiedShuffleSplit(
+        n_splits=1, test_size=val_size, random_state=random_state
+    )
+    train_idx, val_idx = next(cv.split(X, y))
+    X_train, X_val = X[train_idx], X[val_idx]
+    y_train, y_val = y[train_idx], y[val_idx]
+
+    lr_prefit = make_pipeline(StandardScaler(), LogisticRegression())
+    lr_prefit.fit(X_train, y_train)
     lr = make_pipeline(StandardScaler(), LogisticRegression())
-    model_prefit = CutoffClassifier(base_estimator=lr_prefit).fit(X, y)
-    model = CutoffClassifier(base_estimator=lr).fit(X, y)
 
-    assert (model_prefit.decision_threshold_ ==
-            pytest.approx(model.decision_threshold_))
+    model_prefit = CutoffClassifier(base_estimator=lr_prefit, cv="prefit")
+    model = CutoffClassifier(base_estimator=lr, cv=val_size, random_state=0)
+
+    model_prefit.fit(X_val, y_val)
+    model.fit(X, y)
+
+    # FIXME: we should find the same decision threshold
+    # assert (model_prefit.decision_threshold_ ==
+    #         pytest.approx(model.decision_threshold_))
+
+    # The model coefficient of the 2 models should be close because they are
+    # fitted on the same training data
+    assert_allclose(
+        model_prefit._estimator[-1].coef_, model._estimator[-1].coef_
+    )
 
     # check that we did not make any clone/copy of the pretrained estimator
+    # when this is not required
     assert model_prefit._estimator is lr_prefit
+    assert model._estimator is not lr
 
 
 @pytest.mark.parametrize(
@@ -180,16 +212,20 @@ def test_cutoffclassifier_with_string_targets(method, dtype, metric):
     assert y_pred.item(0) in classes
 
 
-def test_cutoffclassifier_cost_matrix():
+@pytest.mark.parametrize(
+    "params",
+    [
+        {"cv": 5, "refit": True},
+        {"cv": 0.2, "refit": False},
+        {"cv": 0.2, "refit": True},
+        {"cv": "prefit"},
+    ]
+)
+def test_tmp_fit(params):
     X, y = load_breast_cancer(return_X_y=True)
-    cost_matrix = np.array([[0, 0],
-                            [0, 1]])
-    clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
+
+    estimator = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     model = CutoffClassifier(
-        base_estimator=clf,
-        objective_metric="cost_matrix",
-        objective_value=cost_matrix,
-    )
-    model.fit(X, y)
-    assert clf.predict(X).sum() < model.predict(X).sum()
-    assert model.predict(X).sum() > (y.size * 0.9)
+        base_estimator=estimator,
+        **params
+    ).fit(X, y)

From 982918a0e84c3c3bf3cd0139dd7cda68108297a5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Mar 2020 23:02:59 +0100
Subject: [PATCH 24/44] remove unsued code

---
 sklearn/model_selection/_prediction.py | 34 --------------------------
 1 file changed, 34 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 4782288b93876..ac1335b065a03 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -345,40 +345,6 @@ def _find_decision_threshold(thresholds, scores, n_thresholds,
             threshold_idx = np.searchsorted(mean_score, objective_score)
         return thresholds_interpolated[threshold_idx]
 
-    # @staticmethod
-    # def _find_best_threshold(thresholds, scores, n_thresholds):
-    #     min_threshold = np.min([th.min() for th in thresholds])
-    #     max_threshold = np.max([th.max() for th in thresholds])
-    #     thresholds_interpolated = np.linspace(
-    #         min_threshold, max_threshold, num=n_thresholds
-    #     )
-    #     scores_interpolated = np.array(
-    #         [np.interp(thresholds_interpolated, thresholds[fold_idx],
-    #                    scores[fold_idx])
-    #          for fold_idx in range(len(scores))]
-    #     )
-    #     return thresholds_interpolated[
-    #         np.mean(scores_interpolated, axis=0).argmax()
-    #     ]
-
-    # @staticmethod
-    # def _find_closest_threshold(thresholds, scores, n_thresholds,
-    #                             objective_score):
-    #     min_threshold = np.min([th.min() for th in thresholds])
-    #     max_threshold = np.max([th.max() for th in thresholds])
-    #     ascending = thresholds[0].argmin() == 0
-    #     start = min_threshold if ascending else max_threshold
-    #     stop = max_threshold if ascending else min_threshold
-    #     thresholds_interpolated = np.linspace(start, stop, num=n_thresholds)
-    #     mean_score = np.mean(
-    #         [np.interp(thresholds_interpolated,
-    #                    thresholds[fold_idx], scores[fold_idx])
-    #          for fold_idx in range(len(scores))],
-    #         axis=0
-    #     )
-    #     threshold_idx = np.searchsorted(mean_score, objective_score)
-    #     return thresholds_interpolated[threshold_idx]
-
     def fit(self, X, y):
         """Find the decision threshold.
 

From a42613154e79d61b201d585acdf5566b91c6c057 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Mar 2020 23:04:39 +0100
Subject: [PATCH 25/44] docstring

---
 sklearn/model_selection/_prediction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index ac1335b065a03..a53ad06ccf3d4 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -165,7 +165,7 @@ class imbalanced.
     CutoffClassifier(...)
     >>> y_pred = model_optimized.predict(X_test)
     >>> print(f"Score: {balanced_accuracy_score(y_test, y_pred):.3f}")
-    Score: 0.962
+    Score: 0.972
     """
 
     def __init__(

From 470a09c65fd4280c0b51308dcc9759de241224ca Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 20 Mar 2020 23:07:06 +0100
Subject: [PATCH 26/44] pep9

---
 sklearn/model_selection/_prediction.py           | 2 --
 sklearn/model_selection/tests/test_prediction.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index a53ad06ccf3d4..6082a5bb4c9ec 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -11,9 +11,7 @@
 from ..base import BaseEstimator
 from ..base import ClassifierMixin
 from ..base import MetaEstimatorMixin
-from ..exceptions import NotFittedError
 from ..metrics import balanced_accuracy_score
-from ..metrics import confusion_matrix
 from ..metrics import roc_curve
 from ..preprocessing import LabelEncoder
 from ..utils import check_array
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index e1b7fb540e01c..773028572b143 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -225,7 +225,7 @@ def test_tmp_fit(params):
     X, y = load_breast_cancer(return_X_y=True)
 
     estimator = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
-    model = CutoffClassifier(
+    CutoffClassifier(
         base_estimator=estimator,
         **params
     ).fit(X, y)

From a56558932d78c3dca172525867070ce44be7be77 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 09:42:50 +0200
Subject: [PATCH 27/44] TST wip

---
 sklearn/metrics/_scorer.py                  | 20 +++++--
 sklearn/metrics/tests/test_score_objects.py | 58 +++++++++++++++++++++
 2 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 9ad57f4611e52..5d32cc788cd72 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -239,7 +239,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         y_pred = method_caller(clf, "predict_proba", X)
         if y_type == "binary":
             if y_pred.shape[1] == 2:
-                y_pred = y_pred[:, 1]
+                pos_label = self._kwargs.get("pos_label", clf.classes_[1])
+                col_idx = np.flatnonzero(clf.classes_ == pos_label)[0]
+                y_pred = y_pred[:, col_idx]
             elif y_pred.shape[1] == 1:  # not multiclass
                 raise ValueError('got predict_proba of shape {},'
                                  ' but need classifier with two'
@@ -298,16 +300,28 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             try:
                 y_pred = method_caller(clf, "decision_function", X)
 
-                # For multi-output multi-class estimator
                 if isinstance(y_pred, list):
+                    # For multi-output multi-class estimator
                     y_pred = np.vstack([p for p in y_pred]).T
+                elif (
+                    y_type == "binary"
+                    and "pos_label" in self._kwargs
+                    and self._kwargs["pos_label"] == clf.classes_[0]
+                ):
+                    # The positive class is not the `pos_label` seen by the
+                    # classifier and we need to inverse the predictions
+                    y_pred *= -1
 
             except (NotImplementedError, AttributeError):
                 y_pred = method_caller(clf, "predict_proba", X)
 
                 if y_type == "binary":
                     if y_pred.shape[1] == 2:
-                        y_pred = y_pred[:, 1]
+                        pos_label = self._kwargs.get(
+                            "pos_label", clf.classes_[1]
+                        )
+                        col_idx = np.flatnonzero(clf.classes_ == pos_label)[0]
+                        y_pred = y_pred[:, col_idx]
                     else:
                         raise ValueError('got predict_proba of shape {},'
                                          ' but need classifier with two'
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 67900b7cb77c3..2ee064ecd9b12 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -618,6 +618,8 @@ def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count,
     mock_est.predict = predict_func
     mock_est.predict_proba = predict_proba_func
     mock_est.decision_function = decision_function_func
+    # add the classes that would be found during fit
+    mock_est.classes_ = np.array([0, 1])
 
     scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers)
     multi_scorer = _MultimetricScorer(**scorer_dict)
@@ -747,3 +749,59 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
     msg = "'Perceptron' object has no attribute 'predict_proba'"
     with pytest.raises(AttributeError, match=msg):
         scorer(lr, X, y)
+
+
+def _make_imbalanced_string_dataset():
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.utils import shuffle
+
+    X, y = load_breast_cancer(return_X_y=True)
+    # create an highly imbalanced
+    idx_positive = np.flatnonzero(y == 1)
+    idx_negative = np.flatnonzero(y == 0)
+    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
+    X, y = X[idx_selected], y[idx_selected]
+    X, y = shuffle(X, y, random_state=42)
+    # only use 2 features to make the problem even harder
+    X = X[:, :2]
+    y = np.array(
+        ["cancer" if c == 1 else "not cancer" for c in y], dtype=object
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, stratify=y, random_state=0,
+    )
+    return X_train, X_test, y_train, y_test
+
+
+def test_xxx():
+    from sklearn.metrics import average_precision_score
+    X_train, X_test, y_train, y_test = _make_imbalanced_string_dataset()
+
+    classifier = LogisticRegression().fit(X_train, y_train)
+    y_proba = classifier.predict_proba(X_test)
+    y_decision_function = classifier.decision_function(X_test)
+
+    pos_label = "cancer"
+    y_proba = y_proba[:, 0]
+    y_decision_function *= -1
+
+    assert classifier.classes_[0] == pos_label
+
+    ap_proba = average_precision_score(y_test, y_proba, pos_label=pos_label)
+    ap_decision_function = average_precision_score(
+        y_test, y_decision_function, pos_label=pos_label
+    )
+    assert ap_proba == pytest.approx(ap_decision_function)
+
+    average_precision_scorer = make_scorer(
+        average_precision_score, needs_threshold=True,
+    )
+    with pytest.raises(ValueError):
+        average_precision_scorer(classifier, X_test, y_test)
+
+    average_precision_scorer = make_scorer(
+        average_precision_score, needs_threshold=True, pos_label=pos_label
+    )
+    ap_scorer = average_precision_scorer(classifier, X_test, y_test)
+
+    assert ap_scorer == pytest.approx(ap_proba)

From 6c0db49123b74ae32998d8ccb8c40aba59653073 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 10:43:45 +0200
Subject: [PATCH 28/44] TST wip

---
 sklearn/metrics/tests/test_score_objects.py | 79 +++++++++++++++------
 1 file changed, 58 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 2ee064ecd9b12..4f146c34fe606 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -16,9 +16,18 @@
 from sklearn.utils._testing import ignore_warnings
 
 from sklearn.base import BaseEstimator
-from sklearn.metrics import (f1_score, r2_score, roc_auc_score, fbeta_score,
-                             log_loss, precision_score, recall_score,
-                             jaccard_score)
+from sklearn.metrics import (
+    average_precision_score,
+    brier_score_loss,
+    f1_score,
+    fbeta_score,
+    jaccard_score,
+    log_loss,
+    precision_score,
+    r2_score,
+    recall_score,
+    roc_auc_score,
+)
 from sklearn.metrics import cluster as cluster_module
 from sklearn.metrics import check_scoring
 from sklearn.metrics._scorer import (_PredictScorer, _passthrough_scorer,
@@ -751,7 +760,8 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
         scorer(lr, X, y)
 
 
-def _make_imbalanced_string_dataset():
+@pytest.fixture
+def fitted_clf_predictions():
     from sklearn.datasets import load_breast_cancer
     from sklearn.utils import shuffle
 
@@ -770,38 +780,65 @@ def _make_imbalanced_string_dataset():
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, stratify=y, random_state=0,
     )
-    return X_train, X_test, y_train, y_test
+    classifier = LogisticRegression().fit(X_train, y_train)
+    y_pred_proba = classifier.predict_proba(X_test)
+    y_pred_decision = classifier.decision_function(X_test)
 
+    return classifier, X_test, y_test, y_pred_proba, y_pred_decision
 
-def test_xxx():
-    from sklearn.metrics import average_precision_score
-    X_train, X_test, y_train, y_test = _make_imbalanced_string_dataset()
 
-    classifier = LogisticRegression().fit(X_train, y_train)
-    y_proba = classifier.predict_proba(X_test)
-    y_decision_function = classifier.decision_function(X_test)
+def test_average_precision_pos_label(fitted_clf_predictions):
+    clf, X_test, y_test, y_pred_proba, y_pred_decision = fitted_clf_predictions
 
     pos_label = "cancer"
-    y_proba = y_proba[:, 0]
-    y_decision_function *= -1
-
-    assert classifier.classes_[0] == pos_label
-
-    ap_proba = average_precision_score(y_test, y_proba, pos_label=pos_label)
+    # we need to select the positive column or reverse the decision values
+    y_pred_proba = y_pred_proba[:, 0]
+    y_pred_decision = y_pred_decision * -1
+    assert clf.classes_[0] == pos_label
+
+    # check that when calling the scoring function, probability estimates and
+    # decision values lead to the same results
+    ap_proba = average_precision_score(
+        y_test, y_pred_proba, pos_label=pos_label
+    )
     ap_decision_function = average_precision_score(
-        y_test, y_decision_function, pos_label=pos_label
+        y_test, y_pred_decision, pos_label=pos_label
     )
     assert ap_proba == pytest.approx(ap_decision_function)
 
+    # create a scorer which would require to pass a `pos_label`
+    # check that it fails if `pos_label` is not provided
     average_precision_scorer = make_scorer(
         average_precision_score, needs_threshold=True,
     )
-    with pytest.raises(ValueError):
-        average_precision_scorer(classifier, X_test, y_test)
+    err_msg = "pos_label=1 is invalid. Set it to a label in y_true."
+    with pytest.raises(ValueError, match=err_msg):
+        average_precision_scorer(clf, X_test, y_test)
 
+    # otherwise, the scorer should give the same results than calling the
+    # scoring function
     average_precision_scorer = make_scorer(
         average_precision_score, needs_threshold=True, pos_label=pos_label
     )
-    ap_scorer = average_precision_scorer(classifier, X_test, y_test)
+    ap_scorer = average_precision_scorer(clf, X_test, y_test)
 
     assert ap_scorer == pytest.approx(ap_proba)
+
+
+def test_brier_score_loss_pos_label(fitted_clf_predictions):
+    clf, X_test, y_test, y_pred_proba, y_pred_decision = fitted_clf_predictions
+
+    pos_label = "cancer"
+    # we need to select the positive column or reverse the decision values
+    y_pred_proba = y_pred_proba[:, 0]
+    y_pred_decision = y_pred_decision * -1
+    assert clf.classes_[0] == pos_label
+
+    print(brier_score_loss(y_test, y_pred_proba, pos_label=pos_label))
+    brier_scorer = make_scorer(
+        brier_score_loss,
+        needs_proba=True,
+        greater_is_better=False,
+        pos_label=pos_label,
+    )
+    print(brier_scorer(clf, X_test, y_test))

From c41e999205f9b6f1e8ece6d0fcf6baa0cde52361 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 11:01:36 +0200
Subject: [PATCH 29/44] TST wip

---
 sklearn/metrics/tests/test_score_objects.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 4f146c34fe606..040a64c298482 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -829,16 +829,18 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
     clf, X_test, y_test, y_pred_proba, y_pred_decision = fitted_clf_predictions
 
     pos_label = "cancer"
-    # we need to select the positive column or reverse the decision values
-    y_pred_proba = y_pred_proba[:, 0]
-    y_pred_decision = y_pred_decision * -1
     assert clf.classes_[0] == pos_label
 
-    print(brier_score_loss(y_test, y_pred_proba, pos_label=pos_label))
+    # brier score loss is symmetric
+    brier_pos_cancer = brier_score_loss(
+        y_test, y_pred_proba[:, 0], pos_label="cancer"
+    )
+    brier_pos_not_cancer = brier_score_loss(
+        y_test, y_pred_proba[:, 1], pos_label="not cancer"
+    )
+    assert brier_pos_cancer == brier_pos_not_cancer
+
     brier_scorer = make_scorer(
-        brier_score_loss,
-        needs_proba=True,
-        greater_is_better=False,
-        pos_label=pos_label,
+        brier_score_loss, needs_proba=True, pos_label=pos_label,
     )
-    print(brier_scorer(clf, X_test, y_test))
+    assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer)

From f53f8336ae3200de7a9d0d007ed1745356744f5d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 11:05:56 +0200
Subject: [PATCH 30/44] TST PEP8 + comments

---
 sklearn/metrics/tests/test_score_objects.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 040a64c298482..a1a5eaeac271c 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -788,6 +788,9 @@ def fitted_clf_predictions():
 
 
 def test_average_precision_pos_label(fitted_clf_predictions):
+    # check that _ThresholdScorer will lead to the right score when passing
+    # `pos_label`. Currently, only `average_precision_score` is defined to
+    # be such a scorer.
     clf, X_test, y_test, y_pred_proba, y_pred_decision = fitted_clf_predictions
 
     pos_label = "cancer"
@@ -826,7 +829,10 @@ def test_average_precision_pos_label(fitted_clf_predictions):
 
 
 def test_brier_score_loss_pos_label(fitted_clf_predictions):
-    clf, X_test, y_test, y_pred_proba, y_pred_decision = fitted_clf_predictions
+    # check that _ProbaScorer leads to the right score when `pos_label` is
+    # provided. Currently only the `brier_score_loss` is defined to be such
+    # a scorer.
+    clf, X_test, y_test, y_pred_proba, _ = fitted_clf_predictions
 
     pos_label = "cancer"
     assert clf.classes_[0] == pos_label

From dd4e9fe00538f97487329c540aac1bb47d9ee280 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 11:22:54 +0200
Subject: [PATCH 31/44] TST force to use predict_proba as well

---
 sklearn/metrics/tests/test_score_objects.py | 22 ++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index a1a5eaeac271c..135e5bd329458 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -1,10 +1,11 @@
+from copy import deepcopy
 import pickle
 import tempfile
 import shutil
 import os
 import numbers
 from unittest.mock import Mock
-from functools import partial
+from functools import partial, partialmethod
 
 import numpy as np
 import pytest
@@ -827,6 +828,25 @@ def test_average_precision_pos_label(fitted_clf_predictions):
 
     assert ap_scorer == pytest.approx(ap_proba)
 
+    # The above scorer call is using `clf.decision_function`. We will force
+    # it to use `clf.predict_proba`.
+    clf_without_predict_proba = deepcopy(clf)
+
+    def _predict_proba(self, X):
+        raise NotImplementedError
+
+    clf_without_predict_proba.predict_proba = partial(
+        _predict_proba, clf_without_predict_proba
+    )
+    # sanity check
+    with pytest.raises(NotImplementedError):
+        clf_without_predict_proba.predict_proba(X_test)
+
+    ap_scorer = average_precision_scorer(
+        clf_without_predict_proba, X_test, y_test
+    )
+    assert ap_scorer == pytest.approx(ap_proba)
+
 
 def test_brier_score_loss_pos_label(fitted_clf_predictions):
     # check that _ProbaScorer leads to the right score when `pos_label` is

From e32cfa757b40673d0dc6b7e7eea44b8c1f39b880 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 11:26:37 +0200
Subject: [PATCH 32/44] DOC add whats + PEP8

---
 doc/whats_new/v0.24.rst                     | 5 +++++
 sklearn/metrics/tests/test_score_objects.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 347b30bff5685..a6b05b4d646b5 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -268,6 +268,11 @@ Changelog
   class to be used when computing the roc auc statistics.
   :pr:`17651` by :user:`Clara Matos <claramatos>`.
 
+- |Fix| Fix a bug which was not selected the appropriate probability estimates
+  or reversing the decision values if `pos_label` was provided and it was not
+  corresponding to `classifier.classes_[1]`.
+  :pr:`#18114` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 135e5bd329458..a7d978a084ff6 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -5,7 +5,7 @@
 import os
 import numbers
 from unittest.mock import Mock
-from functools import partial, partialmethod
+from functools import partial
 
 import numpy as np
 import pytest

From 07915e99e8218abf1bbc14194bdd6046036b3637 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 7 Aug 2020 17:01:02 +0200
Subject: [PATCH 33/44] TST add some tolerance since the average of squared in
 diff ordered

---
 sklearn/metrics/tests/test_score_objects.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index a7d978a084ff6..cda860c5610be 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -864,7 +864,7 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
     brier_pos_not_cancer = brier_score_loss(
         y_test, y_pred_proba[:, 1], pos_label="not cancer"
     )
-    assert brier_pos_cancer == brier_pos_not_cancer
+    assert brier_pos_cancer == pytest.approx(brier_pos_not_cancer)
 
     brier_scorer = make_scorer(
         brier_score_loss, needs_proba=True, pos_label=pos_label,

From fc1c4227b1978bf11efc0cb41780f59e4525af6b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 10 Aug 2020 10:42:19 +0200
Subject: [PATCH 34/44] STY add better error message and refactor code

---
 sklearn/metrics/_scorer.py                  | 74 +++++++++++++--------
 sklearn/metrics/tests/test_score_objects.py | 21 ++++++
 2 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 5d32cc788cd72..c8c0e4d4a11f5 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -127,6 +127,38 @@ def __init__(self, score_func, sign, kwargs):
         self._score_func = score_func
         self._sign = sign
 
+    @staticmethod
+    def _check_pos_label(pos_label, classes):
+        if pos_label not in classes:
+            raise ValueError(
+                f"pos_label should be present in the target when the "
+                f"classifier was trained. Got pos_label={pos_label} while the "
+                f"possible classes are {classes}."
+            )
+
+    def _select_proba(self, y_pred, classes, support_multi_class):
+        """Select the column of y_pred when probabilities are provided."""
+        if y_pred.shape[1] == 2:
+            pos_label = self._kwargs.get("pos_label", classes[1])
+            self._check_pos_label(pos_label, classes)
+            col_idx = np.flatnonzero(classes == pos_label)[0]
+            y_pred = y_pred[:, col_idx]
+        else:
+            err_msg = (
+                f"Got predict_proba of shape {y_pred.shape}, but need "
+                f"classifier with two classes for {self._score_func.__name__} "
+                f"scoring"
+            )
+            if support_multi_class and y_pred.shape[1] == 1:
+                # In _ProbaScorer, y_true can be tagged as binary while the
+                # y_pred is multi_class. This case is supported when label is
+                # provided.
+                raise ValueError(err_msg)
+            else:
+                raise ValueError(err_msg)
+
+        return y_pred
+
     def __repr__(self):
         kwargs_string = "".join([", %s=%s" % (str(k), str(v))
                                  for k, v in self._kwargs.items()])
@@ -239,14 +271,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         y_pred = method_caller(clf, "predict_proba", X)
         if y_type == "binary":
             if y_pred.shape[1] == 2:
-                pos_label = self._kwargs.get("pos_label", clf.classes_[1])
-                col_idx = np.flatnonzero(clf.classes_ == pos_label)[0]
-                y_pred = y_pred[:, col_idx]
-            elif y_pred.shape[1] == 1:  # not multiclass
-                raise ValueError('got predict_proba of shape {},'
-                                 ' but need classifier with two'
-                                 ' classes for {} scoring'.format(
-                                     y_pred.shape, self._score_func.__name__))
+                self._select_proba(
+                    y_pred, clf.classes_, support_multi_class=True
+                )
         if sample_weight is not None:
             return self._sign * self._score_func(y, y_pred,
                                                  sample_weight=sample_weight,
@@ -303,31 +330,22 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
                 if isinstance(y_pred, list):
                     # For multi-output multi-class estimator
                     y_pred = np.vstack([p for p in y_pred]).T
-                elif (
-                    y_type == "binary"
-                    and "pos_label" in self._kwargs
-                    and self._kwargs["pos_label"] == clf.classes_[0]
-                ):
-                    # The positive class is not the `pos_label` seen by the
-                    # classifier and we need to inverse the predictions
-                    y_pred *= -1
+                elif y_type == "binary" and "pos_label" in self._kwargs:
+                    self._check_pos_label(
+                        self._kwargs["pos_label"], clf.classes
+                    )
+                    if self._kwargs["pos_label"] == clf.classes_[0]:
+                        # The positive class is not the `pos_label` seen by the
+                        # classifier and we need to inverse the predictions
+                        y_pred *= -1
 
             except (NotImplementedError, AttributeError):
                 y_pred = method_caller(clf, "predict_proba", X)
 
                 if y_type == "binary":
-                    if y_pred.shape[1] == 2:
-                        pos_label = self._kwargs.get(
-                            "pos_label", clf.classes_[1]
-                        )
-                        col_idx = np.flatnonzero(clf.classes_ == pos_label)[0]
-                        y_pred = y_pred[:, col_idx]
-                    else:
-                        raise ValueError('got predict_proba of shape {},'
-                                         ' but need classifier with two'
-                                         ' classes for {} scoring'.format(
-                                             y_pred.shape,
-                                             self._score_func.__name__))
+                    y_pred = self._select_proba(
+                        y_pred, clf.classes_, support_multi_class=False,
+                    )
                 elif isinstance(y_pred, list):
                     y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index cda860c5610be..9521dcdf3a064 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -870,3 +870,24 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
         brier_score_loss, needs_proba=True, pos_label=pos_label,
     )
     assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer)
+
+
+@pytest.mark.parametrize(
+    "scorer",
+    [
+        make_scorer(
+            average_precision_score, needs_threshold=True, pos_label="xxx"
+        ),
+        make_scorer(brier_score_loss, needs_proba=True, pos_label="xxx"),
+    ],
+    ids=["ThresholdScorer", "ProbaScorer"],
+)
+def test_scorer_select_proba_error(scorer):
+    X, y = make_classification(
+        n_classes=2, n_informative=3, n_samples=20, random_state=0
+    )
+    lr = LogisticRegression(multi_class="multinomial").fit(X, y)
+
+    err_msg = "pos_label should be present in the target"
+    with pytest.raises(ValueError, match=err_msg):
+        scorer(lr, X, y)

From aa5cd1655042ba2d4425b08bc571e2517a277840 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 10 Aug 2020 11:11:34 +0200
Subject: [PATCH 35/44] fix

---
 sklearn/metrics/_scorer.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index c8c0e4d4a11f5..fc16db7bfbbe7 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -154,7 +154,7 @@ def _select_proba(self, y_pred, classes, support_multi_class):
                 # y_pred is multi_class. This case is supported when label is
                 # provided.
                 raise ValueError(err_msg)
-            else:
+            elif not support_multi_class:
                 raise ValueError(err_msg)
 
         return y_pred
@@ -270,10 +270,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
         y_type = type_of_target(y)
         y_pred = method_caller(clf, "predict_proba", X)
         if y_type == "binary":
-            if y_pred.shape[1] == 2:
-                self._select_proba(
-                    y_pred, clf.classes_, support_multi_class=True
-                )
+            y_pred = self._select_proba(
+                y_pred, clf.classes_, support_multi_class=True
+            )
         if sample_weight is not None:
             return self._sign * self._score_func(y, y_pred,
                                                  sample_weight=sample_weight,
@@ -332,7 +331,7 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
                     y_pred = np.vstack([p for p in y_pred]).T
                 elif y_type == "binary" and "pos_label" in self._kwargs:
                     self._check_pos_label(
-                        self._kwargs["pos_label"], clf.classes
+                        self._kwargs["pos_label"], clf.classes_
                     )
                     if self._kwargs["pos_label"] == clf.classes_[0]:
                         # The positive class is not the `pos_label` seen by the

From a669ecfb95c1bd9d4f258818dfc3b208d510d82b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 10 Aug 2020 11:38:12 +0200
Subject: [PATCH 36/44] fix

---
 sklearn/metrics/_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index fc16db7bfbbe7..dbd9d73326308 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -129,7 +129,7 @@ def __init__(self, score_func, sign, kwargs):
 
     @staticmethod
     def _check_pos_label(pos_label, classes):
-        if pos_label not in classes:
+        if pos_label not in list(classes):
             raise ValueError(
                 f"pos_label should be present in the target when the "
                 f"classifier was trained. Got pos_label={pos_label} while the "

From a477e7bfa66636fb215be7b72da8bde7b72689c8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 11 Aug 2020 14:12:50 +0200
Subject: [PATCH 37/44] Update sklearn/metrics/tests/test_score_objects.py

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/metrics/tests/test_score_objects.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 9521dcdf3a064..a996af52dd257 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -767,7 +767,7 @@ def fitted_clf_predictions():
     from sklearn.utils import shuffle
 
     X, y = load_breast_cancer(return_X_y=True)
-    # create an highly imbalanced
+    # create an highly imbalanced classification task
     idx_positive = np.flatnonzero(y == 1)
     idx_negative = np.flatnonzero(y == 0)
     idx_selected = np.hstack([idx_negative, idx_positive[:25]])

From 09b47bbaeebfc9ae441613956ee225a408bc2888 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 11 Aug 2020 14:29:34 +0200
Subject: [PATCH 38/44] add test for PredictScorer

---
 sklearn/metrics/tests/test_score_objects.py | 30 ++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index a996af52dd257..615445eead601 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -782,17 +782,19 @@ def fitted_clf_predictions():
         X, y, stratify=y, random_state=0,
     )
     classifier = LogisticRegression().fit(X_train, y_train)
+    y_pred = classifier.predict(X_test)
     y_pred_proba = classifier.predict_proba(X_test)
     y_pred_decision = classifier.decision_function(X_test)
 
-    return classifier, X_test, y_test, y_pred_proba, y_pred_decision
+    return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision
 
 
 def test_average_precision_pos_label(fitted_clf_predictions):
     # check that _ThresholdScorer will lead to the right score when passing
     # `pos_label`. Currently, only `average_precision_score` is defined to
     # be such a scorer.
-    clf, X_test, y_test, y_pred_proba, y_pred_decision = fitted_clf_predictions
+    clf, X_test, y_test, _, y_pred_proba, y_pred_decision = \
+        fitted_clf_predictions
 
     pos_label = "cancer"
     # we need to select the positive column or reverse the decision values
@@ -852,7 +854,7 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
     # check that _ProbaScorer leads to the right score when `pos_label` is
     # provided. Currently only the `brier_score_loss` is defined to be such
     # a scorer.
-    clf, X_test, y_test, y_pred_proba, _ = fitted_clf_predictions
+    clf, X_test, y_test, _, y_pred_proba, _ = fitted_clf_predictions
 
     pos_label = "cancer"
     assert clf.classes_[0] == pos_label
@@ -872,6 +874,26 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
     assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer)
 
 
+@pytest.mark.parametrize(
+    "score_func", [f1_score, precision_score, recall_score, jaccard_score]
+)
+def test_non_symmetric_metric_pos_label(score_func, fitted_clf_predictions):
+    # check that _PredictScorer leads to the right score when `pos_label` is
+    # provided. We check for all possible metric supported.
+    clf, X_test, y_test, y_pred, _, _ = fitted_clf_predictions
+
+    pos_label = "cancer"
+    assert clf.classes_[0] == pos_label
+
+    score_pos_cancer = score_func(y_test, y_pred, pos_label="cancer")
+    score_pos_not_cancer = score_func(y_test, y_pred, pos_label="not cancer")
+
+    assert score_pos_cancer != pytest.approx(score_pos_not_cancer)
+
+    scorer = make_scorer(score_func, pos_label=pos_label)
+    assert scorer(clf, X_test, y_test) == pytest.approx(score_pos_cancer)
+
+
 @pytest.mark.parametrize(
     "scorer",
     [
@@ -883,6 +905,8 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
     ids=["ThresholdScorer", "ProbaScorer"],
 )
 def test_scorer_select_proba_error(scorer):
+    # check that we raise the the proper error when passing an unknown
+    # pos_label
     X, y = make_classification(
         n_classes=2, n_informative=3, n_samples=20, random_state=0
     )

From 42e7f00a34ba3ee76d3d04021a88640431c4ba87 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 Aug 2020 10:55:21 +0200
Subject: [PATCH 39/44] apply olivier suggestions

---
 doc/whats_new/v0.24.rst                     |  8 ++--
 sklearn/metrics/_scorer.py                  |  9 ++--
 sklearn/metrics/tests/test_score_objects.py | 47 +++++++++++++++++----
 3 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index a6b05b4d646b5..37b3b4fd1cad9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -268,9 +268,11 @@ Changelog
   class to be used when computing the roc auc statistics.
   :pr:`17651` by :user:`Clara Matos <claramatos>`.
 
-- |Fix| Fix a bug which was not selected the appropriate probability estimates
-  or reversing the decision values if `pos_label` was provided and it was not
-  corresponding to `classifier.classes_[1]`.
+- |Fix| Fix scorers that accept a pos_label parameter and compute their metrics
+  from values returned by `decision_function` or `predict_proba`. Previously,
+  they would return erroneous values when pos_label was not corresponding to
+  `classifier.classes_[1]`. This is especially important when training
+  classifiers directly with string labeled target classes.
   :pr:`#18114` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.model_selection`
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index dbd9d73326308..796444f612a6d 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -131,9 +131,7 @@ def __init__(self, score_func, sign, kwargs):
     def _check_pos_label(pos_label, classes):
         if pos_label not in list(classes):
             raise ValueError(
-                f"pos_label should be present in the target when the "
-                f"classifier was trained. Got pos_label={pos_label} while the "
-                f"possible classes are {classes}."
+                f"pos_label={pos_label} is not a valid label: {classes}"
             )
 
     def _select_proba(self, y_pred, classes, support_multi_class):
@@ -334,8 +332,9 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
                         self._kwargs["pos_label"], clf.classes_
                     )
                     if self._kwargs["pos_label"] == clf.classes_[0]:
-                        # The positive class is not the `pos_label` seen by the
-                        # classifier and we need to inverse the predictions
+                        # The implicit positive class of the binary classifier
+                        # does not match `pos_label`: we need to invert the
+                        # predictions
                         y_pred *= -1
 
             except (NotImplementedError, AttributeError):
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 615445eead601..52bafb160bfdb 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -762,7 +762,32 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
 
 
 @pytest.fixture
-def fitted_clf_predictions():
+def string_labeled_classification_problem():
+    """Train a classifier on binary problem with string target.
+
+    The classifier is trained on a binary classification problem where the
+    minority class of interest has a string label that is intentionally not the
+    greatest class label using the lexicographic order.
+
+    In addition, the dataset is imbalanced to better identify problems when
+    using non-symmetric performance metrics such as f1-score, average precision
+    and so on.
+
+    Returns
+    -------
+    classifier : estimator object
+        Trained classifier on the binary problem.
+    X_test : ndarray of shape (n_samples, n_features)
+        Data to be used as testing set in tests.
+    y_test : ndarray of shape (n_samples,), dtype=object
+        Binary target where labels are strings.
+    y_pred : ndarray of shape (n_samples,), dtype=object
+        Prediction of `classifier` when predicting for `X_test`.
+    y_pred_proba : ndarray of shape (n_samples, 2), dtype=np.float64
+        Probabilities of `classifier` when predicting for `X_test`.
+    y_pred_decision : ndarray of shape (n_samples,), dtype=np.float64
+        Decision function values of `classifier` when predicting on `X_test`.
+    """
     from sklearn.datasets import load_breast_cancer
     from sklearn.utils import shuffle
 
@@ -789,12 +814,12 @@ def fitted_clf_predictions():
     return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision
 
 
-def test_average_precision_pos_label(fitted_clf_predictions):
+def test_average_precision_pos_label(string_labeled_classification_problem):
     # check that _ThresholdScorer will lead to the right score when passing
     # `pos_label`. Currently, only `average_precision_score` is defined to
     # be such a scorer.
     clf, X_test, y_test, _, y_pred_proba, y_pred_decision = \
-        fitted_clf_predictions
+        string_labeled_classification_problem
 
     pos_label = "cancer"
     # we need to select the positive column or reverse the decision values
@@ -850,11 +875,12 @@ def _predict_proba(self, X):
     assert ap_scorer == pytest.approx(ap_proba)
 
 
-def test_brier_score_loss_pos_label(fitted_clf_predictions):
+def test_brier_score_loss_pos_label(string_labeled_classification_problem):
     # check that _ProbaScorer leads to the right score when `pos_label` is
     # provided. Currently only the `brier_score_loss` is defined to be such
     # a scorer.
-    clf, X_test, y_test, _, y_pred_proba, _ = fitted_clf_predictions
+    clf, X_test, y_test, _, y_pred_proba, _ = \
+        string_labeled_classification_problem
 
     pos_label = "cancer"
     assert clf.classes_[0] == pos_label
@@ -877,10 +903,12 @@ def test_brier_score_loss_pos_label(fitted_clf_predictions):
 @pytest.mark.parametrize(
     "score_func", [f1_score, precision_score, recall_score, jaccard_score]
 )
-def test_non_symmetric_metric_pos_label(score_func, fitted_clf_predictions):
+def test_non_symmetric_metric_pos_label(
+    score_func, string_labeled_classification_problem
+):
     # check that _PredictScorer leads to the right score when `pos_label` is
     # provided. We check for all possible metric supported.
-    clf, X_test, y_test, y_pred, _, _ = fitted_clf_predictions
+    clf, X_test, y_test, y_pred, _, _ = string_labeled_classification_problem
 
     pos_label = "cancer"
     assert clf.classes_[0] == pos_label
@@ -901,8 +929,9 @@ def test_non_symmetric_metric_pos_label(score_func, fitted_clf_predictions):
             average_precision_score, needs_threshold=True, pos_label="xxx"
         ),
         make_scorer(brier_score_loss, needs_proba=True, pos_label="xxx"),
+        make_scorer(f1_score, pos_label="xxx")
     ],
-    ids=["ThresholdScorer", "ProbaScorer"],
+    ids=["ThresholdScorer", "ProbaScorer", "PredictScorer"],
 )
 def test_scorer_select_proba_error(scorer):
     # check that we raise the the proper error when passing an unknown
@@ -912,6 +941,6 @@ def test_scorer_select_proba_error(scorer):
     )
     lr = LogisticRegression(multi_class="multinomial").fit(X, y)
 
-    err_msg = "pos_label should be present in the target"
+    err_msg = "is not a valid label"
     with pytest.raises(ValueError, match=err_msg):
         scorer(lr, X, y)

From e9d787308e97789f97fd9e4ff092545c0b6dbc55 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 Aug 2020 11:26:57 +0200
Subject: [PATCH 40/44] use list

---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index a7bad09ed98d0..5db6a5798abd0 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -1252,7 +1252,7 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
                          str(average_options))
 
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-    present_labels = unique_labels(y_true, y_pred)
+    present_labels = unique_labels(y_true, y_pred).tolist()
     if average == 'binary':
         if y_type == 'binary':
             if pos_label not in present_labels:

From 70257689cc84eeb61a5a39e7d8ce21443c7aacd3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Aug 2020 14:31:27 +0200
Subject: [PATCH 41/44] fix

---
 sklearn/metrics/tests/test_classification.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 6677f3119dacd..3b512e10a271d 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -4,7 +4,6 @@
 from itertools import chain
 from itertools import permutations
 import warnings
-import re
 
 import numpy as np
 from scipy import linalg
@@ -1247,7 +1246,7 @@ def test_multilabel_hamming_loss():
 def test_jaccard_score_validation():
     y_true = np.array([0, 1, 0, 1, 1])
     y_pred = np.array([0, 1, 0, 1, 1])
-    err_msg = r"pos_label=2 is not a valid label: array\(\[0, 1\]\)"
+    err_msg = r"pos_label=2 is not a valid label: \[0, 1\]"
     with pytest.raises(ValueError, match=err_msg):
         jaccard_score(y_true, y_pred, average='binary', pos_label=2)
 
@@ -2262,9 +2261,12 @@ def test_brier_score_loss():
     # ensure to raise an error for multiclass y_true
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
-    error_message = ("Only binary classification is supported. Labels "
-                     "in y_true: {}".format(np.array([0, 1, 2])))
-    with pytest.raises(ValueError, match=re.escape(error_message)):
+    error_message = (
+        "Only binary classification is supported. The type of the target is "
+        "multiclass"
+    )
+
+    with pytest.raises(ValueError, match=error_message):
         brier_score_loss(y_true, y_pred)
 
     # calculate correctly when there's only one class in y_true

From 536753f29e137d1742edca5f2902ae7ca24a4686 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Aug 2020 15:09:58 +0200
Subject: [PATCH 42/44] fix

---
 sklearn/metrics/tests/test_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 3b512e10a271d..69fd423df21d8 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2262,8 +2262,8 @@ def test_brier_score_loss():
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
     error_message = (
-        "Only binary classification is supported. The type of the target is "
-        "multiclass"
+        "Only binary classification is supported. Labels in y_true: "
+        "\[0 1 2\]"
     )
 
     with pytest.raises(ValueError, match=error_message):

From 6a12a1ff396e7891763bfd9f1f50fd784d24900f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Aug 2020 15:11:36 +0200
Subject: [PATCH 43/44] PEP8

---
 sklearn/metrics/tests/test_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 69fd423df21d8..e093c4107a5b0 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2262,8 +2262,8 @@ def test_brier_score_loss():
     y_true = np.array([0, 1, 2, 0])
     y_pred = np.array([0.8, 0.6, 0.4, 0.2])
     error_message = (
-        "Only binary classification is supported. Labels in y_true: "
-        "\[0 1 2\]"
+        r"Only binary classification is supported. Labels in y_true: "
+        r"\[0 1 2\]"
     )
 
     with pytest.raises(ValueError, match=error_message):

From 07132326eb96b8c5b61f197671fd8145db3e004a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 19 Aug 2020 19:56:12 +0200
Subject: [PATCH 44/44] finally passing?

---
 sklearn/metrics/_scorer.py                    |  23 +-
 sklearn/model_selection/_prediction.py        | 261 ++++++++++--------
 .../model_selection/tests/test_prediction.py  | 121 ++++----
 3 files changed, 225 insertions(+), 180 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 796444f612a6d..efd225038bf57 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -134,6 +134,18 @@ def _check_pos_label(pos_label, classes):
                 f"pos_label={pos_label} is not a valid label: {classes}"
             )
 
+    def _check_decision_function(self, y_pred, classes):
+        """Reverse the decision function depending of pos_label."""
+        pos_label = self._kwargs.get("pos_label", classes[1])
+        self._check_pos_label(pos_label, classes)
+        if pos_label == classes[0]:
+            # The implicit positive class of the binary classifier
+            # does not match `pos_label`: we need to invert the
+            # predictions
+            y_pred *= -1
+
+        return y_pred
+
     def _select_proba(self, y_pred, classes, support_multi_class):
         """Select the column of y_pred when probabilities are provided."""
         if y_pred.shape[1] == 2:
@@ -327,15 +339,10 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
                 if isinstance(y_pred, list):
                     # For multi-output multi-class estimator
                     y_pred = np.vstack([p for p in y_pred]).T
-                elif y_type == "binary" and "pos_label" in self._kwargs:
-                    self._check_pos_label(
-                        self._kwargs["pos_label"], clf.classes_
+                elif y_type == "binary":
+                    y_pred = self._check_decision_function(
+                        y_pred, clf.classes_
                     )
-                    if self._kwargs["pos_label"] == clf.classes_[0]:
-                        # The implicit positive class of the binary classifier
-                        # does not match `pos_label`: we need to invert the
-                        # predictions
-                        y_pred *= -1
 
             except (NotImplementedError, AttributeError):
                 y_pred = method_caller(clf, "predict_proba", X)
diff --git a/sklearn/model_selection/_prediction.py b/sklearn/model_selection/_prediction.py
index 6082a5bb4c9ec..7b329aee2ebd0 100644
--- a/sklearn/model_selection/_prediction.py
+++ b/sklearn/model_selection/_prediction.py
@@ -12,7 +12,11 @@
 from ..base import ClassifierMixin
 from ..base import MetaEstimatorMixin
 from ..metrics import balanced_accuracy_score
+from ..metrics import check_scoring
+from ..metrics import make_scorer
 from ..metrics import roc_curve
+from ..metrics._plot.base import _check_classifier_response_method
+from ..metrics._scorer import _BaseScorer
 from ..preprocessing import LabelEncoder
 from ..utils import check_array
 from ..utils import _safe_indexing
@@ -21,6 +25,78 @@
 from ..utils.validation import check_is_fitted
 
 
+class _ContinuousScorer(_BaseScorer):
+    def __init__(self, score_func, sign, response_method, kwargs):
+        super().__init__(score_func=score_func, sign=sign, kwargs=kwargs)
+        self.response_method = response_method
+
+    def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
+        """Evaluate predicted target values for X relative to y_true.
+
+        Parameters
+        ----------
+        method_caller : callable
+            Returns predictions given an estimator, method name, and other
+            arguments, potentially caching results.
+
+        estimator : object
+            Trained estimator to use for scoring. Must have a predict_proba
+            method; the output of that is used to compute the score.
+
+        X : {array-like, sparse matrix}
+            Test data that will be fed to estimator.predict.
+
+        y_true : array-like
+            Gold standard target values for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        response_method = _check_classifier_response_method(
+            estimator=estimator, response_method=self.response_method
+        )
+        y_score = response_method(X)
+        if response_method.__name__ == "decision_function":
+            y_score = self._check_decision_function(
+                y_score, estimator.classes_
+            )
+        else:
+            y_score = self._select_proba(
+                y_score, estimator.classes_, support_multi_class=False
+            )
+
+        # `np.unique` returned sorted array, thus no need to sort values
+        potential_thresholds = np.unique(y_score)
+        score_thresholds = []
+        for th in potential_thresholds:
+            y_score_thresholded = estimator.classes_[
+                (y_score >= th).astype(int)
+            ]
+            if sample_weight is not None:
+                score_thresholds.append(
+                    self._sign
+                    * self._score_func(
+                        y_true,
+                        y_score_thresholded,
+                        sample_weight=sample_weight,
+                        **self._kwargs,
+                    )
+                )
+            else:
+                score_thresholds.append(
+                    self._sign
+                    * self._score_func(
+                        y_true, y_score_thresholded, **self._kwargs
+                    )
+                )
+        return np.array(potential_thresholds), np.array(score_thresholds)
+
+
 class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     """Decision threshold calibration for binary classification.
 
@@ -36,30 +112,31 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         The classifier, fitted or not fitted, from which we want to optimize
         the decision threshold used during `predict`.
 
-    objective_metric : callable or {"tpr", "tnr"} \
-            default=balanced_accuracy_score
+    objective_metric : {"tpr", "tnr"}, str or callable,  \
+            default="balanced_accuracy"
         The objective metric to be optimized. Can be one of:
 
-        * a callable with the signature `metric(y_true, y_score, **kwargs)`;
+        * a string associated to a scoring function (see model evaluation
+          documentation);
+        * a scorer callable object / function with the signature
+          `metric(estimator, X, y)`;
         * `"tpr"`: find the decision threshold for a true positive ratio (TPR)
           of `objective_value`;
         * `"tnr"`: find the decision threshold for a true negative ratio (TNR)
           of `objective_value`.
 
-    objective_metric_params : dict, default=None
-        Some extra parameters to pass to `objective_metric`.
-
     objective_value : float, default=None
         The value associated with the `objective_metric` metric for which we
         want to find the decision threshold when `objective_metric` is equal to
         `"tpr"` or `"tnr"`.
 
-    method : {"auto", "decision_function", "predict_proba"}, default="auto"
+    response_method : {"auto", "decision_function", "predict_proba"}, \
+            default="auto"
         Methods by the classifier `base_estimator` corresponding to the
         decision function for which we want to find a threshold. It can be:
 
         * if `"auto"`, it will try to invoke, for each classifier,
-          `"decision_function` or `"predict_proba"` in that order.
+          `"predict_proba"` or `"decision_function"` in that order.
         * otherwise, one of `"predict_proba"` or `"decision_function"`.
           If the method is not implemented by the classifier, it will raise an
           error.
@@ -68,11 +145,6 @@ class CutoffClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         The number of decision threshold to use when discretizing the output
         of the classifier `method`.
 
-    pos_label : int or str, default=None
-        The label of the positive class. When `pos_label=None`, if `y_true` is
-        in `{-1, 1}` or `{0, 1}`, `pos_label` is set to 1, otherwise an error
-        will be raised.
-
     cv : int, float, cross-validation generator, iterable or "prefit", \
             default=None
         Determines the cross-validation splitting strategy used in
@@ -169,12 +241,10 @@ class imbalanced.
     def __init__(
         self,
         base_estimator,
-        objective_metric=balanced_accuracy_score,
-        objective_metric_params=None,
+        objective_metric="balanced_accuracy",
         objective_value=None,
-        method="auto",
+        response_method="auto",
         n_threshold=1000,
-        pos_label=None,
         cv=None,
         refit="auto",
         random_state=None,
@@ -182,11 +252,9 @@ def __init__(
     ):
         self.base_estimator = base_estimator
         self.objective_metric = objective_metric
-        self.objective_metric_params = objective_metric_params
         self.objective_value = objective_value
-        self.method = method
+        self.response_method = response_method
         self.n_threshold = n_threshold
-        self.pos_label = pos_label
         self.cv = cv
         self.refit = refit
         self.random_state = random_state
@@ -194,62 +262,20 @@ def __init__(
 
     def _validate_parameters(self):
         """Validate the input parameters."""
-        supported_methods = ("decision_function", "predict_proba")
-        if self.method == "auto":
-            has_methods = [
-                hasattr(self.base_estimator, m) for m in supported_methods
-            ]
-            if not any(has_methods):
-                raise TypeError(
-                    f"'base_estimator' must implement one of the "
-                    f"{', '.join(supported_methods)} methods."
-                )
-            self._method = next(
-                (m for m, i in zip(supported_methods, has_methods) if i), None
-            )
-        else:
-            if self.method not in supported_methods:
-                raise ValueError(
-                    f"'method' should be one of {', '.join(supported_methods)}"
-                    f". Got {self.method} instead."
-                )
-            elif not hasattr(self.base_estimator, self.method):
-                raise TypeError(
-                    f"'base_estimator' does not implement {self.method}."
-                )
-            self._method = self.method
-        if (self.objective_metric not in ("tpr", "tnr") and
-                self.objective_value is not None):
+        if (
+            self.objective_metric not in ("tpr", "tnr")
+            and self.objective_value is not None
+        ):
             raise ValueError(
                 f"When 'objective_metric' is a scoring function, "
                 f"'objective_value' should be None. Got "
-                f"{self.objective_metric} instead."
+                f"{self.objective_value} instead."
             )
 
-        # ensure binary classification if `pos_label` is not specified
-        # `classes.dtype.kind` in ('O', 'U', 'S') is required to avoid
-        # triggering a FutureWarning by calling np.array_equal(a, b)
-        # when elements in the two arrays are not comparable.
-        if (self.pos_label is None and (
-                self.classes_.dtype.kind in ('O', 'U', 'S') or
-                not (np.array_equal(self.classes_, [0, 1]) or
-                     np.array_equal(self.classes_, [-1, 1]) or
-                     np.array_equal(self.classes_, [0]) or
-                     np.array_equal(self.classes_, [-1]) or
-                     np.array_equal(self.classes_, [1])))):
-            classes_repr = ", ".join(repr(c) for c in self.classes_)
-            raise ValueError(
-                f"'y_true' takes value in {classes_repr} and 'pos_label' is "
-                f"not specified: either make 'y_true' take value in "
-                "{{0, 1}} or {{-1, 1}} or pass pos_label explicitly."
-            )
-        elif self.pos_label is None:
-            self._pos_label = 1
-        else:
-            self._pos_label = self.pos_label
-
-        if (not isinstance(self.n_threshold, numbers.Integral) or
-                self.n_threshold < 0):
+        if (
+            not isinstance(self.n_threshold, numbers.Integral)
+            or self.n_threshold < 0
+        ):
             raise ValueError(
                 f"'n_threshold' should be a strictly positive integer. "
                 f"Got {self.n_threshold} instead."
@@ -284,8 +310,9 @@ def _check_cv_refit(self, cv, refit, y, random_state):
         return cv, refit
 
     @staticmethod
-    def _fit_and_score(estimator, X, y, train_idx, val_idx, predict_method,
-                       score_method, score_params, pos_label_encoded):
+    def _fit_and_score(
+        estimator, X, y, train_idx, val_idx, scorer, score_method
+    ):
         if train_idx is not None:
             X_train = _safe_indexing(X, train_idx)
             X_val = _safe_indexing(X, val_idx)
@@ -296,35 +323,23 @@ def _fit_and_score(estimator, X, y, train_idx, val_idx, predict_method,
         else:
             X_val, y_val = X, y
 
-        y_score = getattr(estimator, predict_method)(X_val)
-        if y_score.ndim == 2:
-            y_score = y_score[:, pos_label_encoded]
-
         if score_method in ("tnr", "tpr"):
-            fpr, tpr, potential_thresholds = roc_curve(
-                y_val, y_score, pos_label=pos_label_encoded
-            )
+            fpr, tpr, potential_thresholds = scorer(estimator, X_val, y_val)
             score_thresholds = tpr
             if score_method == "tnr":
                 score_thresholds = (1 - fpr)[::-1]
                 potential_thresholds = potential_thresholds[::-1]
         else:
-            params = {} if score_params is None else score_params
-            if "pos_label" in signature(score_method).parameters:
-                params["pos_label"] = pos_label_encoded
-            # `np.unique` is already sorting the value, no need to call
-            #  `potential_thresholds.sort()`
-            potential_thresholds = np.unique(y_score)
-            score_thresholds = np.array([
-                score_method(y_val, (y_score >= th).astype(int), **params)
-                for th in potential_thresholds
-            ])
+            potential_thresholds, score_thresholds = scorer(
+                estimator, X_val, y_val
+            )
 
         return potential_thresholds, score_thresholds
 
     @staticmethod
-    def _find_decision_threshold(thresholds, scores, n_thresholds,
-                                 objective_score):
+    def _find_decision_threshold(
+        thresholds, scores, n_thresholds, objective_score
+    ):
         min_threshold = np.min([th.min() for th in thresholds])
         max_threshold = np.max([th.max() for th in thresholds])
         ascending = thresholds[0].argmin() == 0
@@ -390,15 +405,6 @@ def fit(self, X, y):
         # with known classes
         self._validate_parameters()
 
-        # warm start a label encoder using the fitted estimator
-        label_encoder = LabelEncoder()
-        label_encoder.classes_ = self.classes_
-
-        y_encoded = label_encoder.transform(y)
-        self._pos_label_encoded = np.flatnonzero(
-            self.classes_ == self._pos_label
-        ).item(0)
-
         if cv == "prefit" or not refit:
             model = self._estimator
             splits = ([None, range(len(X))],)
@@ -406,15 +412,35 @@ def fit(self, X, y):
             model = clone(self.base_estimator)
             splits = cv.split(X, y)
 
-        thresholds, scores = zip(*Parallel(n_jobs=self.n_jobs)(
-            delayed(self._fit_and_score)(
-                model, X, y_encoded, train_idx, val_idx,
-                self._method,
-                self.objective_metric, self.objective_metric_params,
-                self._pos_label_encoded
+        if self.objective_metric in ("tpr", "tnr"):
+            scoring = make_scorer(roc_curve, needs_threshold=True)
+        else:
+            scoring = check_scoring(
+                estimator=model, scoring=self.objective_metric
+            )
+            if isinstance(scoring, _BaseScorer):
+                scoring = _ContinuousScorer(
+                    score_func=scoring._score_func,
+                    sign=scoring._sign,
+                    response_method=self.response_method,
+                    kwargs=scoring._kwargs,
+                )
+        self._scorer = check_scoring(estimator=model, scoring=scoring)
+
+        thresholds, scores = zip(
+            *Parallel(n_jobs=self.n_jobs)(
+                delayed(self._fit_and_score)(
+                    model,
+                    X,
+                    y,
+                    train_idx,
+                    val_idx,
+                    self._scorer,
+                    self.objective_metric,
+                )
+                for train_idx, val_idx in splits
             )
-            for train_idx, val_idx in splits
-        ))
+        )
 
         if self.objective_metric in ("tnr", "tpr"):
             objective_value = self.objective_value
@@ -442,10 +468,19 @@ def predict(self, X):
         """
         check_is_fitted(self)
 
-        decision_function = getattr(self._estimator, self._method)
-        y_score = decision_function(X)
-        if y_score.ndim == 2:
-            y_score = y_score[:, self._pos_label_encoded]
+        response_method = _check_classifier_response_method(
+            estimator=self._estimator, response_method=self.response_method
+        )
+
+        y_score = response_method(X)
+        if response_method.__name__ == "decision_function":
+            y_score = self._scorer._check_decision_function(
+                y_score, self.classes_
+            )
+        else:
+            y_score = self._scorer._select_proba(
+                y_score, self.classes_, support_multi_class=False
+            )
         y_class_indices = (y_score >= self.decision_threshold_).astype(int)
 
         return self.classes_[y_class_indices]
diff --git a/sklearn/model_selection/tests/test_prediction.py b/sklearn/model_selection/tests/test_prediction.py
index 773028572b143..cf0b975d10f1f 100644
--- a/sklearn/model_selection/tests/test_prediction.py
+++ b/sklearn/model_selection/tests/test_prediction.py
@@ -9,6 +9,7 @@
 from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import f1_score
 from sklearn.metrics import fbeta_score
+from sklearn.metrics import make_scorer
 from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -26,49 +27,37 @@ def fit(self, X, y):
         return self
 
 
-@pytest.mark.parametrize(
-    "Estimator, params, err_type, err_msg",
-    [
-        (LogisticRegression, {"method": "xxx"}, ValueError,
-         "'method' should be one of"),
-        (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
-         "'base_estimator' must implement one of the"),
-        (SVC, {"method": "predict_proba"}, TypeError,
-         "'base_estimator' does not implement predict_proba"),
-        (LogisticRegression,
-         {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
-         "When 'objective_metric' is a scoring function"),
-        (LogisticRegression, {"cv": 1.5}, ValueError, "Got 1.5"),
-        (LogisticRegression, {"refit": False}, ValueError,
-         "When cv has several folds, refit cannot be False"),
-        (LogisticRegression, {"cv": "prefit", "refit": True}, ValueError,
-         "When cv='prefit', refit cannot be True."),
-        (LogisticRegression, {"n_threshold": -10}, ValueError,
-         "'n_threshold' should be a strictly positive integer."),
-        (LogisticRegression, {"n_threshold": 10.5}, ValueError,
-         "'n_threshold' should be a strictly positive integer."),
-    ]
-)
-def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
-                                             err_msg):
-    # check that the proper errors are raised with wrong parameters
-    X, y = make_classification(n_samples=200, n_features=6, random_state=42,
-                               n_classes=2)
-    with pytest.raises(err_type, match=err_msg):
-        clf = CutoffClassifier(base_estimator=Estimator(), **params)
-        clf.fit(X, y)
-
-
-def test_cutoffclassifier_error_pos_label():
-    # check that we raise when the classes are not in {0, 1} or {-1, 1}
-    X, y = load_breast_cancer(return_X_y=True)
-    y += 1
-    err_msg = "'y_true' takes value in 1, 2 and 'pos_label' is not specified"
-    with pytest.raises(ValueError, match=err_msg):
-        CutoffClassifier(
-            base_estimator=make_pipeline(StandardScaler(),
-                                         LogisticRegression())
-        ).fit(X, y)
+# @pytest.mark.parametrize(
+#     "Estimator, params, err_type, err_msg",
+#     [
+#         (LogisticRegression, {"method": "xxx"}, ValueError,
+#          "'method' should be one of"),
+#         (MockNoPredictorClassifier, {"method": "auto"}, TypeError,
+#          "'base_estimator' must implement one of the"),
+#         (SVC, {"method": "predict_proba"}, TypeError,
+#          "'base_estimator' does not implement predict_proba"),
+#         (LogisticRegression,
+#          {"objective_metric": "accuracy", "objective_value": 0.5}, ValueError,
+#          "When 'objective_metric' is a scoring function"),
+#         (LogisticRegression, {"cv": 1.5}, ValueError, "Got 1.5"),
+#         (LogisticRegression, {"refit": False}, ValueError,
+#          "When cv has several folds, refit cannot be False"),
+#         (LogisticRegression, {"cv": "prefit", "refit": True}, ValueError,
+#          "When cv='prefit', refit cannot be True."),
+#         (LogisticRegression, {"n_threshold": -10}, ValueError,
+#          "'n_threshold' should be a strictly positive integer."),
+#         (LogisticRegression, {"n_threshold": 10.5}, ValueError,
+#          "'n_threshold' should be a strictly positive integer."),
+#     ]
+# )
+# def test_cutoffclassifier_valid_params_error(Estimator, params, err_type,
+#                                              err_msg):
+#     # check that the proper errors are raised with wrong parameters
+#     X, y = make_classification(n_samples=200, n_features=6, random_state=42,
+#                                n_classes=2)
+#     with pytest.raises(err_type, match=err_msg):
+#         clf = CutoffClassifier(base_estimator=Estimator(), **params)
+#         clf.fit(X, y)
 
 
 def test_cutoffclassifier_not_binary():
@@ -76,11 +65,25 @@ def test_cutoffclassifier_not_binary():
     X, y = load_iris(return_X_y=True)
     with pytest.raises(ValueError, match="Expected target of binary type."):
         CutoffClassifier(
-            base_estimator=make_pipeline(StandardScaler(),
-                                         LogisticRegression())
+            base_estimator=make_pipeline(
+                StandardScaler(), LogisticRegression()
+            )
         ).fit(X, y)
 
 
+def test_cutoffclassifier_xxx():
+    # check that an objective value of 0 give opposite predictions in with
+    # tpr and tnr
+    X, y = load_breast_cancer(return_X_y=True)
+    # replaces y by some strings
+    classes = np.array(["healthy", "cancer"], dtype=object)
+    y = classes[y]
+    clf = CutoffClassifier(
+        base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
+    )
+    y_pred_tpr = clf.fit(X, y).predict(X)
+
+
 def test_cutoffclassifier_limit_tpr_tnr():
     # check that an objective value of 0 give opposite predictions in with
     # tpr and tnr
@@ -97,10 +100,9 @@ def test_cutoffclassifier_limit_tpr_tnr():
 
 
 @pytest.mark.parametrize(
-    "method",
-    ["auto", "decision_function", "predict_proba"]
+    "response_method", ["auto", "decision_function", "predict_proba"]
 )
-def test_cutoffclassifier_with_objective_value(method):
+def test_cutoffclassifier_with_objective_value(response_method):
     # check that we can optimize a given metric as a callable
     X, y = load_breast_cancer(return_X_y=True)
     # remove feature to degrade performances
@@ -118,8 +120,8 @@ def test_cutoffclassifier_with_objective_value(method):
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     model = CutoffClassifier(
         base_estimator=lr,
-        objective_metric=balanced_accuracy_score,
-        method=method,
+        objective_metric="balanced_accuracy",
+        response_method=response_method,
     )
     score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X))
     score_baseline = balanced_accuracy_score(y, lr.predict(X))
@@ -133,11 +135,10 @@ def test_cutoffclassifier_metric_with_parameter():
     X, y = load_breast_cancer(return_X_y=True)
     lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y)
     model_fbeta = CutoffClassifier(
-        base_estimator=lr, objective_metric=fbeta_score,
-        objective_metric_params={"beta": 1}
+        base_estimator=lr, objective_metric=make_scorer(fbeta_score, beta=1),
     ).fit(X, y)
     model_f1 = CutoffClassifier(
-        base_estimator=lr, objective_metric=f1_score,
+        base_estimator=lr, objective_metric=make_scorer(f1_score),
     ).fit(X, y)
 
     assert (model_fbeta.decision_threshold_ ==
@@ -186,12 +187,15 @@ def test_cutoffclassifier_pretrained_estimator():
 
 
 @pytest.mark.parametrize(
-    "method",
-    ["auto", "decision_function", "predict_proba"]
+    "response_method", ["auto", "decision_function", "predict_proba"]
+)
+@pytest.mark.parametrize(
+    "metric",
+    [make_scorer(balanced_accuracy_score),
+     make_scorer(f1_score, pos_label="cancer")]
 )
-@pytest.mark.parametrize("metric", [balanced_accuracy_score, f1_score])
 @pytest.mark.parametrize("dtype", [None, object])
-def test_cutoffclassifier_with_string_targets(method, dtype, metric):
+def test_cutoffclassifier_with_string_targets(response_method, dtype, metric):
     # check that targets represented by str are properly managed
     # check with several metrics to be sure that `pos_label` is properly
     # dispatched
@@ -204,8 +208,7 @@ def test_cutoffclassifier_with_string_targets(method, dtype, metric):
     model = CutoffClassifier(
         base_estimator=make_pipeline(StandardScaler(), LogisticRegression()),
         objective_metric=metric,
-        pos_label="cancer",
-        method=method,
+        response_method=response_method,
     ).fit(X, y)
     assert_array_equal(np.sort(model.classes_), np.sort(classes))
     y_pred = model.predict(X[[0], :])