From 9d090da0baf4bc5776896f9f111d1daebc4ee849 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Oct 2019 11:13:08 -0400
Subject: [PATCH 01/31] WIP

---
 sklearn/model_selection/_validation.py        | 13 ++++++-
 sklearn/model_selection/tests/test_search.py  | 36 +++++++++++++++++++
 .../model_selection/tests/test_validation.py  | 15 ++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index f3464205a993d..4a51f26d3a1dc 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -394,7 +394,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                    parameters, fit_params, return_train_score=False,
                    return_parameters=False, return_n_test_samples=False,
                    return_times=False, return_estimator=False,
-                   error_score=np.nan):
+                   error_score=np.nan, check_scorer_key=None):
     """Fit estimator and compute scores for a given dataset split.
 
     Parameters
@@ -455,6 +455,10 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     return_estimator : boolean, optional, default: False
         Whether to return the fitted estimator.
 
+    check_scorer_key : str or None, default=None
+        If a string and scorer returns a dictionary, the keys will be check
+        to contain `check_scorer_key`.
+
     Returns
     -------
     train_scores : dict of scorer name -> float, optional
@@ -538,6 +542,13 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         score_time = time.time() - start_time - fit_time
         if return_train_score:
             train_scores = _score(estimator, X_train, y_train, scorer)
+
+    # check scorer keys
+    if (check_scorer_key is not None and isinstance(test_scores, dict)
+            and check_scorer_key not in test_scores):
+        raise ValueError("dict returned by scorer must contain {}".format(
+            check_scorer_key))
+
     if verbose > 2:
         if isinstance(test_scores, dict):
             for scorer_name in sorted(test_scores):
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index f3301606e997e..4b74bed3fee8b 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -1779,3 +1779,39 @@ def get_n_splits(self, *args, **kw):
                              'inconsistent results. Expected \\d+ '
                              'splits, got \\d+'):
         ridge.fit(X[:train_size], y[:train_size])
+
+
+def test_callable_multimetric_same_as_list_of_strings():
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return {'recall': recall_score(y, y_pred),
+                'accuracy': accuracy_score(y, y_pred)}
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(est, {'C': [0.1, 1]},
+                                   scoring=custom_scorer, refit='recall')
+    search_str = GridSearchCV(est, {'C': [0.1, 1]},
+                              scoring=['recall', 'accuracy'], refit='recall')
+
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+
+
+def test_callable_multimetric_error_on_invalid_key():
+    def bad_scorer(est, X, y):
+        return {'bad_name': 1}
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]},
+                       scoring=bad_scorer, refit='good_name')
+
+    msg = ('For multi-metric scoring, the parameter refit must be set to a '
+           'scorer key or a callable to refit')
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 4d681f24403ee..d1be6ae7b45e0 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -1717,3 +1717,18 @@ def two_params_scorer(estimator, X_test):
     fit_and_score_args = [None, None, None, two_params_scorer]
     assert_raise_message(ValueError, error_message,
                          _score, *fit_and_score_args)
+
+
+def test_errors_when_key_not_in_scorer_dict():
+    def scorer(est, X, y):
+        return {"my_key": 1}
+
+    X, y = make_classification(n_samples=30, random_state=0)
+    train, test = next(ShuffleSplit().split(X))
+    clf = SVC(kernel="linear", random_state=0)
+
+    fit_and_score_args = [clf, X, y, scorer, train, test, 10, None, None]
+
+    msg = "dict returned by scorer must contain not_my_key"
+    with pytest.raises(ValueError, match=msg):
+        _fit_and_score(*fit_and_score_args, check_scorer_key='not_my_key')

From 315c335c9c68aa461dc16bbae59562a77e5a1634 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Oct 2019 13:33:20 -0400
Subject: [PATCH 02/31] ENH Increase compability

---
 sklearn/metrics/scorer.py                     | 116 +++++++---------
 sklearn/metrics/tests/test_score_objects.py   |  31 +----
 sklearn/model_selection/_search.py            |  48 +++----
 sklearn/model_selection/_validation.py        | 124 +++++++++---------
 sklearn/model_selection/tests/test_search.py  |  36 -----
 .../model_selection/tests/test_validation.py  |  15 ---
 6 files changed, 146 insertions(+), 224 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 25b826ff91f75..6c80894a0ee13 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -431,7 +431,7 @@ def check_scoring(estimator, scoring=None, allow_none=False):
                          " None. %r was passed" % scoring)
 
 
-def _check_multimetric_scoring(estimator, scoring=None):
+def _check_multimetric_scoring(estimator, scoring):
     """Check the scoring parameter in cases when multiple metrics are allowed
 
     Parameters
@@ -439,10 +439,7 @@ def _check_multimetric_scoring(estimator, scoring=None):
     estimator : sklearn estimator instance
         The estimator for which the scoring will be applied.
 
-    scoring : string, callable, list/tuple, dict or None, default: None
-        A single string (see :ref:`scoring_parameter`) or a callable
-        (see :ref:`scoring`) to evaluate the predictions on the test set.
-
+    scoring : list/tuple or dict
         For evaluating multiple metrics, either give a list of (unique) strings
         or a dict with names as keys and callables as values.
 
@@ -452,7 +449,6 @@ def _check_multimetric_scoring(estimator, scoring=None):
 
         See :ref:`multimetric_grid_search` for an example.
 
-        If None the estimator's score method is used.
         The return value in that case will be ``{'score': <default_scorer>}``.
         If the estimator's score method is not available, a ``TypeError``
         is raised.
@@ -461,69 +457,59 @@ def _check_multimetric_scoring(estimator, scoring=None):
     -------
     scorers_dict : dict
         A dict mapping each scorer name to its validated scorer.
-
-    is_multimetric : bool
-        True if scorer is a list/tuple or dict of callables
-        False if scorer is None/str/callable
     """
-    if callable(scoring) or scoring is None or isinstance(scoring,
-                                                          str):
-        scorers = {"score": check_scoring(estimator, scoring=scoring)}
-        return scorers, False
-    else:
-        err_msg_generic = ("scoring should either be a single string or "
-                           "callable for single metric evaluation or a "
-                           "list/tuple of strings or a dict of scorer name "
-                           "mapped to the callable for multiple metric "
-                           "evaluation. Got %s of type %s"
-                           % (repr(scoring), type(scoring)))
-
-        if isinstance(scoring, (list, tuple, set)):
-            err_msg = ("The list/tuple elements must be unique "
-                       "strings of predefined scorers. ")
-            invalid = False
-            try:
-                keys = set(scoring)
-            except TypeError:
-                invalid = True
-            if invalid:
-                raise ValueError(err_msg)
-
-            if len(keys) != len(scoring):
-                raise ValueError(err_msg + "Duplicate elements were found in"
-                                 " the given list. %r" % repr(scoring))
-            elif len(keys) > 0:
-                if not all(isinstance(k, str) for k in keys):
-                    if any(callable(k) for k in keys):
-                        raise ValueError(err_msg +
-                                         "One or more of the elements were "
-                                         "callables. Use a dict of score name "
-                                         "mapped to the scorer callable. "
-                                         "Got %r" % repr(scoring))
-                    else:
-                        raise ValueError(err_msg +
-                                         "Non-string types were found in "
-                                         "the given list. Got %r"
-                                         % repr(scoring))
-                scorers = {scorer: check_scoring(estimator, scoring=scorer)
-                           for scorer in scoring}
-            else:
-                raise ValueError(err_msg +
-                                 "Empty list was given. %r" % repr(scoring))
-
-        elif isinstance(scoring, dict):
+    err_msg_generic = ("scoring should either be a single string or "
+                       "callable for single metric evaluation or a "
+                       "list/tuple of strings or a dict of scorer name "
+                       "mapped to the callable for multiple metric "
+                       "evaluation. Got %s of type %s"
+                       % (repr(scoring), type(scoring)))
+
+    if isinstance(scoring, (list, tuple, set)):
+        err_msg = ("The list/tuple elements must be unique "
+                   "strings of predefined scorers. ")
+        invalid = False
+        try:
             keys = set(scoring)
+        except TypeError:
+            invalid = True
+        if invalid:
+            raise ValueError(err_msg)
+
+        if len(keys) != len(scoring):
+            raise ValueError(err_msg + "Duplicate elements were found in"
+                             " the given list. %r" % repr(scoring))
+        elif len(keys) > 0:
             if not all(isinstance(k, str) for k in keys):
-                raise ValueError("Non-string types were found in the keys of "
-                                 "the given dict. scoring=%r" % repr(scoring))
-            if len(keys) == 0:
-                raise ValueError("An empty dict was passed. %r"
-                                 % repr(scoring))
-            scorers = {key: check_scoring(estimator, scoring=scorer)
-                       for key, scorer in scoring.items()}
+                if any(callable(k) for k in keys):
+                    raise ValueError(err_msg +
+                                     "One or more of the elements were "
+                                     "callables. Use a dict of score name "
+                                     "mapped to the scorer callable. "
+                                     "Got %r" % repr(scoring))
+                else:
+                    raise ValueError(err_msg +
+                                     "Non-string types were found in "
+                                     "the given list. Got %r"
+                                     % repr(scoring))
+            scorers = {scorer: check_scoring(estimator, scoring=scorer)
+                       for scorer in scoring}
         else:
-            raise ValueError(err_msg_generic)
-        return scorers, True
+            raise ValueError(err_msg +
+                             "Empty list was given. %r" % repr(scoring))
+
+    elif isinstance(scoring, dict):
+        keys = set(scoring)
+        if not all(isinstance(k, str) for k in keys):
+            raise ValueError("Non-string types were found in the keys of "
+                             "the given dict. scoring=%r" % repr(scoring))
+        if len(keys) == 0:
+            raise ValueError("An empty dict was passed. %r" % repr(scoring))
+        scorers = {key: check_scoring(estimator, scoring=scorer)
+                   for key, scorer in scoring.items()}
+    else:
+        raise ValueError(err_msg_generic)
+    return scorers
 
 
 def make_scorer(score_func, greater_is_better=True, needs_proba=False,
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index cfabed6d2c4ac..8287e3d5c6445 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -202,30 +202,10 @@ def check_scoring_validator_for_single_metric_usecases(scoring_validator):
         assert scorer is None
 
 
-def check_multimetric_scoring_single_metric_wrapper(*args, **kwargs):
-    # This wraps the _check_multimetric_scoring to take in
-    # single metric scoring parameter so we can run the tests
-    # that we will run for check_scoring, for check_multimetric_scoring
-    # too for single-metric usecases
-
-    scorers, is_multi = _check_multimetric_scoring(*args, **kwargs)
-    # For all single metric use cases, it should register as not multimetric
-    assert not is_multi
-    if args[0] is not None:
-        assert scorers is not None
-        names, scorers = zip(*scorers.items())
-        assert len(scorers) == 1
-        assert names[0] == 'score'
-        scorers = scorers[0]
-    return scorers
-
-
 def test_check_scoring_and_check_multimetric_scoring():
     check_scoring_validator_for_single_metric_usecases(check_scoring)
     # To make sure the check_scoring is correctly applied to the constituent
     # scorers
-    check_scoring_validator_for_single_metric_usecases(
-        check_multimetric_scoring_single_metric_wrapper)
 
     # For multiple metric use cases
     # Make sure it works for the valid cases
@@ -237,8 +217,7 @@ def test_check_scoring_and_check_multimetric_scoring():
         estimator = LinearSVC(random_state=0)
         estimator.fit([[1], [2], [3]], [1, 1, 0])
 
-        scorers, is_multi = _check_multimetric_scoring(estimator, scoring)
-        assert is_multi
+        scorers = _check_multimetric_scoring(estimator, scoring)
         assert isinstance(scorers, dict)
         assert sorted(scorers.keys()) == sorted(list(scoring))
         assert all([isinstance(scorer, _PredictScorer)
@@ -589,7 +568,7 @@ def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count,
     mock_est.predict_proba = predict_proba_func
     mock_est.decision_function = decision_function_func
 
-    scorer_dict, _ = _check_multimetric_scoring(LogisticRegression(), scorers)
+    scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers)
     multi_scorer = _MultimetricScorer(**scorer_dict)
     results = multi_scorer(mock_est, X, y)
 
@@ -616,7 +595,7 @@ def predict_proba(self, X):
     clf.fit(X, y)
 
     scorers = ['roc_auc', 'neg_log_loss']
-    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
     scorer = _MultimetricScorer(**scorer_dict)
     scorer(clf, X, y)
 
@@ -639,7 +618,7 @@ def predict(self, X):
     clf.fit(X, y)
 
     scorers = {'neg_mse': 'neg_mean_squared_error', 'r2': 'roc_auc'}
-    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
     scorer = _MultimetricScorer(**scorer_dict)
     scorer(clf, X, y)
 
@@ -657,7 +636,7 @@ def test_multimetric_scorer_sanity_check():
     clf = DecisionTreeClassifier()
     clf.fit(X, y)
 
-    scorer_dict, _ = _check_multimetric_scoring(clf, scorers)
+    scorer_dict = _check_multimetric_scoring(clf, scorers)
     multi_scorer = _MultimetricScorer(**scorer_dict)
 
     result = multi_scorer(clf, X, y)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 80e78e6b7f913..e29919ea2b37e 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -27,7 +27,7 @@
 from ..base import MetaEstimatorMixin
 from ._split import check_cv
 from ._validation import _fit_and_score
-from ._validation import _aggregate_score_dicts
+from ._validation import _aggregate_list_of_dicts
 from ..exceptions import NotFittedError
 from joblib import Parallel, delayed
 from ..utils import check_random_state
@@ -627,27 +627,29 @@ def fit(self, X, y=None, groups=None, **fit_params):
         estimator = self.estimator
         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
 
-        scorers, self.multimetric_ = _check_multimetric_scoring(
-            self.estimator, scoring=self.scoring)
-
-        if self.multimetric_:
-            if self.refit is not False and (
-                    not isinstance(self.refit, str) or
-                    # This will work for both dict / list (tuple)
-                    self.refit not in scorers) and not callable(self.refit):
-                raise ValueError("For multi-metric scoring, the parameter "
-                                 "refit must be set to a scorer key or a "
-                                 "callable to refit an estimator with the "
-                                 "best parameter setting on the whole "
-                                 "data and make the best_* attributes "
-                                 "available for that metric. If this is "
-                                 "not needed, refit should be set to "
-                                 "False explicitly. %r was passed."
-                                 % self.refit)
-            else:
-                refit_metric = self.refit
-        else:
+        if (callable(self.scoring) or self.scoring is None
+                or isinstance(self.scoring, str)):
+            self.multimetric_ = False
+            scorers = {"score": check_scoring(self.estimator, self.scoring)}
             refit_metric = 'score'
+        else:
+            self.multimetric_ = True
+            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
+            refit_metric = self.refit
+
+        if self.multimetric_ and self.refit is not False and (
+                not isinstance(self.refit, str) or
+                # This will work for both dict / list (tuple)
+                self.refit not in scorers) and not callable(self.refit):
+            raise ValueError("For multi-metric scoring, the parameter "
+                             "refit must be set to a scorer key or a "
+                             "callable to refit an estimator with the "
+                             "best parameter setting on the whole "
+                             "data and make the best_* attributes "
+                             "available for that metric. If this is "
+                             "not needed, refit should be set to "
+                             "False explicitly. %r was passed."
+                             % self.refit)
 
         X, y, groups = indexable(X, y, groups)
         n_splits = cv.get_n_splits(X, y, groups)
@@ -761,9 +763,9 @@ def _format_results(self, candidate_params, scorers, n_splits, out):
 
         # test_score_dicts and train_score dicts are lists of dictionaries and
         # we make them into dict of lists
-        test_scores = _aggregate_score_dicts(test_score_dicts)
+        test_scores = _aggregate_list_of_dicts(test_score_dicts)
         if self.return_train_score:
-            train_scores = _aggregate_score_dicts(train_score_dicts)
+            train_scores = _aggregate_list_of_dicts(train_score_dicts)
 
         results = {}
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 4a51f26d3a1dc..99f9f580e155b 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -220,13 +220,17 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     X, y, groups = indexable(X, y, groups)
 
     cv = check_cv(cv, y, classifier=is_classifier(estimator))
-    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)
+
+    if callable(scoring) or scoring is None or isinstance(scoring, str):
+        scorers = {"score": check_scoring(estimator, scoring)}
+    else:
+        scorers = _check_multimetric_scoring(estimator, scoring)
 
     # We clone the estimator to make sure that all the folds are
     # independent, and that it is pickle-able.
     parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                         pre_dispatch=pre_dispatch)
-    scores = parallel(
+    results = parallel(
         delayed(_fit_and_score)(
             clone(estimator), X, y, scorers, train, test, verbose, None,
             fit_params, return_train_score=return_train_score,
@@ -234,18 +238,16 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
             error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
-    zipped_scores = list(zip(*scores))
+    results = _aggregate_list_of_dicts(results, constructor=list)
     if return_train_score:
-        train_scores = zipped_scores.pop(0)
-        train_scores = _aggregate_score_dicts(train_scores)
+        train_scores = _aggregate_list_of_dicts(results["train_scores"])
     if return_estimator:
-        fitted_estimators = zipped_scores.pop()
-    test_scores, fit_times, score_times = zipped_scores
-    test_scores = _aggregate_score_dicts(test_scores)
+        fitted_estimators = results["estimator"]
+    test_scores = _aggregate_list_of_dicts(results["test_scores"])
 
     ret = {}
-    ret['fit_time'] = np.array(fit_times)
-    ret['score_time'] = np.array(score_times)
+    ret['fit_time'] = np.array(results["fit_time"])
+    ret['score_time'] = np.array(results["score_time"])
 
     if return_estimator:
         ret['estimator'] = fitted_estimators
@@ -394,7 +396,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                    parameters, fit_params, return_train_score=False,
                    return_parameters=False, return_n_test_samples=False,
                    return_times=False, return_estimator=False,
-                   error_score=np.nan, check_scorer_key=None):
+                   error_score=np.nan):
     """Fit estimator and compute scores for a given dataset split.
 
     Parameters
@@ -455,33 +457,30 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     return_estimator : boolean, optional, default: False
         Whether to return the fitted estimator.
 
-    check_scorer_key : str or None, default=None
-        If a string and scorer returns a dictionary, the keys will be check
-        to contain `check_scorer_key`.
-
     Returns
     -------
-    train_scores : dict of scorer name -> float, optional
-        Score on training set (for all the scorers),
-        returned only if `return_train_score` is `True`.
+    result: dict with the following attributes
+        train_scores : dict of scorer name -> float, optional
+            Score on training set (for all the scorers),
+            returned only if `return_train_score` is `True`.
 
-    test_scores : dict of scorer name -> float, optional
-        Score on testing set (for all the scorers).
+        test_scores : dict of scorer name -> float, optional
+            Score on testing set (for all the scorers).
 
-    n_test_samples : int
-        Number of test samples.
+        n_test_samples : int
+            Number of test samples.
 
-    fit_time : float
-        Time spent for fitting in seconds.
+        fit_time : float
+            Time spent for fitting in seconds.
 
-    score_time : float
-        Time spent for scoring in seconds.
+        score_time : float
+            Time spent for scoring in seconds.
 
-    parameters : dict or None, optional
-        The parameters that have been evaluated.
+        parameters : dict or None, optional
+            The parameters that have been evaluated.
 
-    estimator : estimator object
-        The fitted estimator
+        estimator : estimator object
+            The fitted estimator
     """
     if verbose > 1:
         if parameters is None:
@@ -496,7 +495,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     fit_params = {k: _index_param_value(X, v, train)
                   for k, v in fit_params.items()}
 
-    train_scores = {}
     if parameters is not None:
         estimator.set_params(**parameters)
 
@@ -543,12 +541,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         if return_train_score:
             train_scores = _score(estimator, X_train, y_train, scorer)
 
-    # check scorer keys
-    if (check_scorer_key is not None and isinstance(test_scores, dict)
-            and check_scorer_key not in test_scores):
-        raise ValueError("dict returned by scorer must contain {}".format(
-            check_scorer_key))
-
     if verbose > 2:
         if isinstance(test_scores, dict):
             for scorer_name in sorted(test_scores):
@@ -567,17 +559,19 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         total_time = score_time + fit_time
         print(_message_with_time('CV', msg, total_time))
 
-    ret = [train_scores, test_scores] if return_train_score else [test_scores]
-
+    result = {"test_scores": test_scores}
+    if return_train_score:
+        result["train_scores"] = train_scores
     if return_n_test_samples:
-        ret.append(_num_samples(X_test))
+        result["n_test_samples"] = _num_samples(X_test)
     if return_times:
-        ret.extend([fit_time, score_time])
+        result["fit_time"] = fit_time
+        result["score_time"] = score_time
     if return_parameters:
-        ret.append(parameters)
+        result["parameters"] = parameters
     if return_estimator:
-        ret.append(estimator)
-    return ret
+        result["estimator"] = estimator
+    return result
 
 
 def _score(estimator, X_test, y_test, scorer):
@@ -1258,23 +1252,32 @@ def learning_curve(estimator, X, y, groups=None,
         out = parallel(delayed(_incremental_fit_estimator)(
             clone(estimator), X, y, classes, train, test, train_sizes_abs,
             scorer, verbose, return_times) for train, test in cv_iter)
+        out = np.asarray(out).transpose((2, 1, 0))
     else:
         train_test_proportions = []
         for train, test in cv_iter:
             for n_train_samples in train_sizes_abs:
                 train_test_proportions.append((train[:n_train_samples], test))
 
-        out = parallel(delayed(_fit_and_score)(
+        results = parallel(delayed(_fit_and_score)(
             clone(estimator), X, y, scorer, train, test, verbose,
             parameters=None, fit_params=None, return_train_score=True,
             error_score=error_score, return_times=return_times)
             for train, test in train_test_proportions)
-        out = np.array(out)
-        n_cv_folds = out.shape[0] // n_unique_ticks
-        dim = 4 if return_times else 2
-        out = out.reshape(n_cv_folds, n_unique_ticks, dim)
+        results = _aggregate_list_of_dicts(results, constructor=list)
+        train_scores = (np.array(results["train_scores"])
+                        .reshape(-1, n_unique_ticks).T)
 
-    out = np.asarray(out).transpose((2, 1, 0))
+        test_scores = (np.array(results["test_scores"])
+                       .reshape(-1, n_unique_ticks).T)
+        out = [train_scores, test_scores]
+
+        if return_times:
+            fit_times = (np.array(results["fit_time"])
+                         .reshape(-1, n_unique_ticks).T)
+            score_times = (np.array(results["score_time"])
+                           .reshape(-1, n_unique_ticks).T)
+            out.extend([fit_times, score_times])
 
     ret = train_sizes_abs, out[0], out[1]
 
@@ -1479,21 +1482,24 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
 
     parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                         verbose=verbose)
-    out = parallel(delayed(_fit_and_score)(
+    results = parallel(delayed(_fit_and_score)(
         clone(estimator), X, y, scorer, train, test, verbose,
         parameters={param_name: v}, fit_params=None, return_train_score=True,
         error_score=error_score)
         # NOTE do not change order of iteration to allow one time cv splitters
         for train, test in cv.split(X, y, groups) for v in param_range)
-    out = np.asarray(out)
     n_params = len(param_range)
-    n_cv_folds = out.shape[0] // n_params
-    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
 
-    return out[0], out[1]
+    results = _aggregate_list_of_dicts(results)
+    train_scores = (np.asarray(results["train_scores"])
+                    .reshape(-1, n_params).T)
+    test_scores = (np.asarray(results["test_scores"])
+                   .reshape(-1, n_params).T)
+
+    return train_scores, test_scores
 
 
-def _aggregate_score_dicts(scores):
+def _aggregate_list_of_dicts(scores, constructor=np.asarray):
     """Aggregate the list of dict to dict of np ndarray
 
     The aggregated output of _fit_and_score will be a list of dict
@@ -1511,10 +1517,10 @@ def _aggregate_score_dicts(scores):
     -------
 
     >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
-    ...           {'a': 10, 'b': 10}]                         # doctest: +SKIP
-    >>> _aggregate_score_dicts(scores)                        # doctest: +SKIP
+    ...           {'a': 10, 'b': 10}]
+    >>> _aggregate_list_of_dicts(scores)
     {'a': array([1, 2, 3, 10]),
      'b': array([10, 2, 3, 10])}
     """
-    return {key: np.asarray([score[key] for score in scores])
+    return {key: constructor([score[key] for score in scores])
             for key in scores[0]}
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 4b74bed3fee8b..f3301606e997e 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -1779,39 +1779,3 @@ def get_n_splits(self, *args, **kw):
                              'inconsistent results. Expected \\d+ '
                              'splits, got \\d+'):
         ridge.fit(X[:train_size], y[:train_size])
-
-
-def test_callable_multimetric_same_as_list_of_strings():
-    def custom_scorer(est, X, y):
-        y_pred = est.predict(X)
-        return {'recall': recall_score(y, y_pred),
-                'accuracy': accuracy_score(y, y_pred)}
-
-    X, y = make_classification(n_samples=40, n_features=4,
-                               random_state=42)
-    est = LinearSVC(random_state=42)
-    search_callable = GridSearchCV(est, {'C': [0.1, 1]},
-                                   scoring=custom_scorer, refit='recall')
-    search_str = GridSearchCV(est, {'C': [0.1, 1]},
-                              scoring=['recall', 'accuracy'], refit='recall')
-
-    search_callable.fit(X, y)
-    search_str.fit(X, y)
-
-    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
-    assert search_callable.best_index_ == search_str.best_index_
-
-
-def test_callable_multimetric_error_on_invalid_key():
-    def bad_scorer(est, X, y):
-        return {'bad_name': 1}
-
-    X, y = make_classification(n_samples=40, n_features=4,
-                               random_state=42)
-    clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]},
-                       scoring=bad_scorer, refit='good_name')
-
-    msg = ('For multi-metric scoring, the parameter refit must be set to a '
-           'scorer key or a callable to refit')
-    with pytest.raises(ValueError, match=msg):
-        clf.fit(X, y)
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index d1be6ae7b45e0..4d681f24403ee 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -1717,18 +1717,3 @@ def two_params_scorer(estimator, X_test):
     fit_and_score_args = [None, None, None, two_params_scorer]
     assert_raise_message(ValueError, error_message,
                          _score, *fit_and_score_args)
-
-
-def test_errors_when_key_not_in_scorer_dict():
-    def scorer(est, X, y):
-        return {"my_key": 1}
-
-    X, y = make_classification(n_samples=30, random_state=0)
-    train, test = next(ShuffleSplit().split(X))
-    clf = SVC(kernel="linear", random_state=0)
-
-    fit_and_score_args = [clf, X, y, scorer, train, test, 10, None, None]
-
-    msg = "dict returned by scorer must contain not_my_key"
-    with pytest.raises(ValueError, match=msg):
-        _fit_and_score(*fit_and_score_args, check_scorer_key='not_my_key')

From 702cf1bfb79ae86d93fb58e12a995cfade79ba40 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Oct 2019 13:52:16 -0400
Subject: [PATCH 03/31] ENH Refactories _fit_and_score

---
 sklearn/model_selection/_search.py            | 26 +++++++++----------
 sklearn/model_selection/_validation.py        | 24 +++++++++--------
 .../model_selection/tests/test_validation.py  |  2 +-
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index e29919ea2b37e..42094c544f9ae 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -368,13 +368,12 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     # NOTE we are not using the return value as the scorer by itself should be
     # validated before. We use check_scoring only to reject multimetric scorer
     check_scoring(estimator, scorer)
-    scores, n_samples_test = _fit_and_score(estimator, X, y,
-                                            scorer, train,
-                                            test, verbose, parameters,
-                                            fit_params=fit_params,
-                                            return_n_test_samples=True,
-                                            error_score=error_score)
-    return scores, parameters, n_samples_test
+    results = _fit_and_score(estimator, X, y, scorer, train,
+                             test, verbose, parameters,
+                             fit_params=fit_params,
+                             return_n_test_samples=True,
+                             error_score=error_score)
+    return results["test_scores"], parameters, results["n_test_samples"]
 
 
 def _check_param_grid(param_grid):
@@ -752,19 +751,18 @@ def evaluate_candidates(candidate_params):
 
     def _format_results(self, candidate_params, scorers, n_splits, out):
         n_candidates = len(candidate_params)
+        results = _aggregate_list_of_dicts(out, constructor=list)
 
-        # if one choose to see train score, "out" will contain train score info
-        if self.return_train_score:
-            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
-             score_time) = zip(*out)
-        else:
-            (test_score_dicts, test_sample_counts, fit_time,
-             score_time) = zip(*out)
+        test_score_dicts = results["test_scores"]
+        test_sample_counts = results["n_test_samples"]
+        fit_time = results["fit_time"]
+        score_time = results["score_time"]
 
         # test_score_dicts and train_score dicts are lists of dictionaries and
         # we make them into dict of lists
         test_scores = _aggregate_list_of_dicts(test_score_dicts)
         if self.return_train_score:
+            train_score_dicts = results["train_scores"]
             train_scores = _aggregate_list_of_dicts(train_score_dicts)
 
         results = {}
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 99f9f580e155b..77df9aae6ce52 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -1499,8 +1499,8 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
     return train_scores, test_scores
 
 
-def _aggregate_list_of_dicts(scores, constructor=np.asarray):
-    """Aggregate the list of dict to dict of np ndarray
+def _aggregate_list_of_dicts(elements, constructor=np.asarray):
+    """Aggregate the list of dicts
 
     The aggregated output of _fit_and_score will be a list of dict
     of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]
@@ -1509,18 +1509,20 @@ def _aggregate_list_of_dicts(scores, constructor=np.asarray):
     Parameters
     ----------
 
-    scores : list of dict
-        List of dicts of the scores for all scorers. This is a flat list,
+    elements : list of dict
+        List of dicts of the elements for all scorers. This is a flat list,
         assumed originally to be of row major order.
 
+    constructor : function, default=np.asarray
+        Used to combine elements of dictionaries in list
+
     Example
     -------
 
-    >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
-    ...           {'a': 10, 'b': 10}]
-    >>> _aggregate_list_of_dicts(scores)
-    {'a': array([1, 2, 3, 10]),
-     'b': array([10, 2, 3, 10])}
+    >>> elements = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
+    ...             {'a': 10, 'b': 10}] # doctest: +SKIP
+    >>> _aggregate_list_of_dicts(elements) # doctest: +SKIP
+    {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])}
     """
-    return {key: constructor([score[key] for score in scores])
-            for key in scores[0]}
+    return {key: constructor([elm[key] for elm in elements])
+            for key in elements[0]}
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 4d681f24403ee..aa8c12132b09b 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -1680,7 +1680,7 @@ def test_fit_and_score_working():
                             'return_parameters': True}
     result = _fit_and_score(*fit_and_score_args,
                             **fit_and_score_kwargs)
-    assert result[-1] == fit_and_score_kwargs['parameters']
+    assert result['parameters'] == fit_and_score_kwargs['parameters']
 
 
 def three_params_scorer(i, j, k):

From a7d2efbdbb55091a6e6aae2cdf84fdd8d49e9e6b Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Oct 2019 21:45:18 -0400
Subject: [PATCH 04/31] RFC Moves support into a function

---
 doc/modules/model_evaluation.rst              |  20 ++-
 sklearn/model_selection/_search.py            |  90 +++++++++-----
 sklearn/model_selection/_validation.py        |  75 ++++++++++--
 sklearn/model_selection/tests/test_search.py  | 114 ++++++++++++++++++
 sklearn/model_selection/tests/test_split.py   |   2 +-
 .../model_selection/tests/test_validation.py  |  29 ++++-
 6 files changed, 274 insertions(+), 56 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 7f0553c30a3e3..a34c6ae867fff 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -248,7 +248,7 @@ Using multiple metric evaluation
 Scikit-learn also permits evaluation of multiple metrics in ``GridSearchCV``,
 ``RandomizedSearchCV`` and ``cross_validate``.
 
-There are two ways to specify multiple scoring metrics for the ``scoring``
+There are three ways to specify multiple scoring metrics for the ``scoring``
 parameter:
 
 - As an iterable of string metrics::
@@ -263,22 +263,20 @@ parameter:
 Note that the dict values can either be scorer functions or one of the
 predefined metric strings.
 
-Currently only those scorer functions that return a single score can be passed
-inside the dict. Scorer functions that return multiple values are not
-permitted and will require a wrapper to return a single metric::
+- As a callable that returns a dictionary of scores::
 
     >>> from sklearn.model_selection import cross_validate
     >>> from sklearn.metrics import confusion_matrix
     >>> # A sample toy binary classification dataset
     >>> X, y = datasets.make_classification(n_classes=2, random_state=0)
     >>> svm = LinearSVC(random_state=0)
-    >>> def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
-    >>> def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
-    >>> def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
-    >>> def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
-    >>> scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn),
-    ...            'fp': make_scorer(fp), 'fn': make_scorer(fn)}
-    >>> cv_results = cross_validate(svm.fit(X, y), X, y, cv=5, scoring=scoring)
+    >>> def confusion_matrix_scorer(clf, X, y):
+    ...      y_pred = clf.predict(X)
+    ...      cm = confusion_matrix(y, y_pred)
+    ...      return {'tn': cm[0, 0], 'fp': cm[0, 1],
+    ...              'fn': cm[1, 0], 'tp': cm[1, 1]}
+    >>> cv_results = cross_validate(svm.fit(X, y), X, y, cv=5,
+    ...                             scoring=confusion_matrix_scorer)
     >>> # Getting the test set true positive scores
     >>> print(cv_results['test_tp'])
     [10  9  8  7  8]
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 42094c544f9ae..751bdf27dbc6c 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -28,6 +28,7 @@
 from ._split import check_cv
 from ._validation import _fit_and_score
 from ._validation import _aggregate_list_of_dicts
+from ._validation import _check_fit_and_score_results
 from ..exceptions import NotFittedError
 from joblib import Parallel, delayed
 from ..utils import check_random_state
@@ -445,8 +446,18 @@ def score(self, X, y=None):
             raise ValueError("No score function explicitly defined, "
                              "and the estimator doesn't provide one %s"
                              % self.best_estimator_)
-        score = self.scorer_[self.refit] if self.multimetric_ else self.scorer_
-        return score(self.best_estimator_, X, y)
+        if isinstance(self.scorer_, dict):
+            if self.multimetric_:
+                scorer = self.scorer_[self.refit]
+            else:
+                scorer = self.scorer_
+            return scorer(self.best_estimator_, X, y)
+
+        # callable
+        score = self.scorer_(self.best_estimator_, X, y)
+        if self.multimetric_:
+            score = score[self.refit]
+        return score
 
     def _check_is_fitted(self, method_name):
         if not self.refit:
@@ -626,29 +637,31 @@ def fit(self, X, y=None, groups=None, **fit_params):
         estimator = self.estimator
         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
 
-        if (callable(self.scoring) or self.scoring is None
-                or isinstance(self.scoring, str)):
-            self.multimetric_ = False
-            scorers = {"score": check_scoring(self.estimator, self.scoring)}
-            refit_metric = 'score'
+        multimetric_refit_msg = ("For multi-metric scoring, the parameter "
+                                 "refit must be set to a scorer key or a "
+                                 "callable to refit an estimator with the "
+                                 "best parameter setting on the whole "
+                                 "data and make the best_* attributes "
+                                 "available for that metric. If this is "
+                                 "not needed, refit should be set to "
+                                 "False explicitly. %r was passed."
+                                 % self.refit)
+
+        refit_metric = "score"
+        scoring_callable = callable(self.scoring)
+        if scoring_callable:
+            scorers = self.scoring
+        elif (self.scoring is None or isinstance(self.scoring, str)):
+            scorers = check_scoring(self.estimator, self.scoring)
         else:
-            self.multimetric_ = True
             scorers = _check_multimetric_scoring(self.estimator, self.scoring)
             refit_metric = self.refit
 
-        if self.multimetric_ and self.refit is not False and (
-                not isinstance(self.refit, str) or
-                # This will work for both dict / list (tuple)
-                self.refit not in scorers) and not callable(self.refit):
-            raise ValueError("For multi-metric scoring, the parameter "
-                             "refit must be set to a scorer key or a "
-                             "callable to refit an estimator with the "
-                             "best parameter setting on the whole "
-                             "data and make the best_* attributes "
-                             "available for that metric. If this is "
-                             "not needed, refit should be set to "
-                             "False explicitly. %r was passed."
-                             % self.refit)
+            if self.refit is not False and (
+                    not isinstance(self.refit, str) or
+                    # This will work for both dict / list (tuple)
+                    self.refit not in scorers) and not callable(self.refit):
+                raise ValueError(multimetric_refit_msg)
 
         X, y, groups = indexable(X, y, groups)
         n_splits = cv.get_n_splits(X, y, groups)
@@ -664,6 +677,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
                                     return_n_test_samples=True,
                                     return_times=True,
                                     return_parameters=False,
+                                    return_fit_failed=True,
                                     error_score=self.error_score,
                                     verbose=self.verbose)
         results = {}
@@ -705,11 +719,26 @@ def evaluate_candidates(candidate_params):
 
                 nonlocal results
                 results = self._format_results(
-                    all_candidate_params, scorers, n_splits, all_out)
+                    all_candidate_params, n_splits, all_out)
                 return results
 
             self._run_search(evaluate_candidates)
 
+            for out in all_out:
+                if not out["fit_failed"]:
+                    successful_score = out['test_scores']
+                    break
+
+            self.multimetric_ = isinstance(successful_score, dict)
+
+            # scorer is callable, check refit_metric now
+            if scoring_callable and self.multimetric_:
+                if (self.refit is not False and not callable(self.refit)
+                        and (not isinstance(self.refit, str)
+                             or self.refit not in successful_score)):
+                    raise ValueError(multimetric_refit_msg)
+                refit_metric = self.refit
+
         # For multi-metric evaluation, store the best_index_, best_params_ and
         # best_score_ iff refit is one of the scorer names
         # In single metric evaluation, refit_metric is "score"
@@ -742,28 +771,27 @@ def evaluate_candidates(candidate_params):
             self.refit_time_ = refit_end_time - refit_start_time
 
         # Store the only scorer not as a dict for single metric evaluation
-        self.scorer_ = scorers if self.multimetric_ else scorers['score']
+        self.scorer_ = scorers
 
         self.cv_results_ = results
         self.n_splits_ = n_splits
 
         return self
 
-    def _format_results(self, candidate_params, scorers, n_splits, out):
+    def _format_results(self, candidate_params, n_splits, out):
         n_candidates = len(candidate_params)
         results = _aggregate_list_of_dicts(out, constructor=list)
 
-        test_score_dicts = results["test_scores"]
         test_sample_counts = results["n_test_samples"]
         fit_time = results["fit_time"]
         score_time = results["score_time"]
 
-        # test_score_dicts and train_score dicts are lists of dictionaries and
-        # we make them into dict of lists
-        test_scores = _aggregate_list_of_dicts(test_score_dicts)
+        info_dict = _check_fit_and_score_results(results, self.error_score)
+        score_names = info_dict["score_names"]
+        test_scores = info_dict["test_scores"]
+
         if self.return_train_score:
-            train_score_dicts = results["train_scores"]
-            train_scores = _aggregate_list_of_dicts(train_score_dicts)
+            train_scores = info_dict["train_scores"]
 
         results = {}
 
@@ -824,7 +852,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         else:
             iid = False
 
-        for scorer_name in scorers.keys():
+        for scorer_name in score_names:
             # Computed the (weighted) mean and std for test scores alone
             _store('test_%s' % scorer_name, test_scores[scorer_name],
                    splits=True, rank=True,
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 77df9aae6ce52..a733cde888747 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -27,7 +27,7 @@
 from ..utils.metaestimators import _safe_split
 from ..metrics.scorer import (check_scoring, _check_multimetric_scoring,
                               _MultimetricScorer)
-from ..exceptions import FitFailedWarning
+from ..exceptions import FitFailedWarning, NotFittedError
 from ._split import check_cv
 from ..preprocessing import LabelEncoder
 
@@ -221,8 +221,10 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
 
     cv = check_cv(cv, y, classifier=is_classifier(estimator))
 
-    if callable(scoring) or scoring is None or isinstance(scoring, str):
-        scorers = {"score": check_scoring(estimator, scoring)}
+    if callable(scoring):
+        scorers = scoring
+    elif scoring is None or isinstance(scoring, str):
+        scorers = check_scoring(estimator, scoring)
     else:
         scorers = _check_multimetric_scoring(estimator, scoring)
 
@@ -235,15 +237,17 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
             clone(estimator), X, y, scorers, train, test, verbose, None,
             fit_params, return_train_score=return_train_score,
             return_times=True, return_estimator=return_estimator,
-            error_score=error_score)
+            error_score=error_score, return_fit_failed=True)
         for train, test in cv.split(X, y, groups))
 
     results = _aggregate_list_of_dicts(results, constructor=list)
-    if return_train_score:
-        train_scores = _aggregate_list_of_dicts(results["train_scores"])
+
+    info_dict = _check_fit_and_score_results(results, error_score)
+    score_names = info_dict["score_names"]
+    test_scores = info_dict["test_scores"]
+
     if return_estimator:
         fitted_estimators = results["estimator"]
-    test_scores = _aggregate_list_of_dicts(results["test_scores"])
 
     ret = {}
     ret['fit_time'] = np.array(results["fit_time"])
@@ -252,15 +256,52 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
-    for name in scorers:
+    for name in score_names:
         ret['test_%s' % name] = np.array(test_scores[name])
         if return_train_score:
+            train_scores = info_dict["train_scores"]
             key = 'train_%s' % name
             ret[key] = np.array(train_scores[name])
 
     return ret
 
 
+def _check_fit_and_score_results(results, error_score):
+    """Checks _fit_and_score results. Handles scoring as a callable and
+    normalizes scores into a list of dictionaries.
+    """
+    fit_failed = results["fit_failed"]
+    test_score_dicts = results["test_scores"]
+
+    if all(fit_failed):
+        raise NotFittedError("All estimators failed to fit")
+
+    successful_score = test_score_dicts[fit_failed.index(False)]
+    if any(fit_failed) and isinstance(successful_score, dict):
+        for i in np.flatnonzero(fit_failed):
+            # error_score is a number
+            test_score_dicts[i] = {name: error_score
+                                   for name in successful_score}
+
+    output = {}
+    # converts single metrics into a list of dictionaries
+    if not isinstance(successful_score, dict):
+        test_score_dicts = [{"score": elm} for elm in test_score_dicts]
+        output["score_names"] = ["score"]
+    else:
+        output["score_names"] = list(successful_score.keys())
+
+    output["test_scores"] = _aggregate_list_of_dicts(test_score_dicts)
+
+    if "train_scores" in results:
+        train_score_dicts = results["train_scores"]
+        if not isinstance(successful_score, dict):
+            train_score_dicts = [{"score": elm} for elm in train_score_dicts]
+        output["train_scores"] = _aggregate_list_of_dicts(train_score_dicts)
+
+    return output
+
+
 def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
                     n_jobs=None, verbose=0, fit_params=None,
                     pre_dispatch='2*n_jobs', error_score=np.nan):
@@ -396,7 +437,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                    parameters, fit_params, return_train_score=False,
                    return_parameters=False, return_n_test_samples=False,
                    return_times=False, return_estimator=False,
-                   error_score=np.nan):
+                   error_score=np.nan, return_fit_failed=False):
     """Fit estimator and compute scores for a given dataset split.
 
     Parameters
@@ -457,6 +498,10 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     return_estimator : boolean, optional, default: False
         Whether to return the fitted estimator.
 
+    return_fit_failed : bool, default=False
+        Whether to return if estimatored failed to fit, when error_score is
+        numeric.
+
     Returns
     -------
     result: dict with the following attributes
@@ -481,6 +526,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
 
         estimator : estimator object
             The fitted estimator
+
+        fit_failed : bool
+            The estimator failed to fit.
     """
     if verbose > 1:
         if parameters is None:
@@ -503,6 +551,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, y_test = _safe_split(estimator, X, y, test, train)
 
+    result = {}
     try:
         if y_train is None:
             estimator.fit(X_train, **fit_params)
@@ -533,8 +582,12 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
             raise ValueError("error_score must be the string 'raise' or a"
                              " numeric value. (Hint: if using 'raise', please"
                              " make sure that it has been spelled correctly.)")
-
+        if return_fit_failed:
+            result["fit_failed"] = True
     else:
+        if return_fit_failed:
+            result["fit_failed"] = False
+
         fit_time = time.time() - start_time
         test_scores = _score(estimator, X_test, y_test, scorer)
         score_time = time.time() - start_time - fit_time
@@ -559,7 +612,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         total_time = score_time + fit_time
         print(_message_with_time('CV', msg, total_time))
 
-    result = {"test_scores": test_scores}
+    result["test_scores"] = test_scores
     if return_train_score:
         result["train_scores"] = train_scores
     if return_n_test_samples:
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index f3301606e997e..8ff8a1287127b 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -61,6 +61,7 @@
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import make_scorer
 from sklearn.metrics import roc_auc_score
+from sklearn.metrics import confusion_matrix
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import Ridge, SGDClassifier
@@ -1779,3 +1780,116 @@ def get_n_splits(self, *args, **kw):
                              'inconsistent results. Expected \\d+ '
                              'splits, got \\d+'):
         ridge.fit(X[:train_size], y[:train_size])
+
+
+def test_callable_multimetric_confusion_matrix():
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {'tn': cm[0, 0], 'fp': cm[0, 1], 'fn': cm[1, 0], 'tp': cm[1, 1]}
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    est = LinearSVC(random_state=42)
+    search = GridSearchCV(est, {'C': [0.1, 1]}, scoring=custom_scorer,
+                          refit='fp')
+
+    search.fit(X, y)
+
+    score_names = ['tn', 'fp', 'fn', 'tp']
+    for name in score_names:
+        assert "mean_test_{}".format(name) in search.cv_results_
+
+    y_pred = search.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    assert search.score(X, y) == pytest.approx(cm[0, 1])
+
+
+def test_callable_multimetric_same_as_list_of_strings():
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return {'recall': recall_score(y, y_pred),
+                'accuracy': accuracy_score(y, y_pred)}
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(est, {'C': [0.1, 1]},
+                                   scoring=custom_scorer, refit='recall')
+    search_str = GridSearchCV(est, {'C': [0.1, 1]},
+                              scoring=['recall', 'accuracy'], refit='recall')
+
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+    assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y))
+
+
+def test_callable_single_metric_same_as_single_string():
+    def custom_scorer(est, X, y):
+        y_pred = est.predict(X)
+        return recall_score(y, y_pred)
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    est = LinearSVC(random_state=42)
+    search_callable = GridSearchCV(est, {'C': [0.1, 1]},
+                                   scoring=custom_scorer, refit=True)
+    search_str = GridSearchCV(est, {'C': [0.1, 1]},
+                              scoring='recall', refit='recall')
+
+    search_callable.fit(X, y)
+    search_str.fit(X, y)
+
+    assert search_callable.best_score_ == pytest.approx(search_str.best_score_)
+    assert search_callable.best_index_ == search_str.best_index_
+    assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y))
+
+
+def test_callable_multimetric_error_on_invalid_key():
+    def bad_scorer(est, X, y):
+        return {'bad_name': 1}
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]},
+                       scoring=bad_scorer, refit='good_name')
+
+    msg = ('For multi-metric scoring, the parameter refit must be set to a '
+           'scorer key or a callable to refit')
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_callable_multimetric_error_failing_clf():
+    def custom_scorer(est, X, y):
+        return {'acc': 1}
+
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+    gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring=custom_scorer,
+                      refit=False, error_score=0.1)
+
+    with pytest.warns(FitFailedWarning, match='Estimator fit failed'):
+        gs.fit(X, y)
+
+    assert_allclose(gs.cv_results_['mean_test_acc'], [1, 1, 0.1])
+
+
+def test_callable_multimetric_clf_all_fails():
+    def custom_scorer(est, X, y):
+        return {'acc': 1}
+    X, y = make_classification(n_samples=20, n_features=10, random_state=0)
+
+    clf = FailingClassifier()
+
+    gs = GridSearchCV(clf, [{'parameter': [2, 2, 2]}], scoring=custom_scorer,
+                      refit=False, error_score=0.1)
+
+    with pytest.warns(FitFailedWarning, match='Estimator fit failed'), \
+            pytest.raises(NotFittedError,
+                          match="All estimators failed to fit"):
+        gs.fit(X, y)
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 12891c6004f90..09d1bea05fd44 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -1517,7 +1517,7 @@ def test_nested_cv():
            StratifiedShuffleSplit(n_splits=3, random_state=0)]
 
     for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
-        gs = GridSearchCV(Ridge(solver="eigen"), param_grid={'alpha': [1, .1]},
+        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                           cv=inner_cv, error_score='raise')
         cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                         fit_params={'groups': groups})
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index aa8c12132b09b..9448f6b2b0740 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -52,13 +52,14 @@
 from sklearn.metrics import precision_recall_fscore_support
 from sklearn.metrics import precision_score
 from sklearn.metrics import r2_score
+from sklearn.metrics import mean_squared_error
 from sklearn.metrics.scorer import check_scoring
 
 from sklearn.linear_model import Ridge, LogisticRegression, SGDClassifier
 from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
+from sklearn.svm import SVC, LinearSVC
 from sklearn.cluster import KMeans
 
 from sklearn.impute import SimpleImputer
@@ -443,9 +444,16 @@ def check_cross_validate_multi_metric(clf, X, y, scores):
     # Test multimetric evaluation when scoring is a list / dict
     (train_mse_scores, test_mse_scores, train_r2_scores,
      test_r2_scores, fitted_estimators) = scores
+
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        return {'r2': r2_score(y, y_pred),
+                'neg_mean_squared_error': -mean_squared_error(y, y_pred)}
+
     all_scoring = (('r2', 'neg_mean_squared_error'),
                    {'r2': make_scorer(r2_score),
-                    'neg_mean_squared_error': 'neg_mean_squared_error'})
+                    'neg_mean_squared_error': 'neg_mean_squared_error'},
+                   custom_scorer)
 
     keys_sans_train = {'test_r2', 'test_neg_mean_squared_error',
                        'fit_time', 'score_time'}
@@ -1717,3 +1725,20 @@ def two_params_scorer(estimator, X_test):
     fit_and_score_args = [None, None, None, two_params_scorer]
     assert_raise_message(ValueError, error_message,
                          _score, *fit_and_score_args)
+
+
+def test_callable_multimetric_confusion_matrix_cross_validate():
+    def custom_scorer(clf, X, y):
+        y_pred = clf.predict(X)
+        cm = confusion_matrix(y, y_pred)
+        return {'tn': cm[0, 0], 'fp': cm[0, 1], 'fn': cm[1, 0], 'tp': cm[1, 1]}
+
+    X, y = make_classification(n_samples=40, n_features=4,
+                               random_state=42)
+    est = LinearSVC(random_state=42)
+    est.fit(X, y)
+    cv_results = cross_validate(est, X, y, cv=5, scoring=custom_scorer)
+
+    score_names = ['tn', 'fp', 'fn', 'tp']
+    for name in score_names:
+        assert "test_{}".format(name) in cv_results

From c77afd756bff382385b2b44990bd8826ed85bb59 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Oct 2019 22:00:51 -0400
Subject: [PATCH 05/31] BUG Fix old numpy bug

---
 sklearn/model_selection/_validation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index a733cde888747..75a99a97909a9 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -278,8 +278,9 @@ def _check_fit_and_score_results(results, error_score):
 
     successful_score = test_score_dicts[fit_failed.index(False)]
     if any(fit_failed) and isinstance(successful_score, dict):
-        for i in np.flatnonzero(fit_failed):
-            # error_score is a number
+        for i, failed in enumerate(fit_failed):
+            if not failed:
+                continue
             test_score_dicts[i] = {name: error_score
                                    for name in successful_score}
 

From 5ab8693fcbecbe33d9b30605793c1d68cf203bc2 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Oct 2019 22:17:11 -0400
Subject: [PATCH 06/31] TST Removes tests for error on multimetric

---
 sklearn/metrics/scorer.py                        |  2 +-
 sklearn/model_selection/tests/test_validation.py | 14 ++------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 6c80894a0ee13..f99ab0bc1e149 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -459,7 +459,7 @@ def _check_multimetric_scoring(estimator, scoring):
         A dict mapping each scorer name to its validated scorer.
     """
     err_msg_generic = ("scoring should either be a single string or "
-                       "callable for single metric evaluation or a "
+                       "callable or a "
                        "list/tuple of strings or a dict of scorer name "
                        "mapped to the callable for multiple metric "
                        "evaluation. Got %s of type %s"
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 9448f6b2b0740..df185918bb2da 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -315,8 +315,8 @@ def test_cross_validate_invalid_scoring_param():
                         cross_validate, estimator, X, y,
                         scoring=[[make_scorer(precision_score)]])
 
-    error_message_regexp = (".*should either be.*string or callable.*for "
-                            "single.*.*dict.*for multi.*")
+    error_message_regexp = (".*should either be.*string or callable.*"
+                            ".*.*dict.*for multi.*")
 
     # Empty dict should raise invalid scoring error
     assert_raises_regex(ValueError, "An empty dict",
@@ -340,16 +340,6 @@ def test_cross_validate_invalid_scoring_param():
                         cross_validate, estimator, X, y,
                         scoring={"foo": multiclass_scorer})
 
-    multivalued_scorer = make_scorer(confusion_matrix)
-
-    # Multiclass Scorers that return multiple values are not supported yet
-    assert_raises_regex(ValueError, "scoring must return a number, got",
-                        cross_validate, SVC(), X, y,
-                        scoring=multivalued_scorer)
-    assert_raises_regex(ValueError, "scoring must return a number, got",
-                        cross_validate, SVC(), X, y,
-                        scoring={"foo": multivalued_scorer})
-
     assert_raises_regex(ValueError, "'mse' is not a valid scoring value.",
                         cross_validate, SVC(), X, y, scoring="mse")
 

From e8f8c9fd767090542de0163a661f06fd740e3fd1 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 4 Dec 2019 10:47:34 -0500
Subject: [PATCH 07/31] DOC Indent

---
 doc/modules/model_evaluation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 60341e4d2e78b..0cdda4ec1e09b 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -260,8 +260,8 @@ parameter:
       >>> scoring = {'accuracy': make_scorer(accuracy_score),
       ...            'prec': 'precision'}
 
-Note that the dict values can either be scorer functions or one of the
-predefined metric strings.
+  Note that the dict values can either be scorer functions or one of the
+  predefined metric strings.
 
 - As a callable that returns a dictionary of scores::
 

From 5f50a323e56c2ee7d6f9a3ff53ad896497465a78 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 4 Dec 2019 12:36:44 -0500
Subject: [PATCH 08/31] CLN Refactors multimetric check

---
 sklearn/model_selection/_search.py     | 47 ++++++++++++--------------
 sklearn/model_selection/_validation.py | 17 +++-------
 2 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index d43c066a3920c..d37edb7f014f1 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -610,6 +610,23 @@ def _run_search(self, evaluate_candidates):
         """
         raise NotImplementedError("_run_search not implemented.")
 
+    def _check_multimetric_scores_refit(self, scores_dict):
+        """Check score contains the string in refit"""
+        multimetric_refit_msg = ("For multi-metric scoring, the parameter "
+                                 "refit must be set to a scorer key or a "
+                                 "callable to refit an estimator with the "
+                                 "best parameter setting on the whole "
+                                 "data and make the best_* attributes "
+                                 "available for that metric. If this is "
+                                 "not needed, refit should be set to "
+                                 "False explicitly. %r was passed."
+                                 % self.refit)
+        if self.refit is not False and (
+                not isinstance(self.refit, str) or
+                # This will work for both dict / list (tuple)
+                self.refit not in scores_dict) and not callable(self.refit):
+            raise ValueError(multimetric_refit_msg)
+
     def fit(self, X, y=None, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
@@ -635,32 +652,16 @@ def fit(self, X, y=None, groups=None, **fit_params):
         estimator = self.estimator
         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
 
-        multimetric_refit_msg = ("For multi-metric scoring, the parameter "
-                                 "refit must be set to a scorer key or a "
-                                 "callable to refit an estimator with the "
-                                 "best parameter setting on the whole "
-                                 "data and make the best_* attributes "
-                                 "available for that metric. If this is "
-                                 "not needed, refit should be set to "
-                                 "False explicitly. %r was passed."
-                                 % self.refit)
-
         refit_metric = "score"
-        scoring_callable = callable(self.scoring)
-        if scoring_callable:
+        if callable(self.scoring):
             scorers = self.scoring
-        elif (self.scoring is None or isinstance(self.scoring, str)):
+        elif self.scoring is None or isinstance(self.scoring, str):
             scorers = check_scoring(self.estimator, self.scoring)
         else:
             scorers = _check_multimetric_scoring(self.estimator, self.scoring)
+            self._check_multimetric_scores_refit(scorers)
             refit_metric = self.refit
 
-            if self.refit is not False and (
-                    not isinstance(self.refit, str) or
-                    # This will work for both dict / list (tuple)
-                    self.refit not in scorers) and not callable(self.refit):
-                raise ValueError(multimetric_refit_msg)
-
         X, y, groups = indexable(X, y, groups)
         # make sure fit_params are sliceable
         fit_params_values = indexable(*fit_params.values())
@@ -679,7 +680,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
                                     return_n_test_samples=True,
                                     return_times=True,
                                     return_parameters=False,
-                                    return_fit_failed=True,
                                     error_score=self.error_score,
                                     verbose=self.verbose)
         results = {}
@@ -734,11 +734,8 @@ def evaluate_candidates(candidate_params):
             self.multimetric_ = isinstance(successful_score, dict)
 
             # scorer is callable, check refit_metric now
-            if scoring_callable and self.multimetric_:
-                if (self.refit is not False and not callable(self.refit)
-                        and (not isinstance(self.refit, str)
-                             or self.refit not in successful_score)):
-                    raise ValueError(multimetric_refit_msg)
+            if callable(self.scoring) and self.multimetric_:
+                self._check_multimetric_scores_refit(successful_score)
                 refit_metric = self.refit
 
         # For multi-metric evaluation, store the best_index_, best_params_ and
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 3c65d0e8f2ce4..0c2aea24551d8 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -237,7 +237,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
             clone(estimator), X, y, scorers, train, test, verbose, None,
             fit_params, return_train_score=return_train_score,
             return_times=True, return_estimator=return_estimator,
-            error_score=error_score, return_fit_failed=True)
+            error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
     results = _aggregate_list_of_dicts(results, constructor=list)
@@ -267,9 +267,6 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
 
 
 def _check_fit_and_score_results(results, error_score):
-    """Checks _fit_and_score results. Handles scoring as a callable and
-    normalizes scores into a list of dictionaries.
-    """
     fit_failed = results["fit_failed"]
     test_score_dicts = results["test_scores"]
 
@@ -438,7 +435,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                    parameters, fit_params, return_train_score=False,
                    return_parameters=False, return_n_test_samples=False,
                    return_times=False, return_estimator=False,
-                   error_score=np.nan, return_fit_failed=False):
+                   error_score=np.nan):
     """Fit estimator and compute scores for a given dataset split.
 
     Parameters
@@ -499,10 +496,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     return_estimator : boolean, optional, default: False
         Whether to return the fitted estimator.
 
-    return_fit_failed : bool, default=False
-        Whether to return if estimatored failed to fit, when error_score is
-        numeric.
-
     Returns
     -------
     result: dict with the following attributes
@@ -590,11 +583,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
             raise ValueError("error_score must be the string 'raise' or a"
                              " numeric value. (Hint: if using 'raise', please"
                              " make sure that it has been spelled correctly.)")
-        if return_fit_failed:
-            result["fit_failed"] = True
+        result["fit_failed"] = True
     else:
-        if return_fit_failed:
-            result["fit_failed"] = False
+        result["fit_failed"] = False
 
         fit_time = time.time() - start_time
         test_scores = _score(estimator, X_test, y_test, scorer)

From 57c390a962a558ca52e70b0a5b86c94bac1ca1e1 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 3 Feb 2020 20:25:31 -0500
Subject: [PATCH 09/31] CLN Address comments

---
 sklearn/model_selection/_search.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 157cc976de359..457af285e1e5f 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -610,7 +610,7 @@ def _run_search(self, evaluate_candidates):
         """
         raise NotImplementedError("_run_search not implemented.")
 
-    def _check_multimetric_scores_refit(self, scores_dict):
+    def _check_refit_for_multimetric(self, scores_dict):
         """Check score contains the string in refit"""
         multimetric_refit_msg = ("For multi-metric scoring, the parameter "
                                  "refit must be set to a scorer key or a "
@@ -660,7 +660,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
             scorers = check_scoring(self.estimator, self.scoring)
         else:
             scorers = _check_multimetric_scoring(self.estimator, self.scoring)
-            self._check_multimetric_scores_refit(scorers)
+            self._check_refit_for_multimetric(scorers)
             refit_metric = self.refit
 
         X, y, groups = indexable(X, y, groups)
@@ -734,7 +734,7 @@ def evaluate_candidates(candidate_params):
 
             # scorer is callable, check refit_metric now
             if callable(self.scoring) and self.multimetric_:
-                self._check_multimetric_scores_refit(successful_score)
+                self._check_refit_for_multimetric(successful_score)
                 refit_metric = self.refit
 
         # For multi-metric evaluation, store the best_index_, best_params_ and

From 1b28907d52a3620b4d8c2d7b92326bac45546da4 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 5 Feb 2020 09:48:09 -0500
Subject: [PATCH 10/31] CLN Simplifies checking

---
 sklearn/model_selection/_search.py     |  3 +--
 sklearn/model_selection/_validation.py | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 457af285e1e5f..39a2178ce5f55 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -787,7 +787,6 @@ def _format_results(self, candidate_params, n_splits, out):
         score_time = results["score_time"]
 
         info_dict = _check_fit_and_score_results(results, self.error_score)
-        score_names = info_dict["score_names"]
         test_scores = info_dict["test_scores"]
 
         if self.return_train_score:
@@ -852,7 +851,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         else:
             iid = False
 
-        for scorer_name in score_names:
+        for scorer_name in test_scores:
             # Computed the (weighted) mean and std for test scores alone
             _store('test_%s' % scorer_name, test_scores[scorer_name],
                    splits=True, rank=True,
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 0cda7a7852710..ef87ced331df1 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -245,7 +245,6 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     results = _aggregate_list_of_dicts(results, constructor=list)
 
     info_dict = _check_fit_and_score_results(results, error_score)
-    score_names = info_dict["score_names"]
     test_scores = info_dict["test_scores"]
 
     if return_estimator:
@@ -258,7 +257,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
-    for name in score_names:
+    for name in test_scores:
         ret['test_%s' % name] = np.array(test_scores[name])
         if return_train_score:
             train_scores = info_dict["train_scores"]
@@ -272,14 +271,18 @@ def _check_fit_and_score_results(results, error_score):
     fit_failed = results["fit_failed"]
     test_score_dicts = results["test_scores"]
 
-    if all(fit_failed):
+    failed_indices = []
+    for i, failed in enumerate(fit_failed):
+        if failed:
+            failed_indices.append(i)
+        else:
+            successful_score = test_score_dicts[i]
+
+    if len(failed_indices) == len(fit_failed):
         raise NotFittedError("All estimators failed to fit")
 
-    successful_score = test_score_dicts[fit_failed.index(False)]
-    if any(fit_failed) and isinstance(successful_score, dict):
-        for i, failed in enumerate(fit_failed):
-            if not failed:
-                continue
+    if failed_indices and isinstance(successful_score, dict):
+        for i in failed_indices:
             test_score_dicts[i] = {name: error_score
                                    for name in successful_score}
 
@@ -287,9 +290,6 @@ def _check_fit_and_score_results(results, error_score):
     # converts single metrics into a list of dictionaries
     if not isinstance(successful_score, dict):
         test_score_dicts = [{"score": elm} for elm in test_score_dicts]
-        output["score_names"] = ["score"]
-    else:
-        output["score_names"] = list(successful_score.keys())
 
     output["test_scores"] = _aggregate_list_of_dicts(test_score_dicts)
 

From 2cf9ba8169b0635e8821b4d4140c2044393f893a Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 5 Feb 2020 10:56:35 -0500
Subject: [PATCH 11/31] CLN Simplifies aggregation

---
 sklearn/model_selection/_search.py     |  2 +-
 sklearn/model_selection/_validation.py | 35 +++++++++++---------------
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 39a2178ce5f55..7a6bba46ec659 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -780,7 +780,7 @@ def evaluate_candidates(candidate_params):
 
     def _format_results(self, candidate_params, n_splits, out):
         n_candidates = len(candidate_params)
-        results = _aggregate_list_of_dicts(out, constructor=list)
+        results = _aggregate_list_of_dicts(out)
 
         test_sample_counts = results["n_test_samples"]
         fit_time = results["fit_time"]
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ef87ced331df1..8c474175868c2 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -242,7 +242,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
             error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
-    results = _aggregate_list_of_dicts(results, constructor=list)
+    results = _aggregate_list_of_dicts(results)
 
     info_dict = _check_fit_and_score_results(results, error_score)
     test_scores = info_dict["test_scores"]
@@ -251,23 +251,26 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
         fitted_estimators = results["estimator"]
 
     ret = {}
-    ret['fit_time'] = np.array(results["fit_time"])
-    ret['score_time'] = np.array(results["score_time"])
+    ret['fit_time'] = results["fit_time"]
+    ret['score_time'] = results["score_time"]
 
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
     for name in test_scores:
-        ret['test_%s' % name] = np.array(test_scores[name])
+        ret['test_%s' % name] = test_scores[name]
         if return_train_score:
             train_scores = info_dict["train_scores"]
             key = 'train_%s' % name
-            ret[key] = np.array(train_scores[name])
+            ret[key] = train_scores[name]
 
     return ret
 
 
 def _check_fit_and_score_results(results, error_score):
+    """Aggregate scores in results into a single dictionary of scores. Results
+    that failed are set to error_score
+    """
     fit_failed = results["fit_failed"]
     test_score_dicts = results["test_scores"]
 
@@ -1310,19 +1313,14 @@ def learning_curve(estimator, X, y, groups=None,
             parameters=None, fit_params=None, return_train_score=True,
             error_score=error_score, return_times=return_times)
             for train, test in train_test_proportions)
-        results = _aggregate_list_of_dicts(results, constructor=list)
-        train_scores = (np.array(results["train_scores"])
-                        .reshape(-1, n_unique_ticks).T)
-
-        test_scores = (np.array(results["test_scores"])
-                       .reshape(-1, n_unique_ticks).T)
+        results = _aggregate_list_of_dicts(results)
+        train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T
+        test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T
         out = [train_scores, test_scores]
 
         if return_times:
-            fit_times = (np.array(results["fit_time"])
-                         .reshape(-1, n_unique_ticks).T)
-            score_times = (np.array(results["score_time"])
-                           .reshape(-1, n_unique_ticks).T)
+            fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T
+            score_times = results["score_time"].reshape(-1, n_unique_ticks).T
             out.extend([fit_times, score_times])
 
     ret = train_sizes_abs, out[0], out[1]
@@ -1545,7 +1543,7 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
     return train_scores, test_scores
 
 
-def _aggregate_list_of_dicts(elements, constructor=np.asarray):
+def _aggregate_list_of_dicts(elements):
     """Aggregate the list of dicts
 
     The aggregated output of _fit_and_score will be a list of dict
@@ -1559,9 +1557,6 @@ def _aggregate_list_of_dicts(elements, constructor=np.asarray):
         List of dicts of the elements for all scorers. This is a flat list,
         assumed originally to be of row major order.
 
-    constructor : function, default=np.asarray
-        Used to combine elements of dictionaries in list
-
     Example
     -------
 
@@ -1570,5 +1565,5 @@ def _aggregate_list_of_dicts(elements, constructor=np.asarray):
     >>> _aggregate_list_of_dicts(elements) # doctest: +SKIP
     {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])}
     """
-    return {key: constructor([elm[key] for elm in elements])
+    return {key: np.asarray([elm[key] for elm in elements])
             for key in elements[0]}

From f336d64a01d77b5d51d441519e06f0e6d593afab Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 5 Feb 2020 11:48:07 -0500
Subject: [PATCH 12/31] CLN Less code the better

---
 sklearn/model_selection/_validation.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 8c474175868c2..368494ae9a078 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -1535,10 +1535,8 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
     n_params = len(param_range)
 
     results = _aggregate_list_of_dicts(results)
-    train_scores = (np.asarray(results["train_scores"])
-                    .reshape(-1, n_params).T)
-    test_scores = (np.asarray(results["test_scores"])
-                   .reshape(-1, n_params).T)
+    train_scores = results["train_scores"].reshape(-1, n_params).T
+    test_scores = results["test_scores"].reshape(-1, n_params).T
 
     return train_scores, test_scores
 
@@ -1560,10 +1558,13 @@ def _aggregate_list_of_dicts(elements):
     Example
     -------
 
-    >>> elements = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
-    ...             {'a': 10, 'b': 10}] # doctest: +SKIP
-    >>> _aggregate_list_of_dicts(elements) # doctest: +SKIP
-    {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])}
+    >>> elements = [{'a': 1, 'b': 10}, {'a': 2, 'b': 2}, {'a': 3, 'b': 3},
+    ...             {'a': 10, 'b': 10}]
+    >>> output = _aggregate_list_of_dicts(elements)
+    >>> output['a']
+    array([ 1, 2, 3, 10])
+    >>> output['b']
+    array([10, 2, 3, 10])
     """
     return {key: np.asarray([elm[key] for elm in elements])
             for key in elements[0]}

From a86eaf04ab9f030e0f46ff90431087b880e30d09 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 5 Feb 2020 12:02:32 -0500
Subject: [PATCH 13/31] CLN Moves definition closer to usage

---
 sklearn/model_selection/_search.py     | 18 ++++++++----------
 sklearn/model_selection/_validation.py | 11 +++++------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 7a6bba46ec659..698fc4e854348 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -780,17 +780,13 @@ def evaluate_candidates(candidate_params):
 
     def _format_results(self, candidate_params, n_splits, out):
         n_candidates = len(candidate_params)
-        results = _aggregate_list_of_dicts(out)
+        agg_out = _aggregate_list_of_dicts(out)
 
-        test_sample_counts = results["n_test_samples"]
-        fit_time = results["fit_time"]
-        score_time = results["score_time"]
+        test_sample_counts = agg_out["n_test_samples"]
+        fit_time = agg_out["fit_time"]
+        score_time = agg_out["score_time"]
 
-        info_dict = _check_fit_and_score_results(results, self.error_score)
-        test_scores = info_dict["test_scores"]
-
-        if self.return_train_score:
-            train_scores = info_dict["train_scores"]
+        score_results = _check_fit_and_score_results(agg_out, self.error_score)
 
         results = {}
 
@@ -851,13 +847,15 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         else:
             iid = False
 
+        test_scores = score_results["test_scores"]
         for scorer_name in test_scores:
             # Computed the (weighted) mean and std for test scores alone
             _store('test_%s' % scorer_name, test_scores[scorer_name],
                    splits=True, rank=True,
                    weights=test_sample_counts if iid else None)
             if self.return_train_score:
-                _store('train_%s' % scorer_name, train_scores[scorer_name],
+                _store('train_%s' % scorer_name,
+                       score_results["train_scores"][scorer_name],
                        splits=True)
 
         return results
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 368494ae9a078..06c837dbf603f 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -244,9 +244,6 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
 
     results = _aggregate_list_of_dicts(results)
 
-    info_dict = _check_fit_and_score_results(results, error_score)
-    test_scores = info_dict["test_scores"]
-
     if return_estimator:
         fitted_estimators = results["estimator"]
 
@@ -257,19 +254,21 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
+    score_results = _check_fit_and_score_results(results, error_score)
+    test_scores = score_results["test_scores"]
     for name in test_scores:
         ret['test_%s' % name] = test_scores[name]
         if return_train_score:
-            train_scores = info_dict["train_scores"]
             key = 'train_%s' % name
-            ret[key] = train_scores[name]
+            ret[key] = score_results["train_scores"][name]
 
     return ret
 
 
 def _check_fit_and_score_results(results, error_score):
     """Aggregate scores in results into a single dictionary of scores. Results
-    that failed are set to error_score
+    that failed are set to error_score. `results` are the aggregated output
+    of `_fit_and_score`.
     """
     fit_failed = results["fit_failed"]
     test_score_dicts = results["test_scores"]

From b1782aed857a8d8d4115b89dc8fc06daec96947b Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 6 Feb 2020 17:16:42 -0500
Subject: [PATCH 14/31] CLN Update error handling

---
 sklearn/model_selection/_search.py     | 49 +++++++++++---------
 sklearn/model_selection/_validation.py | 63 +++++++++++++-------------
 2 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 698fc4e854348..ea5ee4db7ff33 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -656,12 +656,15 @@ def fit(self, X, y=None, groups=None, **fit_params):
         refit_metric = "score"
         if callable(self.scoring):
             scorers = self.scoring
+            check_fit_and_score_results = True
         elif self.scoring is None or isinstance(self.scoring, str):
             scorers = check_scoring(self.estimator, self.scoring)
+            check_fit_and_score_results = False
         else:
             scorers = _check_multimetric_scoring(self.estimator, self.scoring)
             self._check_refit_for_multimetric(scorers)
             refit_metric = self.refit
+            check_fit_and_score_results = False
 
         X, y, groups = indexable(X, y, groups)
         fit_params = _check_fit_params(X, fit_params)
@@ -715,6 +718,8 @@ def evaluate_candidates(candidate_params):
                                      .format(n_splits,
                                              len(out) // n_candidates))
 
+                if check_fit_and_score_results:
+                    _check_fit_and_score_results(out, self.error_score)
                 all_candidate_params.extend(candidate_params)
                 all_out.extend(out)
 
@@ -725,16 +730,12 @@ def evaluate_candidates(candidate_params):
 
             self._run_search(evaluate_candidates)
 
-            for out in all_out:
-                if not out["fit_failed"]:
-                    successful_score = out['test_scores']
-                    break
-
-            self.multimetric_ = isinstance(successful_score, dict)
+            sample_score = all_out[0]['test_scores']
+            self.multimetric_ = isinstance(sample_score, dict)
 
             # scorer is callable, check refit_metric now
             if callable(self.scoring) and self.multimetric_:
-                self._check_refit_for_multimetric(successful_score)
+                self._check_refit_for_multimetric(sample_score)
                 refit_metric = self.refit
 
         # For multi-metric evaluation, store the best_index_, best_params_ and
@@ -780,13 +781,7 @@ def evaluate_candidates(candidate_params):
 
     def _format_results(self, candidate_params, n_splits, out):
         n_candidates = len(candidate_params)
-        agg_out = _aggregate_list_of_dicts(out)
-
-        test_sample_counts = agg_out["n_test_samples"]
-        fit_time = agg_out["fit_time"]
-        score_time = agg_out["score_time"]
-
-        score_results = _check_fit_and_score_results(agg_out, self.error_score)
+        out = _aggregate_list_of_dicts(out)
 
         results = {}
 
@@ -814,8 +809,8 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                 results["rank_%s" % key_name] = np.asarray(
                     rankdata(-array_means, method='min'), dtype=np.int32)
 
-        _store('fit_time', fit_time)
-        _store('score_time', score_time)
+        _store('fit_time', out["fit_time"])
+        _store('score_time', out["score_time"])
         # Use one MaskedArray and mask all the places where the param is not
         # applicable for that candidate. Use defaultdict as each candidate may
         # not contain all the params
@@ -835,7 +830,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         results['params'] = candidate_params
 
         # NOTE test_sample counts (weights) remain the same for all candidates
-        test_sample_counts = np.array(test_sample_counts[:n_splits],
+        test_sample_counts = np.array(out["n_test_samples"][:n_splits],
                                       dtype=np.int)
 
         if self.iid != 'deprecated':
@@ -847,15 +842,27 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         else:
             iid = False
 
-        test_scores = score_results["test_scores"]
-        for scorer_name in test_scores:
+        test_scores = out["test_scores"]
+        if isinstance(test_scores[0], dict):
+            test_scores_dict = _aggregate_list_of_dicts(test_scores)
+        else:
+            test_scores_dict = {"score": test_scores}
+
+        if self.return_train_score:
+            train_scores = out["train_scores"]
+            if isinstance(test_scores[0], dict):
+                train_scores_dict = _aggregate_list_of_dicts(train_scores)
+            else:
+                train_scores_dict = {"score": train_scores}
+
+        for scorer_name in test_scores_dict:
             # Computed the (weighted) mean and std for test scores alone
-            _store('test_%s' % scorer_name, test_scores[scorer_name],
+            _store('test_%s' % scorer_name, test_scores_dict[scorer_name],
                    splits=True, rank=True,
                    weights=test_sample_counts if iid else None)
             if self.return_train_score:
                 _store('train_%s' % scorer_name,
-                       score_results["train_scores"][scorer_name],
+                       train_scores_dict[scorer_name],
                        splits=True)
 
         return results
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 06c837dbf603f..c330a3d3acf15 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -225,10 +225,13 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
 
     if callable(scoring):
         scorers = scoring
+        check_fit_and_score_results = True
     elif scoring is None or isinstance(scoring, str):
         scorers = check_scoring(estimator, scoring)
+        check_fit_and_score_results = False
     else:
         scorers = _check_multimetric_scoring(estimator, scoring)
+        check_fit_and_score_results = False
 
     # We clone the estimator to make sure that all the folds are
     # independent, and that it is pickle-able.
@@ -242,6 +245,8 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
             error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
+    if check_fit_and_score_results:
+        _check_fit_and_score_results(results, error_score)
     results = _aggregate_list_of_dicts(results)
 
     if return_estimator:
@@ -254,13 +259,24 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
-    score_results = _check_fit_and_score_results(results, error_score)
-    test_scores = score_results["test_scores"]
-    for name in test_scores:
-        ret['test_%s' % name] = test_scores[name]
+    test_scores = results["test_scores"]
+    if isinstance(test_scores[0], dict):
+        test_scores_dict = _aggregate_list_of_dicts(test_scores)
+    else:
+        test_scores_dict = {"score": test_scores}
+
+    if return_train_score:
+        train_scores = results["train_scores"]
+        if isinstance(test_scores[0], dict):
+            train_scores_dict = _aggregate_list_of_dicts(train_scores)
+        else:
+            train_scores_dict = {"score": train_scores}
+
+    for name in test_scores_dict:
+        ret['test_%s' % name] = test_scores_dict[name]
         if return_train_score:
             key = 'train_%s' % name
-            ret[key] = score_results["train_scores"][name]
+            ret[key] = train_scores_dict[name]
 
     return ret
 
@@ -270,38 +286,23 @@ def _check_fit_and_score_results(results, error_score):
     that failed are set to error_score. `results` are the aggregated output
     of `_fit_and_score`.
     """
-    fit_failed = results["fit_failed"]
-    test_score_dicts = results["test_scores"]
-
+    successful_score = None
     failed_indices = []
-    for i, failed in enumerate(fit_failed):
-        if failed:
+    for i, result in enumerate(results):
+        if result["fit_failed"]:
             failed_indices.append(i)
-        else:
-            successful_score = test_score_dicts[i]
+        elif successful_score is None:
+            successful_score = result["test_scores"]
 
-    if len(failed_indices) == len(fit_failed):
+    if successful_score is None:
         raise NotFittedError("All estimators failed to fit")
 
-    if failed_indices and isinstance(successful_score, dict):
+    if isinstance(successful_score, dict):
+        formatted_erorr = {name: error_score for name in successful_score}
         for i in failed_indices:
-            test_score_dicts[i] = {name: error_score
-                                   for name in successful_score}
-
-    output = {}
-    # converts single metrics into a list of dictionaries
-    if not isinstance(successful_score, dict):
-        test_score_dicts = [{"score": elm} for elm in test_score_dicts]
-
-    output["test_scores"] = _aggregate_list_of_dicts(test_score_dicts)
-
-    if "train_scores" in results:
-        train_score_dicts = results["train_scores"]
-        if not isinstance(successful_score, dict):
-            train_score_dicts = [{"score": elm} for elm in train_score_dicts]
-        output["train_scores"] = _aggregate_list_of_dicts(train_score_dicts)
-
-    return output
+            results[i]["test_scores"] = formatted_erorr.copy()
+            if "train_scores" in results[i]:
+                results[i]["train_scores"] = formatted_erorr.copy()
 
 
 def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,

From c5f9b42006baa34b1aaf8a4429421e7b6a31498f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 24 May 2020 17:06:20 -0400
Subject: [PATCH 15/31] REV Less diffs

---
 sklearn/metrics/_scorer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 08b97fd3cba8b..fa6f8181aeb3f 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -431,7 +431,10 @@ def _check_multimetric_scoring(estimator, scoring):
     estimator : sklearn estimator instance
         The estimator for which the scoring will be applied.
 
-    scoring : list/tuple or dict
+    scoring : string, callable, list/tuple, dict or None, default: None
+        A single string (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring`) to evaluate the predictions on the test set.
+
         For evaluating multiple metrics, either give a list of (unique) strings
         or a dict with names as keys and callables as values.
 
@@ -441,6 +444,7 @@ def _check_multimetric_scoring(estimator, scoring):
 
         See :ref:`multimetric_grid_search` for an example.
 
+        If None the estimator's score method is used.
         The return value in that case will be ``{'score': <default_scorer>}``.
         If the estimator's score method is not available, a ``TypeError``
         is raised.

From 0e79b59d79a5625e123f09196829d5c2cfba5301 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 24 May 2020 17:14:04 -0400
Subject: [PATCH 16/31] CLN Address comments

---
 sklearn/model_selection/_search.py     | 15 +++++----
 sklearn/model_selection/_validation.py | 42 +++++++++++---------------
 2 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 7b2a1999ffd50..fa0c3997e1b6a 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -29,7 +29,7 @@
 from ._split import check_cv
 from ._validation import _fit_and_score
 from ._validation import _aggregate_list_of_dicts
-from ._validation import _check_fit_and_score_results
+from ._validation import _handle_error_score
 from ..exceptions import NotFittedError
 from joblib import Parallel, delayed
 from ..utils import check_random_state
@@ -678,17 +678,20 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
         cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
 
         refit_metric = "score"
+
+        # If scoring is callable, then error scores must be handled after
+        # scoring is called.
         if callable(self.scoring):
             scorers = self.scoring
-            check_fit_and_score_results = True
+            should_handle_error_scores = True
         elif self.scoring is None or isinstance(self.scoring, str):
             scorers = check_scoring(self.estimator, self.scoring)
-            check_fit_and_score_results = False
+            should_handle_error_scores = False
         else:
             scorers = _check_multimetric_scoring(self.estimator, self.scoring)
             self._check_refit_for_multimetric(scorers)
             refit_metric = self.refit
-            check_fit_and_score_results = False
+            should_handle_error_scores = False
 
         X, y, groups = indexable(X, y, groups)
         fit_params = _check_fit_params(X, fit_params)
@@ -742,8 +745,8 @@ def evaluate_candidates(candidate_params):
                                      .format(n_splits,
                                              len(out) // n_candidates))
 
-                if check_fit_and_score_results:
-                    _check_fit_and_score_results(out, self.error_score)
+                if should_handle_error_scores:
+                    _handle_error_score(out, self.error_score)
                 all_candidate_params.extend(candidate_params)
                 all_out.extend(out)
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ef357f60d1321..b527126788d38 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -234,15 +234,17 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 
     cv = check_cv(cv, y, classifier=is_classifier(estimator))
 
+    # If scoring is callable, then error scores must be handled after
+    # scoring is called.
     if callable(scoring):
         scorers = scoring
-        check_fit_and_score_results = True
+        should_handle_error_scores = True
     elif scoring is None or isinstance(scoring, str):
         scorers = check_scoring(estimator, scoring)
-        check_fit_and_score_results = False
+        should_handle_error_scores = False
     else:
         scorers = _check_multimetric_scoring(estimator, scoring)
-        check_fit_and_score_results = False
+        should_handle_error_scores = False
 
     # We clone the estimator to make sure that all the folds are
     # independent, and that it is pickle-able.
@@ -256,8 +258,8 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
             error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
-    if check_fit_and_score_results:
-        _check_fit_and_score_results(results, error_score)
+    if should_handle_error_scores:
+        _handle_error_score(results, error_score)
     results = _aggregate_list_of_dicts(results)
 
     if return_estimator:
@@ -292,28 +294,25 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
     return ret
 
 
-def _check_fit_and_score_results(results, error_score):
-    """Aggregate scores in results into a single dictionary of scores. Results
-    that failed are set to error_score. `results` are the aggregated output
-    of `_fit_and_score`.
-    """
-    successful_score = None
+def _handle_error_score(results, error_score):
+    """Handle error in results by replacing them with `error_score`."""
+    score_names = None
     failed_indices = []
     for i, result in enumerate(results):
         if result["fit_failed"]:
             failed_indices.append(i)
-        elif successful_score is None:
-            successful_score = result["test_scores"]
+        elif score_names is None:
+            score_names = result["test_scores"].keys()
 
-    if successful_score is None:
+    if score_names is None:
         raise NotFittedError("All estimators failed to fit")
 
-    if isinstance(successful_score, dict):
-        formatted_erorr = {name: error_score for name in successful_score}
+    if score_names:
+        formatted_error = {name: error_score for name in score_names}
         for i in failed_indices:
-            results[i]["test_scores"] = formatted_erorr.copy()
+            results[i]["test_scores"] = formatted_error.copy()
             if "train_scores" in results[i]:
-                results[i]["train_scores"] = formatted_erorr.copy()
+                results[i]["train_scores"] = formatted_error.copy()
 
 
 @_deprecate_positional_args
@@ -522,25 +521,18 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         train_scores : dict of scorer name -> float
             Score on training set (for all the scorers),
             returned only if `return_train_score` is `True`.
-
         test_scores : dict of scorer name -> float
             Score on testing set (for all the scorers).
-
         n_test_samples : int
             Number of test samples.
-
         fit_time : float
             Time spent for fitting in seconds.
-
         score_time : float
             Time spent for scoring in seconds.
-
         parameters : dict or None
             The parameters that have been evaluated.
-
         estimator : estimator object
             The fitted estimator
-
         fit_failed : bool
             The estimator failed to fit.
     """

From 49e8c0399f7995123646bd8dbc7b62df3ef4bd88 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 24 May 2020 17:17:36 -0400
Subject: [PATCH 17/31] REV

---
 sklearn/metrics/_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index fa6f8181aeb3f..2cf1e8f231901 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -423,7 +423,7 @@ def check_scoring(estimator, scoring=None, *, allow_none=False):
                          " None. %r was passed" % scoring)
 
 
-def _check_multimetric_scoring(estimator, scoring):
+def _check_multimetric_scoring(estimator, scoring=None):
     """Check the scoring parameter in cases when multiple metrics are allowed
 
     Parameters

From 4f6ecd7c6bce136486b866f12404f4147b152901 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 24 May 2020 17:27:06 -0400
Subject: [PATCH 18/31] STY Flake

---
 sklearn/model_selection/_search.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index fa0c3997e1b6a..0f3793526265e 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -856,10 +856,6 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         # Store a list of param dicts at the key 'params'
         results['params'] = candidate_params
 
-        # NOTE test_sample counts (weights) remain the same for all candidates
-        test_sample_counts = np.array(out["n_test_samples"][:n_splits],
-                                      dtype=np.int)
-
         test_scores = out["test_scores"]
         if isinstance(test_scores[0], dict):
             test_scores_dict = _aggregate_list_of_dicts(test_scores)

From 4fa5eb6099517dd858e3ae8abd7bcd3a531aaed8 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 24 May 2020 18:13:53 -0400
Subject: [PATCH 19/31] ENH Fix error

---
 sklearn/model_selection/_validation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index b527126788d38..ef10e80ffbe2b 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -296,19 +296,19 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 
 def _handle_error_score(results, error_score):
     """Handle error in results by replacing them with `error_score`."""
-    score_names = None
+    successful_score = None
     failed_indices = []
     for i, result in enumerate(results):
         if result["fit_failed"]:
             failed_indices.append(i)
-        elif score_names is None:
-            score_names = result["test_scores"].keys()
+        elif successful_score is None:
+            successful_score = result["test_scores"]
 
-    if score_names is None:
+    if successful_score is None:
         raise NotFittedError("All estimators failed to fit")
 
-    if score_names:
-        formatted_error = {name: error_score for name in score_names}
+    if isinstance(successful_score, dict):
+        formatted_error = {name: error_score for name in successful_score}
         for i in failed_indices:
             results[i]["test_scores"] = formatted_error.copy()
             if "train_scores" in results[i]:

From 97b1db2262586c6f7b9075401b25776156d94d36 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 24 May 2020 22:05:56 -0400
Subject: [PATCH 20/31] REV Less diffs

---
 sklearn/model_selection/_search.py     |  8 +++---
 sklearn/model_selection/_validation.py | 34 ++++++++++++--------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 0f3793526265e..0ffb5021c4294 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -28,7 +28,7 @@
 from ..base import MetaEstimatorMixin
 from ._split import check_cv
 from ._validation import _fit_and_score
-from ._validation import _aggregate_list_of_dicts
+from ._validation import _aggregate_score_dicts
 from ._validation import _handle_error_score
 from ..exceptions import NotFittedError
 from joblib import Parallel, delayed
@@ -808,7 +808,7 @@ def evaluate_candidates(candidate_params):
 
     def _format_results(self, candidate_params, n_splits, out):
         n_candidates = len(candidate_params)
-        out = _aggregate_list_of_dicts(out)
+        out = _aggregate_score_dicts(out)
 
         results = {}
 
@@ -858,14 +858,14 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
 
         test_scores = out["test_scores"]
         if isinstance(test_scores[0], dict):
-            test_scores_dict = _aggregate_list_of_dicts(test_scores)
+            test_scores_dict = _aggregate_score_dicts(test_scores)
         else:
             test_scores_dict = {"score": test_scores}
 
         if self.return_train_score:
             train_scores = out["train_scores"]
             if isinstance(test_scores[0], dict):
-                train_scores_dict = _aggregate_list_of_dicts(train_scores)
+                train_scores_dict = _aggregate_score_dicts(train_scores)
             else:
                 train_scores_dict = {"score": train_scores}
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ef10e80ffbe2b..d2c404e94d536 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -260,7 +260,7 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 
     if should_handle_error_scores:
         _handle_error_score(results, error_score)
-    results = _aggregate_list_of_dicts(results)
+    results = _aggregate_score_dicts(results)
 
     if return_estimator:
         fitted_estimators = results["estimator"]
@@ -274,14 +274,14 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 
     test_scores = results["test_scores"]
     if isinstance(test_scores[0], dict):
-        test_scores_dict = _aggregate_list_of_dicts(test_scores)
+        test_scores_dict = _aggregate_score_dicts(test_scores)
     else:
         test_scores_dict = {"score": test_scores}
 
     if return_train_score:
         train_scores = results["train_scores"]
         if isinstance(test_scores[0], dict):
-            train_scores_dict = _aggregate_list_of_dicts(train_scores)
+            train_scores_dict = _aggregate_score_dicts(train_scores)
         else:
             train_scores_dict = {"score": train_scores}
 
@@ -1327,7 +1327,7 @@ def learning_curve(estimator, X, y, *, groups=None,
             parameters=None, fit_params=None, return_train_score=True,
             error_score=error_score, return_times=return_times)
             for train, test in train_test_proportions)
-        results = _aggregate_list_of_dicts(results)
+        results = _aggregate_score_dicts(results)
         train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T
         test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T
         out = [train_scores, test_scores]
@@ -1551,15 +1551,15 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
         for train, test in cv.split(X, y, groups) for v in param_range)
     n_params = len(param_range)
 
-    results = _aggregate_list_of_dicts(results)
+    results = _aggregate_score_dicts(results)
     train_scores = results["train_scores"].reshape(-1, n_params).T
     test_scores = results["test_scores"].reshape(-1, n_params).T
 
     return train_scores, test_scores
 
 
-def _aggregate_list_of_dicts(elements):
-    """Aggregate the list of dicts
+def _aggregate_score_dicts(scores):
+    """Aggregate the list of dict to dict of np ndarray
 
     The aggregated output of _fit_and_score will be a list of dict
     of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]
@@ -1568,20 +1568,18 @@ def _aggregate_list_of_dicts(elements):
     Parameters
     ----------
 
-    elements : list of dict
-        List of dicts of the elements for all scorers. This is a flat list,
+    scores : list of dict
+        List of dicts of the scores for all scorers. This is a flat list,
         assumed originally to be of row major order.
 
     Example
     -------
 
-    >>> elements = [{'a': 1, 'b': 10}, {'a': 2, 'b': 2}, {'a': 3, 'b': 3},
-    ...             {'a': 10, 'b': 10}]
-    >>> output = _aggregate_list_of_dicts(elements)
-    >>> output['a']
-    array([ 1, 2, 3, 10])
-    >>> output['b']
-    array([10, 2, 3, 10])
+    >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},
+    ...           {'a': 10, 'b': 10}]                         # doctest: +SKIP
+    >>> _aggregate_score_dicts(scores)                        # doctest: +SKIP
+    {'a': array([1, 2, 3, 10]),
+     'b': array([10, 2, 3, 10])}
     """
-    return {key: np.asarray([elm[key] for elm in elements])
-            for key in elements[0]}
+    return {key: np.asarray([score[key] for score in scores])
+            for key in scores[0]}

From 286bb86d6e2587ff34ddc959a30eaec879a8ba1c Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 25 May 2020 00:01:44 -0400
Subject: [PATCH 21/31] DOC Adds comments

---
 sklearn/model_selection/_search.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 0ffb5021c4294..34068b4645eee 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -757,6 +757,8 @@ def evaluate_candidates(candidate_params):
 
             self._run_search(evaluate_candidates)
 
+            # multimetric is determined here based on test_scores. This is
+            # to support callable self.scoring
             sample_score = all_out[0]['test_scores']
             self.multimetric_ = isinstance(sample_score, dict)
 

From 5da1571921fa2f9df1a4dfb17ebd9c68e80527e3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 8 Jul 2020 15:58:18 -0400
Subject: [PATCH 22/31] CLN Removes some state

---
 sklearn/model_selection/_search.py     | 40 +++++++++---------------
 sklearn/model_selection/_validation.py | 42 ++++++++++++++------------
 2 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 8a113d9432188..0eeabbd25a1ef 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -29,7 +29,8 @@
 from ._split import check_cv
 from ._validation import _fit_and_score
 from ._validation import _aggregate_score_dicts
-from ._validation import _handle_error_score
+from ._validation import _insert_error_scores
+from ._validation import _normalize_score_results
 from ..exceptions import NotFittedError
 from joblib import Parallel, delayed
 from ..utils import check_random_state
@@ -700,19 +701,14 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
 
         refit_metric = "score"
 
-        # If scoring is callable, then error scores must be handled after
-        # scoring is called.
         if callable(self.scoring):
             scorers = self.scoring
-            should_handle_error_scores = True
         elif self.scoring is None or isinstance(self.scoring, str):
             scorers = check_scoring(self.estimator, self.scoring)
-            should_handle_error_scores = False
         else:
             scorers = _check_multimetric_scoring(self.estimator, self.scoring)
             self._check_refit_for_multimetric(scorers)
             refit_metric = self.refit
-            should_handle_error_scores = False
 
         X, y, groups = indexable(X, y, groups)
         fit_params = _check_fit_params(X, fit_params)
@@ -773,8 +769,11 @@ def evaluate_candidates(candidate_params):
                                      .format(n_splits,
                                              len(out) // n_candidates))
 
-                if should_handle_error_scores:
-                    _handle_error_score(out, self.error_score)
+                # For callabe self.scoring, the return type is only know after
+                # calling. If the return type is a dictionary, the error scores
+                # can now be inserted with the correct key.
+                if callable(self.scoring):
+                    _insert_error_scores(out, self.error_score)
                 all_candidate_params.extend(candidate_params)
                 all_out.extend(out)
 
@@ -785,14 +784,14 @@ def evaluate_candidates(candidate_params):
 
             self._run_search(evaluate_candidates)
 
-            # multimetric is determined here based on test_scores. This is
-            # to support callable self.scoring
-            sample_score = all_out[0]['test_scores']
-            self.multimetric_ = isinstance(sample_score, dict)
+            # multimetric is determined here because in the case of a callable
+            # self.scoring the return type is only known after calling
+            first_test_score = all_out[0]['test_scores']
+            self.multimetric_ = isinstance(first_test_score, dict)
 
-            # scorer is callable, check refit_metric now
+            # check refit_metric now for a callabe scorer that is multimetric
             if callable(self.scoring) and self.multimetric_:
-                self._check_refit_for_multimetric(sample_score)
+                self._check_refit_for_multimetric(first_test_score)
                 refit_metric = self.refit
 
         # For multi-metric evaluation, store the best_index_, best_params_ and
@@ -886,18 +885,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         # Store a list of param dicts at the key 'params'
         results['params'] = candidate_params
 
-        test_scores = out["test_scores"]
-        if isinstance(test_scores[0], dict):
-            test_scores_dict = _aggregate_score_dicts(test_scores)
-        else:
-            test_scores_dict = {"score": test_scores}
-
+        test_scores_dict = _normalize_score_results(out["test_scores"])
         if self.return_train_score:
-            train_scores = out["train_scores"]
-            if isinstance(test_scores[0], dict):
-                train_scores_dict = _aggregate_score_dicts(train_scores)
-            else:
-                train_scores_dict = {"score": train_scores}
+            train_scores_dict = _normalize_score_results(out["train_scores"])
 
         for scorer_name in test_scores_dict:
             # Computed the (weighted) mean and std for test scores alone
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 287365bf864ed..e75dfbe532976 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -234,17 +234,12 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 
     cv = check_cv(cv, y, classifier=is_classifier(estimator))
 
-    # If scoring is callable, then error scores must be handled after
-    # scoring is called.
     if callable(scoring):
         scorers = scoring
-        should_handle_error_scores = True
     elif scoring is None or isinstance(scoring, str):
         scorers = check_scoring(estimator, scoring)
-        should_handle_error_scores = False
     else:
         scorers = _check_multimetric_scoring(estimator, scoring)
-        should_handle_error_scores = False
 
     # We clone the estimator to make sure that all the folds are
     # independent, and that it is pickle-able.
@@ -258,8 +253,12 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
             error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
-    if should_handle_error_scores:
-        _handle_error_score(results, error_score)
+    # For callabe scoring, the return type is only know after calling. If the
+    # return type is a dictionary, the error scores can now be inserted with
+    # the correct key.
+    if callable(scoring):
+        _insert_error_scores(results, error_score)
+
     results = _aggregate_score_dicts(results)
 
     if return_estimator:
@@ -272,18 +271,9 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
-    test_scores = results["test_scores"]
-    if isinstance(test_scores[0], dict):
-        test_scores_dict = _aggregate_score_dicts(test_scores)
-    else:
-        test_scores_dict = {"score": test_scores}
-
+    test_scores_dict = _normalize_score_results(results["test_scores"])
     if return_train_score:
-        train_scores = results["train_scores"]
-        if isinstance(test_scores[0], dict):
-            train_scores_dict = _aggregate_score_dicts(train_scores)
-        else:
-            train_scores_dict = {"score": train_scores}
+        train_scores_dict = _normalize_score_results(results["train_scores"])
 
     for name in test_scores_dict:
         ret['test_%s' % name] = test_scores_dict[name]
@@ -294,8 +284,11 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
     return ret
 
 
-def _handle_error_score(results, error_score):
-    """Handle error in results by replacing them with `error_score`."""
+def _insert_error_scores(results, error_score):
+    """Insert error in results by replacing them with `error_score`.
+
+    This only applies to dictionaries scores because `_fit_and_score` will
+    handle the single metric case."""
     successful_score = None
     failed_indices = []
     for i, result in enumerate(results):
@@ -315,6 +308,15 @@ def _handle_error_score(results, error_score):
                 results[i]["train_scores"] = formatted_error.copy()
 
 
+def _normalize_score_results(scores, scaler_score_key='score'):
+    """Creates a scoring dictionary based on the type of `scores`"""
+    if isinstance(scores[0], dict):
+        # multimetric scoring
+        return _aggregate_score_dicts(scores)
+    # scaler
+    return {scaler_score_key: scores}
+
+
 @_deprecate_positional_args
 def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
                     cv=None, n_jobs=None, verbose=0, fit_params=None,

From e541de34c739bd74941c05e2332b46c4c2a13f10 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 9 Jul 2020 10:17:21 -0400
Subject: [PATCH 23/31] CLN Address comments

---
 sklearn/metrics/_scorer.py             | 13 ++-----------
 sklearn/model_selection/_validation.py | 12 ++++++------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 16455c7dd8c50..89bf37ffe0711 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -423,7 +423,7 @@ def check_scoring(estimator, scoring=None, *, allow_none=False):
                          " None. %r was passed" % scoring)
 
 
-def _check_multimetric_scoring(estimator, scoring=None):
+def _check_multimetric_scoring(estimator, scoring):
     """Check the scoring parameter in cases when multiple metrics are allowed
 
     Parameters
@@ -431,24 +431,15 @@ def _check_multimetric_scoring(estimator, scoring=None):
     estimator : sklearn estimator instance
         The estimator for which the scoring will be applied.
 
-    scoring : str, callable, list, tuple or dict, default=None
+    scoring : list, tuple or dict
         A single string (see :ref:`scoring_parameter`) or a callable
         (see :ref:`scoring`) to evaluate the predictions on the test set.
 
         For evaluating multiple metrics, either give a list of (unique) strings
         or a dict with names as keys and callables as values.
 
-        NOTE that when using custom scorers, each scorer should return a single
-        value. Metric functions returning a list/array of values can be wrapped
-        into multiple scorers that return one value each.
-
         See :ref:`multimetric_grid_search` for an example.
 
-        If None the estimator's score method is used.
-        The return value in that case will be ``{'score': <default_scorer>}``.
-        If the estimator's score method is not available, a ``TypeError``
-        is raised.
-
     Returns
     -------
     scorers_dict : dict
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index e75dfbe532976..b31bdfeb1c714 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -289,19 +289,19 @@ def _insert_error_scores(results, error_score):
 
     This only applies to dictionaries scores because `_fit_and_score` will
     handle the single metric case."""
-    successful_score = None
+    score_names = None
     failed_indices = []
     for i, result in enumerate(results):
         if result["fit_failed"]:
             failed_indices.append(i)
-        elif successful_score is None:
-            successful_score = result["test_scores"]
+        elif score_names is None:
+            score_names = result["test_scores"].keys()
 
-    if successful_score is None:
+    if score_names is None:
         raise NotFittedError("All estimators failed to fit")
 
-    if isinstance(successful_score, dict):
-        formatted_error = {name: error_score for name in successful_score}
+    if isinstance(score_names, dict):
+        formatted_error = {name: error_score for name in score_names}
         for i in failed_indices:
             results[i]["test_scores"] = formatted_error.copy()
             if "train_scores" in results[i]:

From 657ef893a7ac2d7f42025cecaee7925b39652a85 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 9 Jul 2020 11:08:30 -0400
Subject: [PATCH 24/31] BUG Fix score

---
 sklearn/model_selection/_validation.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ac30159ed3b5c..eac1082a97e4f 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -284,21 +284,21 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 def _insert_error_scores(results, error_score):
     """Insert error in results by replacing them with `error_score`.
 
-    This only applies to dictionaries scores because `_fit_and_score` will
+    This only applies to multimetric scores because `_fit_and_score` will
     handle the single metric case."""
-    score_names = None
+    successful_score = None
     failed_indices = []
     for i, result in enumerate(results):
         if result["fit_failed"]:
             failed_indices.append(i)
-        elif score_names is None:
-            score_names = result["test_scores"].keys()
+        elif successful_score is None:
+            successful_score = result["test_scores"]
 
-    if score_names is None:
+    if successful_score is None:
         raise NotFittedError("All estimators failed to fit")
 
-    if isinstance(score_names, dict):
-        formatted_error = {name: error_score for name in score_names}
+    if isinstance(successful_score, dict):
+        formatted_error = {name: error_score for name in successful_score}
         for i in failed_indices:
             results[i]["test_scores"] = formatted_error.copy()
             if "train_scores" in results[i]:

From b0cdc570eb103294c66fcbe660411f3fbdce3720 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 9 Jul 2020 13:36:36 -0400
Subject: [PATCH 25/31] CLN Adds to glossary

---
 doc/glossary.rst                                 |  8 ++++----
 sklearn/metrics/_scorer.py                       | 10 ++++------
 sklearn/model_selection/tests/test_validation.py |  4 ++--
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/doc/glossary.rst b/doc/glossary.rst
index 86cb3c06f5634..42e746c38b9ec 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -1583,10 +1583,10 @@ functions or non-estimator constructors.
         in the User Guide.
 
         Where multiple metrics can be evaluated, ``scoring`` may be given
-        either as a list of unique strings or a dictionary with names as keys
-        and callables as values. Note that this does *not* specify which score
-        function is to be maximized, and another parameter such as ``refit``
-        maybe used for this purpose.
+        either as a list of unique strings, a dictionary with names as keys and
+        callables as values or a callable that returns a dictionary. Note that
+        this does *not* specify which score function is to be maximized, and
+        another parameter such as ``refit`` maybe used for this purpose.
 
 
         The ``scoring`` parameter is validated and interpreted using
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 89bf37ffe0711..0852955c72cad 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -445,12 +445,10 @@ def _check_multimetric_scoring(estimator, scoring):
     scorers_dict : dict
         A dict mapping each scorer name to its validated scorer.
     """
-    err_msg_generic = ("scoring should either be a single string or "
-                       "callable or a "
-                       "list/tuple of strings or a dict of scorer name "
-                       "mapped to the callable for multiple metric "
-                       "evaluation. Got %s of type %s"
-                       % (repr(scoring), type(scoring)))
+    err_msg_generic = (
+        f"scoring is invalid (got {scoring!r}). Refer to the "
+        "scoring glossary for details: "
+        "https://scikit-learn.org/stable/glossary.html#term-scoring ")
 
     if isinstance(scoring, (list, tuple, set)):
         err_msg = ("The list/tuple elements must be unique "
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 8c42354e87344..6e1faa1088075 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -318,8 +318,8 @@ def test_cross_validate_invalid_scoring_param():
                         cross_validate, estimator, X, y,
                         scoring=[[make_scorer(precision_score)]])
 
-    error_message_regexp = (".*should either be.*string or callable.*"
-                            ".*.*dict.*for multi.*")
+    error_message_regexp = (".*scoring is invalid.*Refer to the scoring "
+                            "glossary for details:.*")
 
     # Empty dict should raise invalid scoring error
     assert_raises_regex(ValueError, "An empty dict",

From 714372f66f59035e86ba5435c7c7e3da46d0c386 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 9 Jul 2020 15:16:13 -0400
Subject: [PATCH 26/31] CLN Uses f-strings

---
 sklearn/metrics/_scorer.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 0852955c72cad..b824b9b0cbcb8 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -448,7 +448,7 @@ def _check_multimetric_scoring(estimator, scoring):
     err_msg_generic = (
         f"scoring is invalid (got {scoring!r}). Refer to the "
         "scoring glossary for details: "
-        "https://scikit-learn.org/stable/glossary.html#term-scoring ")
+        "https://scikit-learn.org/stable/glossary.html#term-scoring")
 
     if isinstance(scoring, (list, tuple, set)):
         err_msg = ("The list/tuple elements must be unique "
@@ -462,34 +462,30 @@ def _check_multimetric_scoring(estimator, scoring):
             raise ValueError(err_msg)
 
         if len(keys) != len(scoring):
-            raise ValueError(err_msg + "Duplicate elements were found in"
-                             " the given list. %r" % repr(scoring))
+            raise ValueError(f"{err_msg} Duplicate elements were found in"
+                             f" the given list. {scoring!r}")
         elif len(keys) > 0:
             if not all(isinstance(k, str) for k in keys):
                 if any(callable(k) for k in keys):
-                    raise ValueError(err_msg +
-                                     "One or more of the elements were "
-                                     "callables. Use a dict of score name "
-                                     "mapped to the scorer callable. "
-                                     "Got %r" % repr(scoring))
+                    raise ValueError(f"{err_msg} One or more of the elements "
+                                     "were callables. Use a dict of score "
+                                     "name mapped to the scorer callable. "
+                                     f"Got {scoring!r}")
                 else:
-                    raise ValueError(err_msg +
-                                     "Non-string types were found in "
-                                     "the given list. Got %r"
-                                     % repr(scoring))
+                    raise ValueError(f"{err_msg} Non-string types were found "
+                                     f"in the given list. Got {scoring!r}")
             scorers = {scorer: check_scoring(estimator, scoring=scorer)
                        for scorer in scoring}
         else:
-            raise ValueError(err_msg +
-                             "Empty list was given. %r" % repr(scoring))
+            raise ValueError(f"{err_msg} Empty list was given. {scoring!r}")
 
     elif isinstance(scoring, dict):
         keys = set(scoring)
         if not all(isinstance(k, str) for k in keys):
             raise ValueError("Non-string types were found in the keys of "
-                             "the given dict. scoring=%r" % repr(scoring))
+                             f"the given dict. scoring={scoring!r}")
         if len(keys) == 0:
-            raise ValueError("An empty dict was passed. %r" % repr(scoring))
+            raise ValueError(f"An empty dict was passed. {scoring!r}")
         scorers = {key: check_scoring(estimator, scoring=scorer)
                    for key, scorer in scoring.items()}
     else:

From 346f8e3fdbeaa0893f426f8c81126a63b1ec7b64 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 10 Jul 2020 20:14:57 +0200
Subject: [PATCH 27/31] ENH create a generator of applicable metrics depending
 on the target y

---
 sklearn/metrics/__init__.py                 |   2 +
 sklearn/metrics/_scorer.py                  | 137 +++++++++++++++++++-
 sklearn/metrics/tests/test_score_objects.py |  58 ++++++++-
 3 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index be28005631963..36a54b88a50cc 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -74,6 +74,7 @@
 from ._scorer import check_scoring
 from ._scorer import make_scorer
 from ._scorer import SCORERS
+from ._scorer import get_applicable_scorers
 from ._scorer import get_scorer
 
 from ._plot.roc_curve import plot_roc_curve
@@ -109,6 +110,7 @@
     'f1_score',
     'fbeta_score',
     'fowlkes_mallows_score',
+    'get_applicable_scorers',
     'get_scorer',
     'hamming_loss',
     'hinge_loss',
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index b824b9b0cbcb8..a52a3b88edaa0 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -18,9 +18,12 @@
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 # License: Simplified BSD
 
+from collections import Counter
+from collections import namedtuple
 from collections.abc import Iterable
+from copy import deepcopy
+from inspect import signature
 from functools import partial
-from collections import Counter
 
 import numpy as np
 
@@ -686,3 +689,135 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
         qualified_name = '{0}_{1}'.format(name, average)
         SCORERS[qualified_name] = make_scorer(metric, pos_label=None,
                                               average=average)
+
+ScorerProperty = namedtuple(
+    "ScorerProperty", ["scorer", "target_type_supported"],
+)
+
+SCORERS_PROPERTY = dict(
+    explained_variance=ScorerProperty(
+        scorer=explained_variance_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    r2=ScorerProperty(
+        scorer=r2_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    max_error=ScorerProperty(
+        scorer=max_error_scorer,
+        target_type_supported=("continuous",),
+    ),
+    neg_median_absolute_error=ScorerProperty(
+        scorer=neg_median_absolute_error_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    neg_mean_absolute_error=ScorerProperty(
+        scorer=neg_mean_absolute_error_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    neg_mean_absolute_percentage_error=ScorerProperty(
+        scorer=neg_mean_absolute_percentage_error_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    neg_mean_squared_error=ScorerProperty(
+        scorer=neg_mean_squared_error_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    neg_mean_squared_log_error=ScorerProperty(
+        scorer=neg_mean_squared_log_error_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    neg_root_mean_squared_error=ScorerProperty(
+        scorer=neg_root_mean_squared_error_scorer,
+        target_type_supported=("continuous", "continuous-multioutput"),
+    ),
+    neg_mean_poisson_deviance=ScorerProperty(
+        scorer=neg_mean_poisson_deviance_scorer,
+        target_type_supported=("continuous",),
+    ),
+    neg_mean_gamma_deviance=ScorerProperty(
+        scorer=neg_mean_gamma_deviance_scorer,
+        target_type_supported=("continuous",),
+    ),
+    accuracy=ScorerProperty(
+        scorer=accuracy_scorer,
+        target_type_supported=("binary", "multiclass", "multilabel-indicator"),
+    ),
+    roc_auc=ScorerProperty(
+        scorer=roc_auc_scorer,
+        target_type_supported=("binary", "multiclass", "multilabel-indicator"),
+    ),
+    balanced_accuracy=ScorerProperty(
+        scorer=balanced_accuracy_scorer,
+        target_type_supported=("binary", "multiclass"),
+    ),
+    precision=ScorerProperty(
+        scorer=make_scorer(precision_score),
+        target_type_supported=("binary", "multilabel-indicator"),
+    ),
+    recall=ScorerProperty(
+        scorer=make_scorer(recall_score),
+        target_type_supported=("binary", "multilabel-indicator"),
+    ),
+    f1=ScorerProperty(
+        scorer=make_scorer(f1_score),
+        target_type_supported=("binary", "multilabel-indicator"),
+    ),
+    jaccard=ScorerProperty(
+        scorer=make_scorer(jaccard_score),
+        target_type_supported=("binary", "multilabel-indicator"),
+    ),
+    average_precision=ScorerProperty(
+        scorer=average_precision_scorer,
+        target_type_supported=("binary", "multilabel-indicator"),
+    ),
+    neg_log_loss=ScorerProperty(
+        scorer=neg_log_loss_scorer,
+        target_type_supported=("binary", "multiclass", "multilabel-indicator"),
+    ),
+    neg_brier_score=ScorerProperty(
+        scorer=neg_brier_score_scorer,
+        target_type_supported=("binary", "multiclass"),
+    ),
+)
+
+
+def get_applicable_scorers(y, **scorers_params):
+    """Utility providing scorers to be used on `y`.
+
+    This utility creates a dictionary containing the scorers which can be used
+    on `y`. The dictionary returned can be used directly in a
+    :class:`~sklearn.model_selection.GridSearchCV`.
+
+    Additional parameters taken by the different metrics can be passed as
+    keyword argument.
+
+    Parameters
+    ----------
+    y : array-like
+        The target used to infer the metrics which can be used.
+
+    **scorers_params
+        Additional parameters to be passed to the scorers when present in their
+        signature.
+
+    Returns
+    -------
+    scorers : dict
+        A dictionary containing the scorer name as key and a scorer callable as
+        value.
+    """
+    target_type = type_of_target(y)
+
+    scorers = {}
+    for scorer_name, scorer_property in SCORERS_PROPERTY.items():
+        if target_type in scorer_property.target_type_supported:
+            scorers[scorer_name] = deepcopy(scorer_property.scorer)
+            scorer_sig = signature(scorers[scorer_name]._score_func)
+            for param_name, param_value in scorers_params.items():
+                if param_name in scorer_sig.parameters:
+                    scorers[scorer_name]._kwargs[param_name] = param_value
+
+    if not scorers:
+        raise ValueError("No compatible scorer with the target 'y' was found.")
+    return scorers
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 484edd3e751ca..f105db0af9a4f 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -25,7 +25,12 @@
                                      _MultimetricScorer,
                                      _check_multimetric_scoring)
 from sklearn.metrics import accuracy_score
-from sklearn.metrics import make_scorer, get_scorer, SCORERS
+from sklearn.metrics import (
+    get_applicable_scorers,
+    get_scorer,
+    make_scorer,
+    SCORERS
+)
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import LinearSVC
 from sklearn.pipeline import make_pipeline
@@ -729,3 +734,54 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
     msg = "'Perceptron' object has no attribute 'predict_proba'"
     with pytest.raises(AttributeError, match=msg):
         scorer(lr, X, y)
+
+
+@pytest.mark.parametrize(
+    "Estimator, X, y",
+    [(LogisticRegression, *make_classification(n_classes=2)),
+     (LogisticRegression, *make_classification(
+         n_classes=3, n_clusters_per_class=1
+     )),
+     (LogisticRegression, *make_multilabel_classification()),
+     (Ridge, *make_regression(n_targets=1)),
+     (Ridge, *make_regression(n_targets=2))],
+    ids=[
+        "binary-classification",
+        "multiclass-classification",
+        "multilabel-classification",
+        "regression",
+        "multioutput-regression",
+    ]
+)
+def _generate_scorer(Estimator, X, y):
+    # smoke test to check that we can compute the score on the expected
+    # dataset
+    scorers = get_applicable_scorers(y)
+    estimator = Estimator().fit(X, y)
+    for scorer_name in scorers:
+        yield estimator, X, y, scorers[scorer_name]
+
+
+def _parametrize_scorers_from_target(estimator_data_ids):
+    check_scorers, check_scorers_ids = zip(*[
+        ((Estimator, X, y, scorer), f"{scorer_name}-{problem_id}")
+        for problem_id, Estimator, X, y in estimator_data_ids
+        for scorer_name, scorer in get_applicable_scorers(y).items()
+    ])
+
+    return pytest.mark.parametrize(
+        "Estimator, X, y, scorer", check_scorers, ids=check_scorers_ids,
+    )
+
+
+@_parametrize_scorers_from_target(
+    [("binary", LogisticRegression, *make_classification(n_classes=2)),
+     ("multiclass", LogisticRegression,
+      *make_classification(n_classes=3, n_clusters_per_class=1)),
+     ("multilabel", LogisticRegression, *make_multilabel_classification()),
+     ("continuous", Ridge, *make_regression(n_targets=1)),
+    ("continuous-multioutput", Ridge, *make_regression(n_targets=2))]
+)
+def test_get_applicable_scorers_smoke_test(Estimator, X, y, scorer):
+    estimator = Estimator().fit(X, y)
+    scorer(estimator, X, y)

From 6e8be2add35636b514f76e8af1fa763d26229b5f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 10 Jul 2020 23:53:26 +0200
Subject: [PATCH 28/31] iter

---
 sklearn/metrics/_scorer.py                  | 47 ++++++++++++++-------
 sklearn/metrics/tests/test_score_objects.py |  7 +--
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index a52a3b88edaa0..d98f1c57639d9 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -745,23 +745,27 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
     ),
     roc_auc=ScorerProperty(
         scorer=roc_auc_scorer,
-        target_type_supported=("binary", "multiclass", "multilabel-indicator"),
+        target_type_supported=("binary", "multilabel-indicator"),
     ),
-    balanced_accuracy=ScorerProperty(
-        scorer=balanced_accuracy_scorer,
-        target_type_supported=("binary", "multiclass"),
+    roc_auc_ovr=ScorerProperty(
+        scorer=roc_auc_ovr_scorer,
+        target_type_supported=("multiclass"),
     ),
-    precision=ScorerProperty(
-        scorer=make_scorer(precision_score),
-        target_type_supported=("binary", "multilabel-indicator"),
+    roc_auc_ovo=ScorerProperty(
+        scorer=roc_auc_ovo_scorer,
+        target_type_supported=("multiclass"),
     ),
-    recall=ScorerProperty(
-        scorer=make_scorer(recall_score),
-        target_type_supported=("binary", "multilabel-indicator"),
+    roc_auc_ovr_weighted=ScorerProperty(
+        scorer=roc_auc_ovr_weighted_scorer,
+        target_type_supported=("multiclass"),
     ),
-    f1=ScorerProperty(
-        scorer=make_scorer(f1_score),
-        target_type_supported=("binary", "multilabel-indicator"),
+    roc_auc_ovo_weighted=ScorerProperty(
+        scorer=roc_auc_ovo_weighted_scorer,
+        target_type_supported=("multiclass"),
+    ),
+    balanced_accuracy=ScorerProperty(
+        scorer=balanced_accuracy_scorer,
+        target_type_supported=("binary", "multiclass"),
     ),
     jaccard=ScorerProperty(
         scorer=make_scorer(jaccard_score),
@@ -773,14 +777,27 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
     ),
     neg_log_loss=ScorerProperty(
         scorer=neg_log_loss_scorer,
-        target_type_supported=("binary", "multiclass", "multilabel-indicator"),
+        target_type_supported=("binary", "multiclass"),
     ),
     neg_brier_score=ScorerProperty(
         scorer=neg_brier_score_scorer,
-        target_type_supported=("binary", "multiclass"),
+        target_type_supported=("binary"),
     ),
 )
 
+for name, metric in [('precision', precision_score),
+                     ('recall', recall_score), ('f1', f1_score),
+                     ('jaccard', jaccard_score)]:
+    SCORERS_PROPERTY[name] = ScorerProperty(
+        scorer=make_scorer(metric, average='binary'),
+        target_type_supported=("binary",),
+    )
+    for average in ['macro', 'micro', 'samples', 'weighted']:
+        qualified_name = f'{name}_{average}'
+        SCORERS_PROPERTY[qualified_name] = ScorerProperty(
+            scorer=make_scorer(metric, pos_label=None, average=average),
+            target_type_supported=("multilabel-indicator"),
+        )
 
 def get_applicable_scorers(y, **scorers_params):
     """Utility providing scorers to be used on `y`.
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index f105db0af9a4f..c082ecf7f3aba 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -764,7 +764,8 @@ def _generate_scorer(Estimator, X, y):
 
 def _parametrize_scorers_from_target(estimator_data_ids):
     check_scorers, check_scorers_ids = zip(*[
-        ((Estimator, X, y, scorer), f"{scorer_name}-{problem_id}")
+        ((Estimator, X, np.abs(y) - np.min(y), scorer),
+         f"{scorer_name}-{problem_id}")
         for problem_id, Estimator, X, y in estimator_data_ids
         for scorer_name, scorer in get_applicable_scorers(y).items()
     ])
@@ -778,9 +779,9 @@ def _parametrize_scorers_from_target(estimator_data_ids):
     [("binary", LogisticRegression, *make_classification(n_classes=2)),
      ("multiclass", LogisticRegression,
       *make_classification(n_classes=3, n_clusters_per_class=1)),
-     ("multilabel", LogisticRegression, *make_multilabel_classification()),
+     ("multilabel", DecisionTreeClassifier, *make_multilabel_classification()),
      ("continuous", Ridge, *make_regression(n_targets=1)),
-    ("continuous-multioutput", Ridge, *make_regression(n_targets=2))]
+     ("continuous-multioutput", Ridge, *make_regression(n_targets=2))]
 )
 def test_get_applicable_scorers_smoke_test(Estimator, X, y, scorer):
     estimator = Estimator().fit(X, y)

From 5a3bab16c50f9a3e5bd7bcc7e6ef14225f56fa4f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 11 Jul 2020 00:00:44 +0200
Subject: [PATCH 29/31] iter

---
 sklearn/metrics/tests/test_score_objects.py | 29 +++------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index c082ecf7f3aba..b4429981aa731 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -736,32 +736,6 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name):
         scorer(lr, X, y)
 
 
-@pytest.mark.parametrize(
-    "Estimator, X, y",
-    [(LogisticRegression, *make_classification(n_classes=2)),
-     (LogisticRegression, *make_classification(
-         n_classes=3, n_clusters_per_class=1
-     )),
-     (LogisticRegression, *make_multilabel_classification()),
-     (Ridge, *make_regression(n_targets=1)),
-     (Ridge, *make_regression(n_targets=2))],
-    ids=[
-        "binary-classification",
-        "multiclass-classification",
-        "multilabel-classification",
-        "regression",
-        "multioutput-regression",
-    ]
-)
-def _generate_scorer(Estimator, X, y):
-    # smoke test to check that we can compute the score on the expected
-    # dataset
-    scorers = get_applicable_scorers(y)
-    estimator = Estimator().fit(X, y)
-    for scorer_name in scorers:
-        yield estimator, X, y, scorers[scorer_name]
-
-
 def _parametrize_scorers_from_target(estimator_data_ids):
     check_scorers, check_scorers_ids = zip(*[
         ((Estimator, X, np.abs(y) - np.min(y), scorer),
@@ -775,6 +749,9 @@ def _parametrize_scorers_from_target(estimator_data_ids):
     )
 
 
+@pytest.mark.filterwarnings(
+    "ignore::sklearn.exceptions.UndefinedMetricWarning"
+)
 @_parametrize_scorers_from_target(
     [("binary", LogisticRegression, *make_classification(n_classes=2)),
      ("multiclass", LogisticRegression,

From 8732aa4ce4c16a6147dbcae6858b7e4eee04d977 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 11 Jul 2020 00:37:44 +0200
Subject: [PATCH 30/31] iter

---
 sklearn/metrics/tests/test_score_objects.py | 58 +++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index b4429981aa731..22d1f2d971f93 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -25,6 +25,7 @@
                                      _MultimetricScorer,
                                      _check_multimetric_scoring)
 from sklearn.metrics import accuracy_score
+from sklearn.metrics import average_precision_score
 from sklearn.metrics import (
     get_applicable_scorers,
     get_scorer,
@@ -40,6 +41,7 @@
 from sklearn.datasets import make_blobs
 from sklearn.datasets import make_classification, make_regression
 from sklearn.datasets import make_multilabel_classification
+from sklearn.datasets import load_breast_cancer
 from sklearn.datasets import load_diabetes
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.model_selection import GridSearchCV
@@ -761,5 +763,61 @@ def _parametrize_scorers_from_target(estimator_data_ids):
      ("continuous-multioutput", Ridge, *make_regression(n_targets=2))]
 )
 def test_get_applicable_scorers_smoke_test(Estimator, X, y, scorer):
+    # smoke test to check that we can use the score on the registered problem
     estimator = Estimator().fit(X, y)
     scorer(estimator, X, y)
+
+
+@pytest.mark.filterwarnings(
+    "ignore::sklearn.exceptions.UndefinedMetricWarning"
+)
+@pytest.mark.parametrize(
+    "Estimator, X, y",
+    [(LogisticRegression, *make_classification(n_classes=2)),
+     (LogisticRegression,
+      *make_classification(n_classes=3, n_clusters_per_class=1)),
+     (DecisionTreeClassifier, *make_multilabel_classification()),
+     (Ridge, *make_regression(n_targets=1)),
+     (Ridge, *make_regression(n_targets=2))]
+)
+def test_get_applicable_scorers_with_grid_search_smoke_test(Estimator, X, y):
+    # smoke test to check that scorers can be used directly inside a
+    # grid-search
+    if issubclass(Estimator, LogisticRegression):
+        param_grid = {"C": [0.1, 1]}
+    elif issubclass(Estimator, DecisionTreeClassifier):
+        param_grid = {"max_depth": [3, 5]}
+    elif issubclass(Estimator, Ridge):
+        y = np.abs(y) - np.min(y)
+        param_grid = {"alpha": [1, 10]}
+
+    scorers = get_applicable_scorers(y)
+    estimator = GridSearchCV(
+        Estimator(), param_grid=param_grid, scoring=scorers, n_jobs=-1,
+        refit=list(scorers.keys())[0],
+    )
+    estimator.fit(X, y)
+
+
+def test_get_applicable_scorers_passing_scoring_params():
+    # check that we can pass scoring parameters when getting the score
+    breast_cancer = load_breast_cancer()
+    X = breast_cancer.data
+    y = breast_cancer.target_names[breast_cancer.target].astype("object")
+
+    scorers = get_applicable_scorers(y, pos_label="malignant")
+    average_precision_scorer = scorers["average_precision"]
+    assert "pos_label" in average_precision_scorer._kwargs
+    assert average_precision_scorer._kwargs["pos_label"] == "malignant"
+
+    estimator = GridSearchCV(
+        DecisionTreeClassifier(), param_grid={"max_depth": [3, 5]},
+        scoring=average_precision_scorer,
+    )
+    estimator.fit(X, y)
+
+    # check that if we don't provide any pos_label, the grid-search will raise
+    # an error
+    with pytest.raises(ValueError, match="pos_label=1 is invalid"):
+        estimator.set_params(scoring=make_scorer(average_precision_score))
+        estimator.fit(X, y)

From 43668af8a23bfd64450e2e23eead4a44d7bc6d50 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 11 Jul 2020 00:43:41 +0200
Subject: [PATCH 31/31] PEP8

---
 sklearn/metrics/_scorer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index d98f1c57639d9..4f56f88ad3d23 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -799,6 +799,7 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
             target_type_supported=("multilabel-indicator"),
         )
 
+
 def get_applicable_scorers(y, **scorers_params):
     """Utility providing scorers to be used on `y`.