From 9d090da0baf4bc5776896f9f111d1daebc4ee849 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 3 Oct 2019 11:13:08 -0400 Subject: [PATCH 01/31] WIP --- sklearn/model_selection/_validation.py | 13 ++++++- sklearn/model_selection/tests/test_search.py | 36 +++++++++++++++++++ .../model_selection/tests/test_validation.py | 15 ++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index f3464205a993d..4a51f26d3a1dc 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -394,7 +394,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, - error_score=np.nan): + error_score=np.nan, check_scorer_key=None): """Fit estimator and compute scores for a given dataset split. Parameters @@ -455,6 +455,10 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, return_estimator : boolean, optional, default: False Whether to return the fitted estimator. + check_scorer_key : str or None, default=None + If a string and scorer returns a dictionary, the keys will be check + to contain `check_scorer_key`. + Returns ------- train_scores : dict of scorer name -> float, optional @@ -538,6 +542,13 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer) + + # check scorer keys + if (check_scorer_key is not None and isinstance(test_scores, dict) + and check_scorer_key not in test_scores): + raise ValueError("dict returned by scorer must contain {}".format( + check_scorer_key)) + if verbose > 2: if isinstance(test_scores, dict): for scorer_name in sorted(test_scores): diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index f3301606e997e..4b74bed3fee8b 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1779,3 +1779,39 @@ def get_n_splits(self, *args, **kw): 'inconsistent results. Expected \\d+ ' 'splits, got \\d+'): ridge.fit(X[:train_size], y[:train_size]) + + +def test_callable_multimetric_same_as_list_of_strings(): + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return {'recall': recall_score(y, y_pred), + 'accuracy': accuracy_score(y, y_pred)} + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV(est, {'C': [0.1, 1]}, + scoring=custom_scorer, refit='recall') + search_str = GridSearchCV(est, {'C': [0.1, 1]}, + scoring=['recall', 'accuracy'], refit='recall') + + search_callable.fit(X, y) + search_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + + +def test_callable_multimetric_error_on_invalid_key(): + def bad_scorer(est, X, y): + return {'bad_name': 1} + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]}, + scoring=bad_scorer, refit='good_name') + + msg = ('For multi-metric scoring, the parameter refit must be set to a ' + 'scorer key or a callable to refit') + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 4d681f24403ee..d1be6ae7b45e0 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1717,3 +1717,18 @@ def two_params_scorer(estimator, X_test): fit_and_score_args = [None, None, None, two_params_scorer] assert_raise_message(ValueError, error_message, _score, *fit_and_score_args) + + +def test_errors_when_key_not_in_scorer_dict(): + def scorer(est, X, y): + return {"my_key": 1} + + X, y = make_classification(n_samples=30, random_state=0) + train, test = next(ShuffleSplit().split(X)) + clf = SVC(kernel="linear", random_state=0) + + fit_and_score_args = [clf, X, y, scorer, train, test, 10, None, None] + + msg = "dict returned by scorer must contain not_my_key" + with pytest.raises(ValueError, match=msg): + _fit_and_score(*fit_and_score_args, check_scorer_key='not_my_key') From 315c335c9c68aa461dc16bbae59562a77e5a1634 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 3 Oct 2019 13:33:20 -0400 Subject: [PATCH 02/31] ENH Increase compability --- sklearn/metrics/scorer.py | 116 +++++++--------- sklearn/metrics/tests/test_score_objects.py | 31 +---- sklearn/model_selection/_search.py | 48 +++---- sklearn/model_selection/_validation.py | 124 +++++++++--------- sklearn/model_selection/tests/test_search.py | 36 ----- .../model_selection/tests/test_validation.py | 15 --- 6 files changed, 146 insertions(+), 224 deletions(-) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 25b826ff91f75..6c80894a0ee13 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -431,7 +431,7 @@ def check_scoring(estimator, scoring=None, allow_none=False): " None. %r was passed" % scoring) -def _check_multimetric_scoring(estimator, scoring=None): +def _check_multimetric_scoring(estimator, scoring): """Check the scoring parameter in cases when multiple metrics are allowed Parameters @@ -439,10 +439,7 @@ def _check_multimetric_scoring(estimator, scoring=None): estimator : sklearn estimator instance The estimator for which the scoring will be applied. - scoring : string, callable, list/tuple, dict or None, default: None - A single string (see :ref:`scoring_parameter`) or a callable - (see :ref:`scoring`) to evaluate the predictions on the test set. - + scoring : list/tuple or dict For evaluating multiple metrics, either give a list of (unique) strings or a dict with names as keys and callables as values. @@ -452,7 +449,6 @@ def _check_multimetric_scoring(estimator, scoring=None): See :ref:`multimetric_grid_search` for an example. - If None the estimator's score method is used. The return value in that case will be ``{'score': }``. If the estimator's score method is not available, a ``TypeError`` is raised. @@ -461,69 +457,59 @@ def _check_multimetric_scoring(estimator, scoring=None): ------- scorers_dict : dict A dict mapping each scorer name to its validated scorer. - - is_multimetric : bool - True if scorer is a list/tuple or dict of callables - False if scorer is None/str/callable """ - if callable(scoring) or scoring is None or isinstance(scoring, - str): - scorers = {"score": check_scoring(estimator, scoring=scoring)} - return scorers, False - else: - err_msg_generic = ("scoring should either be a single string or " - "callable for single metric evaluation or a " - "list/tuple of strings or a dict of scorer name " - "mapped to the callable for multiple metric " - "evaluation. Got %s of type %s" - % (repr(scoring), type(scoring))) - - if isinstance(scoring, (list, tuple, set)): - err_msg = ("The list/tuple elements must be unique " - "strings of predefined scorers. ") - invalid = False - try: - keys = set(scoring) - except TypeError: - invalid = True - if invalid: - raise ValueError(err_msg) - - if len(keys) != len(scoring): - raise ValueError(err_msg + "Duplicate elements were found in" - " the given list. %r" % repr(scoring)) - elif len(keys) > 0: - if not all(isinstance(k, str) for k in keys): - if any(callable(k) for k in keys): - raise ValueError(err_msg + - "One or more of the elements were " - "callables. Use a dict of score name " - "mapped to the scorer callable. " - "Got %r" % repr(scoring)) - else: - raise ValueError(err_msg + - "Non-string types were found in " - "the given list. Got %r" - % repr(scoring)) - scorers = {scorer: check_scoring(estimator, scoring=scorer) - for scorer in scoring} - else: - raise ValueError(err_msg + - "Empty list was given. %r" % repr(scoring)) - - elif isinstance(scoring, dict): + err_msg_generic = ("scoring should either be a single string or " + "callable for single metric evaluation or a " + "list/tuple of strings or a dict of scorer name " + "mapped to the callable for multiple metric " + "evaluation. Got %s of type %s" + % (repr(scoring), type(scoring))) + + if isinstance(scoring, (list, tuple, set)): + err_msg = ("The list/tuple elements must be unique " + "strings of predefined scorers. ") + invalid = False + try: keys = set(scoring) + except TypeError: + invalid = True + if invalid: + raise ValueError(err_msg) + + if len(keys) != len(scoring): + raise ValueError(err_msg + "Duplicate elements were found in" + " the given list. %r" % repr(scoring)) + elif len(keys) > 0: if not all(isinstance(k, str) for k in keys): - raise ValueError("Non-string types were found in the keys of " - "the given dict. scoring=%r" % repr(scoring)) - if len(keys) == 0: - raise ValueError("An empty dict was passed. %r" - % repr(scoring)) - scorers = {key: check_scoring(estimator, scoring=scorer) - for key, scorer in scoring.items()} + if any(callable(k) for k in keys): + raise ValueError(err_msg + + "One or more of the elements were " + "callables. Use a dict of score name " + "mapped to the scorer callable. " + "Got %r" % repr(scoring)) + else: + raise ValueError(err_msg + + "Non-string types were found in " + "the given list. Got %r" + % repr(scoring)) + scorers = {scorer: check_scoring(estimator, scoring=scorer) + for scorer in scoring} else: - raise ValueError(err_msg_generic) - return scorers, True + raise ValueError(err_msg + + "Empty list was given. %r" % repr(scoring)) + + elif isinstance(scoring, dict): + keys = set(scoring) + if not all(isinstance(k, str) for k in keys): + raise ValueError("Non-string types were found in the keys of " + "the given dict. scoring=%r" % repr(scoring)) + if len(keys) == 0: + raise ValueError("An empty dict was passed. %r" % repr(scoring)) + scorers = {key: check_scoring(estimator, scoring=scorer) + for key, scorer in scoring.items()} + else: + raise ValueError(err_msg_generic) + return scorers def make_scorer(score_func, greater_is_better=True, needs_proba=False, diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index cfabed6d2c4ac..8287e3d5c6445 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -202,30 +202,10 @@ def check_scoring_validator_for_single_metric_usecases(scoring_validator): assert scorer is None -def check_multimetric_scoring_single_metric_wrapper(*args, **kwargs): - # This wraps the _check_multimetric_scoring to take in - # single metric scoring parameter so we can run the tests - # that we will run for check_scoring, for check_multimetric_scoring - # too for single-metric usecases - - scorers, is_multi = _check_multimetric_scoring(*args, **kwargs) - # For all single metric use cases, it should register as not multimetric - assert not is_multi - if args[0] is not None: - assert scorers is not None - names, scorers = zip(*scorers.items()) - assert len(scorers) == 1 - assert names[0] == 'score' - scorers = scorers[0] - return scorers - - def test_check_scoring_and_check_multimetric_scoring(): check_scoring_validator_for_single_metric_usecases(check_scoring) # To make sure the check_scoring is correctly applied to the constituent # scorers - check_scoring_validator_for_single_metric_usecases( - check_multimetric_scoring_single_metric_wrapper) # For multiple metric use cases # Make sure it works for the valid cases @@ -237,8 +217,7 @@ def test_check_scoring_and_check_multimetric_scoring(): estimator = LinearSVC(random_state=0) estimator.fit([[1], [2], [3]], [1, 1, 0]) - scorers, is_multi = _check_multimetric_scoring(estimator, scoring) - assert is_multi + scorers = _check_multimetric_scoring(estimator, scoring) assert isinstance(scorers, dict) assert sorted(scorers.keys()) == sorted(list(scoring)) assert all([isinstance(scorer, _PredictScorer) @@ -589,7 +568,7 @@ def test_multimetric_scorer_calls_method_once(scorers, expected_predict_count, mock_est.predict_proba = predict_proba_func mock_est.decision_function = decision_function_func - scorer_dict, _ = _check_multimetric_scoring(LogisticRegression(), scorers) + scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers) multi_scorer = _MultimetricScorer(**scorer_dict) results = multi_scorer(mock_est, X, y) @@ -616,7 +595,7 @@ def predict_proba(self, X): clf.fit(X, y) scorers = ['roc_auc', 'neg_log_loss'] - scorer_dict, _ = _check_multimetric_scoring(clf, scorers) + scorer_dict = _check_multimetric_scoring(clf, scorers) scorer = _MultimetricScorer(**scorer_dict) scorer(clf, X, y) @@ -639,7 +618,7 @@ def predict(self, X): clf.fit(X, y) scorers = {'neg_mse': 'neg_mean_squared_error', 'r2': 'roc_auc'} - scorer_dict, _ = _check_multimetric_scoring(clf, scorers) + scorer_dict = _check_multimetric_scoring(clf, scorers) scorer = _MultimetricScorer(**scorer_dict) scorer(clf, X, y) @@ -657,7 +636,7 @@ def test_multimetric_scorer_sanity_check(): clf = DecisionTreeClassifier() clf.fit(X, y) - scorer_dict, _ = _check_multimetric_scoring(clf, scorers) + scorer_dict = _check_multimetric_scoring(clf, scorers) multi_scorer = _MultimetricScorer(**scorer_dict) result = multi_scorer(clf, X, y) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 80e78e6b7f913..e29919ea2b37e 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -27,7 +27,7 @@ from ..base import MetaEstimatorMixin from ._split import check_cv from ._validation import _fit_and_score -from ._validation import _aggregate_score_dicts +from ._validation import _aggregate_list_of_dicts from ..exceptions import NotFittedError from joblib import Parallel, delayed from ..utils import check_random_state @@ -627,27 +627,29 @@ def fit(self, X, y=None, groups=None, **fit_params): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) - scorers, self.multimetric_ = _check_multimetric_scoring( - self.estimator, scoring=self.scoring) - - if self.multimetric_: - if self.refit is not False and ( - not isinstance(self.refit, str) or - # This will work for both dict / list (tuple) - self.refit not in scorers) and not callable(self.refit): - raise ValueError("For multi-metric scoring, the parameter " - "refit must be set to a scorer key or a " - "callable to refit an estimator with the " - "best parameter setting on the whole " - "data and make the best_* attributes " - "available for that metric. If this is " - "not needed, refit should be set to " - "False explicitly. %r was passed." - % self.refit) - else: - refit_metric = self.refit - else: + if (callable(self.scoring) or self.scoring is None + or isinstance(self.scoring, str)): + self.multimetric_ = False + scorers = {"score": check_scoring(self.estimator, self.scoring)} refit_metric = 'score' + else: + self.multimetric_ = True + scorers = _check_multimetric_scoring(self.estimator, self.scoring) + refit_metric = self.refit + + if self.multimetric_ and self.refit is not False and ( + not isinstance(self.refit, str) or + # This will work for both dict / list (tuple) + self.refit not in scorers) and not callable(self.refit): + raise ValueError("For multi-metric scoring, the parameter " + "refit must be set to a scorer key or a " + "callable to refit an estimator with the " + "best parameter setting on the whole " + "data and make the best_* attributes " + "available for that metric. If this is " + "not needed, refit should be set to " + "False explicitly. %r was passed." + % self.refit) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) @@ -761,9 +763,9 @@ def _format_results(self, candidate_params, scorers, n_splits, out): # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists - test_scores = _aggregate_score_dicts(test_score_dicts) + test_scores = _aggregate_list_of_dicts(test_score_dicts) if self.return_train_score: - train_scores = _aggregate_score_dicts(train_score_dicts) + train_scores = _aggregate_list_of_dicts(train_score_dicts) results = {} diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 4a51f26d3a1dc..99f9f580e155b 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -220,13 +220,17 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) - scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring) + + if callable(scoring) or scoring is None or isinstance(scoring, str): + scorers = {"score": check_scoring(estimator, scoring)} + else: + scorers = _check_multimetric_scoring(estimator, scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) - scores = parallel( + results = parallel( delayed(_fit_and_score)( clone(estimator), X, y, scorers, train, test, verbose, None, fit_params, return_train_score=return_train_score, @@ -234,18 +238,16 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, error_score=error_score) for train, test in cv.split(X, y, groups)) - zipped_scores = list(zip(*scores)) + results = _aggregate_list_of_dicts(results, constructor=list) if return_train_score: - train_scores = zipped_scores.pop(0) - train_scores = _aggregate_score_dicts(train_scores) + train_scores = _aggregate_list_of_dicts(results["train_scores"]) if return_estimator: - fitted_estimators = zipped_scores.pop() - test_scores, fit_times, score_times = zipped_scores - test_scores = _aggregate_score_dicts(test_scores) + fitted_estimators = results["estimator"] + test_scores = _aggregate_list_of_dicts(results["test_scores"]) ret = {} - ret['fit_time'] = np.array(fit_times) - ret['score_time'] = np.array(score_times) + ret['fit_time'] = np.array(results["fit_time"]) + ret['score_time'] = np.array(results["score_time"]) if return_estimator: ret['estimator'] = fitted_estimators @@ -394,7 +396,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, - error_score=np.nan, check_scorer_key=None): + error_score=np.nan): """Fit estimator and compute scores for a given dataset split. Parameters @@ -455,33 +457,30 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, return_estimator : boolean, optional, default: False Whether to return the fitted estimator. - check_scorer_key : str or None, default=None - If a string and scorer returns a dictionary, the keys will be check - to contain `check_scorer_key`. - Returns ------- - train_scores : dict of scorer name -> float, optional - Score on training set (for all the scorers), - returned only if `return_train_score` is `True`. + result: dict with the following attributes + train_scores : dict of scorer name -> float, optional + Score on training set (for all the scorers), + returned only if `return_train_score` is `True`. - test_scores : dict of scorer name -> float, optional - Score on testing set (for all the scorers). + test_scores : dict of scorer name -> float, optional + Score on testing set (for all the scorers). - n_test_samples : int - Number of test samples. + n_test_samples : int + Number of test samples. - fit_time : float - Time spent for fitting in seconds. + fit_time : float + Time spent for fitting in seconds. - score_time : float - Time spent for scoring in seconds. + score_time : float + Time spent for scoring in seconds. - parameters : dict or None, optional - The parameters that have been evaluated. + parameters : dict or None, optional + The parameters that have been evaluated. - estimator : estimator object - The fitted estimator + estimator : estimator object + The fitted estimator """ if verbose > 1: if parameters is None: @@ -496,7 +495,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, fit_params = {k: _index_param_value(X, v, train) for k, v in fit_params.items()} - train_scores = {} if parameters is not None: estimator.set_params(**parameters) @@ -543,12 +541,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer) - # check scorer keys - if (check_scorer_key is not None and isinstance(test_scores, dict) - and check_scorer_key not in test_scores): - raise ValueError("dict returned by scorer must contain {}".format( - check_scorer_key)) - if verbose > 2: if isinstance(test_scores, dict): for scorer_name in sorted(test_scores): @@ -567,17 +559,19 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, total_time = score_time + fit_time print(_message_with_time('CV', msg, total_time)) - ret = [train_scores, test_scores] if return_train_score else [test_scores] - + result = {"test_scores": test_scores} + if return_train_score: + result["train_scores"] = train_scores if return_n_test_samples: - ret.append(_num_samples(X_test)) + result["n_test_samples"] = _num_samples(X_test) if return_times: - ret.extend([fit_time, score_time]) + result["fit_time"] = fit_time + result["score_time"] = score_time if return_parameters: - ret.append(parameters) + result["parameters"] = parameters if return_estimator: - ret.append(estimator) - return ret + result["estimator"] = estimator + return result def _score(estimator, X_test, y_test, scorer): @@ -1258,23 +1252,32 @@ def learning_curve(estimator, X, y, groups=None, out = parallel(delayed(_incremental_fit_estimator)( clone(estimator), X, y, classes, train, test, train_sizes_abs, scorer, verbose, return_times) for train, test in cv_iter) + out = np.asarray(out).transpose((2, 1, 0)) else: train_test_proportions = [] for train, test in cv_iter: for n_train_samples in train_sizes_abs: train_test_proportions.append((train[:n_train_samples], test)) - out = parallel(delayed(_fit_and_score)( + results = parallel(delayed(_fit_and_score)( clone(estimator), X, y, scorer, train, test, verbose, parameters=None, fit_params=None, return_train_score=True, error_score=error_score, return_times=return_times) for train, test in train_test_proportions) - out = np.array(out) - n_cv_folds = out.shape[0] // n_unique_ticks - dim = 4 if return_times else 2 - out = out.reshape(n_cv_folds, n_unique_ticks, dim) + results = _aggregate_list_of_dicts(results, constructor=list) + train_scores = (np.array(results["train_scores"]) + .reshape(-1, n_unique_ticks).T) - out = np.asarray(out).transpose((2, 1, 0)) + test_scores = (np.array(results["test_scores"]) + .reshape(-1, n_unique_ticks).T) + out = [train_scores, test_scores] + + if return_times: + fit_times = (np.array(results["fit_time"]) + .reshape(-1, n_unique_ticks).T) + score_times = (np.array(results["score_time"]) + .reshape(-1, n_unique_ticks).T) + out.extend([fit_times, score_times]) ret = train_sizes_abs, out[0], out[1] @@ -1479,21 +1482,24 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) - out = parallel(delayed(_fit_and_score)( + results = parallel(delayed(_fit_and_score)( clone(estimator), X, y, scorer, train, test, verbose, parameters={param_name: v}, fit_params=None, return_train_score=True, error_score=error_score) # NOTE do not change order of iteration to allow one time cv splitters for train, test in cv.split(X, y, groups) for v in param_range) - out = np.asarray(out) n_params = len(param_range) - n_cv_folds = out.shape[0] // n_params - out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0)) - return out[0], out[1] + results = _aggregate_list_of_dicts(results) + train_scores = (np.asarray(results["train_scores"]) + .reshape(-1, n_params).T) + test_scores = (np.asarray(results["test_scores"]) + .reshape(-1, n_params).T) + + return train_scores, test_scores -def _aggregate_score_dicts(scores): +def _aggregate_list_of_dicts(scores, constructor=np.asarray): """Aggregate the list of dict to dict of np ndarray The aggregated output of _fit_and_score will be a list of dict @@ -1511,10 +1517,10 @@ def _aggregate_score_dicts(scores): ------- >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, - ... {'a': 10, 'b': 10}] # doctest: +SKIP - >>> _aggregate_score_dicts(scores) # doctest: +SKIP + ... {'a': 10, 'b': 10}] + >>> _aggregate_list_of_dicts(scores) {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])} """ - return {key: np.asarray([score[key] for score in scores]) + return {key: constructor([score[key] for score in scores]) for key in scores[0]} diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 4b74bed3fee8b..f3301606e997e 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1779,39 +1779,3 @@ def get_n_splits(self, *args, **kw): 'inconsistent results. Expected \\d+ ' 'splits, got \\d+'): ridge.fit(X[:train_size], y[:train_size]) - - -def test_callable_multimetric_same_as_list_of_strings(): - def custom_scorer(est, X, y): - y_pred = est.predict(X) - return {'recall': recall_score(y, y_pred), - 'accuracy': accuracy_score(y, y_pred)} - - X, y = make_classification(n_samples=40, n_features=4, - random_state=42) - est = LinearSVC(random_state=42) - search_callable = GridSearchCV(est, {'C': [0.1, 1]}, - scoring=custom_scorer, refit='recall') - search_str = GridSearchCV(est, {'C': [0.1, 1]}, - scoring=['recall', 'accuracy'], refit='recall') - - search_callable.fit(X, y) - search_str.fit(X, y) - - assert search_callable.best_score_ == pytest.approx(search_str.best_score_) - assert search_callable.best_index_ == search_str.best_index_ - - -def test_callable_multimetric_error_on_invalid_key(): - def bad_scorer(est, X, y): - return {'bad_name': 1} - - X, y = make_classification(n_samples=40, n_features=4, - random_state=42) - clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]}, - scoring=bad_scorer, refit='good_name') - - msg = ('For multi-metric scoring, the parameter refit must be set to a ' - 'scorer key or a callable to refit') - with pytest.raises(ValueError, match=msg): - clf.fit(X, y) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index d1be6ae7b45e0..4d681f24403ee 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1717,18 +1717,3 @@ def two_params_scorer(estimator, X_test): fit_and_score_args = [None, None, None, two_params_scorer] assert_raise_message(ValueError, error_message, _score, *fit_and_score_args) - - -def test_errors_when_key_not_in_scorer_dict(): - def scorer(est, X, y): - return {"my_key": 1} - - X, y = make_classification(n_samples=30, random_state=0) - train, test = next(ShuffleSplit().split(X)) - clf = SVC(kernel="linear", random_state=0) - - fit_and_score_args = [clf, X, y, scorer, train, test, 10, None, None] - - msg = "dict returned by scorer must contain not_my_key" - with pytest.raises(ValueError, match=msg): - _fit_and_score(*fit_and_score_args, check_scorer_key='not_my_key') From 702cf1bfb79ae86d93fb58e12a995cfade79ba40 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 3 Oct 2019 13:52:16 -0400 Subject: [PATCH 03/31] ENH Refactories _fit_and_score --- sklearn/model_selection/_search.py | 26 +++++++++---------- sklearn/model_selection/_validation.py | 24 +++++++++-------- .../model_selection/tests/test_validation.py | 2 +- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index e29919ea2b37e..42094c544f9ae 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -368,13 +368,12 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, # NOTE we are not using the return value as the scorer by itself should be # validated before. We use check_scoring only to reject multimetric scorer check_scoring(estimator, scorer) - scores, n_samples_test = _fit_and_score(estimator, X, y, - scorer, train, - test, verbose, parameters, - fit_params=fit_params, - return_n_test_samples=True, - error_score=error_score) - return scores, parameters, n_samples_test + results = _fit_and_score(estimator, X, y, scorer, train, + test, verbose, parameters, + fit_params=fit_params, + return_n_test_samples=True, + error_score=error_score) + return results["test_scores"], parameters, results["n_test_samples"] def _check_param_grid(param_grid): @@ -752,19 +751,18 @@ def evaluate_candidates(candidate_params): def _format_results(self, candidate_params, scorers, n_splits, out): n_candidates = len(candidate_params) + results = _aggregate_list_of_dicts(out, constructor=list) - # if one choose to see train score, "out" will contain train score info - if self.return_train_score: - (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, - score_time) = zip(*out) - else: - (test_score_dicts, test_sample_counts, fit_time, - score_time) = zip(*out) + test_score_dicts = results["test_scores"] + test_sample_counts = results["n_test_samples"] + fit_time = results["fit_time"] + score_time = results["score_time"] # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists test_scores = _aggregate_list_of_dicts(test_score_dicts) if self.return_train_score: + train_score_dicts = results["train_scores"] train_scores = _aggregate_list_of_dicts(train_score_dicts) results = {} diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 99f9f580e155b..77df9aae6ce52 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1499,8 +1499,8 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, return train_scores, test_scores -def _aggregate_list_of_dicts(scores, constructor=np.asarray): - """Aggregate the list of dict to dict of np ndarray +def _aggregate_list_of_dicts(elements, constructor=np.asarray): + """Aggregate the list of dicts The aggregated output of _fit_and_score will be a list of dict of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...] @@ -1509,18 +1509,20 @@ def _aggregate_list_of_dicts(scores, constructor=np.asarray): Parameters ---------- - scores : list of dict - List of dicts of the scores for all scorers. This is a flat list, + elements : list of dict + List of dicts of the elements for all scorers. This is a flat list, assumed originally to be of row major order. + constructor : function, default=np.asarray + Used to combine elements of dictionaries in list + Example ------- - >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, - ... {'a': 10, 'b': 10}] - >>> _aggregate_list_of_dicts(scores) - {'a': array([1, 2, 3, 10]), - 'b': array([10, 2, 3, 10])} + >>> elements = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, + ... {'a': 10, 'b': 10}] # doctest: +SKIP + >>> _aggregate_list_of_dicts(elements) # doctest: +SKIP + {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])} """ - return {key: constructor([score[key] for score in scores]) - for key in scores[0]} + return {key: constructor([elm[key] for elm in elements]) + for key in elements[0]} diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 4d681f24403ee..aa8c12132b09b 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1680,7 +1680,7 @@ def test_fit_and_score_working(): 'return_parameters': True} result = _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) - assert result[-1] == fit_and_score_kwargs['parameters'] + assert result['parameters'] == fit_and_score_kwargs['parameters'] def three_params_scorer(i, j, k): From a7d2efbdbb55091a6e6aae2cdf84fdd8d49e9e6b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 3 Oct 2019 21:45:18 -0400 Subject: [PATCH 04/31] RFC Moves support into a function --- doc/modules/model_evaluation.rst | 20 ++- sklearn/model_selection/_search.py | 90 +++++++++----- sklearn/model_selection/_validation.py | 75 ++++++++++-- sklearn/model_selection/tests/test_search.py | 114 ++++++++++++++++++ sklearn/model_selection/tests/test_split.py | 2 +- .../model_selection/tests/test_validation.py | 29 ++++- 6 files changed, 274 insertions(+), 56 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 7f0553c30a3e3..a34c6ae867fff 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -248,7 +248,7 @@ Using multiple metric evaluation Scikit-learn also permits evaluation of multiple metrics in ``GridSearchCV``, ``RandomizedSearchCV`` and ``cross_validate``. -There are two ways to specify multiple scoring metrics for the ``scoring`` +There are three ways to specify multiple scoring metrics for the ``scoring`` parameter: - As an iterable of string metrics:: @@ -263,22 +263,20 @@ parameter: Note that the dict values can either be scorer functions or one of the predefined metric strings. -Currently only those scorer functions that return a single score can be passed -inside the dict. Scorer functions that return multiple values are not -permitted and will require a wrapper to return a single metric:: +- As a callable that returns a dictionary of scores:: >>> from sklearn.model_selection import cross_validate >>> from sklearn.metrics import confusion_matrix >>> # A sample toy binary classification dataset >>> X, y = datasets.make_classification(n_classes=2, random_state=0) >>> svm = LinearSVC(random_state=0) - >>> def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0] - >>> def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1] - >>> def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0] - >>> def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1] - >>> scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn), - ... 'fp': make_scorer(fp), 'fn': make_scorer(fn)} - >>> cv_results = cross_validate(svm.fit(X, y), X, y, cv=5, scoring=scoring) + >>> def confusion_matrix_scorer(clf, X, y): + ... y_pred = clf.predict(X) + ... cm = confusion_matrix(y, y_pred) + ... return {'tn': cm[0, 0], 'fp': cm[0, 1], + ... 'fn': cm[1, 0], 'tp': cm[1, 1]} + >>> cv_results = cross_validate(svm.fit(X, y), X, y, cv=5, + ... scoring=confusion_matrix_scorer) >>> # Getting the test set true positive scores >>> print(cv_results['test_tp']) [10 9 8 7 8] diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 42094c544f9ae..751bdf27dbc6c 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -28,6 +28,7 @@ from ._split import check_cv from ._validation import _fit_and_score from ._validation import _aggregate_list_of_dicts +from ._validation import _check_fit_and_score_results from ..exceptions import NotFittedError from joblib import Parallel, delayed from ..utils import check_random_state @@ -445,8 +446,18 @@ def score(self, X, y=None): raise ValueError("No score function explicitly defined, " "and the estimator doesn't provide one %s" % self.best_estimator_) - score = self.scorer_[self.refit] if self.multimetric_ else self.scorer_ - return score(self.best_estimator_, X, y) + if isinstance(self.scorer_, dict): + if self.multimetric_: + scorer = self.scorer_[self.refit] + else: + scorer = self.scorer_ + return scorer(self.best_estimator_, X, y) + + # callable + score = self.scorer_(self.best_estimator_, X, y) + if self.multimetric_: + score = score[self.refit] + return score def _check_is_fitted(self, method_name): if not self.refit: @@ -626,29 +637,31 @@ def fit(self, X, y=None, groups=None, **fit_params): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) - if (callable(self.scoring) or self.scoring is None - or isinstance(self.scoring, str)): - self.multimetric_ = False - scorers = {"score": check_scoring(self.estimator, self.scoring)} - refit_metric = 'score' + multimetric_refit_msg = ("For multi-metric scoring, the parameter " + "refit must be set to a scorer key or a " + "callable to refit an estimator with the " + "best parameter setting on the whole " + "data and make the best_* attributes " + "available for that metric. If this is " + "not needed, refit should be set to " + "False explicitly. %r was passed." + % self.refit) + + refit_metric = "score" + scoring_callable = callable(self.scoring) + if scoring_callable: + scorers = self.scoring + elif (self.scoring is None or isinstance(self.scoring, str)): + scorers = check_scoring(self.estimator, self.scoring) else: - self.multimetric_ = True scorers = _check_multimetric_scoring(self.estimator, self.scoring) refit_metric = self.refit - if self.multimetric_ and self.refit is not False and ( - not isinstance(self.refit, str) or - # This will work for both dict / list (tuple) - self.refit not in scorers) and not callable(self.refit): - raise ValueError("For multi-metric scoring, the parameter " - "refit must be set to a scorer key or a " - "callable to refit an estimator with the " - "best parameter setting on the whole " - "data and make the best_* attributes " - "available for that metric. If this is " - "not needed, refit should be set to " - "False explicitly. %r was passed." - % self.refit) + if self.refit is not False and ( + not isinstance(self.refit, str) or + # This will work for both dict / list (tuple) + self.refit not in scorers) and not callable(self.refit): + raise ValueError(multimetric_refit_msg) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) @@ -664,6 +677,7 @@ def fit(self, X, y=None, groups=None, **fit_params): return_n_test_samples=True, return_times=True, return_parameters=False, + return_fit_failed=True, error_score=self.error_score, verbose=self.verbose) results = {} @@ -705,11 +719,26 @@ def evaluate_candidates(candidate_params): nonlocal results results = self._format_results( - all_candidate_params, scorers, n_splits, all_out) + all_candidate_params, n_splits, all_out) return results self._run_search(evaluate_candidates) + for out in all_out: + if not out["fit_failed"]: + successful_score = out['test_scores'] + break + + self.multimetric_ = isinstance(successful_score, dict) + + # scorer is callable, check refit_metric now + if scoring_callable and self.multimetric_: + if (self.refit is not False and not callable(self.refit) + and (not isinstance(self.refit, str) + or self.refit not in successful_score)): + raise ValueError(multimetric_refit_msg) + refit_metric = self.refit + # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" @@ -742,28 +771,27 @@ def evaluate_candidates(candidate_params): self.refit_time_ = refit_end_time - refit_start_time # Store the only scorer not as a dict for single metric evaluation - self.scorer_ = scorers if self.multimetric_ else scorers['score'] + self.scorer_ = scorers self.cv_results_ = results self.n_splits_ = n_splits return self - def _format_results(self, candidate_params, scorers, n_splits, out): + def _format_results(self, candidate_params, n_splits, out): n_candidates = len(candidate_params) results = _aggregate_list_of_dicts(out, constructor=list) - test_score_dicts = results["test_scores"] test_sample_counts = results["n_test_samples"] fit_time = results["fit_time"] score_time = results["score_time"] - # test_score_dicts and train_score dicts are lists of dictionaries and - # we make them into dict of lists - test_scores = _aggregate_list_of_dicts(test_score_dicts) + info_dict = _check_fit_and_score_results(results, self.error_score) + score_names = info_dict["score_names"] + test_scores = info_dict["test_scores"] + if self.return_train_score: - train_score_dicts = results["train_scores"] - train_scores = _aggregate_list_of_dicts(train_score_dicts) + train_scores = info_dict["train_scores"] results = {} @@ -824,7 +852,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): else: iid = False - for scorer_name in scorers.keys(): + for scorer_name in score_names: # Computed the (weighted) mean and std for test scores alone _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 77df9aae6ce52..a733cde888747 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -27,7 +27,7 @@ from ..utils.metaestimators import _safe_split from ..metrics.scorer import (check_scoring, _check_multimetric_scoring, _MultimetricScorer) -from ..exceptions import FitFailedWarning +from ..exceptions import FitFailedWarning, NotFittedError from ._split import check_cv from ..preprocessing import LabelEncoder @@ -221,8 +221,10 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, cv = check_cv(cv, y, classifier=is_classifier(estimator)) - if callable(scoring) or scoring is None or isinstance(scoring, str): - scorers = {"score": check_scoring(estimator, scoring)} + if callable(scoring): + scorers = scoring + elif scoring is None or isinstance(scoring, str): + scorers = check_scoring(estimator, scoring) else: scorers = _check_multimetric_scoring(estimator, scoring) @@ -235,15 +237,17 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, clone(estimator), X, y, scorers, train, test, verbose, None, fit_params, return_train_score=return_train_score, return_times=True, return_estimator=return_estimator, - error_score=error_score) + error_score=error_score, return_fit_failed=True) for train, test in cv.split(X, y, groups)) results = _aggregate_list_of_dicts(results, constructor=list) - if return_train_score: - train_scores = _aggregate_list_of_dicts(results["train_scores"]) + + info_dict = _check_fit_and_score_results(results, error_score) + score_names = info_dict["score_names"] + test_scores = info_dict["test_scores"] + if return_estimator: fitted_estimators = results["estimator"] - test_scores = _aggregate_list_of_dicts(results["test_scores"]) ret = {} ret['fit_time'] = np.array(results["fit_time"]) @@ -252,15 +256,52 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, if return_estimator: ret['estimator'] = fitted_estimators - for name in scorers: + for name in score_names: ret['test_%s' % name] = np.array(test_scores[name]) if return_train_score: + train_scores = info_dict["train_scores"] key = 'train_%s' % name ret[key] = np.array(train_scores[name]) return ret +def _check_fit_and_score_results(results, error_score): + """Checks _fit_and_score results. Handles scoring as a callable and + normalizes scores into a list of dictionaries. + """ + fit_failed = results["fit_failed"] + test_score_dicts = results["test_scores"] + + if all(fit_failed): + raise NotFittedError("All estimators failed to fit") + + successful_score = test_score_dicts[fit_failed.index(False)] + if any(fit_failed) and isinstance(successful_score, dict): + for i in np.flatnonzero(fit_failed): + # error_score is a number + test_score_dicts[i] = {name: error_score + for name in successful_score} + + output = {} + # converts single metrics into a list of dictionaries + if not isinstance(successful_score, dict): + test_score_dicts = [{"score": elm} for elm in test_score_dicts] + output["score_names"] = ["score"] + else: + output["score_names"] = list(successful_score.keys()) + + output["test_scores"] = _aggregate_list_of_dicts(test_score_dicts) + + if "train_scores" in results: + train_score_dicts = results["train_scores"] + if not isinstance(successful_score, dict): + train_score_dicts = [{"score": elm} for elm in train_score_dicts] + output["train_scores"] = _aggregate_list_of_dicts(train_score_dicts) + + return output + + def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan): @@ -396,7 +437,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, - error_score=np.nan): + error_score=np.nan, return_fit_failed=False): """Fit estimator and compute scores for a given dataset split. Parameters @@ -457,6 +498,10 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, return_estimator : boolean, optional, default: False Whether to return the fitted estimator. + return_fit_failed : bool, default=False + Whether to return if estimatored failed to fit, when error_score is + numeric. + Returns ------- result: dict with the following attributes @@ -481,6 +526,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, estimator : estimator object The fitted estimator + + fit_failed : bool + The estimator failed to fit. """ if verbose > 1: if parameters is None: @@ -503,6 +551,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) + result = {} try: if y_train is None: estimator.fit(X_train, **fit_params) @@ -533,8 +582,12 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") - + if return_fit_failed: + result["fit_failed"] = True else: + if return_fit_failed: + result["fit_failed"] = False + fit_time = time.time() - start_time test_scores = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time @@ -559,7 +612,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, total_time = score_time + fit_time print(_message_with_time('CV', msg, total_time)) - result = {"test_scores": test_scores} + result["test_scores"] = test_scores if return_train_score: result["train_scores"] = train_scores if return_n_test_samples: diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index f3301606e997e..8ff8a1287127b 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -61,6 +61,7 @@ from sklearn.metrics import accuracy_score from sklearn.metrics import make_scorer from sklearn.metrics import roc_auc_score +from sklearn.metrics import confusion_matrix from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import Ridge, SGDClassifier @@ -1779,3 +1780,116 @@ def get_n_splits(self, *args, **kw): 'inconsistent results. Expected \\d+ ' 'splits, got \\d+'): ridge.fit(X[:train_size], y[:train_size]) + + +def test_callable_multimetric_confusion_matrix(): + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + return {'tn': cm[0, 0], 'fp': cm[0, 1], 'fn': cm[1, 0], 'tp': cm[1, 1]} + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + est = LinearSVC(random_state=42) + search = GridSearchCV(est, {'C': [0.1, 1]}, scoring=custom_scorer, + refit='fp') + + search.fit(X, y) + + score_names = ['tn', 'fp', 'fn', 'tp'] + for name in score_names: + assert "mean_test_{}".format(name) in search.cv_results_ + + y_pred = search.predict(X) + cm = confusion_matrix(y, y_pred) + assert search.score(X, y) == pytest.approx(cm[0, 1]) + + +def test_callable_multimetric_same_as_list_of_strings(): + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return {'recall': recall_score(y, y_pred), + 'accuracy': accuracy_score(y, y_pred)} + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV(est, {'C': [0.1, 1]}, + scoring=custom_scorer, refit='recall') + search_str = GridSearchCV(est, {'C': [0.1, 1]}, + scoring=['recall', 'accuracy'], refit='recall') + + search_callable.fit(X, y) + search_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y)) + + +def test_callable_single_metric_same_as_single_string(): + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return recall_score(y, y_pred) + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV(est, {'C': [0.1, 1]}, + scoring=custom_scorer, refit=True) + search_str = GridSearchCV(est, {'C': [0.1, 1]}, + scoring='recall', refit='recall') + + search_callable.fit(X, y) + search_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y)) + + +def test_callable_multimetric_error_on_invalid_key(): + def bad_scorer(est, X, y): + return {'bad_name': 1} + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]}, + scoring=bad_scorer, refit='good_name') + + msg = ('For multi-metric scoring, the parameter refit must be set to a ' + 'scorer key or a callable to refit') + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) + + +def test_callable_multimetric_error_failing_clf(): + def custom_scorer(est, X, y): + return {'acc': 1} + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring=custom_scorer, + refit=False, error_score=0.1) + + with pytest.warns(FitFailedWarning, match='Estimator fit failed'): + gs.fit(X, y) + + assert_allclose(gs.cv_results_['mean_test_acc'], [1, 1, 0.1]) + + +def test_callable_multimetric_clf_all_fails(): + def custom_scorer(est, X, y): + return {'acc': 1} + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + gs = GridSearchCV(clf, [{'parameter': [2, 2, 2]}], scoring=custom_scorer, + refit=False, error_score=0.1) + + with pytest.warns(FitFailedWarning, match='Estimator fit failed'), \ + pytest.raises(NotFittedError, + match="All estimators failed to fit"): + gs.fit(X, y) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 12891c6004f90..09d1bea05fd44 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1517,7 +1517,7 @@ def test_nested_cv(): StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): - gs = GridSearchCV(Ridge(solver="eigen"), param_grid={'alpha': [1, .1]}, + gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]}, cv=inner_cv, error_score='raise') cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={'groups': groups}) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index aa8c12132b09b..9448f6b2b0740 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -52,13 +52,14 @@ from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import precision_score from sklearn.metrics import r2_score +from sklearn.metrics import mean_squared_error from sklearn.metrics.scorer import check_scoring from sklearn.linear_model import Ridge, LogisticRegression, SGDClassifier from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC +from sklearn.svm import SVC, LinearSVC from sklearn.cluster import KMeans from sklearn.impute import SimpleImputer @@ -443,9 +444,16 @@ def check_cross_validate_multi_metric(clf, X, y, scores): # Test multimetric evaluation when scoring is a list / dict (train_mse_scores, test_mse_scores, train_r2_scores, test_r2_scores, fitted_estimators) = scores + + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + return {'r2': r2_score(y, y_pred), + 'neg_mean_squared_error': -mean_squared_error(y, y_pred)} + all_scoring = (('r2', 'neg_mean_squared_error'), {'r2': make_scorer(r2_score), - 'neg_mean_squared_error': 'neg_mean_squared_error'}) + 'neg_mean_squared_error': 'neg_mean_squared_error'}, + custom_scorer) keys_sans_train = {'test_r2', 'test_neg_mean_squared_error', 'fit_time', 'score_time'} @@ -1717,3 +1725,20 @@ def two_params_scorer(estimator, X_test): fit_and_score_args = [None, None, None, two_params_scorer] assert_raise_message(ValueError, error_message, _score, *fit_and_score_args) + + +def test_callable_multimetric_confusion_matrix_cross_validate(): + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + return {'tn': cm[0, 0], 'fp': cm[0, 1], 'fn': cm[1, 0], 'tp': cm[1, 1]} + + X, y = make_classification(n_samples=40, n_features=4, + random_state=42) + est = LinearSVC(random_state=42) + est.fit(X, y) + cv_results = cross_validate(est, X, y, cv=5, scoring=custom_scorer) + + score_names = ['tn', 'fp', 'fn', 'tp'] + for name in score_names: + assert "test_{}".format(name) in cv_results From c77afd756bff382385b2b44990bd8826ed85bb59 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 3 Oct 2019 22:00:51 -0400 Subject: [PATCH 05/31] BUG Fix old numpy bug --- sklearn/model_selection/_validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index a733cde888747..75a99a97909a9 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -278,8 +278,9 @@ def _check_fit_and_score_results(results, error_score): successful_score = test_score_dicts[fit_failed.index(False)] if any(fit_failed) and isinstance(successful_score, dict): - for i in np.flatnonzero(fit_failed): - # error_score is a number + for i, failed in enumerate(fit_failed): + if not failed: + continue test_score_dicts[i] = {name: error_score for name in successful_score} From 5ab8693fcbecbe33d9b30605793c1d68cf203bc2 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 3 Oct 2019 22:17:11 -0400 Subject: [PATCH 06/31] TST Removes tests for error on multimetric --- sklearn/metrics/scorer.py | 2 +- sklearn/model_selection/tests/test_validation.py | 14 ++------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 6c80894a0ee13..f99ab0bc1e149 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -459,7 +459,7 @@ def _check_multimetric_scoring(estimator, scoring): A dict mapping each scorer name to its validated scorer. """ err_msg_generic = ("scoring should either be a single string or " - "callable for single metric evaluation or a " + "callable or a " "list/tuple of strings or a dict of scorer name " "mapped to the callable for multiple metric " "evaluation. Got %s of type %s" diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 9448f6b2b0740..df185918bb2da 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -315,8 +315,8 @@ def test_cross_validate_invalid_scoring_param(): cross_validate, estimator, X, y, scoring=[[make_scorer(precision_score)]]) - error_message_regexp = (".*should either be.*string or callable.*for " - "single.*.*dict.*for multi.*") + error_message_regexp = (".*should either be.*string or callable.*" + ".*.*dict.*for multi.*") # Empty dict should raise invalid scoring error assert_raises_regex(ValueError, "An empty dict", @@ -340,16 +340,6 @@ def test_cross_validate_invalid_scoring_param(): cross_validate, estimator, X, y, scoring={"foo": multiclass_scorer}) - multivalued_scorer = make_scorer(confusion_matrix) - - # Multiclass Scorers that return multiple values are not supported yet - assert_raises_regex(ValueError, "scoring must return a number, got", - cross_validate, SVC(), X, y, - scoring=multivalued_scorer) - assert_raises_regex(ValueError, "scoring must return a number, got", - cross_validate, SVC(), X, y, - scoring={"foo": multivalued_scorer}) - assert_raises_regex(ValueError, "'mse' is not a valid scoring value.", cross_validate, SVC(), X, y, scoring="mse") From e8f8c9fd767090542de0163a661f06fd740e3fd1 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 10:47:34 -0500 Subject: [PATCH 07/31] DOC Indent --- doc/modules/model_evaluation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 60341e4d2e78b..0cdda4ec1e09b 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -260,8 +260,8 @@ parameter: >>> scoring = {'accuracy': make_scorer(accuracy_score), ... 'prec': 'precision'} -Note that the dict values can either be scorer functions or one of the -predefined metric strings. + Note that the dict values can either be scorer functions or one of the + predefined metric strings. - As a callable that returns a dictionary of scores:: From 5f50a323e56c2ee7d6f9a3ff53ad896497465a78 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 4 Dec 2019 12:36:44 -0500 Subject: [PATCH 08/31] CLN Refactors multimetric check --- sklearn/model_selection/_search.py | 47 ++++++++++++-------------- sklearn/model_selection/_validation.py | 17 +++------- 2 files changed, 26 insertions(+), 38 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d43c066a3920c..d37edb7f014f1 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -610,6 +610,23 @@ def _run_search(self, evaluate_candidates): """ raise NotImplementedError("_run_search not implemented.") + def _check_multimetric_scores_refit(self, scores_dict): + """Check score contains the string in refit""" + multimetric_refit_msg = ("For multi-metric scoring, the parameter " + "refit must be set to a scorer key or a " + "callable to refit an estimator with the " + "best parameter setting on the whole " + "data and make the best_* attributes " + "available for that metric. If this is " + "not needed, refit should be set to " + "False explicitly. %r was passed." + % self.refit) + if self.refit is not False and ( + not isinstance(self.refit, str) or + # This will work for both dict / list (tuple) + self.refit not in scores_dict) and not callable(self.refit): + raise ValueError(multimetric_refit_msg) + def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. @@ -635,32 +652,16 @@ def fit(self, X, y=None, groups=None, **fit_params): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) - multimetric_refit_msg = ("For multi-metric scoring, the parameter " - "refit must be set to a scorer key or a " - "callable to refit an estimator with the " - "best parameter setting on the whole " - "data and make the best_* attributes " - "available for that metric. If this is " - "not needed, refit should be set to " - "False explicitly. %r was passed." - % self.refit) - refit_metric = "score" - scoring_callable = callable(self.scoring) - if scoring_callable: + if callable(self.scoring): scorers = self.scoring - elif (self.scoring is None or isinstance(self.scoring, str)): + elif self.scoring is None or isinstance(self.scoring, str): scorers = check_scoring(self.estimator, self.scoring) else: scorers = _check_multimetric_scoring(self.estimator, self.scoring) + self._check_multimetric_scores_refit(scorers) refit_metric = self.refit - if self.refit is not False and ( - not isinstance(self.refit, str) or - # This will work for both dict / list (tuple) - self.refit not in scorers) and not callable(self.refit): - raise ValueError(multimetric_refit_msg) - X, y, groups = indexable(X, y, groups) # make sure fit_params are sliceable fit_params_values = indexable(*fit_params.values()) @@ -679,7 +680,6 @@ def fit(self, X, y=None, groups=None, **fit_params): return_n_test_samples=True, return_times=True, return_parameters=False, - return_fit_failed=True, error_score=self.error_score, verbose=self.verbose) results = {} @@ -734,11 +734,8 @@ def evaluate_candidates(candidate_params): self.multimetric_ = isinstance(successful_score, dict) # scorer is callable, check refit_metric now - if scoring_callable and self.multimetric_: - if (self.refit is not False and not callable(self.refit) - and (not isinstance(self.refit, str) - or self.refit not in successful_score)): - raise ValueError(multimetric_refit_msg) + if callable(self.scoring) and self.multimetric_: + self._check_multimetric_scores_refit(successful_score) refit_metric = self.refit # For multi-metric evaluation, store the best_index_, best_params_ and diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 3c65d0e8f2ce4..0c2aea24551d8 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -237,7 +237,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, clone(estimator), X, y, scorers, train, test, verbose, None, fit_params, return_train_score=return_train_score, return_times=True, return_estimator=return_estimator, - error_score=error_score, return_fit_failed=True) + error_score=error_score) for train, test in cv.split(X, y, groups)) results = _aggregate_list_of_dicts(results, constructor=list) @@ -267,9 +267,6 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, def _check_fit_and_score_results(results, error_score): - """Checks _fit_and_score results. Handles scoring as a callable and - normalizes scores into a list of dictionaries. - """ fit_failed = results["fit_failed"] test_score_dicts = results["test_scores"] @@ -438,7 +435,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, - error_score=np.nan, return_fit_failed=False): + error_score=np.nan): """Fit estimator and compute scores for a given dataset split. Parameters @@ -499,10 +496,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, return_estimator : boolean, optional, default: False Whether to return the fitted estimator. - return_fit_failed : bool, default=False - Whether to return if estimatored failed to fit, when error_score is - numeric. - Returns ------- result: dict with the following attributes @@ -590,11 +583,9 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") - if return_fit_failed: - result["fit_failed"] = True + result["fit_failed"] = True else: - if return_fit_failed: - result["fit_failed"] = False + result["fit_failed"] = False fit_time = time.time() - start_time test_scores = _score(estimator, X_test, y_test, scorer) From 57c390a962a558ca52e70b0a5b86c94bac1ca1e1 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 3 Feb 2020 20:25:31 -0500 Subject: [PATCH 09/31] CLN Address comments --- sklearn/model_selection/_search.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 157cc976de359..457af285e1e5f 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -610,7 +610,7 @@ def _run_search(self, evaluate_candidates): """ raise NotImplementedError("_run_search not implemented.") - def _check_multimetric_scores_refit(self, scores_dict): + def _check_refit_for_multimetric(self, scores_dict): """Check score contains the string in refit""" multimetric_refit_msg = ("For multi-metric scoring, the parameter " "refit must be set to a scorer key or a " @@ -660,7 +660,7 @@ def fit(self, X, y=None, groups=None, **fit_params): scorers = check_scoring(self.estimator, self.scoring) else: scorers = _check_multimetric_scoring(self.estimator, self.scoring) - self._check_multimetric_scores_refit(scorers) + self._check_refit_for_multimetric(scorers) refit_metric = self.refit X, y, groups = indexable(X, y, groups) @@ -734,7 +734,7 @@ def evaluate_candidates(candidate_params): # scorer is callable, check refit_metric now if callable(self.scoring) and self.multimetric_: - self._check_multimetric_scores_refit(successful_score) + self._check_refit_for_multimetric(successful_score) refit_metric = self.refit # For multi-metric evaluation, store the best_index_, best_params_ and From 1b28907d52a3620b4d8c2d7b92326bac45546da4 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 5 Feb 2020 09:48:09 -0500 Subject: [PATCH 10/31] CLN Simplifies checking --- sklearn/model_selection/_search.py | 3 +-- sklearn/model_selection/_validation.py | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 457af285e1e5f..39a2178ce5f55 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -787,7 +787,6 @@ def _format_results(self, candidate_params, n_splits, out): score_time = results["score_time"] info_dict = _check_fit_and_score_results(results, self.error_score) - score_names = info_dict["score_names"] test_scores = info_dict["test_scores"] if self.return_train_score: @@ -852,7 +851,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): else: iid = False - for scorer_name in score_names: + for scorer_name in test_scores: # Computed the (weighted) mean and std for test scores alone _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 0cda7a7852710..ef87ced331df1 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -245,7 +245,6 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, results = _aggregate_list_of_dicts(results, constructor=list) info_dict = _check_fit_and_score_results(results, error_score) - score_names = info_dict["score_names"] test_scores = info_dict["test_scores"] if return_estimator: @@ -258,7 +257,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, if return_estimator: ret['estimator'] = fitted_estimators - for name in score_names: + for name in test_scores: ret['test_%s' % name] = np.array(test_scores[name]) if return_train_score: train_scores = info_dict["train_scores"] @@ -272,14 +271,18 @@ def _check_fit_and_score_results(results, error_score): fit_failed = results["fit_failed"] test_score_dicts = results["test_scores"] - if all(fit_failed): + failed_indices = [] + for i, failed in enumerate(fit_failed): + if failed: + failed_indices.append(i) + else: + successful_score = test_score_dicts[i] + + if len(failed_indices) == len(fit_failed): raise NotFittedError("All estimators failed to fit") - successful_score = test_score_dicts[fit_failed.index(False)] - if any(fit_failed) and isinstance(successful_score, dict): - for i, failed in enumerate(fit_failed): - if not failed: - continue + if failed_indices and isinstance(successful_score, dict): + for i in failed_indices: test_score_dicts[i] = {name: error_score for name in successful_score} @@ -287,9 +290,6 @@ def _check_fit_and_score_results(results, error_score): # converts single metrics into a list of dictionaries if not isinstance(successful_score, dict): test_score_dicts = [{"score": elm} for elm in test_score_dicts] - output["score_names"] = ["score"] - else: - output["score_names"] = list(successful_score.keys()) output["test_scores"] = _aggregate_list_of_dicts(test_score_dicts) From 2cf9ba8169b0635e8821b4d4140c2044393f893a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 5 Feb 2020 10:56:35 -0500 Subject: [PATCH 11/31] CLN Simplifies aggregation --- sklearn/model_selection/_search.py | 2 +- sklearn/model_selection/_validation.py | 35 +++++++++++--------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 39a2178ce5f55..7a6bba46ec659 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -780,7 +780,7 @@ def evaluate_candidates(candidate_params): def _format_results(self, candidate_params, n_splits, out): n_candidates = len(candidate_params) - results = _aggregate_list_of_dicts(out, constructor=list) + results = _aggregate_list_of_dicts(out) test_sample_counts = results["n_test_samples"] fit_time = results["fit_time"] diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ef87ced331df1..8c474175868c2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -242,7 +242,7 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, error_score=error_score) for train, test in cv.split(X, y, groups)) - results = _aggregate_list_of_dicts(results, constructor=list) + results = _aggregate_list_of_dicts(results) info_dict = _check_fit_and_score_results(results, error_score) test_scores = info_dict["test_scores"] @@ -251,23 +251,26 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, fitted_estimators = results["estimator"] ret = {} - ret['fit_time'] = np.array(results["fit_time"]) - ret['score_time'] = np.array(results["score_time"]) + ret['fit_time'] = results["fit_time"] + ret['score_time'] = results["score_time"] if return_estimator: ret['estimator'] = fitted_estimators for name in test_scores: - ret['test_%s' % name] = np.array(test_scores[name]) + ret['test_%s' % name] = test_scores[name] if return_train_score: train_scores = info_dict["train_scores"] key = 'train_%s' % name - ret[key] = np.array(train_scores[name]) + ret[key] = train_scores[name] return ret def _check_fit_and_score_results(results, error_score): + """Aggregate scores in results into a single dictionary of scores. Results + that failed are set to error_score + """ fit_failed = results["fit_failed"] test_score_dicts = results["test_scores"] @@ -1310,19 +1313,14 @@ def learning_curve(estimator, X, y, groups=None, parameters=None, fit_params=None, return_train_score=True, error_score=error_score, return_times=return_times) for train, test in train_test_proportions) - results = _aggregate_list_of_dicts(results, constructor=list) - train_scores = (np.array(results["train_scores"]) - .reshape(-1, n_unique_ticks).T) - - test_scores = (np.array(results["test_scores"]) - .reshape(-1, n_unique_ticks).T) + results = _aggregate_list_of_dicts(results) + train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T + test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T out = [train_scores, test_scores] if return_times: - fit_times = (np.array(results["fit_time"]) - .reshape(-1, n_unique_ticks).T) - score_times = (np.array(results["score_time"]) - .reshape(-1, n_unique_ticks).T) + fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T + score_times = results["score_time"].reshape(-1, n_unique_ticks).T out.extend([fit_times, score_times]) ret = train_sizes_abs, out[0], out[1] @@ -1545,7 +1543,7 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, return train_scores, test_scores -def _aggregate_list_of_dicts(elements, constructor=np.asarray): +def _aggregate_list_of_dicts(elements): """Aggregate the list of dicts The aggregated output of _fit_and_score will be a list of dict @@ -1559,9 +1557,6 @@ def _aggregate_list_of_dicts(elements, constructor=np.asarray): List of dicts of the elements for all scorers. This is a flat list, assumed originally to be of row major order. - constructor : function, default=np.asarray - Used to combine elements of dictionaries in list - Example ------- @@ -1570,5 +1565,5 @@ def _aggregate_list_of_dicts(elements, constructor=np.asarray): >>> _aggregate_list_of_dicts(elements) # doctest: +SKIP {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])} """ - return {key: constructor([elm[key] for elm in elements]) + return {key: np.asarray([elm[key] for elm in elements]) for key in elements[0]} From f336d64a01d77b5d51d441519e06f0e6d593afab Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 5 Feb 2020 11:48:07 -0500 Subject: [PATCH 12/31] CLN Less code the better --- sklearn/model_selection/_validation.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 8c474175868c2..368494ae9a078 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1535,10 +1535,8 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, n_params = len(param_range) results = _aggregate_list_of_dicts(results) - train_scores = (np.asarray(results["train_scores"]) - .reshape(-1, n_params).T) - test_scores = (np.asarray(results["test_scores"]) - .reshape(-1, n_params).T) + train_scores = results["train_scores"].reshape(-1, n_params).T + test_scores = results["test_scores"].reshape(-1, n_params).T return train_scores, test_scores @@ -1560,10 +1558,13 @@ def _aggregate_list_of_dicts(elements): Example ------- - >>> elements = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, - ... {'a': 10, 'b': 10}] # doctest: +SKIP - >>> _aggregate_list_of_dicts(elements) # doctest: +SKIP - {'a': array([1, 2, 3, 10]), 'b': array([10, 2, 3, 10])} + >>> elements = [{'a': 1, 'b': 10}, {'a': 2, 'b': 2}, {'a': 3, 'b': 3}, + ... {'a': 10, 'b': 10}] + >>> output = _aggregate_list_of_dicts(elements) + >>> output['a'] + array([ 1, 2, 3, 10]) + >>> output['b'] + array([10, 2, 3, 10]) """ return {key: np.asarray([elm[key] for elm in elements]) for key in elements[0]} From a86eaf04ab9f030e0f46ff90431087b880e30d09 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 5 Feb 2020 12:02:32 -0500 Subject: [PATCH 13/31] CLN Moves definition closer to usage --- sklearn/model_selection/_search.py | 18 ++++++++---------- sklearn/model_selection/_validation.py | 11 +++++------ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 7a6bba46ec659..698fc4e854348 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -780,17 +780,13 @@ def evaluate_candidates(candidate_params): def _format_results(self, candidate_params, n_splits, out): n_candidates = len(candidate_params) - results = _aggregate_list_of_dicts(out) + agg_out = _aggregate_list_of_dicts(out) - test_sample_counts = results["n_test_samples"] - fit_time = results["fit_time"] - score_time = results["score_time"] + test_sample_counts = agg_out["n_test_samples"] + fit_time = agg_out["fit_time"] + score_time = agg_out["score_time"] - info_dict = _check_fit_and_score_results(results, self.error_score) - test_scores = info_dict["test_scores"] - - if self.return_train_score: - train_scores = info_dict["train_scores"] + score_results = _check_fit_and_score_results(agg_out, self.error_score) results = {} @@ -851,13 +847,15 @@ def _store(key_name, array, weights=None, splits=False, rank=False): else: iid = False + test_scores = score_results["test_scores"] for scorer_name in test_scores: # Computed the (weighted) mean and std for test scores alone _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, weights=test_sample_counts if iid else None) if self.return_train_score: - _store('train_%s' % scorer_name, train_scores[scorer_name], + _store('train_%s' % scorer_name, + score_results["train_scores"][scorer_name], splits=True) return results diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 368494ae9a078..06c837dbf603f 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -244,9 +244,6 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, results = _aggregate_list_of_dicts(results) - info_dict = _check_fit_and_score_results(results, error_score) - test_scores = info_dict["test_scores"] - if return_estimator: fitted_estimators = results["estimator"] @@ -257,19 +254,21 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, if return_estimator: ret['estimator'] = fitted_estimators + score_results = _check_fit_and_score_results(results, error_score) + test_scores = score_results["test_scores"] for name in test_scores: ret['test_%s' % name] = test_scores[name] if return_train_score: - train_scores = info_dict["train_scores"] key = 'train_%s' % name - ret[key] = train_scores[name] + ret[key] = score_results["train_scores"][name] return ret def _check_fit_and_score_results(results, error_score): """Aggregate scores in results into a single dictionary of scores. Results - that failed are set to error_score + that failed are set to error_score. `results` are the aggregated output + of `_fit_and_score`. """ fit_failed = results["fit_failed"] test_score_dicts = results["test_scores"] From b1782aed857a8d8d4115b89dc8fc06daec96947b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 6 Feb 2020 17:16:42 -0500 Subject: [PATCH 14/31] CLN Update error handling --- sklearn/model_selection/_search.py | 49 +++++++++++--------- sklearn/model_selection/_validation.py | 63 +++++++++++++------------- 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 698fc4e854348..ea5ee4db7ff33 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -656,12 +656,15 @@ def fit(self, X, y=None, groups=None, **fit_params): refit_metric = "score" if callable(self.scoring): scorers = self.scoring + check_fit_and_score_results = True elif self.scoring is None or isinstance(self.scoring, str): scorers = check_scoring(self.estimator, self.scoring) + check_fit_and_score_results = False else: scorers = _check_multimetric_scoring(self.estimator, self.scoring) self._check_refit_for_multimetric(scorers) refit_metric = self.refit + check_fit_and_score_results = False X, y, groups = indexable(X, y, groups) fit_params = _check_fit_params(X, fit_params) @@ -715,6 +718,8 @@ def evaluate_candidates(candidate_params): .format(n_splits, len(out) // n_candidates)) + if check_fit_and_score_results: + _check_fit_and_score_results(out, self.error_score) all_candidate_params.extend(candidate_params) all_out.extend(out) @@ -725,16 +730,12 @@ def evaluate_candidates(candidate_params): self._run_search(evaluate_candidates) - for out in all_out: - if not out["fit_failed"]: - successful_score = out['test_scores'] - break - - self.multimetric_ = isinstance(successful_score, dict) + sample_score = all_out[0]['test_scores'] + self.multimetric_ = isinstance(sample_score, dict) # scorer is callable, check refit_metric now if callable(self.scoring) and self.multimetric_: - self._check_refit_for_multimetric(successful_score) + self._check_refit_for_multimetric(sample_score) refit_metric = self.refit # For multi-metric evaluation, store the best_index_, best_params_ and @@ -780,13 +781,7 @@ def evaluate_candidates(candidate_params): def _format_results(self, candidate_params, n_splits, out): n_candidates = len(candidate_params) - agg_out = _aggregate_list_of_dicts(out) - - test_sample_counts = agg_out["n_test_samples"] - fit_time = agg_out["fit_time"] - score_time = agg_out["score_time"] - - score_results = _check_fit_and_score_results(agg_out, self.error_score) + out = _aggregate_list_of_dicts(out) results = {} @@ -814,8 +809,8 @@ def _store(key_name, array, weights=None, splits=False, rank=False): results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) - _store('fit_time', fit_time) - _store('score_time', score_time) + _store('fit_time', out["fit_time"]) + _store('score_time', out["score_time"]) # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params @@ -835,7 +830,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): results['params'] = candidate_params # NOTE test_sample counts (weights) remain the same for all candidates - test_sample_counts = np.array(test_sample_counts[:n_splits], + test_sample_counts = np.array(out["n_test_samples"][:n_splits], dtype=np.int) if self.iid != 'deprecated': @@ -847,15 +842,27 @@ def _store(key_name, array, weights=None, splits=False, rank=False): else: iid = False - test_scores = score_results["test_scores"] - for scorer_name in test_scores: + test_scores = out["test_scores"] + if isinstance(test_scores[0], dict): + test_scores_dict = _aggregate_list_of_dicts(test_scores) + else: + test_scores_dict = {"score": test_scores} + + if self.return_train_score: + train_scores = out["train_scores"] + if isinstance(test_scores[0], dict): + train_scores_dict = _aggregate_list_of_dicts(train_scores) + else: + train_scores_dict = {"score": train_scores} + + for scorer_name in test_scores_dict: # Computed the (weighted) mean and std for test scores alone - _store('test_%s' % scorer_name, test_scores[scorer_name], + _store('test_%s' % scorer_name, test_scores_dict[scorer_name], splits=True, rank=True, weights=test_sample_counts if iid else None) if self.return_train_score: _store('train_%s' % scorer_name, - score_results["train_scores"][scorer_name], + train_scores_dict[scorer_name], splits=True) return results diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 06c837dbf603f..c330a3d3acf15 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -225,10 +225,13 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, if callable(scoring): scorers = scoring + check_fit_and_score_results = True elif scoring is None or isinstance(scoring, str): scorers = check_scoring(estimator, scoring) + check_fit_and_score_results = False else: scorers = _check_multimetric_scoring(estimator, scoring) + check_fit_and_score_results = False # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. @@ -242,6 +245,8 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, error_score=error_score) for train, test in cv.split(X, y, groups)) + if check_fit_and_score_results: + _check_fit_and_score_results(results, error_score) results = _aggregate_list_of_dicts(results) if return_estimator: @@ -254,13 +259,24 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, if return_estimator: ret['estimator'] = fitted_estimators - score_results = _check_fit_and_score_results(results, error_score) - test_scores = score_results["test_scores"] - for name in test_scores: - ret['test_%s' % name] = test_scores[name] + test_scores = results["test_scores"] + if isinstance(test_scores[0], dict): + test_scores_dict = _aggregate_list_of_dicts(test_scores) + else: + test_scores_dict = {"score": test_scores} + + if return_train_score: + train_scores = results["train_scores"] + if isinstance(test_scores[0], dict): + train_scores_dict = _aggregate_list_of_dicts(train_scores) + else: + train_scores_dict = {"score": train_scores} + + for name in test_scores_dict: + ret['test_%s' % name] = test_scores_dict[name] if return_train_score: key = 'train_%s' % name - ret[key] = score_results["train_scores"][name] + ret[key] = train_scores_dict[name] return ret @@ -270,38 +286,23 @@ def _check_fit_and_score_results(results, error_score): that failed are set to error_score. `results` are the aggregated output of `_fit_and_score`. """ - fit_failed = results["fit_failed"] - test_score_dicts = results["test_scores"] - + successful_score = None failed_indices = [] - for i, failed in enumerate(fit_failed): - if failed: + for i, result in enumerate(results): + if result["fit_failed"]: failed_indices.append(i) - else: - successful_score = test_score_dicts[i] + elif successful_score is None: + successful_score = result["test_scores"] - if len(failed_indices) == len(fit_failed): + if successful_score is None: raise NotFittedError("All estimators failed to fit") - if failed_indices and isinstance(successful_score, dict): + if isinstance(successful_score, dict): + formatted_erorr = {name: error_score for name in successful_score} for i in failed_indices: - test_score_dicts[i] = {name: error_score - for name in successful_score} - - output = {} - # converts single metrics into a list of dictionaries - if not isinstance(successful_score, dict): - test_score_dicts = [{"score": elm} for elm in test_score_dicts] - - output["test_scores"] = _aggregate_list_of_dicts(test_score_dicts) - - if "train_scores" in results: - train_score_dicts = results["train_scores"] - if not isinstance(successful_score, dict): - train_score_dicts = [{"score": elm} for elm in train_score_dicts] - output["train_scores"] = _aggregate_list_of_dicts(train_score_dicts) - - return output + results[i]["test_scores"] = formatted_erorr.copy() + if "train_scores" in results[i]: + results[i]["train_scores"] = formatted_erorr.copy() def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, From c5f9b42006baa34b1aaf8a4429421e7b6a31498f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 May 2020 17:06:20 -0400 Subject: [PATCH 15/31] REV Less diffs --- sklearn/metrics/_scorer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 08b97fd3cba8b..fa6f8181aeb3f 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -431,7 +431,10 @@ def _check_multimetric_scoring(estimator, scoring): estimator : sklearn estimator instance The estimator for which the scoring will be applied. - scoring : list/tuple or dict + scoring : string, callable, list/tuple, dict or None, default: None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + For evaluating multiple metrics, either give a list of (unique) strings or a dict with names as keys and callables as values. @@ -441,6 +444,7 @@ def _check_multimetric_scoring(estimator, scoring): See :ref:`multimetric_grid_search` for an example. + If None the estimator's score method is used. The return value in that case will be ``{'score': }``. If the estimator's score method is not available, a ``TypeError`` is raised. From 0e79b59d79a5625e123f09196829d5c2cfba5301 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 May 2020 17:14:04 -0400 Subject: [PATCH 16/31] CLN Address comments --- sklearn/model_selection/_search.py | 15 +++++---- sklearn/model_selection/_validation.py | 42 +++++++++++--------------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 7b2a1999ffd50..fa0c3997e1b6a 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -29,7 +29,7 @@ from ._split import check_cv from ._validation import _fit_and_score from ._validation import _aggregate_list_of_dicts -from ._validation import _check_fit_and_score_results +from ._validation import _handle_error_score from ..exceptions import NotFittedError from joblib import Parallel, delayed from ..utils import check_random_state @@ -678,17 +678,20 @@ def fit(self, X, y=None, *, groups=None, **fit_params): cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) refit_metric = "score" + + # If scoring is callable, then error scores must be handled after + # scoring is called. if callable(self.scoring): scorers = self.scoring - check_fit_and_score_results = True + should_handle_error_scores = True elif self.scoring is None or isinstance(self.scoring, str): scorers = check_scoring(self.estimator, self.scoring) - check_fit_and_score_results = False + should_handle_error_scores = False else: scorers = _check_multimetric_scoring(self.estimator, self.scoring) self._check_refit_for_multimetric(scorers) refit_metric = self.refit - check_fit_and_score_results = False + should_handle_error_scores = False X, y, groups = indexable(X, y, groups) fit_params = _check_fit_params(X, fit_params) @@ -742,8 +745,8 @@ def evaluate_candidates(candidate_params): .format(n_splits, len(out) // n_candidates)) - if check_fit_and_score_results: - _check_fit_and_score_results(out, self.error_score) + if should_handle_error_scores: + _handle_error_score(out, self.error_score) all_candidate_params.extend(candidate_params) all_out.extend(out) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ef357f60d1321..b527126788d38 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -234,15 +234,17 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, cv = check_cv(cv, y, classifier=is_classifier(estimator)) + # If scoring is callable, then error scores must be handled after + # scoring is called. if callable(scoring): scorers = scoring - check_fit_and_score_results = True + should_handle_error_scores = True elif scoring is None or isinstance(scoring, str): scorers = check_scoring(estimator, scoring) - check_fit_and_score_results = False + should_handle_error_scores = False else: scorers = _check_multimetric_scoring(estimator, scoring) - check_fit_and_score_results = False + should_handle_error_scores = False # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. @@ -256,8 +258,8 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, error_score=error_score) for train, test in cv.split(X, y, groups)) - if check_fit_and_score_results: - _check_fit_and_score_results(results, error_score) + if should_handle_error_scores: + _handle_error_score(results, error_score) results = _aggregate_list_of_dicts(results) if return_estimator: @@ -292,28 +294,25 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, return ret -def _check_fit_and_score_results(results, error_score): - """Aggregate scores in results into a single dictionary of scores. Results - that failed are set to error_score. `results` are the aggregated output - of `_fit_and_score`. - """ - successful_score = None +def _handle_error_score(results, error_score): + """Handle error in results by replacing them with `error_score`.""" + score_names = None failed_indices = [] for i, result in enumerate(results): if result["fit_failed"]: failed_indices.append(i) - elif successful_score is None: - successful_score = result["test_scores"] + elif score_names is None: + score_names = result["test_scores"].keys() - if successful_score is None: + if score_names is None: raise NotFittedError("All estimators failed to fit") - if isinstance(successful_score, dict): - formatted_erorr = {name: error_score for name in successful_score} + if score_names: + formatted_error = {name: error_score for name in score_names} for i in failed_indices: - results[i]["test_scores"] = formatted_erorr.copy() + results[i]["test_scores"] = formatted_error.copy() if "train_scores" in results[i]: - results[i]["train_scores"] = formatted_erorr.copy() + results[i]["train_scores"] = formatted_error.copy() @_deprecate_positional_args @@ -522,25 +521,18 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, train_scores : dict of scorer name -> float Score on training set (for all the scorers), returned only if `return_train_score` is `True`. - test_scores : dict of scorer name -> float Score on testing set (for all the scorers). - n_test_samples : int Number of test samples. - fit_time : float Time spent for fitting in seconds. - score_time : float Time spent for scoring in seconds. - parameters : dict or None The parameters that have been evaluated. - estimator : estimator object The fitted estimator - fit_failed : bool The estimator failed to fit. """ From 49e8c0399f7995123646bd8dbc7b62df3ef4bd88 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 May 2020 17:17:36 -0400 Subject: [PATCH 17/31] REV --- sklearn/metrics/_scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index fa6f8181aeb3f..2cf1e8f231901 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -423,7 +423,7 @@ def check_scoring(estimator, scoring=None, *, allow_none=False): " None. %r was passed" % scoring) -def _check_multimetric_scoring(estimator, scoring): +def _check_multimetric_scoring(estimator, scoring=None): """Check the scoring parameter in cases when multiple metrics are allowed Parameters From 4f6ecd7c6bce136486b866f12404f4147b152901 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 May 2020 17:27:06 -0400 Subject: [PATCH 18/31] STY Flake --- sklearn/model_selection/_search.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index fa0c3997e1b6a..0f3793526265e 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -856,10 +856,6 @@ def _store(key_name, array, weights=None, splits=False, rank=False): # Store a list of param dicts at the key 'params' results['params'] = candidate_params - # NOTE test_sample counts (weights) remain the same for all candidates - test_sample_counts = np.array(out["n_test_samples"][:n_splits], - dtype=np.int) - test_scores = out["test_scores"] if isinstance(test_scores[0], dict): test_scores_dict = _aggregate_list_of_dicts(test_scores) From 4fa5eb6099517dd858e3ae8abd7bcd3a531aaed8 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 May 2020 18:13:53 -0400 Subject: [PATCH 19/31] ENH Fix error --- sklearn/model_selection/_validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index b527126788d38..ef10e80ffbe2b 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -296,19 +296,19 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, def _handle_error_score(results, error_score): """Handle error in results by replacing them with `error_score`.""" - score_names = None + successful_score = None failed_indices = [] for i, result in enumerate(results): if result["fit_failed"]: failed_indices.append(i) - elif score_names is None: - score_names = result["test_scores"].keys() + elif successful_score is None: + successful_score = result["test_scores"] - if score_names is None: + if successful_score is None: raise NotFittedError("All estimators failed to fit") - if score_names: - formatted_error = {name: error_score for name in score_names} + if isinstance(successful_score, dict): + formatted_error = {name: error_score for name in successful_score} for i in failed_indices: results[i]["test_scores"] = formatted_error.copy() if "train_scores" in results[i]: From 97b1db2262586c6f7b9075401b25776156d94d36 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 May 2020 22:05:56 -0400 Subject: [PATCH 20/31] REV Less diffs --- sklearn/model_selection/_search.py | 8 +++--- sklearn/model_selection/_validation.py | 34 ++++++++++++-------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 0f3793526265e..0ffb5021c4294 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -28,7 +28,7 @@ from ..base import MetaEstimatorMixin from ._split import check_cv from ._validation import _fit_and_score -from ._validation import _aggregate_list_of_dicts +from ._validation import _aggregate_score_dicts from ._validation import _handle_error_score from ..exceptions import NotFittedError from joblib import Parallel, delayed @@ -808,7 +808,7 @@ def evaluate_candidates(candidate_params): def _format_results(self, candidate_params, n_splits, out): n_candidates = len(candidate_params) - out = _aggregate_list_of_dicts(out) + out = _aggregate_score_dicts(out) results = {} @@ -858,14 +858,14 @@ def _store(key_name, array, weights=None, splits=False, rank=False): test_scores = out["test_scores"] if isinstance(test_scores[0], dict): - test_scores_dict = _aggregate_list_of_dicts(test_scores) + test_scores_dict = _aggregate_score_dicts(test_scores) else: test_scores_dict = {"score": test_scores} if self.return_train_score: train_scores = out["train_scores"] if isinstance(test_scores[0], dict): - train_scores_dict = _aggregate_list_of_dicts(train_scores) + train_scores_dict = _aggregate_score_dicts(train_scores) else: train_scores_dict = {"score": train_scores} diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ef10e80ffbe2b..d2c404e94d536 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -260,7 +260,7 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, if should_handle_error_scores: _handle_error_score(results, error_score) - results = _aggregate_list_of_dicts(results) + results = _aggregate_score_dicts(results) if return_estimator: fitted_estimators = results["estimator"] @@ -274,14 +274,14 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, test_scores = results["test_scores"] if isinstance(test_scores[0], dict): - test_scores_dict = _aggregate_list_of_dicts(test_scores) + test_scores_dict = _aggregate_score_dicts(test_scores) else: test_scores_dict = {"score": test_scores} if return_train_score: train_scores = results["train_scores"] if isinstance(test_scores[0], dict): - train_scores_dict = _aggregate_list_of_dicts(train_scores) + train_scores_dict = _aggregate_score_dicts(train_scores) else: train_scores_dict = {"score": train_scores} @@ -1327,7 +1327,7 @@ def learning_curve(estimator, X, y, *, groups=None, parameters=None, fit_params=None, return_train_score=True, error_score=error_score, return_times=return_times) for train, test in train_test_proportions) - results = _aggregate_list_of_dicts(results) + results = _aggregate_score_dicts(results) train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T out = [train_scores, test_scores] @@ -1551,15 +1551,15 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None, for train, test in cv.split(X, y, groups) for v in param_range) n_params = len(param_range) - results = _aggregate_list_of_dicts(results) + results = _aggregate_score_dicts(results) train_scores = results["train_scores"].reshape(-1, n_params).T test_scores = results["test_scores"].reshape(-1, n_params).T return train_scores, test_scores -def _aggregate_list_of_dicts(elements): - """Aggregate the list of dicts +def _aggregate_score_dicts(scores): + """Aggregate the list of dict to dict of np ndarray The aggregated output of _fit_and_score will be a list of dict of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...] @@ -1568,20 +1568,18 @@ def _aggregate_list_of_dicts(elements): Parameters ---------- - elements : list of dict - List of dicts of the elements for all scorers. This is a flat list, + scores : list of dict + List of dicts of the scores for all scorers. This is a flat list, assumed originally to be of row major order. Example ------- - >>> elements = [{'a': 1, 'b': 10}, {'a': 2, 'b': 2}, {'a': 3, 'b': 3}, - ... {'a': 10, 'b': 10}] - >>> output = _aggregate_list_of_dicts(elements) - >>> output['a'] - array([ 1, 2, 3, 10]) - >>> output['b'] - array([10, 2, 3, 10]) + >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, + ... {'a': 10, 'b': 10}] # doctest: +SKIP + >>> _aggregate_score_dicts(scores) # doctest: +SKIP + {'a': array([1, 2, 3, 10]), + 'b': array([10, 2, 3, 10])} """ - return {key: np.asarray([elm[key] for elm in elements]) - for key in elements[0]} + return {key: np.asarray([score[key] for score in scores]) + for key in scores[0]} From 286bb86d6e2587ff34ddc959a30eaec879a8ba1c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 25 May 2020 00:01:44 -0400 Subject: [PATCH 21/31] DOC Adds comments --- sklearn/model_selection/_search.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 0ffb5021c4294..34068b4645eee 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -757,6 +757,8 @@ def evaluate_candidates(candidate_params): self._run_search(evaluate_candidates) + # multimetric is determined here based on test_scores. This is + # to support callable self.scoring sample_score = all_out[0]['test_scores'] self.multimetric_ = isinstance(sample_score, dict) From 5da1571921fa2f9df1a4dfb17ebd9c68e80527e3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 8 Jul 2020 15:58:18 -0400 Subject: [PATCH 22/31] CLN Removes some state --- sklearn/model_selection/_search.py | 40 +++++++++--------------- sklearn/model_selection/_validation.py | 42 ++++++++++++++------------ 2 files changed, 37 insertions(+), 45 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 8a113d9432188..0eeabbd25a1ef 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -29,7 +29,8 @@ from ._split import check_cv from ._validation import _fit_and_score from ._validation import _aggregate_score_dicts -from ._validation import _handle_error_score +from ._validation import _insert_error_scores +from ._validation import _normalize_score_results from ..exceptions import NotFittedError from joblib import Parallel, delayed from ..utils import check_random_state @@ -700,19 +701,14 @@ def fit(self, X, y=None, *, groups=None, **fit_params): refit_metric = "score" - # If scoring is callable, then error scores must be handled after - # scoring is called. if callable(self.scoring): scorers = self.scoring - should_handle_error_scores = True elif self.scoring is None or isinstance(self.scoring, str): scorers = check_scoring(self.estimator, self.scoring) - should_handle_error_scores = False else: scorers = _check_multimetric_scoring(self.estimator, self.scoring) self._check_refit_for_multimetric(scorers) refit_metric = self.refit - should_handle_error_scores = False X, y, groups = indexable(X, y, groups) fit_params = _check_fit_params(X, fit_params) @@ -773,8 +769,11 @@ def evaluate_candidates(candidate_params): .format(n_splits, len(out) // n_candidates)) - if should_handle_error_scores: - _handle_error_score(out, self.error_score) + # For callabe self.scoring, the return type is only know after + # calling. If the return type is a dictionary, the error scores + # can now be inserted with the correct key. + if callable(self.scoring): + _insert_error_scores(out, self.error_score) all_candidate_params.extend(candidate_params) all_out.extend(out) @@ -785,14 +784,14 @@ def evaluate_candidates(candidate_params): self._run_search(evaluate_candidates) - # multimetric is determined here based on test_scores. This is - # to support callable self.scoring - sample_score = all_out[0]['test_scores'] - self.multimetric_ = isinstance(sample_score, dict) + # multimetric is determined here because in the case of a callable + # self.scoring the return type is only known after calling + first_test_score = all_out[0]['test_scores'] + self.multimetric_ = isinstance(first_test_score, dict) - # scorer is callable, check refit_metric now + # check refit_metric now for a callabe scorer that is multimetric if callable(self.scoring) and self.multimetric_: - self._check_refit_for_multimetric(sample_score) + self._check_refit_for_multimetric(first_test_score) refit_metric = self.refit # For multi-metric evaluation, store the best_index_, best_params_ and @@ -886,18 +885,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False): # Store a list of param dicts at the key 'params' results['params'] = candidate_params - test_scores = out["test_scores"] - if isinstance(test_scores[0], dict): - test_scores_dict = _aggregate_score_dicts(test_scores) - else: - test_scores_dict = {"score": test_scores} - + test_scores_dict = _normalize_score_results(out["test_scores"]) if self.return_train_score: - train_scores = out["train_scores"] - if isinstance(test_scores[0], dict): - train_scores_dict = _aggregate_score_dicts(train_scores) - else: - train_scores_dict = {"score": train_scores} + train_scores_dict = _normalize_score_results(out["train_scores"]) for scorer_name in test_scores_dict: # Computed the (weighted) mean and std for test scores alone diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 287365bf864ed..e75dfbe532976 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -234,17 +234,12 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, cv = check_cv(cv, y, classifier=is_classifier(estimator)) - # If scoring is callable, then error scores must be handled after - # scoring is called. if callable(scoring): scorers = scoring - should_handle_error_scores = True elif scoring is None or isinstance(scoring, str): scorers = check_scoring(estimator, scoring) - should_handle_error_scores = False else: scorers = _check_multimetric_scoring(estimator, scoring) - should_handle_error_scores = False # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. @@ -258,8 +253,12 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, error_score=error_score) for train, test in cv.split(X, y, groups)) - if should_handle_error_scores: - _handle_error_score(results, error_score) + # For callabe scoring, the return type is only know after calling. If the + # return type is a dictionary, the error scores can now be inserted with + # the correct key. + if callable(scoring): + _insert_error_scores(results, error_score) + results = _aggregate_score_dicts(results) if return_estimator: @@ -272,18 +271,9 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, if return_estimator: ret['estimator'] = fitted_estimators - test_scores = results["test_scores"] - if isinstance(test_scores[0], dict): - test_scores_dict = _aggregate_score_dicts(test_scores) - else: - test_scores_dict = {"score": test_scores} - + test_scores_dict = _normalize_score_results(results["test_scores"]) if return_train_score: - train_scores = results["train_scores"] - if isinstance(test_scores[0], dict): - train_scores_dict = _aggregate_score_dicts(train_scores) - else: - train_scores_dict = {"score": train_scores} + train_scores_dict = _normalize_score_results(results["train_scores"]) for name in test_scores_dict: ret['test_%s' % name] = test_scores_dict[name] @@ -294,8 +284,11 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, return ret -def _handle_error_score(results, error_score): - """Handle error in results by replacing them with `error_score`.""" +def _insert_error_scores(results, error_score): + """Insert error in results by replacing them with `error_score`. + + This only applies to dictionaries scores because `_fit_and_score` will + handle the single metric case.""" successful_score = None failed_indices = [] for i, result in enumerate(results): @@ -315,6 +308,15 @@ def _handle_error_score(results, error_score): results[i]["train_scores"] = formatted_error.copy() +def _normalize_score_results(scores, scaler_score_key='score'): + """Creates a scoring dictionary based on the type of `scores`""" + if isinstance(scores[0], dict): + # multimetric scoring + return _aggregate_score_dicts(scores) + # scaler + return {scaler_score_key: scores} + + @_deprecate_positional_args def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, From e541de34c739bd74941c05e2332b46c4c2a13f10 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 9 Jul 2020 10:17:21 -0400 Subject: [PATCH 23/31] CLN Address comments --- sklearn/metrics/_scorer.py | 13 ++----------- sklearn/model_selection/_validation.py | 12 ++++++------ 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 16455c7dd8c50..89bf37ffe0711 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -423,7 +423,7 @@ def check_scoring(estimator, scoring=None, *, allow_none=False): " None. %r was passed" % scoring) -def _check_multimetric_scoring(estimator, scoring=None): +def _check_multimetric_scoring(estimator, scoring): """Check the scoring parameter in cases when multiple metrics are allowed Parameters @@ -431,24 +431,15 @@ def _check_multimetric_scoring(estimator, scoring=None): estimator : sklearn estimator instance The estimator for which the scoring will be applied. - scoring : str, callable, list, tuple or dict, default=None + scoring : list, tuple or dict A single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. For evaluating multiple metrics, either give a list of (unique) strings or a dict with names as keys and callables as values. - NOTE that when using custom scorers, each scorer should return a single - value. Metric functions returning a list/array of values can be wrapped - into multiple scorers that return one value each. - See :ref:`multimetric_grid_search` for an example. - If None the estimator's score method is used. - The return value in that case will be ``{'score': }``. - If the estimator's score method is not available, a ``TypeError`` - is raised. - Returns ------- scorers_dict : dict diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index e75dfbe532976..b31bdfeb1c714 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -289,19 +289,19 @@ def _insert_error_scores(results, error_score): This only applies to dictionaries scores because `_fit_and_score` will handle the single metric case.""" - successful_score = None + score_names = None failed_indices = [] for i, result in enumerate(results): if result["fit_failed"]: failed_indices.append(i) - elif successful_score is None: - successful_score = result["test_scores"] + elif score_names is None: + score_names = result["test_scores"].keys() - if successful_score is None: + if score_names is None: raise NotFittedError("All estimators failed to fit") - if isinstance(successful_score, dict): - formatted_error = {name: error_score for name in successful_score} + if isinstance(score_names, dict): + formatted_error = {name: error_score for name in score_names} for i in failed_indices: results[i]["test_scores"] = formatted_error.copy() if "train_scores" in results[i]: From 657ef893a7ac2d7f42025cecaee7925b39652a85 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 9 Jul 2020 11:08:30 -0400 Subject: [PATCH 24/31] BUG Fix score --- sklearn/model_selection/_validation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ac30159ed3b5c..eac1082a97e4f 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -284,21 +284,21 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, def _insert_error_scores(results, error_score): """Insert error in results by replacing them with `error_score`. - This only applies to dictionaries scores because `_fit_and_score` will + This only applies to multimetric scores because `_fit_and_score` will handle the single metric case.""" - score_names = None + successful_score = None failed_indices = [] for i, result in enumerate(results): if result["fit_failed"]: failed_indices.append(i) - elif score_names is None: - score_names = result["test_scores"].keys() + elif successful_score is None: + successful_score = result["test_scores"] - if score_names is None: + if successful_score is None: raise NotFittedError("All estimators failed to fit") - if isinstance(score_names, dict): - formatted_error = {name: error_score for name in score_names} + if isinstance(successful_score, dict): + formatted_error = {name: error_score for name in successful_score} for i in failed_indices: results[i]["test_scores"] = formatted_error.copy() if "train_scores" in results[i]: From b0cdc570eb103294c66fcbe660411f3fbdce3720 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 9 Jul 2020 13:36:36 -0400 Subject: [PATCH 25/31] CLN Adds to glossary --- doc/glossary.rst | 8 ++++---- sklearn/metrics/_scorer.py | 10 ++++------ sklearn/model_selection/tests/test_validation.py | 4 ++-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 86cb3c06f5634..42e746c38b9ec 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -1583,10 +1583,10 @@ functions or non-estimator constructors. in the User Guide. Where multiple metrics can be evaluated, ``scoring`` may be given - either as a list of unique strings or a dictionary with names as keys - and callables as values. Note that this does *not* specify which score - function is to be maximized, and another parameter such as ``refit`` - maybe used for this purpose. + either as a list of unique strings, a dictionary with names as keys and + callables as values or a callable that returns a dictionary. Note that + this does *not* specify which score function is to be maximized, and + another parameter such as ``refit`` maybe used for this purpose. The ``scoring`` parameter is validated and interpreted using diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 89bf37ffe0711..0852955c72cad 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -445,12 +445,10 @@ def _check_multimetric_scoring(estimator, scoring): scorers_dict : dict A dict mapping each scorer name to its validated scorer. """ - err_msg_generic = ("scoring should either be a single string or " - "callable or a " - "list/tuple of strings or a dict of scorer name " - "mapped to the callable for multiple metric " - "evaluation. Got %s of type %s" - % (repr(scoring), type(scoring))) + err_msg_generic = ( + f"scoring is invalid (got {scoring!r}). Refer to the " + "scoring glossary for details: " + "https://scikit-learn.org/stable/glossary.html#term-scoring ") if isinstance(scoring, (list, tuple, set)): err_msg = ("The list/tuple elements must be unique " diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 8c42354e87344..6e1faa1088075 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -318,8 +318,8 @@ def test_cross_validate_invalid_scoring_param(): cross_validate, estimator, X, y, scoring=[[make_scorer(precision_score)]]) - error_message_regexp = (".*should either be.*string or callable.*" - ".*.*dict.*for multi.*") + error_message_regexp = (".*scoring is invalid.*Refer to the scoring " + "glossary for details:.*") # Empty dict should raise invalid scoring error assert_raises_regex(ValueError, "An empty dict", From 714372f66f59035e86ba5435c7c7e3da46d0c386 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 9 Jul 2020 15:16:13 -0400 Subject: [PATCH 26/31] CLN Uses f-strings --- sklearn/metrics/_scorer.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 0852955c72cad..b824b9b0cbcb8 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -448,7 +448,7 @@ def _check_multimetric_scoring(estimator, scoring): err_msg_generic = ( f"scoring is invalid (got {scoring!r}). Refer to the " "scoring glossary for details: " - "https://scikit-learn.org/stable/glossary.html#term-scoring ") + "https://scikit-learn.org/stable/glossary.html#term-scoring") if isinstance(scoring, (list, tuple, set)): err_msg = ("The list/tuple elements must be unique " @@ -462,34 +462,30 @@ def _check_multimetric_scoring(estimator, scoring): raise ValueError(err_msg) if len(keys) != len(scoring): - raise ValueError(err_msg + "Duplicate elements were found in" - " the given list. %r" % repr(scoring)) + raise ValueError(f"{err_msg} Duplicate elements were found in" + f" the given list. {scoring!r}") elif len(keys) > 0: if not all(isinstance(k, str) for k in keys): if any(callable(k) for k in keys): - raise ValueError(err_msg + - "One or more of the elements were " - "callables. Use a dict of score name " - "mapped to the scorer callable. " - "Got %r" % repr(scoring)) + raise ValueError(f"{err_msg} One or more of the elements " + "were callables. Use a dict of score " + "name mapped to the scorer callable. " + f"Got {scoring!r}") else: - raise ValueError(err_msg + - "Non-string types were found in " - "the given list. Got %r" - % repr(scoring)) + raise ValueError(f"{err_msg} Non-string types were found " + f"in the given list. Got {scoring!r}") scorers = {scorer: check_scoring(estimator, scoring=scorer) for scorer in scoring} else: - raise ValueError(err_msg + - "Empty list was given. %r" % repr(scoring)) + raise ValueError(f"{err_msg} Empty list was given. {scoring!r}") elif isinstance(scoring, dict): keys = set(scoring) if not all(isinstance(k, str) for k in keys): raise ValueError("Non-string types were found in the keys of " - "the given dict. scoring=%r" % repr(scoring)) + f"the given dict. scoring={scoring!r}") if len(keys) == 0: - raise ValueError("An empty dict was passed. %r" % repr(scoring)) + raise ValueError(f"An empty dict was passed. {scoring!r}") scorers = {key: check_scoring(estimator, scoring=scorer) for key, scorer in scoring.items()} else: From 346f8e3fdbeaa0893f426f8c81126a63b1ec7b64 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 10 Jul 2020 20:14:57 +0200 Subject: [PATCH 27/31] ENH create a generator of applicable metrics depending on the target y --- sklearn/metrics/__init__.py | 2 + sklearn/metrics/_scorer.py | 137 +++++++++++++++++++- sklearn/metrics/tests/test_score_objects.py | 58 ++++++++- 3 files changed, 195 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index be28005631963..36a54b88a50cc 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -74,6 +74,7 @@ from ._scorer import check_scoring from ._scorer import make_scorer from ._scorer import SCORERS +from ._scorer import get_applicable_scorers from ._scorer import get_scorer from ._plot.roc_curve import plot_roc_curve @@ -109,6 +110,7 @@ 'f1_score', 'fbeta_score', 'fowlkes_mallows_score', + 'get_applicable_scorers', 'get_scorer', 'hamming_loss', 'hinge_loss', diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index b824b9b0cbcb8..a52a3b88edaa0 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -18,9 +18,12 @@ # Arnaud Joly # License: Simplified BSD +from collections import Counter +from collections import namedtuple from collections.abc import Iterable +from copy import deepcopy +from inspect import signature from functools import partial -from collections import Counter import numpy as np @@ -686,3 +689,135 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, qualified_name = '{0}_{1}'.format(name, average) SCORERS[qualified_name] = make_scorer(metric, pos_label=None, average=average) + +ScorerProperty = namedtuple( + "ScorerProperty", ["scorer", "target_type_supported"], +) + +SCORERS_PROPERTY = dict( + explained_variance=ScorerProperty( + scorer=explained_variance_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + r2=ScorerProperty( + scorer=r2_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + max_error=ScorerProperty( + scorer=max_error_scorer, + target_type_supported=("continuous",), + ), + neg_median_absolute_error=ScorerProperty( + scorer=neg_median_absolute_error_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + neg_mean_absolute_error=ScorerProperty( + scorer=neg_mean_absolute_error_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + neg_mean_absolute_percentage_error=ScorerProperty( + scorer=neg_mean_absolute_percentage_error_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + neg_mean_squared_error=ScorerProperty( + scorer=neg_mean_squared_error_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + neg_mean_squared_log_error=ScorerProperty( + scorer=neg_mean_squared_log_error_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + neg_root_mean_squared_error=ScorerProperty( + scorer=neg_root_mean_squared_error_scorer, + target_type_supported=("continuous", "continuous-multioutput"), + ), + neg_mean_poisson_deviance=ScorerProperty( + scorer=neg_mean_poisson_deviance_scorer, + target_type_supported=("continuous",), + ), + neg_mean_gamma_deviance=ScorerProperty( + scorer=neg_mean_gamma_deviance_scorer, + target_type_supported=("continuous",), + ), + accuracy=ScorerProperty( + scorer=accuracy_scorer, + target_type_supported=("binary", "multiclass", "multilabel-indicator"), + ), + roc_auc=ScorerProperty( + scorer=roc_auc_scorer, + target_type_supported=("binary", "multiclass", "multilabel-indicator"), + ), + balanced_accuracy=ScorerProperty( + scorer=balanced_accuracy_scorer, + target_type_supported=("binary", "multiclass"), + ), + precision=ScorerProperty( + scorer=make_scorer(precision_score), + target_type_supported=("binary", "multilabel-indicator"), + ), + recall=ScorerProperty( + scorer=make_scorer(recall_score), + target_type_supported=("binary", "multilabel-indicator"), + ), + f1=ScorerProperty( + scorer=make_scorer(f1_score), + target_type_supported=("binary", "multilabel-indicator"), + ), + jaccard=ScorerProperty( + scorer=make_scorer(jaccard_score), + target_type_supported=("binary", "multilabel-indicator"), + ), + average_precision=ScorerProperty( + scorer=average_precision_scorer, + target_type_supported=("binary", "multilabel-indicator"), + ), + neg_log_loss=ScorerProperty( + scorer=neg_log_loss_scorer, + target_type_supported=("binary", "multiclass", "multilabel-indicator"), + ), + neg_brier_score=ScorerProperty( + scorer=neg_brier_score_scorer, + target_type_supported=("binary", "multiclass"), + ), +) + + +def get_applicable_scorers(y, **scorers_params): + """Utility providing scorers to be used on `y`. + + This utility creates a dictionary containing the scorers which can be used + on `y`. The dictionary returned can be used directly in a + :class:`~sklearn.model_selection.GridSearchCV`. + + Additional parameters taken by the different metrics can be passed as + keyword argument. + + Parameters + ---------- + y : array-like + The target used to infer the metrics which can be used. + + **scorers_params + Additional parameters to be passed to the scorers when present in their + signature. + + Returns + ------- + scorers : dict + A dictionary containing the scorer name as key and a scorer callable as + value. + """ + target_type = type_of_target(y) + + scorers = {} + for scorer_name, scorer_property in SCORERS_PROPERTY.items(): + if target_type in scorer_property.target_type_supported: + scorers[scorer_name] = deepcopy(scorer_property.scorer) + scorer_sig = signature(scorers[scorer_name]._score_func) + for param_name, param_value in scorers_params.items(): + if param_name in scorer_sig.parameters: + scorers[scorer_name]._kwargs[param_name] = param_value + + if not scorers: + raise ValueError("No compatible scorer with the target 'y' was found.") + return scorers diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 484edd3e751ca..f105db0af9a4f 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -25,7 +25,12 @@ _MultimetricScorer, _check_multimetric_scoring) from sklearn.metrics import accuracy_score -from sklearn.metrics import make_scorer, get_scorer, SCORERS +from sklearn.metrics import ( + get_applicable_scorers, + get_scorer, + make_scorer, + SCORERS +) from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import LinearSVC from sklearn.pipeline import make_pipeline @@ -729,3 +734,54 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name): msg = "'Perceptron' object has no attribute 'predict_proba'" with pytest.raises(AttributeError, match=msg): scorer(lr, X, y) + + +@pytest.mark.parametrize( + "Estimator, X, y", + [(LogisticRegression, *make_classification(n_classes=2)), + (LogisticRegression, *make_classification( + n_classes=3, n_clusters_per_class=1 + )), + (LogisticRegression, *make_multilabel_classification()), + (Ridge, *make_regression(n_targets=1)), + (Ridge, *make_regression(n_targets=2))], + ids=[ + "binary-classification", + "multiclass-classification", + "multilabel-classification", + "regression", + "multioutput-regression", + ] +) +def _generate_scorer(Estimator, X, y): + # smoke test to check that we can compute the score on the expected + # dataset + scorers = get_applicable_scorers(y) + estimator = Estimator().fit(X, y) + for scorer_name in scorers: + yield estimator, X, y, scorers[scorer_name] + + +def _parametrize_scorers_from_target(estimator_data_ids): + check_scorers, check_scorers_ids = zip(*[ + ((Estimator, X, y, scorer), f"{scorer_name}-{problem_id}") + for problem_id, Estimator, X, y in estimator_data_ids + for scorer_name, scorer in get_applicable_scorers(y).items() + ]) + + return pytest.mark.parametrize( + "Estimator, X, y, scorer", check_scorers, ids=check_scorers_ids, + ) + + +@_parametrize_scorers_from_target( + [("binary", LogisticRegression, *make_classification(n_classes=2)), + ("multiclass", LogisticRegression, + *make_classification(n_classes=3, n_clusters_per_class=1)), + ("multilabel", LogisticRegression, *make_multilabel_classification()), + ("continuous", Ridge, *make_regression(n_targets=1)), + ("continuous-multioutput", Ridge, *make_regression(n_targets=2))] +) +def test_get_applicable_scorers_smoke_test(Estimator, X, y, scorer): + estimator = Estimator().fit(X, y) + scorer(estimator, X, y) From 6e8be2add35636b514f76e8af1fa763d26229b5f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 10 Jul 2020 23:53:26 +0200 Subject: [PATCH 28/31] iter --- sklearn/metrics/_scorer.py | 47 ++++++++++++++------- sklearn/metrics/tests/test_score_objects.py | 7 +-- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index a52a3b88edaa0..d98f1c57639d9 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -745,23 +745,27 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, ), roc_auc=ScorerProperty( scorer=roc_auc_scorer, - target_type_supported=("binary", "multiclass", "multilabel-indicator"), + target_type_supported=("binary", "multilabel-indicator"), ), - balanced_accuracy=ScorerProperty( - scorer=balanced_accuracy_scorer, - target_type_supported=("binary", "multiclass"), + roc_auc_ovr=ScorerProperty( + scorer=roc_auc_ovr_scorer, + target_type_supported=("multiclass"), ), - precision=ScorerProperty( - scorer=make_scorer(precision_score), - target_type_supported=("binary", "multilabel-indicator"), + roc_auc_ovo=ScorerProperty( + scorer=roc_auc_ovo_scorer, + target_type_supported=("multiclass"), ), - recall=ScorerProperty( - scorer=make_scorer(recall_score), - target_type_supported=("binary", "multilabel-indicator"), + roc_auc_ovr_weighted=ScorerProperty( + scorer=roc_auc_ovr_weighted_scorer, + target_type_supported=("multiclass"), ), - f1=ScorerProperty( - scorer=make_scorer(f1_score), - target_type_supported=("binary", "multilabel-indicator"), + roc_auc_ovo_weighted=ScorerProperty( + scorer=roc_auc_ovo_weighted_scorer, + target_type_supported=("multiclass"), + ), + balanced_accuracy=ScorerProperty( + scorer=balanced_accuracy_scorer, + target_type_supported=("binary", "multiclass"), ), jaccard=ScorerProperty( scorer=make_scorer(jaccard_score), @@ -773,14 +777,27 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, ), neg_log_loss=ScorerProperty( scorer=neg_log_loss_scorer, - target_type_supported=("binary", "multiclass", "multilabel-indicator"), + target_type_supported=("binary", "multiclass"), ), neg_brier_score=ScorerProperty( scorer=neg_brier_score_scorer, - target_type_supported=("binary", "multiclass"), + target_type_supported=("binary"), ), ) +for name, metric in [('precision', precision_score), + ('recall', recall_score), ('f1', f1_score), + ('jaccard', jaccard_score)]: + SCORERS_PROPERTY[name] = ScorerProperty( + scorer=make_scorer(metric, average='binary'), + target_type_supported=("binary",), + ) + for average in ['macro', 'micro', 'samples', 'weighted']: + qualified_name = f'{name}_{average}' + SCORERS_PROPERTY[qualified_name] = ScorerProperty( + scorer=make_scorer(metric, pos_label=None, average=average), + target_type_supported=("multilabel-indicator"), + ) def get_applicable_scorers(y, **scorers_params): """Utility providing scorers to be used on `y`. diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index f105db0af9a4f..c082ecf7f3aba 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -764,7 +764,8 @@ def _generate_scorer(Estimator, X, y): def _parametrize_scorers_from_target(estimator_data_ids): check_scorers, check_scorers_ids = zip(*[ - ((Estimator, X, y, scorer), f"{scorer_name}-{problem_id}") + ((Estimator, X, np.abs(y) - np.min(y), scorer), + f"{scorer_name}-{problem_id}") for problem_id, Estimator, X, y in estimator_data_ids for scorer_name, scorer in get_applicable_scorers(y).items() ]) @@ -778,9 +779,9 @@ def _parametrize_scorers_from_target(estimator_data_ids): [("binary", LogisticRegression, *make_classification(n_classes=2)), ("multiclass", LogisticRegression, *make_classification(n_classes=3, n_clusters_per_class=1)), - ("multilabel", LogisticRegression, *make_multilabel_classification()), + ("multilabel", DecisionTreeClassifier, *make_multilabel_classification()), ("continuous", Ridge, *make_regression(n_targets=1)), - ("continuous-multioutput", Ridge, *make_regression(n_targets=2))] + ("continuous-multioutput", Ridge, *make_regression(n_targets=2))] ) def test_get_applicable_scorers_smoke_test(Estimator, X, y, scorer): estimator = Estimator().fit(X, y) From 5a3bab16c50f9a3e5bd7bcc7e6ef14225f56fa4f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 11 Jul 2020 00:00:44 +0200 Subject: [PATCH 29/31] iter --- sklearn/metrics/tests/test_score_objects.py | 29 +++------------------ 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index c082ecf7f3aba..b4429981aa731 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -736,32 +736,6 @@ def test_multiclass_roc_no_proba_scorer_errors(scorer_name): scorer(lr, X, y) -@pytest.mark.parametrize( - "Estimator, X, y", - [(LogisticRegression, *make_classification(n_classes=2)), - (LogisticRegression, *make_classification( - n_classes=3, n_clusters_per_class=1 - )), - (LogisticRegression, *make_multilabel_classification()), - (Ridge, *make_regression(n_targets=1)), - (Ridge, *make_regression(n_targets=2))], - ids=[ - "binary-classification", - "multiclass-classification", - "multilabel-classification", - "regression", - "multioutput-regression", - ] -) -def _generate_scorer(Estimator, X, y): - # smoke test to check that we can compute the score on the expected - # dataset - scorers = get_applicable_scorers(y) - estimator = Estimator().fit(X, y) - for scorer_name in scorers: - yield estimator, X, y, scorers[scorer_name] - - def _parametrize_scorers_from_target(estimator_data_ids): check_scorers, check_scorers_ids = zip(*[ ((Estimator, X, np.abs(y) - np.min(y), scorer), @@ -775,6 +749,9 @@ def _parametrize_scorers_from_target(estimator_data_ids): ) +@pytest.mark.filterwarnings( + "ignore::sklearn.exceptions.UndefinedMetricWarning" +) @_parametrize_scorers_from_target( [("binary", LogisticRegression, *make_classification(n_classes=2)), ("multiclass", LogisticRegression, From 8732aa4ce4c16a6147dbcae6858b7e4eee04d977 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 11 Jul 2020 00:37:44 +0200 Subject: [PATCH 30/31] iter --- sklearn/metrics/tests/test_score_objects.py | 58 +++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index b4429981aa731..22d1f2d971f93 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -25,6 +25,7 @@ _MultimetricScorer, _check_multimetric_scoring) from sklearn.metrics import accuracy_score +from sklearn.metrics import average_precision_score from sklearn.metrics import ( get_applicable_scorers, get_scorer, @@ -40,6 +41,7 @@ from sklearn.datasets import make_blobs from sklearn.datasets import make_classification, make_regression from sklearn.datasets import make_multilabel_classification +from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split, cross_val_score from sklearn.model_selection import GridSearchCV @@ -761,5 +763,61 @@ def _parametrize_scorers_from_target(estimator_data_ids): ("continuous-multioutput", Ridge, *make_regression(n_targets=2))] ) def test_get_applicable_scorers_smoke_test(Estimator, X, y, scorer): + # smoke test to check that we can use the score on the registered problem estimator = Estimator().fit(X, y) scorer(estimator, X, y) + + +@pytest.mark.filterwarnings( + "ignore::sklearn.exceptions.UndefinedMetricWarning" +) +@pytest.mark.parametrize( + "Estimator, X, y", + [(LogisticRegression, *make_classification(n_classes=2)), + (LogisticRegression, + *make_classification(n_classes=3, n_clusters_per_class=1)), + (DecisionTreeClassifier, *make_multilabel_classification()), + (Ridge, *make_regression(n_targets=1)), + (Ridge, *make_regression(n_targets=2))] +) +def test_get_applicable_scorers_with_grid_search_smoke_test(Estimator, X, y): + # smoke test to check that scorers can be used directly inside a + # grid-search + if issubclass(Estimator, LogisticRegression): + param_grid = {"C": [0.1, 1]} + elif issubclass(Estimator, DecisionTreeClassifier): + param_grid = {"max_depth": [3, 5]} + elif issubclass(Estimator, Ridge): + y = np.abs(y) - np.min(y) + param_grid = {"alpha": [1, 10]} + + scorers = get_applicable_scorers(y) + estimator = GridSearchCV( + Estimator(), param_grid=param_grid, scoring=scorers, n_jobs=-1, + refit=list(scorers.keys())[0], + ) + estimator.fit(X, y) + + +def test_get_applicable_scorers_passing_scoring_params(): + # check that we can pass scoring parameters when getting the score + breast_cancer = load_breast_cancer() + X = breast_cancer.data + y = breast_cancer.target_names[breast_cancer.target].astype("object") + + scorers = get_applicable_scorers(y, pos_label="malignant") + average_precision_scorer = scorers["average_precision"] + assert "pos_label" in average_precision_scorer._kwargs + assert average_precision_scorer._kwargs["pos_label"] == "malignant" + + estimator = GridSearchCV( + DecisionTreeClassifier(), param_grid={"max_depth": [3, 5]}, + scoring=average_precision_scorer, + ) + estimator.fit(X, y) + + # check that if we don't provide any pos_label, the grid-search will raise + # an error + with pytest.raises(ValueError, match="pos_label=1 is invalid"): + estimator.set_params(scoring=make_scorer(average_precision_score)) + estimator.fit(X, y) From 43668af8a23bfd64450e2e23eead4a44d7bc6d50 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 11 Jul 2020 00:43:41 +0200 Subject: [PATCH 31/31] PEP8 --- sklearn/metrics/_scorer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index d98f1c57639d9..4f56f88ad3d23 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -799,6 +799,7 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, target_type_supported=("multilabel-indicator"), ) + def get_applicable_scorers(y, **scorers_params): """Utility providing scorers to be used on `y`.