From af3056b4c2fe12fecb7039f66845de29b705adbb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Sep 2022 15:33:05 +0200 Subject: [PATCH 01/15] MAINT use nanmin to replace nan by finite values in ranking of SearchCV --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 37b26eb1c72d3..11799fa0c47f7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -968,7 +968,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): # when input is nan, scipy >= 1.10 rankdata returns nan. To # keep previous behaviour nans are set to be smaller than the # minimum value in the array before ranking - min_array_means = min(array_means) - 1 + min_array_means = np.nanmin(array_means) - 1 array_means = np.nan_to_num(array_means, copy=True, nan=min_array_means) rank_result = rankdata(-array_means, method="min") rank_result = np.asarray(rank_result, dtype=np.int32) From c7363b631a9b9f6721409d264a4878210cd94b95 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Sep 2022 15:40:00 +0200 Subject: [PATCH 02/15] handle the case with only nan values --- sklearn/model_selection/_search.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 11799fa0c47f7..81ef331458f68 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -969,6 +969,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False): # keep previous behaviour nans are set to be smaller than the # minimum value in the array before ranking min_array_means = np.nanmin(array_means) - 1 + if np.isnan(min_array_means): + # all values in array_means are nan. Set min_array_means to 0 + min_array_means = 0 array_means = np.nan_to_num(array_means, copy=True, nan=min_array_means) rank_result = rankdata(-array_means, method="min") rank_result = np.asarray(rank_result, dtype=np.int32) From 4f094b2410e862808cef8eaaa4326039cb62bed4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Sep 2022 15:57:40 +0200 Subject: [PATCH 03/15] [scipy-dev] trigger CI dev builds From 4d2784e72818079e3fa5048bece7aa1092192c3f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Sep 2022 16:31:30 +0200 Subject: [PATCH 04/15] [scipy-dev] trigger CI dev builds From 88376417ab45ffab03176ee4602b284e724bbe46 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Sep 2022 20:48:07 +0200 Subject: [PATCH 05/15] catch early the case of all failed fit/scoring routinesd --- sklearn/model_selection/_search.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 81ef331458f68..525327d539bbd 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -965,16 +965,20 @@ def _store(key_name, array, weights=None, splits=False, rank=False): results["std_%s" % key_name] = array_stds if rank: - # when input is nan, scipy >= 1.10 rankdata returns nan. To - # keep previous behaviour nans are set to be smaller than the - # minimum value in the array before ranking - min_array_means = np.nanmin(array_means) - 1 - if np.isnan(min_array_means): - # all values in array_means are nan. Set min_array_means to 0 - min_array_means = 0 - array_means = np.nan_to_num(array_means, copy=True, nan=min_array_means) - rank_result = rankdata(-array_means, method="min") - rank_result = np.asarray(rank_result, dtype=np.int32) + # When the fit/scoring fails `array_means` contains NaNs, we + # will exclude them from the ranking process and consider them + # as ties as worst performers. + if np.isnan(array_means).all(): + # All fit/scoring routines failed. + rank_result = np.ones_like(array_means, dtype=np.int32) + else: + min_array_means = np.nanmin(array_means) - 1 + array_means = np.nan_to_num( + array_means, copy=True, nan=min_array_means + ) + rank_result = rankdata(-array_means, method="min").astype( + np.int32, copy=False + ) results["rank_%s" % key_name] = rank_result _store("fit_time", out["fit_time"]) From 349dab64410be6e5e0e2c614d7a64120b35ed638 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Sep 2022 20:48:31 +0200 Subject: [PATCH 06/15] DOC add a changelog entry to document the change of behaviour of nan scores --- doc/whats_new/v1.2.rst | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 877c6c26dc8c1..1830449c81ef1 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -42,7 +42,13 @@ random sampling procedures. :func:`linear_model._sgd_fast._plain_sgd` which is used by :class:`linear_model.SGDRegressor` and :class:`linear_model.SGDClassifier`. The old condition did not disambiguate between training and validation set and had an effect of overscaling the error tolerance. - This has been fixed in :pr:`23798` by :user:`Harsh Agrawal ` + This has been fixed in :pr:`23798` by :user:`Harsh Agrawal `. + +- |Fix| :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` will rank as wort equally + performers the fit/scoring routines that fail and return `np.nan` scores. + Previous failing routines were ranked arbitrarily differently without ties. + :pr:`24543` by :user:`Guillaume Lemaitre `. Changes impacting all modules ----------------------------- @@ -398,6 +404,12 @@ Changelog nan score is correctly set to the maximum possible rank, rather than `np.iinfo(np.int32).min`. :pr:`24141` by :user:`Loïc Estève `. +- |Fix| :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` will rank as wort equally + performers the fit/scoring routines that fail and return `np.nan` scores. + Previous failing routines were ranked arbitrarily differently without ties. + :pr:`24543` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.multioutput` .......................... From 03c36182e08344706867bc2dbaad76e64ae35ba6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 14:48:26 +0200 Subject: [PATCH 07/15] Update doc/whats_new/v1.2.rst Co-authored-by: Tim Head --- doc/whats_new/v1.2.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 1830449c81ef1..cd36174ce64e5 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -45,9 +45,9 @@ random sampling procedures. This has been fixed in :pr:`23798` by :user:`Harsh Agrawal `. - |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` will rank as wort equally - performers the fit/scoring routines that fail and return `np.nan` scores. - Previous failing routines were ranked arbitrarily differently without ties. + :class:`model_selection.RandomizedSearchCV` will rank fit/scoring routines + that raise an exception as equal worst. Previously they were ranked as worst performers + in an arbitrary order. :pr:`24543` by :user:`Guillaume Lemaitre `. Changes impacting all modules From 2861a7bacc949415e900949cad8b1ff523ac7f29 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 14:49:03 +0200 Subject: [PATCH 08/15] Update sklearn/model_selection/_search.py Co-authored-by: Olivier Grisel --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 525327d539bbd..3d067b2767f41 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -967,7 +967,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): if rank: # When the fit/scoring fails `array_means` contains NaNs, we # will exclude them from the ranking process and consider them - # as ties as worst performers. + # as tied with the worst performers. if np.isnan(array_means).all(): # All fit/scoring routines failed. rank_result = np.ones_like(array_means, dtype=np.int32) From 78a51fa90d0d5825a20197dcc99242fb7203e11b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 14:52:35 +0200 Subject: [PATCH 09/15] thomas comment --- sklearn/model_selection/_search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 3d067b2767f41..6ccbae2abc611 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -973,9 +973,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): rank_result = np.ones_like(array_means, dtype=np.int32) else: min_array_means = np.nanmin(array_means) - 1 - array_means = np.nan_to_num( - array_means, copy=True, nan=min_array_means - ) + array_means = np.nan_to_num(array_means, nan=min_array_means) rank_result = rankdata(-array_means, method="min").astype( np.int32, copy=False ) From 17035bd689463e6e9599ac3318872348c75cbd10 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 16:42:11 +0200 Subject: [PATCH 10/15] TST add a check for new behaviour --- sklearn/model_selection/tests/test_search.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index b86dfbd77846f..194a5d7ea3ca1 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1981,10 +1981,10 @@ def get_n_splits(self, *args, **kw): @pytest.mark.parametrize( "SearchCV, specialized_params", [ - (GridSearchCV, {"param_grid": {"max_depth": [2, 3]}}), + (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}), ( RandomizedSearchCV, - {"param_distributions": {"max_depth": [2, 3]}, "n_iter": 2}, + {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4}, ), ], ) @@ -2025,6 +2025,13 @@ def __call__(self, estimator, X, y): for msg, dataset in zip(warn_msg, set_with_warning): assert f"One or more of the {dataset} scores are non-finite" in str(msg.message) + # all non-finite scores should be equally ranked last + last_rank = grid.cv_results_["rank_test_score"].max() + non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"]) + assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank) + # all finite scores should be better ranked than the non-finite scores + assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank) + def test_callable_multimetric_confusion_matrix(): # Test callable with many metrics inserts the correct names and metrics From 6390f8f47a6d2a31a9ca7d9c4d8106a669144c3f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 16:42:58 +0200 Subject: [PATCH 11/15] Update doc/whats_new/v1.2.rst Co-authored-by: Olivier Grisel --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index cd36174ce64e5..2bac0d6314a94 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -405,7 +405,7 @@ Changelog `np.iinfo(np.int32).min`. :pr:`24141` by :user:`Loïc Estève `. - |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` will rank as wort equally + :class:`model_selection.RandomizedSearchCV` now rank as wort equally performers the fit/scoring routines that fail and return `np.nan` scores. Previous failing routines were ranked arbitrarily differently without ties. :pr:`24543` by :user:`Guillaume Lemaitre `. From 412e16e2d68e6866d35a45098129b081ea3b1c41 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 16:43:11 +0200 Subject: [PATCH 12/15] Update doc/whats_new/v1.2.rst Co-authored-by: Olivier Grisel --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 2bac0d6314a94..03458d95066d8 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -45,7 +45,7 @@ random sampling procedures. This has been fixed in :pr:`23798` by :user:`Harsh Agrawal `. - |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` will rank fit/scoring routines + :class:`model_selection.RandomizedSearchCV` now rank fit/scoring routines that raise an exception as equal worst. Previously they were ranked as worst performers in an arbitrary order. :pr:`24543` by :user:`Guillaume Lemaitre `. From ddae3c3bd118da00cff804e4f083f12d234c305c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 16:43:45 +0200 Subject: [PATCH 13/15] [scipy-dev] trigger nightly builds CIs From 8e33037baad4688c932a55196c14db53ff7bcf70 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 18:04:34 +0200 Subject: [PATCH 14/15] Update doc/whats_new/v1.2.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- doc/whats_new/v1.2.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 03458d95066d8..a5602e861c766 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -44,10 +44,9 @@ random sampling procedures. training and validation set and had an effect of overscaling the error tolerance. This has been fixed in :pr:`23798` by :user:`Harsh Agrawal `. -- |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` now rank fit/scoring routines - that raise an exception as equal worst. Previously they were ranked as worst performers - in an arbitrary order. +- |Fix| For :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` ranks corresponding to nan + scores will all be set to the maximum possible rank. :pr:`24543` by :user:`Guillaume Lemaitre `. Changes impacting all modules From 73d4ce05a4ed117961378d360fb032b7588fc241 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 13 Oct 2022 18:04:45 +0200 Subject: [PATCH 15/15] Update doc/whats_new/v1.2.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Loïc Estève --- doc/whats_new/v1.2.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index a5602e861c766..4c650e2ea84ce 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -403,10 +403,9 @@ Changelog nan score is correctly set to the maximum possible rank, rather than `np.iinfo(np.int32).min`. :pr:`24141` by :user:`Loïc Estève `. -- |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` now rank as wort equally - performers the fit/scoring routines that fail and return `np.nan` scores. - Previous failing routines were ranked arbitrarily differently without ties. +- |Fix| For :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` ranks corresponding to nan + scores will all be set to the maximum possible rank. :pr:`24543` by :user:`Guillaume Lemaitre `. :mod:`sklearn.multioutput`