Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c0b3385

Browse files
glemaitrebetatimogrisellesteve
authored
MAINT use nanmin to replace nan by finite values in ranking of SearchCV (#24543)
Co-authored-by: Tim Head <[email protected]> Co-authored-by: Olivier Grisel <[email protected]> Co-authored-by: Loïc Estève <[email protected]>
1 parent e01035d commit c0b3385

File tree

3 files changed

+32
-10
lines changed

3 files changed

+32
-10
lines changed

doc/whats_new/v1.2.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,12 @@ random sampling procedures.
4747
:func:`linear_model._sgd_fast._plain_sgd` which is used by :class:`linear_model.SGDRegressor`
4848
and :class:`linear_model.SGDClassifier`. The old condition did not disambiguate between
4949
training and validation set and had an effect of overscaling the error tolerance.
50-
This has been fixed in :pr:`23798` by :user:`Harsh Agrawal <Harsh14901>`
50+
This has been fixed in :pr:`23798` by :user:`Harsh Agrawal <Harsh14901>`.
51+
52+
- |Fix| For :class:`model_selection.GridSearchCV` and
53+
:class:`model_selection.RandomizedSearchCV` ranks corresponding to nan
54+
scores will all be set to the maximum possible rank.
55+
:pr:`24543` by :user:`Guillaume Lemaitre <glemaitre>`.
5156

5257
Changes impacting all modules
5358
-----------------------------
@@ -430,6 +435,11 @@ Changelog
430435
nan score is correctly set to the maximum possible rank, rather than
431436
`np.iinfo(np.int32).min`. :pr:`24141` by :user:`Loïc Estève <lesteve>`.
432437

438+
- |Fix| For :class:`model_selection.GridSearchCV` and
439+
:class:`model_selection.RandomizedSearchCV` ranks corresponding to nan
440+
scores will all be set to the maximum possible rank.
441+
:pr:`24543` by :user:`Guillaume Lemaitre <glemaitre>`.
442+
433443
:mod:`sklearn.multioutput`
434444
..........................
435445

sklearn/model_selection/_search.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -965,13 +965,18 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
965965
results["std_%s" % key_name] = array_stds
966966

967967
if rank:
968-
# when input is nan, scipy >= 1.10 rankdata returns nan. To
969-
# keep previous behaviour nans are set to be smaller than the
970-
# minimum value in the array before ranking
971-
min_array_means = min(array_means) - 1
972-
array_means = np.nan_to_num(array_means, copy=True, nan=min_array_means)
973-
rank_result = rankdata(-array_means, method="min")
974-
rank_result = np.asarray(rank_result, dtype=np.int32)
968+
# When the fit/scoring fails `array_means` contains NaNs, we
969+
# will exclude them from the ranking process and consider them
970+
# as tied with the worst performers.
971+
if np.isnan(array_means).all():
972+
# All fit/scoring routines failed.
973+
rank_result = np.ones_like(array_means, dtype=np.int32)
974+
else:
975+
min_array_means = np.nanmin(array_means) - 1
976+
array_means = np.nan_to_num(array_means, nan=min_array_means)
977+
rank_result = rankdata(-array_means, method="min").astype(
978+
np.int32, copy=False
979+
)
975980
results["rank_%s" % key_name] = rank_result
976981

977982
_store("fit_time", out["fit_time"])

sklearn/model_selection/tests/test_search.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1981,10 +1981,10 @@ def get_n_splits(self, *args, **kw):
19811981
@pytest.mark.parametrize(
19821982
"SearchCV, specialized_params",
19831983
[
1984-
(GridSearchCV, {"param_grid": {"max_depth": [2, 3]}}),
1984+
(GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}),
19851985
(
19861986
RandomizedSearchCV,
1987-
{"param_distributions": {"max_depth": [2, 3]}, "n_iter": 2},
1987+
{"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4},
19881988
),
19891989
],
19901990
)
@@ -2025,6 +2025,13 @@ def __call__(self, estimator, X, y):
20252025
for msg, dataset in zip(warn_msg, set_with_warning):
20262026
assert f"One or more of the {dataset} scores are non-finite" in str(msg.message)
20272027

2028+
# all non-finite scores should be equally ranked last
2029+
last_rank = grid.cv_results_["rank_test_score"].max()
2030+
non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"])
2031+
assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank)
2032+
# all finite scores should be better ranked than the non-finite scores
2033+
assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank)
2034+
20282035

20292036
def test_callable_multimetric_confusion_matrix():
20302037
# Test callable with many metrics inserts the correct names and metrics

0 commit comments

Comments
 (0)