MAINT use nanmin to replace nan by finite values in ranking of SearchCV (#24543)

glemaitre · betatim · ogrisel · web-flow · commit c0b3385a1ebd · 2022-10-13T18:06:03.000+02:00
Co-authored-by: Tim Head &lt;betatim@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Loïc Estève &lt;loic.esteve@ymail.com&gt;
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -47,7 +47,12 @@ random sampling procedures.
   :func:`linear_model._sgd_fast._plain_sgd` which is used by :class:`linear_model.SGDRegressor`
   and :class:`linear_model.SGDClassifier`. The old condition did not disambiguate between
   training and validation set and had an effect of overscaling the error tolerance.
-  This has been fixed in :pr:`23798` by :user:`Harsh Agrawal <Harsh14901>`
+  This has been fixed in :pr:`23798` by :user:`Harsh Agrawal <Harsh14901>`.
+
+- |Fix| For :class:`model_selection.GridSearchCV` and
+  :class:`model_selection.RandomizedSearchCV` ranks corresponding to nan
+  scores will all be set to the maximum possible rank.
+  :pr:`24543` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 Changes impacting all modules
 -----------------------------
@@ -430,6 +435,11 @@ Changelog
   nan score is correctly set to the maximum possible rank, rather than
   `np.iinfo(np.int32).min`. :pr:`24141` by :user:`Loïc Estève <lesteve>`.
 
+- |Fix| For :class:`model_selection.GridSearchCV` and
+  :class:`model_selection.RandomizedSearchCV` ranks corresponding to nan
+  scores will all be set to the maximum possible rank.
+  :pr:`24543` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -965,13 +965,18 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
             results["std_%s" % key_name] = array_stds
 
             if rank:
-                # when input is nan, scipy >= 1.10 rankdata returns nan. To
-                # keep previous behaviour nans are set to be smaller than the
-                # minimum value in the array before ranking
-                min_array_means = min(array_means) - 1
-                array_means = np.nan_to_num(array_means, copy=True, nan=min_array_means)
-                rank_result = rankdata(-array_means, method="min")
-                rank_result = np.asarray(rank_result, dtype=np.int32)
+                # When the fit/scoring fails `array_means` contains NaNs, we
+                # will exclude them from the ranking process and consider them
+                # as tied with the worst performers.
+                if np.isnan(array_means).all():
+                    # All fit/scoring routines failed.
+                    rank_result = np.ones_like(array_means, dtype=np.int32)
+                else:
+                    min_array_means = np.nanmin(array_means) - 1
+                    array_means = np.nan_to_num(array_means, nan=min_array_means)
+                    rank_result = rankdata(-array_means, method="min").astype(
+                        np.int32, copy=False
+                    )
                 results["rank_%s" % key_name] = rank_result
 
         _store("fit_time", out["fit_time"])
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -1981,10 +1981,10 @@ def get_n_splits(self, *args, **kw):
 @pytest.mark.parametrize(
     "SearchCV, specialized_params",
     [
-        (GridSearchCV, {"param_grid": {"max_depth": [2, 3]}}),
+        (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}),
         (
             RandomizedSearchCV,
-            {"param_distributions": {"max_depth": [2, 3]}, "n_iter": 2},
+            {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4},
         ),
     ],
 )
@@ -2025,6 +2025,13 @@ def __call__(self, estimator, X, y):
     for msg, dataset in zip(warn_msg, set_with_warning):
         assert f"One or more of the {dataset} scores are non-finite" in str(msg.message)
 
+    # all non-finite scores should be equally ranked last
+    last_rank = grid.cv_results_["rank_test_score"].max()
+    non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"])
+    assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank)
+    # all finite scores should be better ranked than the non-finite scores
+    assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank)
+
 
 def test_callable_multimetric_confusion_matrix():
     # Test callable with many metrics inserts the correct names and metrics