BUG: use appropriate dtype in cv_results as opposed to always using object (#28352)

MarcoGorelli · web-flow · commit fd28ffdc6f74 · 2024-03-03T17:55:14.000-05:00
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -217,6 +217,9 @@ Changelog
 
 - |Enhancement| :term:`CV splitters <CV splitter>` that ignores the group parameter now
   raises a warning when groups are passed in to :term:`split`. :pr:`28210` by
+- |Fix| the ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV`) now
+  returns masked arrays of the appropriate NumPy dtype, as opposed to always returning
+  dtype ``object``. :pr:`28352` by :user:`Marco Gorelli<MarcoGorelli>`.
 
 :mod:`sklearn.multioutput`
 ..........................
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -1073,27 +1073,27 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
 
         _store("fit_time", out["fit_time"])
         _store("score_time", out["score_time"])
-        # Use one MaskedArray and mask all the places where the param is not
-        # applicable for that candidate. Use defaultdict as each candidate may
-        # not contain all the params
-        param_results = defaultdict(
-            partial(
-                MaskedArray,
-                np.empty(
-                    n_candidates,
-                ),
-                mask=True,
-                dtype=object,
-            )
-        )
+        param_results = defaultdict(dict)
         for cand_idx, params in enumerate(candidate_params):
             for name, value in params.items():
-                # An all masked empty array gets created for the key
-                # `"param_%s" % name` at the first occurrence of `name`.
-                # Setting the value at an index also unmasks that index
                 param_results["param_%s" % name][cand_idx] = value
+        for key, param_result in param_results.items():
+            param_list = list(param_result.values())
+            try:
+                arr_dtype = np.result_type(*param_list)
+            except TypeError:
+                arr_dtype = object
+            if len(param_list) == n_candidates:
+                results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
+            else:
+                # Use one MaskedArray and mask all the places where the param is not
+                # applicable for that candidate (which may not contain all the params).
+                ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
+                for index, value in param_result.items():
+                    # Setting the value at an index unmasks that index
+                    ma[index] = value
+                results[key] = ma
 
-        results.update(param_results)
         # Store a list of param dicts at the key 'params'
         results["params"] = candidate_params
 
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -898,11 +898,15 @@ def test_param_sampler():
     assert [x for x in sampler] == [x for x in sampler]
 
 
-def check_cv_results_array_types(search, param_keys, score_keys):
+def check_cv_results_array_types(
+    search, param_keys, score_keys, expected_cv_results_kinds
+):
     # Check if the search `cv_results`'s array are of correct types
     cv_results = search.cv_results_
     assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
-    assert all(cv_results[key].dtype == object for key in param_keys)
+    assert {
+        key: cv_results[key].dtype.kind for key in param_keys
+    } == expected_cv_results_kinds
     assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
     assert all(
         cv_results[key].dtype == np.float64
@@ -975,7 +979,15 @@ def test_grid_search_cv_results():
         if "time" not in k and k != "rank_test_score"
     )
     # Check cv_results structure
-    check_cv_results_array_types(search, param_keys, score_keys)
+    expected_cv_results_kinds = {
+        "param_C": "i",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     # Check masking
     cv_results = search.cv_results_
@@ -1044,7 +1056,15 @@ def test_random_search_cv_results():
     search.fit(X, y)
     cv_results = search.cv_results_
     # Check results structure
-    check_cv_results_array_types(search, param_keys, score_keys)
+    expected_cv_results_kinds = {
+        "param_C": "f",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     assert all(
         (
@@ -1378,7 +1398,9 @@ def test_search_cv_results_none_param():
             est_parameters,
             cv=cv,
         ).fit(X, y)
-        assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None])
+        assert_array_equal(
+            grid_search.cv_results_["param_random_state"], [0, float("nan")]
+        )
 
 
 @ignore_warnings()
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
@@ -826,7 +826,15 @@ def test_halving_random_search_list_of_dicts():
     cv_results = search.cv_results_
     # Check results structure
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
-    check_cv_results_array_types(search, param_keys, score_keys)
+    expected_cv_results_kinds = {
+        "param_C": "f",
+        "param_degree": "i",
+        "param_gamma": "f",
+        "param_kernel": "O",
+    }
+    check_cv_results_array_types(
+        search, param_keys, score_keys, expected_cv_results_kinds
+    )
 
     assert all(
         (