BUG: use appropriate dtype in cv_results as opposed to always using object

MarcoGorelli · MarcoGorelli · commit 41658f3f6319 · 2024-02-02T11:01:13.000Z
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -117,6 +117,9 @@ Changelog
 
 - |Enhancement| :term:`CV splitters <CV splitter>` that ignores the group parameter now
   raises a warning when groups are passed in to :term:`split`. :pr:`28210` by
+- |Fix| the ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV`) now
+  returns masked arrays of the appropriate NumPy dtype, as opposed to always returning
+  dtype ``object``. :pr:`28352` by :user:`Marco Gorelli<MarcoGorelli>`.
 
 :mod:`sklearn.utils`
 ....................
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -1081,25 +1081,22 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
 
         _store("fit_time", out["fit_time"])
         _store("score_time", out["score_time"])
-        # Use one MaskedArray and mask all the places where the param is not
-        # applicable for that candidate. Use defaultdict as each candidate may
-        # not contain all the params
-        param_results = defaultdict(
-            partial(
-                MaskedArray,
-                np.empty(
-                    n_candidates,
-                ),
-                mask=True,
-                dtype=object,
-            )
-        )
+        param_results = defaultdict(dict)
         for cand_idx, params in enumerate(candidate_params):
             for name, value in params.items():
-                # An all masked empty array gets created for the key
-                # `"param_%s" % name` at the first occurrence of `name`.
-                # Setting the value at an index also unmasks that index
                 param_results["param_%s" % name][cand_idx] = value
+        for key in param_results:
+            arr = np.array(list(param_results[key].values()))
+            if len(arr) == n_candidates:
+                param_results[key] = MaskedArray(arr, mask=False)
+            else:
+                # Use one MaskedArray and mask all the places where the param is not
+                # applicable for that candidate (which may not contain all the params).
+                ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr.dtype)
+                for index, value in param_results[key].items():
+                    # Setting the value at an index unmasks that index
+                    ma[index] = value
+                param_results[key] = ma
 
         results.update(param_results)
         # Store a list of param dicts at the key 'params'
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -898,11 +898,11 @@ def test_param_sampler():
     assert [x for x in sampler] == [x for x in sampler]
 
 
-def check_cv_results_array_types(search, param_keys, score_keys):
+def check_cv_results_array_types(search, param_keys, score_keys, expected_dtypes):
     # Check if the search `cv_results`'s array are of correct types
     cv_results = search.cv_results_
     assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
-    assert all(cv_results[key].dtype == object for key in param_keys)
+    assert {key: cv_results[key].dtype for key in param_keys} == expected_dtypes
     assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
     assert all(
         cv_results[key].dtype == np.float64
@@ -975,7 +975,13 @@ def test_grid_search_cv_results():
         if "time" not in k and k != "rank_test_score"
     )
     # Check cv_results structure
-    check_cv_results_array_types(search, param_keys, score_keys)
+    expected_dtypes = {
+        "param_C": "int64",
+        "param_degree": "int64",
+        "param_gamma": "float64",
+        "param_kernel": "<U4",
+    }
+    check_cv_results_array_types(search, param_keys, score_keys, expected_dtypes)
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     # Check masking
     cv_results = search.cv_results_
@@ -1044,7 +1050,13 @@ def test_random_search_cv_results():
     search.fit(X, y)
     cv_results = search.cv_results_
     # Check results structure
-    check_cv_results_array_types(search, param_keys, score_keys)
+    expected_dtypes = {
+        "param_C": "float64",
+        "param_degree": "int64",
+        "param_gamma": "float64",
+        "param_kernel": "<U4",
+    }
+    check_cv_results_array_types(search, param_keys, score_keys, expected_dtypes)
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
     assert all(
         (
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
@@ -826,7 +826,13 @@ def test_halving_random_search_list_of_dicts():
     cv_results = search.cv_results_
     # Check results structure
     check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
-    check_cv_results_array_types(search, param_keys, score_keys)
+    expected_dtypes = {
+        "param_C": "float64",
+        "param_degree": "int64",
+        "param_gamma": "float64",
+        "param_kernel": "<U4",
+    }
+    check_cv_results_array_types(search, param_keys, score_keys, expected_dtypes)
 
     assert all(
         (