diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index fc3c2337fc4e1..f7a631a06c1b1 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -217,6 +217,9 @@ Changelog - |Enhancement| :term:`CV splitters ` that ignores the group parameter now raises a warning when groups are passed in to :term:`split`. :pr:`28210` by +- |Fix| the ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV`) now + returns masked arrays of the appropriate NumPy dtype, as opposed to always returning + dtype ``object``. :pr:`28352` by :user:`Marco Gorelli`. :mod:`sklearn.multioutput` .......................... diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 3401b328a38cd..cc351921b9463 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1073,27 +1073,27 @@ def _store(key_name, array, weights=None, splits=False, rank=False): _store("fit_time", out["fit_time"]) _store("score_time", out["score_time"]) - # Use one MaskedArray and mask all the places where the param is not - # applicable for that candidate. Use defaultdict as each candidate may - # not contain all the params - param_results = defaultdict( - partial( - MaskedArray, - np.empty( - n_candidates, - ), - mask=True, - dtype=object, - ) - ) + param_results = defaultdict(dict) for cand_idx, params in enumerate(candidate_params): for name, value in params.items(): - # An all masked empty array gets created for the key - # `"param_%s" % name` at the first occurrence of `name`. - # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_idx] = value + for key, param_result in param_results.items(): + param_list = list(param_result.values()) + try: + arr_dtype = np.result_type(*param_list) + except TypeError: + arr_dtype = object + if len(param_list) == n_candidates: + results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype) + else: + # Use one MaskedArray and mask all the places where the param is not + # applicable for that candidate (which may not contain all the params). + ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype) + for index, value in param_result.items(): + # Setting the value at an index unmasks that index + ma[index] = value + results[key] = ma - results.update(param_results) # Store a list of param dicts at the key 'params' results["params"] = candidate_params diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c0db76c5c6ef6..f5e91948e0fbe 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -898,11 +898,15 @@ def test_param_sampler(): assert [x for x in sampler] == [x for x in sampler] -def check_cv_results_array_types(search, param_keys, score_keys): +def check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds +): # Check if the search `cv_results`'s array are of correct types cv_results = search.cv_results_ assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys) - assert all(cv_results[key].dtype == object for key in param_keys) + assert { + key: cv_results[key].dtype.kind for key in param_keys + } == expected_cv_results_kinds assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys) assert all( cv_results[key].dtype == np.float64 @@ -975,7 +979,15 @@ def test_grid_search_cv_results(): if "time" not in k and k != "rank_test_score" ) # Check cv_results structure - check_cv_results_array_types(search, param_keys, score_keys) + expected_cv_results_kinds = { + "param_C": "i", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) # Check masking cv_results = search.cv_results_ @@ -1044,7 +1056,15 @@ def test_random_search_cv_results(): search.fit(X, y) cv_results = search.cv_results_ # Check results structure - check_cv_results_array_types(search, param_keys, score_keys) + expected_cv_results_kinds = { + "param_C": "f", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) assert all( ( @@ -1378,7 +1398,9 @@ def test_search_cv_results_none_param(): est_parameters, cv=cv, ).fit(X, y) - assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None]) + assert_array_equal( + grid_search.cv_results_["param_random_state"], [0, float("nan")] + ) @ignore_warnings() diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 6c89f89afa684..b7047c7537871 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -826,7 +826,15 @@ def test_halving_random_search_list_of_dicts(): cv_results = search.cv_results_ # Check results structure check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) - check_cv_results_array_types(search, param_keys, score_keys) + expected_cv_results_kinds = { + "param_C": "f", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) assert all( (