Thanks to visit codestin.com
Credit goes to github.com

Skip to content

BUG: use appropriate dtype in cv_results as opposed to always using object #28352

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 3, 2024
3 changes: 3 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ Changelog

- |Enhancement| :term:`CV splitters <CV splitter>` that ignores the group parameter now
raises a warning when groups are passed in to :term:`split`. :pr:`28210` by
- |Fix| the ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV`) now
returns masked arrays of the appropriate NumPy dtype, as opposed to always returning
dtype ``object``. :pr:`28352` by :user:`Marco Gorelli<MarcoGorelli>`.

:mod:`sklearn.multioutput`
..........................
Expand Down
34 changes: 17 additions & 17 deletions sklearn/model_selection/_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,27 +1073,27 @@ def _store(key_name, array, weights=None, splits=False, rank=False):

_store("fit_time", out["fit_time"])
_store("score_time", out["score_time"])
# Use one MaskedArray and mask all the places where the param is not
# applicable for that candidate. Use defaultdict as each candidate may
# not contain all the params
param_results = defaultdict(
partial(
MaskedArray,
np.empty(
n_candidates,
),
mask=True,
dtype=object,
)
)
param_results = defaultdict(dict)
for cand_idx, params in enumerate(candidate_params):
for name, value in params.items():
# An all masked empty array gets created for the key
# `"param_%s" % name` at the first occurrence of `name`.
# Setting the value at an index also unmasks that index
param_results["param_%s" % name][cand_idx] = value
for key, param_result in param_results.items():
param_list = list(param_result.values())
try:
arr_dtype = np.result_type(*param_list)
except TypeError:
arr_dtype = object
if len(param_list) == n_candidates:
results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
else:
# Use one MaskedArray and mask all the places where the param is not
# applicable for that candidate (which may not contain all the params).
ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
for index, value in param_result.items():
# Setting the value at an index unmasks that index
ma[index] = value
results[key] = ma

results.update(param_results)
# Store a list of param dicts at the key 'params'
results["params"] = candidate_params

Expand Down
32 changes: 27 additions & 5 deletions sklearn/model_selection/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,11 +898,15 @@ def test_param_sampler():
assert [x for x in sampler] == [x for x in sampler]


def check_cv_results_array_types(search, param_keys, score_keys):
def check_cv_results_array_types(
search, param_keys, score_keys, expected_cv_results_kinds
):
# Check if the search `cv_results`'s array are of correct types
cv_results = search.cv_results_
assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys)
assert all(cv_results[key].dtype == object for key in param_keys)
assert {
key: cv_results[key].dtype.kind for key in param_keys
} == expected_cv_results_kinds
assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys)
assert all(
cv_results[key].dtype == np.float64
Expand Down Expand Up @@ -975,7 +979,15 @@ def test_grid_search_cv_results():
if "time" not in k and k != "rank_test_score"
)
# Check cv_results structure
check_cv_results_array_types(search, param_keys, score_keys)
expected_cv_results_kinds = {
"param_C": "i",
"param_degree": "i",
"param_gamma": "f",
"param_kernel": "O",
}
check_cv_results_array_types(
search, param_keys, score_keys, expected_cv_results_kinds
)
check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
# Check masking
cv_results = search.cv_results_
Expand Down Expand Up @@ -1044,7 +1056,15 @@ def test_random_search_cv_results():
search.fit(X, y)
cv_results = search.cv_results_
# Check results structure
check_cv_results_array_types(search, param_keys, score_keys)
expected_cv_results_kinds = {
"param_C": "f",
"param_degree": "i",
"param_gamma": "f",
"param_kernel": "O",
}
check_cv_results_array_types(
search, param_keys, score_keys, expected_cv_results_kinds
)
check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
assert all(
(
Expand Down Expand Up @@ -1378,7 +1398,9 @@ def test_search_cv_results_none_param():
est_parameters,
cv=cv,
).fit(X, y)
assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None])
assert_array_equal(
grid_search.cv_results_["param_random_state"], [0, float("nan")]
)


@ignore_warnings()
Expand Down
10 changes: 9 additions & 1 deletion sklearn/model_selection/tests/test_successive_halving.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,15 @@ def test_halving_random_search_list_of_dicts():
cv_results = search.cv_results_
# Check results structure
check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys)
check_cv_results_array_types(search, param_keys, score_keys)
expected_cv_results_kinds = {
"param_C": "f",
"param_degree": "i",
"param_gamma": "f",
"param_kernel": "O",
}
check_cv_results_array_types(
search, param_keys, score_keys, expected_cv_results_kinds
)

assert all(
(
Expand Down