From 06cc76ad2bdf9ff8a62912dcc54bbe6da18038a6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:44:22 +0100 Subject: [PATCH 01/12] fix regression in gridsearchcv when parameter grids have estimators as values --- doc/whats_new/v1.5.rst | 4 +++ sklearn/model_selection/_search.py | 4 ++- sklearn/model_selection/tests/test_search.py | 35 ++++++++++++++++---- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 60b8dadc97373..5c0d3d76419e3 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -38,6 +38,10 @@ Changelog grids that have heterogeneous parameter values. :pr:`29078` by :user:`Loïc Estève `. +- |Fix| Fix a regression in :class:`model_selection.GridSearchCV` for parameter + grids that have estimators as parameter values. + :pr:`29179` by :user:`Marco Gorelli`. + .. _changes_1_5: diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index edf492b84877a..a8069fffcb6c3 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1089,8 +1089,10 @@ def _store(key_name, array, weights=None, splits=False, rank=False): for key, param_result in param_results.items(): param_list = list(param_result.values()) try: - arr_dtype = np.result_type(*param_list) + arr_dtype = np.array(param_list).dtype except (TypeError, ValueError): + arr_dtype = np.dtype(object) + if arr_dtype.kind == "U": arr_dtype = object if len(param_list) == n_candidates and arr_dtype != object: # Exclude `object` else the numpy constructor might infer a list of diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index cb4af646aee39..6b70783f8e168 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -11,12 +11,14 @@ from types import GeneratorType import numpy as np +import pandas as pd import pytest from scipy.stats import bernoulli, expon, uniform from sklearn import config_context from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier from sklearn.cluster import KMeans +from sklearn.compose import ColumnTransformer from sklearn.datasets import ( make_blobs, make_classification, @@ -64,7 +66,7 @@ from sklearn.naive_bayes import ComplementNB from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler from sklearn.svm import SVC, LinearSVC from sklearn.tests.metadata_routing_common import ( ConsumingScorer, @@ -1403,9 +1405,7 @@ def test_search_cv_results_none_param(): est_parameters, cv=cv, ).fit(X, y) - assert_array_equal( - grid_search.cv_results_["param_random_state"], [0, float("nan")] - ) + assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None]) @ignore_warnings() @@ -2521,8 +2521,7 @@ def test_search_with_2d_array(): data_target = [0, 0, 1, 0, 1] random_search.fit(data_train, data_target) result = random_search.cv_results_["param_vect__ngram_range"] - expected_data = np.empty(3, dtype=object) - expected_data[:] = [(1, 2), (1, 2), (1, 1)] + expected_data = np.array([[1, 2], [1, 2], [1, 1]]) np.testing.assert_array_equal(result.data, expected_data) @@ -2686,3 +2685,27 @@ def score(self, X, y): grid_search.fit(X, y) for param in param_grid: assert grid_search.cv_results_[f"param_{param}"].dtype == object + + +def test_search_with_estimators_(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}) + X = df.drop("b", axis=1) + y = df["a"] + enc = ColumnTransformer( + [("enc", OneHotEncoder(), ["c"])], + ) + pipe = Pipeline( + [ + ("enc", enc), + ("regressor", LinearRegression()), + ] + ) + grid_params = { + "enc__enc": [ + OneHotEncoder(), + OrdinalEncoder(), + ] + } + grid_search = GridSearchCV(pipe, grid_params, cv=2) + grid_search.fit(X, y) + assert grid_search.cv_results_["param_enc__enc"].dtype == object From 4905ef388e156ee82a5a9d957aabf011d9c0bf0a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 15:56:31 +0100 Subject: [PATCH 02/12] preserve list of tuples --- sklearn/model_selection/_search.py | 10 +++++++--- sklearn/model_selection/tests/test_search.py | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index a8069fffcb6c3..d9ca738eb7822 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1089,11 +1089,15 @@ def _store(key_name, array, weights=None, splits=False, rank=False): for key, param_result in param_results.items(): param_list = list(param_result.values()) try: - arr_dtype = np.array(param_list).dtype + arr_dtype = np.result_type(*param_list) except (TypeError, ValueError): arr_dtype = np.dtype(object) - if arr_dtype.kind == "U": - arr_dtype = object + else: + if np.array(param_list).dtype == object: + # `np.result_type` might get thrown off by `.dtype` properties + # (which some estimators have), so we check against the `dtype` + # resulting from constructing an array. + arr_dtype = object if len(param_list) == n_candidates and arr_dtype != object: # Exclude `object` else the numpy constructor might infer a list of # tuples to be a 2d array. diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6b70783f8e168..fba61e9d5b34f 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2521,7 +2521,8 @@ def test_search_with_2d_array(): data_target = [0, 0, 1, 0, 1] random_search.fit(data_train, data_target) result = random_search.cv_results_["param_vect__ngram_range"] - expected_data = np.array([[1, 2], [1, 2], [1, 1]]) + expected_data = np.empty(3, dtype=object) + expected_data[:] = [(1, 2), (1, 2), (1, 1)] np.testing.assert_array_equal(result.data, expected_data) From 176ead1d4e1fd06efbb15724d764b31208c17b18 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 15:58:58 +0100 Subject: [PATCH 03/12] consistency, link to gh issue --- sklearn/model_selection/_search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d9ca738eb7822..e4d55a94054a5 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1097,7 +1097,8 @@ def _store(key_name, array, weights=None, splits=False, rank=False): # `np.result_type` might get thrown off by `.dtype` properties # (which some estimators have), so we check against the `dtype` # resulting from constructing an array. - arr_dtype = object + # https://github.com/scikit-learn/scikit-learn/issues/29157 + arr_dtype = np.dtype(object) if len(param_list) == n_candidates and arr_dtype != object: # Exclude `object` else the numpy constructor might infer a list of # tuples to be a 2d array. From c318da7e6b31eec75c2fd1b1c38ba49daea4348f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:07:37 +0100 Subject: [PATCH 04/12] dont allocate array --- sklearn/model_selection/_search.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index e4d55a94054a5..92055494ea8c4 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -10,6 +10,7 @@ # Raghav RV # License: BSD 3 clause +import functools import numbers import operator import time @@ -1093,10 +1094,17 @@ def _store(key_name, array, weights=None, splits=False, rank=False): except (TypeError, ValueError): arr_dtype = np.dtype(object) else: - if np.array(param_list).dtype == object: + if ( + functools.reduce( + lambda x, y: np.promote_types(x, y), + (np.min_scalar_type(x) for x in param_list), + ) + == object + ): # `np.result_type` might get thrown off by `.dtype` properties - # (which some estimators have), so we check against the `dtype` - # resulting from constructing an array. + # (which some estimators have). + # If finding the result dtype this way would give object, + # then we use object. # https://github.com/scikit-learn/scikit-learn/issues/29157 arr_dtype = np.dtype(object) if len(param_list) == n_candidates and arr_dtype != object: From 8e73ce498edf1fafa749ccb986ecaa8ca96a3783 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 18:07:13 +0100 Subject: [PATCH 05/12] catch deprecation warning from numpy --- sklearn/model_selection/tests/test_search.py | 28 +++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index fba61e9d5b34f..da1f016e7bf73 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2688,12 +2688,19 @@ def score(self, X, y): assert grid_search.cv_results_[f"param_{param}"].dtype == object -def test_search_with_estimators_(): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}) - X = df.drop("b", axis=1) - y = df["a"] +def test_search_with_estimators(): + df = pd.DataFrame( + { + "numeric_1": [1, 2, 3, 4, 5], + "object_1": ["a", "a", "a", "a", "a"], + "target": [1.0, 4.1, 2.0, 3.0, 1.0], + } + ) + X = df.drop("target", axis=1) + y = df["target"] enc = ColumnTransformer( - [("enc", OneHotEncoder(), ["c"])], + [("enc", OneHotEncoder(sparse_output=False), ["object_1"])], + remainder="passthrough", ) pipe = Pipeline( [ @@ -2703,10 +2710,17 @@ def test_search_with_estimators_(): ) grid_params = { "enc__enc": [ - OneHotEncoder(), + OneHotEncoder(sparse_output=False), OrdinalEncoder(), ] } grid_search = GridSearchCV(pipe, grid_params, cv=2) - grid_search.fit(X, y) + with pytest.warns( + DeprecationWarning, + match=( + "in the future the `.dtype` attribute of a given datatype object must be " + "a valid dtype instance", + ), + ): + grid_search.fit(X, y) assert grid_search.cv_results_["param_enc__enc"].dtype == object From 81b6da7367c3190ffad431df596311d84c20be70 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 18:38:22 +0100 Subject: [PATCH 06/12] fixup --- sklearn/model_selection/tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index da1f016e7bf73..addaa8b131903 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2719,7 +2719,7 @@ def test_search_with_estimators(): DeprecationWarning, match=( "in the future the `.dtype` attribute of a given datatype object must be " - "a valid dtype instance", + "a valid dtype instance" ), ): grid_search.fit(X, y) From 1975302d3b93d5700987a1c142cefc98300bc1ee Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 19:11:33 +0100 Subject: [PATCH 07/12] importorskip pandas --- sklearn/model_selection/tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index addaa8b131903..c43ec67b11313 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -11,7 +11,6 @@ from types import GeneratorType import numpy as np -import pandas as pd import pytest from scipy.stats import bernoulli, expon, uniform @@ -2689,6 +2688,7 @@ def score(self, X, y): def test_search_with_estimators(): + pd = pytest.importorskip("pandas") df = pd.DataFrame( { "numeric_1": [1, 2, 3, 4, 5], From 77462ae44ab3998a3177e58e1761c4f7d02eac5f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 21:51:17 +0100 Subject: [PATCH 08/12] filter the warning --- sklearn/model_selection/tests/test_search.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c43ec67b11313..168cc91551af2 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2687,6 +2687,10 @@ def score(self, X, y): assert grid_search.cv_results_[f"param_{param}"].dtype == object +@pytest.mark.filterwarnings( + "ignore:in the future the `.dtype` attribute of a given datatype object must " + "be a valid dtype instance:DeprecationWarning" +) def test_search_with_estimators(): pd = pytest.importorskip("pandas") df = pd.DataFrame( @@ -2715,12 +2719,5 @@ def test_search_with_estimators(): ] } grid_search = GridSearchCV(pipe, grid_params, cv=2) - with pytest.warns( - DeprecationWarning, - match=( - "in the future the `.dtype` attribute of a given datatype object must be " - "a valid dtype instance" - ), - ): - grid_search.fit(X, y) + grid_search.fit(X, y) assert grid_search.cv_results_["param_enc__enc"].dtype == object From 70ed083f71ab269dbcbdb9cf3e5a197fdb93ce44 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 4 Jun 2024 22:54:36 +0100 Subject: [PATCH 09/12] it gets simpler! --- sklearn/model_selection/_search.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 92055494ea8c4..d42e4894321eb 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -10,7 +10,6 @@ # Raghav RV # License: BSD 3 clause -import functools import numbers import operator import time @@ -1094,13 +1093,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): except (TypeError, ValueError): arr_dtype = np.dtype(object) else: - if ( - functools.reduce( - lambda x, y: np.promote_types(x, y), - (np.min_scalar_type(x) for x in param_list), - ) - == object - ): + if any(np.min_scalar_type(x) == object for x in param_list): # `np.result_type` might get thrown off by `.dtype` properties # (which some estimators have). # If finding the result dtype this way would give object, From 8a65fffa9b7619e846c023a9509c2943c801a7fb Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 5 Jun 2024 07:22:56 +0100 Subject: [PATCH 10/12] Update sklearn/model_selection/tests/test_search.py --- sklearn/model_selection/tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 168cc91551af2..79a9a28f7c250 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2691,7 +2691,7 @@ def score(self, X, y): "ignore:in the future the `.dtype` attribute of a given datatype object must " "be a valid dtype instance:DeprecationWarning" ) -def test_search_with_estimators(): +def test_search_with_estimators_issue_29157(): pd = pytest.importorskip("pandas") df = pd.DataFrame( { From b5f944fdd8a4e22f29a9bd5ae2cff7f36fe5024e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 5 Jun 2024 13:25:36 +0100 Subject: [PATCH 11/12] silence warning --- sklearn/model_selection/_search.py | 8 +++++++- sklearn/model_selection/tests/test_search.py | 4 ---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d42e4894321eb..fb2739c9d1a76 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1089,7 +1089,13 @@ def _store(key_name, array, weights=None, splits=False, rank=False): for key, param_result in param_results.items(): param_list = list(param_result.values()) try: - arr_dtype = np.result_type(*param_list) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="in the future the `.dtype` attribute", + category=DeprecationWarning, + ) + arr_dtype = np.result_type(*param_list) except (TypeError, ValueError): arr_dtype = np.dtype(object) else: diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 168cc91551af2..7368f9bf22945 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2687,10 +2687,6 @@ def score(self, X, y): assert grid_search.cv_results_[f"param_{param}"].dtype == object -@pytest.mark.filterwarnings( - "ignore:in the future the `.dtype` attribute of a given datatype object must " - "be a valid dtype instance:DeprecationWarning" -) def test_search_with_estimators(): pd = pytest.importorskip("pandas") df = pd.DataFrame( From 58689a0ca1fd8fd5e2298dfc8657257881a0d4db Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 5 Jun 2024 18:38:26 +0100 Subject: [PATCH 12/12] add comment + docstring --- sklearn/model_selection/_search.py | 1 + sklearn/model_selection/tests/test_search.py | 1 + 2 files changed, 2 insertions(+) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index fb2739c9d1a76..fdc6abf195a67 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1095,6 +1095,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): message="in the future the `.dtype` attribute", category=DeprecationWarning, ) + # Warning raised by NumPy 1.20+ arr_dtype = np.result_type(*param_list) except (TypeError, ValueError): arr_dtype = np.dtype(object) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 2a71772c8552a..7beb0d73bd993 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2688,6 +2688,7 @@ def score(self, X, y): def test_search_with_estimators_issue_29157(): + """Check cv_results_ for estimators with a `dtype` parameter, e.g. OneHotEncoder.""" pd = pytest.importorskip("pandas") df = pd.DataFrame( {