From 06cc76ad2bdf9ff8a62912dcc54bbe6da18038a6 Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 14:44:22 +0100
Subject: [PATCH 01/12] fix regression in gridsearchcv when parameter grids
 have estimators as values

---
 doc/whats_new/v1.5.rst                       |  4 +++
 sklearn/model_selection/_search.py           |  4 ++-
 sklearn/model_selection/tests/test_search.py | 35 ++++++++++++++++----
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index 60b8dadc97373..5c0d3d76419e3 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -38,6 +38,10 @@ Changelog
   grids that have heterogeneous parameter values.
   :pr:`29078` by :user:`Loïc Estève <lesteve>`.
 
+- |Fix| Fix a regression in :class:`model_selection.GridSearchCV` for parameter
+  grids that have estimators as parameter values.
+  :pr:`29179` by :user:`Marco Gorelli<MarcoGorelli>`.
+
 
 .. _changes_1_5:
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index edf492b84877a..a8069fffcb6c3 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1089,8 +1089,10 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         for key, param_result in param_results.items():
             param_list = list(param_result.values())
             try:
-                arr_dtype = np.result_type(*param_list)
+                arr_dtype = np.array(param_list).dtype
             except (TypeError, ValueError):
+                arr_dtype = np.dtype(object)
+            if arr_dtype.kind == "U":
                 arr_dtype = object
             if len(param_list) == n_candidates and arr_dtype != object:
                 # Exclude `object` else the numpy constructor might infer a list of
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index cb4af646aee39..6b70783f8e168 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -11,12 +11,14 @@
 from types import GeneratorType
 
 import numpy as np
+import pandas as pd
 import pytest
 from scipy.stats import bernoulli, expon, uniform
 
 from sklearn import config_context
 from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier
 from sklearn.cluster import KMeans
+from sklearn.compose import ColumnTransformer
 from sklearn.datasets import (
     make_blobs,
     make_classification,
@@ -64,7 +66,7 @@
 from sklearn.naive_bayes import ComplementNB
 from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
 from sklearn.svm import SVC, LinearSVC
 from sklearn.tests.metadata_routing_common import (
     ConsumingScorer,
@@ -1403,9 +1405,7 @@ def test_search_cv_results_none_param():
             est_parameters,
             cv=cv,
         ).fit(X, y)
-        assert_array_equal(
-            grid_search.cv_results_["param_random_state"], [0, float("nan")]
-        )
+        assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None])
 
 
 @ignore_warnings()
@@ -2521,8 +2521,7 @@ def test_search_with_2d_array():
     data_target = [0, 0, 1, 0, 1]
     random_search.fit(data_train, data_target)
     result = random_search.cv_results_["param_vect__ngram_range"]
-    expected_data = np.empty(3, dtype=object)
-    expected_data[:] = [(1, 2), (1, 2), (1, 1)]
+    expected_data = np.array([[1, 2], [1, 2], [1, 1]])
     np.testing.assert_array_equal(result.data, expected_data)
 
 
@@ -2686,3 +2685,27 @@ def score(self, X, y):
     grid_search.fit(X, y)
     for param in param_grid:
         assert grid_search.cv_results_[f"param_{param}"].dtype == object
+
+
+def test_search_with_estimators_():
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]})
+    X = df.drop("b", axis=1)
+    y = df["a"]
+    enc = ColumnTransformer(
+        [("enc", OneHotEncoder(), ["c"])],
+    )
+    pipe = Pipeline(
+        [
+            ("enc", enc),
+            ("regressor", LinearRegression()),
+        ]
+    )
+    grid_params = {
+        "enc__enc": [
+            OneHotEncoder(),
+            OrdinalEncoder(),
+        ]
+    }
+    grid_search = GridSearchCV(pipe, grid_params, cv=2)
+    grid_search.fit(X, y)
+    assert grid_search.cv_results_["param_enc__enc"].dtype == object

From 4905ef388e156ee82a5a9d957aabf011d9c0bf0a Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 15:56:31 +0100
Subject: [PATCH 02/12] preserve list of tuples

---
 sklearn/model_selection/_search.py           | 10 +++++++---
 sklearn/model_selection/tests/test_search.py |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index a8069fffcb6c3..d9ca738eb7822 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1089,11 +1089,15 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         for key, param_result in param_results.items():
             param_list = list(param_result.values())
             try:
-                arr_dtype = np.array(param_list).dtype
+                arr_dtype = np.result_type(*param_list)
             except (TypeError, ValueError):
                 arr_dtype = np.dtype(object)
-            if arr_dtype.kind == "U":
-                arr_dtype = object
+            else:
+                if np.array(param_list).dtype == object:
+                    # `np.result_type` might get thrown off by `.dtype` properties
+                    # (which some estimators have), so we check against the `dtype`
+                    # resulting from constructing an array.
+                    arr_dtype = object
             if len(param_list) == n_candidates and arr_dtype != object:
                 # Exclude `object` else the numpy constructor might infer a list of
                 # tuples to be a 2d array.
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 6b70783f8e168..fba61e9d5b34f 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2521,7 +2521,8 @@ def test_search_with_2d_array():
     data_target = [0, 0, 1, 0, 1]
     random_search.fit(data_train, data_target)
     result = random_search.cv_results_["param_vect__ngram_range"]
-    expected_data = np.array([[1, 2], [1, 2], [1, 1]])
+    expected_data = np.empty(3, dtype=object)
+    expected_data[:] = [(1, 2), (1, 2), (1, 1)]
     np.testing.assert_array_equal(result.data, expected_data)
 
 

From 176ead1d4e1fd06efbb15724d764b31208c17b18 Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 15:58:58 +0100
Subject: [PATCH 03/12] consistency, link to gh issue

---
 sklearn/model_selection/_search.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index d9ca738eb7822..e4d55a94054a5 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1097,7 +1097,8 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                     # `np.result_type` might get thrown off by `.dtype` properties
                     # (which some estimators have), so we check against the `dtype`
                     # resulting from constructing an array.
-                    arr_dtype = object
+                    # https://github.com/scikit-learn/scikit-learn/issues/29157
+                    arr_dtype = np.dtype(object)
             if len(param_list) == n_candidates and arr_dtype != object:
                 # Exclude `object` else the numpy constructor might infer a list of
                 # tuples to be a 2d array.

From c318da7e6b31eec75c2fd1b1c38ba49daea4348f Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 16:07:37 +0100
Subject: [PATCH 04/12] dont allocate array

---
 sklearn/model_selection/_search.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index e4d55a94054a5..92055494ea8c4 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -10,6 +10,7 @@
 #         Raghav RV <rvraghav93@gmail.com>
 # License: BSD 3 clause
 
+import functools
 import numbers
 import operator
 import time
@@ -1093,10 +1094,17 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
             except (TypeError, ValueError):
                 arr_dtype = np.dtype(object)
             else:
-                if np.array(param_list).dtype == object:
+                if (
+                    functools.reduce(
+                        lambda x, y: np.promote_types(x, y),
+                        (np.min_scalar_type(x) for x in param_list),
+                    )
+                    == object
+                ):
                     # `np.result_type` might get thrown off by `.dtype` properties
-                    # (which some estimators have), so we check against the `dtype`
-                    # resulting from constructing an array.
+                    # (which some estimators have).
+                    # If finding the result dtype this way would give object,
+                    # then we use object.
                     # https://github.com/scikit-learn/scikit-learn/issues/29157
                     arr_dtype = np.dtype(object)
             if len(param_list) == n_candidates and arr_dtype != object:

From 8e73ce498edf1fafa749ccb986ecaa8ca96a3783 Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 18:07:13 +0100
Subject: [PATCH 05/12] catch deprecation warning from numpy

---
 sklearn/model_selection/tests/test_search.py | 28 +++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index fba61e9d5b34f..da1f016e7bf73 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2688,12 +2688,19 @@ def score(self, X, y):
         assert grid_search.cv_results_[f"param_{param}"].dtype == object
 
 
-def test_search_with_estimators_():
-    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]})
-    X = df.drop("b", axis=1)
-    y = df["a"]
+def test_search_with_estimators():
+    df = pd.DataFrame(
+        {
+            "numeric_1": [1, 2, 3, 4, 5],
+            "object_1": ["a", "a", "a", "a", "a"],
+            "target": [1.0, 4.1, 2.0, 3.0, 1.0],
+        }
+    )
+    X = df.drop("target", axis=1)
+    y = df["target"]
     enc = ColumnTransformer(
-        [("enc", OneHotEncoder(), ["c"])],
+        [("enc", OneHotEncoder(sparse_output=False), ["object_1"])],
+        remainder="passthrough",
     )
     pipe = Pipeline(
         [
@@ -2703,10 +2710,17 @@ def test_search_with_estimators_():
     )
     grid_params = {
         "enc__enc": [
-            OneHotEncoder(),
+            OneHotEncoder(sparse_output=False),
             OrdinalEncoder(),
         ]
     }
     grid_search = GridSearchCV(pipe, grid_params, cv=2)
-    grid_search.fit(X, y)
+    with pytest.warns(
+        DeprecationWarning,
+        match=(
+            "in the future the `.dtype` attribute of a given datatype object must be "
+            "a valid dtype instance",
+        ),
+    ):
+        grid_search.fit(X, y)
     assert grid_search.cv_results_["param_enc__enc"].dtype == object

From 81b6da7367c3190ffad431df596311d84c20be70 Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 18:38:22 +0100
Subject: [PATCH 06/12] fixup

---
 sklearn/model_selection/tests/test_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index da1f016e7bf73..addaa8b131903 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2719,7 +2719,7 @@ def test_search_with_estimators():
         DeprecationWarning,
         match=(
             "in the future the `.dtype` attribute of a given datatype object must be "
-            "a valid dtype instance",
+            "a valid dtype instance"
         ),
     ):
         grid_search.fit(X, y)

From 1975302d3b93d5700987a1c142cefc98300bc1ee Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 19:11:33 +0100
Subject: [PATCH 07/12] importorskip pandas

---
 sklearn/model_selection/tests/test_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index addaa8b131903..c43ec67b11313 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -11,7 +11,6 @@
 from types import GeneratorType
 
 import numpy as np
-import pandas as pd
 import pytest
 from scipy.stats import bernoulli, expon, uniform
 
@@ -2689,6 +2688,7 @@ def score(self, X, y):
 
 
 def test_search_with_estimators():
+    pd = pytest.importorskip("pandas")
     df = pd.DataFrame(
         {
             "numeric_1": [1, 2, 3, 4, 5],

From 77462ae44ab3998a3177e58e1761c4f7d02eac5f Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 21:51:17 +0100
Subject: [PATCH 08/12] filter the warning

---
 sklearn/model_selection/tests/test_search.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index c43ec67b11313..168cc91551af2 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2687,6 +2687,10 @@ def score(self, X, y):
         assert grid_search.cv_results_[f"param_{param}"].dtype == object
 
 
+@pytest.mark.filterwarnings(
+    "ignore:in the future the `.dtype` attribute of a given datatype object must "
+    "be a valid dtype instance:DeprecationWarning"
+)
 def test_search_with_estimators():
     pd = pytest.importorskip("pandas")
     df = pd.DataFrame(
@@ -2715,12 +2719,5 @@ def test_search_with_estimators():
         ]
     }
     grid_search = GridSearchCV(pipe, grid_params, cv=2)
-    with pytest.warns(
-        DeprecationWarning,
-        match=(
-            "in the future the `.dtype` attribute of a given datatype object must be "
-            "a valid dtype instance"
-        ),
-    ):
-        grid_search.fit(X, y)
+    grid_search.fit(X, y)
     assert grid_search.cv_results_["param_enc__enc"].dtype == object

From 70ed083f71ab269dbcbdb9cf3e5a197fdb93ce44 Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Tue, 4 Jun 2024 22:54:36 +0100
Subject: [PATCH 09/12] it gets simpler!

---
 sklearn/model_selection/_search.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 92055494ea8c4..d42e4894321eb 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -10,7 +10,6 @@
 #         Raghav RV <rvraghav93@gmail.com>
 # License: BSD 3 clause
 
-import functools
 import numbers
 import operator
 import time
@@ -1094,13 +1093,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
             except (TypeError, ValueError):
                 arr_dtype = np.dtype(object)
             else:
-                if (
-                    functools.reduce(
-                        lambda x, y: np.promote_types(x, y),
-                        (np.min_scalar_type(x) for x in param_list),
-                    )
-                    == object
-                ):
+                if any(np.min_scalar_type(x) == object for x in param_list):
                     # `np.result_type` might get thrown off by `.dtype` properties
                     # (which some estimators have).
                     # If finding the result dtype this way would give object,

From 8a65fffa9b7619e846c023a9509c2943c801a7fb Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Wed, 5 Jun 2024 07:22:56 +0100
Subject: [PATCH 10/12] Update sklearn/model_selection/tests/test_search.py

---
 sklearn/model_selection/tests/test_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 168cc91551af2..79a9a28f7c250 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2691,7 +2691,7 @@ def score(self, X, y):
     "ignore:in the future the `.dtype` attribute of a given datatype object must "
     "be a valid dtype instance:DeprecationWarning"
 )
-def test_search_with_estimators():
+def test_search_with_estimators_issue_29157():
     pd = pytest.importorskip("pandas")
     df = pd.DataFrame(
         {

From b5f944fdd8a4e22f29a9bd5ae2cff7f36fe5024e Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:25:36 +0100
Subject: [PATCH 11/12] silence warning

---
 sklearn/model_selection/_search.py           | 8 +++++++-
 sklearn/model_selection/tests/test_search.py | 4 ----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index d42e4894321eb..fb2739c9d1a76 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1089,7 +1089,13 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         for key, param_result in param_results.items():
             param_list = list(param_result.values())
             try:
-                arr_dtype = np.result_type(*param_list)
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        message="in the future the `.dtype` attribute",
+                        category=DeprecationWarning,
+                    )
+                    arr_dtype = np.result_type(*param_list)
             except (TypeError, ValueError):
                 arr_dtype = np.dtype(object)
             else:
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 168cc91551af2..7368f9bf22945 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2687,10 +2687,6 @@ def score(self, X, y):
         assert grid_search.cv_results_[f"param_{param}"].dtype == object
 
 
-@pytest.mark.filterwarnings(
-    "ignore:in the future the `.dtype` attribute of a given datatype object must "
-    "be a valid dtype instance:DeprecationWarning"
-)
 def test_search_with_estimators():
     pd = pytest.importorskip("pandas")
     df = pd.DataFrame(

From 58689a0ca1fd8fd5e2298dfc8657257881a0d4db Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Wed, 5 Jun 2024 18:38:26 +0100
Subject: [PATCH 12/12] add comment + docstring

---
 sklearn/model_selection/_search.py           | 1 +
 sklearn/model_selection/tests/test_search.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index fb2739c9d1a76..fdc6abf195a67 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1095,6 +1095,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                         message="in the future the `.dtype` attribute",
                         category=DeprecationWarning,
                     )
+                    # Warning raised by NumPy 1.20+
                     arr_dtype = np.result_type(*param_list)
             except (TypeError, ValueError):
                 arr_dtype = np.dtype(object)
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 2a71772c8552a..7beb0d73bd993 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2688,6 +2688,7 @@ def score(self, X, y):
 
 
 def test_search_with_estimators_issue_29157():
+    """Check cv_results_ for estimators with a `dtype` parameter, e.g. OneHotEncoder."""
     pd = pytest.importorskip("pandas")
     df = pd.DataFrame(
         {