fix cv_results_ in GridSearch when params are arrays of varying sizes

MarcoGorelli · MarcoGorelli · commit 08b6b2743610 · 2024-06-20T14:16:13.000+01:00
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -48,6 +48,10 @@ Changes impacting many modules
   grids that have estimators as parameter values.
   :pr:`29179` by :user:`Marco Gorelli<MarcoGorelli>`.
 
+- |Fix| Fix a regression in :class:`model_selection.GridSearchCV` for parameter
+  grids that have arrays of different sizes as parameter values.
+  :pr:`29314` by :user:`Marco Gorelli<MarcoGorelli>`.
+
 :mod:`sklearn.metrics`
 ..............................
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -1086,36 +1086,31 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         for key, param_result in param_results.items():
             param_list = list(param_result.values())
             try:
-                with warnings.catch_warnings():
-                    warnings.filterwarnings(
-                        "ignore",
-                        message="in the future the `.dtype` attribute",
-                        category=DeprecationWarning,
-                    )
-                    # Warning raised by NumPy 1.20+
-                    arr_dtype = np.result_type(*param_list)
+                arr = np.array(param_list)
+                arr_dtype = arr.dtype
             except (TypeError, ValueError):
                 arr_dtype = np.dtype(object)
             else:
-                if any(np.min_scalar_type(x) == object for x in param_list):
-                    # `np.result_type` might get thrown off by `.dtype` properties
-                    # (which some estimators have).
-                    # If finding the result dtype this way would give object,
-                    # then we use object.
-                    # https://github.com/scikit-learn/scikit-learn/issues/29157
+                if arr_dtype.kind == "U" or arr.ndim > 1:
                     arr_dtype = np.dtype(object)
-            if len(param_list) == n_candidates and arr_dtype != object:
-                # Exclude `object` else the numpy constructor might infer a list of
-                # tuples to be a 2d array.
-                results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
-            else:
-                # Use one MaskedArray and mask all the places where the param is not
-                # applicable for that candidate (which may not contain all the params).
-                ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
-                for index, value in param_result.items():
-                    # Setting the value at an index unmasks that index
-                    ma[index] = value
-                results[key] = ma
+
+            if len(param_list) == n_candidates:
+                try:
+                    ma = MaskedArray(param_list, mask=False, dtype=arr_dtype)
+                except ValueError:
+                    pass
+                else:
+                    if ma.ndim == 1:
+                        results[key] = ma
+                        continue
+
+            # Use one MaskedArray and mask all the places where the param is not
+            # applicable for that candidate (which may not contain all the params).
+            ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
+            for index, value in param_result.items():
+                # Setting the value at an index unmasks that index
+                ma[index] = value
+            results[key] = ma
 
         # Store a list of param dicts at the key 'params'
         results["params"] = candidate_params
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -65,8 +65,13 @@
 from sklearn.model_selection.tests.common import OneTimeSplitter
 from sklearn.naive_bayes import ComplementNB
 from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import (
+    OneHotEncoder,
+    OrdinalEncoder,
+    SplineTransformer,
+    StandardScaler,
+)
 from sklearn.svm import SVC, LinearSVC
 from sklearn.tests.metadata_routing_common import (
     ConsumingScorer,
@@ -2724,6 +2729,39 @@ def test_search_with_estimators_issue_29157():
     assert grid_search.cv_results_["param_enc__enc"].dtype == object
 
 
+def test_cv_results_multi_size_array_29277():
+    x = np.linspace(-np.pi * 2, np.pi * 5, 1000)
+    y_true = np.sin(x)
+    y_train = y_true[(0 < x) & (x < np.pi * 2)]
+
+    x_train = x[(0 < x) & (x < np.pi * 2)]
+    y_train_noise = y_train + np.random.normal(size=y_train.shape, scale=0.5)
+
+    x = x.reshape((-1, 1))
+    x_train = x_train.reshape((-1, 1))
+
+    spline_reg_pipe = make_pipeline(
+        SplineTransformer(extrapolation="periodic"),
+        LinearRegression(fit_intercept=False),
+    )
+
+    spline_reg_pipe_cv = GridSearchCV(
+        estimator=spline_reg_pipe,
+        param_grid={
+            "splinetransformer__knots": [
+                np.linspace(0, np.pi * 2, n_knots).reshape((-1, 1))
+                for n_knots in range(10, 21, 5)
+            ],
+        },
+        verbose=1,
+    )
+
+    spline_reg_pipe_cv.fit(X=x_train, y=y_train_noise)
+    assert (
+        spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object
+    )
+
+
 @pytest.mark.parametrize(
     "array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
 )