From 212f5fa1919aab74a688272f151333a07d993b55 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 7 Dec 2022 17:44:15 +0100
Subject: [PATCH 1/6] MAINT validate parameters of Pipeline

---
 sklearn/base.py                | 38 ++++++++++++++++++++--------------
 sklearn/pipeline.py            | 14 +++++++++++--
 sklearn/tests/test_pipeline.py |  8 +++++++
 3 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index db82353662c0d..f86efab71e74d 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -109,7 +109,29 @@ def clone(estimator, *, safe=True):
     return new_object
 
 
-class BaseEstimator:
+class _ParamValidationMixin:
+    """Mixin class for all estimators in scikit-learn that validate their parameters.
+
+    This mixin gives access to the `_validate_params` method, which is called
+    at `fit` at the beginning of the `fit` method.
+    """
+
+    def _validate_params(self):
+        """Validate types and values of constructor parameters
+
+        The expected type and values must be defined in the `_parameter_constraints`
+        class attribute, which is a dictionary `param_name: list of constraints`. See
+        the docstring of `validate_parameter_constraints` for a description of the
+        accepted constraints.
+        """
+        validate_parameter_constraints(
+            self._parameter_constraints,
+            self.get_params(deep=False),
+            caller_name=self.__class__.__name__,
+        )
+
+
+class BaseEstimator(_ParamValidationMixin):
     """Base class for all estimators in scikit-learn.
 
     Notes
@@ -559,20 +581,6 @@ def _validate_data(
 
         return out
 
-    def _validate_params(self):
-        """Validate types and values of constructor parameters
-
-        The expected type and values must be defined in the `_parameter_constraints`
-        class attribute, which is a dictionary `param_name: list of constraints`. See
-        the docstring of `validate_parameter_constraints` for a description of the
-        accepted constraints.
-        """
-        validate_parameter_constraints(
-            self._parameter_constraints,
-            self.get_params(deep=False),
-            caller_name=self.__class__.__name__,
-        )
-
     @property
     def _repr_html_(self):
         """HTML representation of estimator.
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index cef3288b85439..b723ffe3bd2e6 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -16,7 +16,7 @@
 from scipy import sparse
 from joblib import Parallel
 
-from .base import clone, TransformerMixin
+from .base import clone, TransformerMixin, _ParamValidationMixin
 from .preprocessing import FunctionTransformer
 from .utils._estimator_html_repr import _VisualBlock
 from .utils.metaestimators import available_if
@@ -28,6 +28,7 @@
 from .utils.validation import check_memory
 from .utils.validation import check_is_fitted
 from .utils import check_pandas_support
+from .utils._param_validation import HasMethods
 from .utils._set_output import _safe_set_output, _get_output_config
 from .utils.fixes import delayed
 from .exceptions import NotFittedError
@@ -50,7 +51,7 @@ def check(self):
     return check
 
 
-class Pipeline(_BaseComposition):
+class Pipeline(_BaseComposition, _ParamValidationMixin):
     """
     Pipeline of transforms with a final estimator.
 
@@ -143,6 +144,12 @@ class Pipeline(_BaseComposition):
     # BaseEstimator interface
     _required_parameters = ["steps"]
 
+    _parameter_constraints: dict = {
+        "steps": "no_validation",  # validated in `_validate_steps`
+        "memory": [None, str, HasMethods(["cache"])],
+        "verbose": ["boolean"],
+    }
+
     def __init__(self, steps, *, memory=None, verbose=False):
         self.steps = steps
         self.memory = memory
@@ -398,6 +405,7 @@ def fit(self, X, y=None, **fit_params):
         self : object
             Pipeline with fitted steps.
         """
+        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
@@ -434,6 +442,7 @@ def fit_transform(self, X, y=None, **fit_params):
         Xt : ndarray of shape (n_samples, n_transformed_features)
             Transformed samples.
         """
+        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
 
@@ -510,6 +519,7 @@ def fit_predict(self, X, y=None, **fit_params):
         y_pred : ndarray
             Result of calling `fit_predict` on the final estimator.
         """
+        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
 
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index eab7d8027b3cd..0b96c81508ab6 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -12,6 +12,7 @@
 from scipy import sparse
 import joblib
 
+from sklearn.utils.estimator_checks import check_param_validation
 from sklearn.utils._testing import (
     assert_allclose,
     assert_array_equal,
@@ -1687,3 +1688,10 @@ def test_feature_union_getitem_error(key):
     msg = "Only string keys are supported"
     with pytest.raises(KeyError, match=msg):
         union[key]
+
+
+def test_pipeline_param_validation():
+    model = Pipeline(
+        [("scaler", StandardScaler()), ("classifier", LogisticRegression())]
+    )
+    check_param_validation("Pipeline", model)

From 8dead9532afea325baacd2231cedd7dd5308cdaf Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 7 Dec 2022 18:07:31 +0100
Subject: [PATCH 2/6] revert _ParamValidationMixin

---
 sklearn/base.py     | 38 +++++++++++++++-----------------------
 sklearn/pipeline.py |  4 ++--
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index f86efab71e74d..db82353662c0d 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -109,29 +109,7 @@ def clone(estimator, *, safe=True):
     return new_object
 
 
-class _ParamValidationMixin:
-    """Mixin class for all estimators in scikit-learn that validate their parameters.
-
-    This mixin gives access to the `_validate_params` method, which is called
-    at `fit` at the beginning of the `fit` method.
-    """
-
-    def _validate_params(self):
-        """Validate types and values of constructor parameters
-
-        The expected type and values must be defined in the `_parameter_constraints`
-        class attribute, which is a dictionary `param_name: list of constraints`. See
-        the docstring of `validate_parameter_constraints` for a description of the
-        accepted constraints.
-        """
-        validate_parameter_constraints(
-            self._parameter_constraints,
-            self.get_params(deep=False),
-            caller_name=self.__class__.__name__,
-        )
-
-
-class BaseEstimator(_ParamValidationMixin):
+class BaseEstimator:
     """Base class for all estimators in scikit-learn.
 
     Notes
@@ -581,6 +559,20 @@ def _validate_data(
 
         return out
 
+    def _validate_params(self):
+        """Validate types and values of constructor parameters
+
+        The expected type and values must be defined in the `_parameter_constraints`
+        class attribute, which is a dictionary `param_name: list of constraints`. See
+        the docstring of `validate_parameter_constraints` for a description of the
+        accepted constraints.
+        """
+        validate_parameter_constraints(
+            self._parameter_constraints,
+            self.get_params(deep=False),
+            caller_name=self.__class__.__name__,
+        )
+
     @property
     def _repr_html_(self):
         """HTML representation of estimator.
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index b723ffe3bd2e6..9e5c5ea711ca6 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -16,7 +16,7 @@
 from scipy import sparse
 from joblib import Parallel
 
-from .base import clone, TransformerMixin, _ParamValidationMixin
+from .base import clone, TransformerMixin
 from .preprocessing import FunctionTransformer
 from .utils._estimator_html_repr import _VisualBlock
 from .utils.metaestimators import available_if
@@ -51,7 +51,7 @@ def check(self):
     return check
 
 
-class Pipeline(_BaseComposition, _ParamValidationMixin):
+class Pipeline(_BaseComposition):
     """
     Pipeline of transforms with a final estimator.
 

From b7db3e93409e4945a388f214eedd6857ec9faf40 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 7 Dec 2022 18:17:16 +0100
Subject: [PATCH 3/6] TST remove redundant test

---
 sklearn/tests/test_pipeline.py | 40 ----------------------------------
 1 file changed, 40 deletions(-)

diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 0b96c81508ab6..67ad65b74b1eb 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -1186,46 +1186,6 @@ def test_set_params_nested_pipeline():
     estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5)
 
 
-def test_pipeline_wrong_memory():
-    # Test that an error is raised when memory is not a string or a Memory
-    # instance
-    X = iris.data
-    y = iris.target
-    # Define memory as an integer
-    memory = 1
-    cached_pipe = Pipeline([("transf", DummyTransf()), ("svc", SVC())], memory=memory)
-
-    msg = re.escape(
-        "'memory' should be None, a string or have the same interface "
-        "as joblib.Memory. Got memory='1' instead."
-    )
-    with pytest.raises(ValueError, match=msg):
-        cached_pipe.fit(X, y)
-
-
-class DummyMemory:
-    def cache(self, func):
-        return func
-
-
-class WrongDummyMemory:
-    pass
-
-
-def test_pipeline_with_cache_attribute():
-    X = np.array([[1, 2]])
-    pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=DummyMemory())
-    pipe.fit(X, y=None)
-    dummy = WrongDummyMemory()
-    pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=dummy)
-    msg = re.escape(
-        "'memory' should be None, a string or have the same interface "
-        f"as joblib.Memory. Got memory='{dummy}' instead."
-    )
-    with pytest.raises(ValueError, match=msg):
-        pipe.fit(X)
-
-
 def test_pipeline_memory():
     X = iris.data
     y = iris.target

From 1d15e441c4f0781fdcbe98f54d21ebea528e0c0e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 8 Dec 2022 11:33:07 +0100
Subject: [PATCH 4/6] Update test_pipeline.py

---
 sklearn/tests/test_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 67ad65b74b1eb..3c04889d95faa 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -1651,6 +1651,7 @@ def test_feature_union_getitem_error(key):
 
 
 def test_pipeline_param_validation():
+    """Run the param validation for `Pipeline`."""
     model = Pipeline(
         [("scaler", StandardScaler()), ("classifier", LogisticRegression())]
     )

From fada95f433089cbd0755116d4572b0aae182828c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 9 Dec 2022 12:07:48 +0100
Subject: [PATCH 5/6] move test to common tests

---
 sklearn/pipeline.py            | 25 ++++++++++++++++++-------
 sklearn/tests/test_common.py   | 16 ++++++++++++++--
 sklearn/tests/test_pipeline.py |  9 ---------
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 9e5c5ea711ca6..5236c4499a728 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -28,7 +28,7 @@
 from .utils.validation import check_memory
 from .utils.validation import check_is_fitted
 from .utils import check_pandas_support
-from .utils._param_validation import HasMethods
+from .utils._param_validation import HasMethods, Hidden
 from .utils._set_output import _safe_set_output, _get_output_config
 from .utils.fixes import delayed
 from .exceptions import NotFittedError
@@ -41,7 +41,7 @@
 def _final_estimator_has(attr):
     """Check that final_estimator has `attr`.
 
-    Used together with `avaliable_if` in `Pipeline`."""
+    Used together with `available_if` in `Pipeline`."""
 
     def check(self):
         # raise original `AttributeError` if `attr` does not exist
@@ -145,7 +145,7 @@ class Pipeline(_BaseComposition):
     _required_parameters = ["steps"]
 
     _parameter_constraints: dict = {
-        "steps": "no_validation",  # validated in `_validate_steps`
+        "steps": [list, Hidden(tuple)],
         "memory": [None, str, HasMethods(["cache"])],
         "verbose": ["boolean"],
     }
@@ -314,8 +314,15 @@ def named_steps(self):
 
     @property
     def _final_estimator(self):
-        estimator = self.steps[-1][1]
-        return "passthrough" if estimator is None else estimator
+        try:
+            estimator = self.steps[-1][1]
+            return "passthrough" if estimator is None else estimator
+        except (ValueError, AttributeError, TypeError):
+            # This condition happens when a call to a method is first calling
+            # `_available_if` and `fit` did not validate `steps` yet. We
+            # return `None` and an `InvalidParameterError` will be raised
+            # right after.
+            return None
 
     def _log_message(self, step_idx):
         if not self.verbose:
@@ -738,8 +745,12 @@ def classes_(self):
         return self.steps[-1][1].classes_
 
     def _more_tags(self):
-        # check if first estimator expects pairwise input
-        return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")}
+        try:
+            return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")}
+        except (ValueError, AttributeError, TypeError):
+            # This happens when the `steps` is not a list of (name, estimator)
+            # tuples and `fit` is not called yet to validate the steps.
+            return {}
 
     def get_feature_names_out(self, input_features=None):
         """Get output feature names for transformation.
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 1781c4bde3134..9de45eed7e541 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -58,7 +58,7 @@
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import HalvingGridSearchCV
 from sklearn.model_selection import HalvingRandomSearchCV
-from sklearn.pipeline import make_pipeline
+from sklearn.pipeline import make_pipeline, Pipeline
 
 from sklearn.utils import IS_PYPY
 from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
@@ -273,6 +273,16 @@ def test_class_support_removed():
         parametrize_with_checks([LogisticRegression])
 
 
+def _generate_pipeline():
+    for final_estimator in [Ridge(), LogisticRegression()]:
+        yield Pipeline(
+            steps=[
+                ("scaler", StandardScaler()),
+                ("final_estimator", final_estimator),
+            ]
+        )
+
+
 def _generate_search_cv_instances():
     for SearchCV, (Estimator, param_grid) in product(
         [
@@ -458,7 +468,9 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
 
 
 @pytest.mark.parametrize(
-    "estimator", _tested_estimators(), ids=_get_check_estimator_ids
+    "estimator",
+    list(_tested_estimators()) + list(_generate_pipeline()),
+    ids=_get_check_estimator_ids,
 )
 def test_check_param_validation(estimator):
     name = estimator.__class__.__name__
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 3c04889d95faa..342dc12b966c9 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -12,7 +12,6 @@
 from scipy import sparse
 import joblib
 
-from sklearn.utils.estimator_checks import check_param_validation
 from sklearn.utils._testing import (
     assert_allclose,
     assert_array_equal,
@@ -1648,11 +1647,3 @@ def test_feature_union_getitem_error(key):
     msg = "Only string keys are supported"
     with pytest.raises(KeyError, match=msg):
         union[key]
-
-
-def test_pipeline_param_validation():
-    """Run the param validation for `Pipeline`."""
-    model = Pipeline(
-        [("scaler", StandardScaler()), ("classifier", LogisticRegression())]
-    )
-    check_param_validation("Pipeline", model)

From a5c8577c2fa52f135663e4953c737b839a0f134c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 9 Dec 2022 13:51:27 +0100
Subject: [PATCH 6/6] iter

---
 sklearn/tests/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 9de45eed7e541..4f53e78471f4e 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -469,7 +469,7 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
 
 @pytest.mark.parametrize(
     "estimator",
-    list(_tested_estimators()) + list(_generate_pipeline()),
+    chain(_tested_estimators(), _generate_pipeline()),
     ids=_get_check_estimator_ids,
 )
 def test_check_param_validation(estimator):