diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index cef3288b85439..5236c4499a728 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -28,6 +28,7 @@ from .utils.validation import check_memory from .utils.validation import check_is_fitted from .utils import check_pandas_support +from .utils._param_validation import HasMethods, Hidden from .utils._set_output import _safe_set_output, _get_output_config from .utils.fixes import delayed from .exceptions import NotFittedError @@ -40,7 +41,7 @@ def _final_estimator_has(attr): """Check that final_estimator has `attr`. - Used together with `avaliable_if` in `Pipeline`.""" + Used together with `available_if` in `Pipeline`.""" def check(self): # raise original `AttributeError` if `attr` does not exist @@ -143,6 +144,12 @@ class Pipeline(_BaseComposition): # BaseEstimator interface _required_parameters = ["steps"] + _parameter_constraints: dict = { + "steps": [list, Hidden(tuple)], + "memory": [None, str, HasMethods(["cache"])], + "verbose": ["boolean"], + } + def __init__(self, steps, *, memory=None, verbose=False): self.steps = steps self.memory = memory @@ -307,8 +314,15 @@ def named_steps(self): @property def _final_estimator(self): - estimator = self.steps[-1][1] - return "passthrough" if estimator is None else estimator + try: + estimator = self.steps[-1][1] + return "passthrough" if estimator is None else estimator + except (ValueError, AttributeError, TypeError): + # This condition happens when a call to a method is first calling + # `_available_if` and `fit` did not validate `steps` yet. We + # return `None` and an `InvalidParameterError` will be raised + # right after. + return None def _log_message(self, step_idx): if not self.verbose: @@ -398,6 +412,7 @@ def fit(self, X, y=None, **fit_params): self : object Pipeline with fitted steps. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): @@ -434,6 +449,7 @@ def fit_transform(self, X, y=None, **fit_params): Xt : ndarray of shape (n_samples, n_transformed_features) Transformed samples. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) @@ -510,6 +526,7 @@ def fit_predict(self, X, y=None, **fit_params): y_pred : ndarray Result of calling `fit_predict` on the final estimator. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) @@ -728,8 +745,12 @@ def classes_(self): return self.steps[-1][1].classes_ def _more_tags(self): - # check if first estimator expects pairwise input - return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} + try: + return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} + except (ValueError, AttributeError, TypeError): + # This happens when the `steps` is not a list of (name, estimator) + # tuples and `fit` is not called yet to validate the steps. + return {} def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 1781c4bde3134..4f53e78471f4e 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -58,7 +58,7 @@ from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import HalvingGridSearchCV from sklearn.model_selection import HalvingRandomSearchCV -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import make_pipeline, Pipeline from sklearn.utils import IS_PYPY from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags @@ -273,6 +273,16 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) +def _generate_pipeline(): + for final_estimator in [Ridge(), LogisticRegression()]: + yield Pipeline( + steps=[ + ("scaler", StandardScaler()), + ("final_estimator", final_estimator), + ] + ) + + def _generate_search_cv_instances(): for SearchCV, (Estimator, param_grid) in product( [ @@ -458,7 +468,9 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator): @pytest.mark.parametrize( - "estimator", _tested_estimators(), ids=_get_check_estimator_ids + "estimator", + chain(_tested_estimators(), _generate_pipeline()), + ids=_get_check_estimator_ids, ) def test_check_param_validation(estimator): name = estimator.__class__.__name__ diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index eab7d8027b3cd..342dc12b966c9 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1185,46 +1185,6 @@ def test_set_params_nested_pipeline(): estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5) -def test_pipeline_wrong_memory(): - # Test that an error is raised when memory is not a string or a Memory - # instance - X = iris.data - y = iris.target - # Define memory as an integer - memory = 1 - cached_pipe = Pipeline([("transf", DummyTransf()), ("svc", SVC())], memory=memory) - - msg = re.escape( - "'memory' should be None, a string or have the same interface " - "as joblib.Memory. Got memory='1' instead." - ) - with pytest.raises(ValueError, match=msg): - cached_pipe.fit(X, y) - - -class DummyMemory: - def cache(self, func): - return func - - -class WrongDummyMemory: - pass - - -def test_pipeline_with_cache_attribute(): - X = np.array([[1, 2]]) - pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=DummyMemory()) - pipe.fit(X, y=None) - dummy = WrongDummyMemory() - pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=dummy) - msg = re.escape( - "'memory' should be None, a string or have the same interface " - f"as joblib.Memory. Got memory='{dummy}' instead." - ) - with pytest.raises(ValueError, match=msg): - pipe.fit(X) - - def test_pipeline_memory(): X = iris.data y = iris.target