From 212f5fa1919aab74a688272f151333a07d993b55 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Dec 2022 17:44:15 +0100 Subject: [PATCH 1/6] MAINT validate parameters of Pipeline --- sklearn/base.py | 38 ++++++++++++++++++++-------------- sklearn/pipeline.py | 14 +++++++++++-- sklearn/tests/test_pipeline.py | 8 +++++++ 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index db82353662c0d..f86efab71e74d 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -109,7 +109,29 @@ def clone(estimator, *, safe=True): return new_object -class BaseEstimator: +class _ParamValidationMixin: + """Mixin class for all estimators in scikit-learn that validate their parameters. + + This mixin gives access to the `_validate_params` method, which is called + at `fit` at the beginning of the `fit` method. + """ + + def _validate_params(self): + """Validate types and values of constructor parameters + + The expected type and values must be defined in the `_parameter_constraints` + class attribute, which is a dictionary `param_name: list of constraints`. See + the docstring of `validate_parameter_constraints` for a description of the + accepted constraints. + """ + validate_parameter_constraints( + self._parameter_constraints, + self.get_params(deep=False), + caller_name=self.__class__.__name__, + ) + + +class BaseEstimator(_ParamValidationMixin): """Base class for all estimators in scikit-learn. Notes @@ -559,20 +581,6 @@ def _validate_data( return out - def _validate_params(self): - """Validate types and values of constructor parameters - - The expected type and values must be defined in the `_parameter_constraints` - class attribute, which is a dictionary `param_name: list of constraints`. See - the docstring of `validate_parameter_constraints` for a description of the - accepted constraints. - """ - validate_parameter_constraints( - self._parameter_constraints, - self.get_params(deep=False), - caller_name=self.__class__.__name__, - ) - @property def _repr_html_(self): """HTML representation of estimator. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index cef3288b85439..b723ffe3bd2e6 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,7 +16,7 @@ from scipy import sparse from joblib import Parallel -from .base import clone, TransformerMixin +from .base import clone, TransformerMixin, _ParamValidationMixin from .preprocessing import FunctionTransformer from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import available_if @@ -28,6 +28,7 @@ from .utils.validation import check_memory from .utils.validation import check_is_fitted from .utils import check_pandas_support +from .utils._param_validation import HasMethods from .utils._set_output import _safe_set_output, _get_output_config from .utils.fixes import delayed from .exceptions import NotFittedError @@ -50,7 +51,7 @@ def check(self): return check -class Pipeline(_BaseComposition): +class Pipeline(_BaseComposition, _ParamValidationMixin): """ Pipeline of transforms with a final estimator. @@ -143,6 +144,12 @@ class Pipeline(_BaseComposition): # BaseEstimator interface _required_parameters = ["steps"] + _parameter_constraints: dict = { + "steps": "no_validation", # validated in `_validate_steps` + "memory": [None, str, HasMethods(["cache"])], + "verbose": ["boolean"], + } + def __init__(self, steps, *, memory=None, verbose=False): self.steps = steps self.memory = memory @@ -398,6 +405,7 @@ def fit(self, X, y=None, **fit_params): self : object Pipeline with fitted steps. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): @@ -434,6 +442,7 @@ def fit_transform(self, X, y=None, **fit_params): Xt : ndarray of shape (n_samples, n_transformed_features) Transformed samples. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) @@ -510,6 +519,7 @@ def fit_predict(self, X, y=None, **fit_params): y_pred : ndarray Result of calling `fit_predict` on the final estimator. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index eab7d8027b3cd..0b96c81508ab6 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -12,6 +12,7 @@ from scipy import sparse import joblib +from sklearn.utils.estimator_checks import check_param_validation from sklearn.utils._testing import ( assert_allclose, assert_array_equal, @@ -1687,3 +1688,10 @@ def test_feature_union_getitem_error(key): msg = "Only string keys are supported" with pytest.raises(KeyError, match=msg): union[key] + + +def test_pipeline_param_validation(): + model = Pipeline( + [("scaler", StandardScaler()), ("classifier", LogisticRegression())] + ) + check_param_validation("Pipeline", model) From 8dead9532afea325baacd2231cedd7dd5308cdaf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Dec 2022 18:07:31 +0100 Subject: [PATCH 2/6] revert _ParamValidationMixin --- sklearn/base.py | 38 +++++++++++++++----------------------- sklearn/pipeline.py | 4 ++-- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index f86efab71e74d..db82353662c0d 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -109,29 +109,7 @@ def clone(estimator, *, safe=True): return new_object -class _ParamValidationMixin: - """Mixin class for all estimators in scikit-learn that validate their parameters. - - This mixin gives access to the `_validate_params` method, which is called - at `fit` at the beginning of the `fit` method. - """ - - def _validate_params(self): - """Validate types and values of constructor parameters - - The expected type and values must be defined in the `_parameter_constraints` - class attribute, which is a dictionary `param_name: list of constraints`. See - the docstring of `validate_parameter_constraints` for a description of the - accepted constraints. - """ - validate_parameter_constraints( - self._parameter_constraints, - self.get_params(deep=False), - caller_name=self.__class__.__name__, - ) - - -class BaseEstimator(_ParamValidationMixin): +class BaseEstimator: """Base class for all estimators in scikit-learn. Notes @@ -581,6 +559,20 @@ def _validate_data( return out + def _validate_params(self): + """Validate types and values of constructor parameters + + The expected type and values must be defined in the `_parameter_constraints` + class attribute, which is a dictionary `param_name: list of constraints`. See + the docstring of `validate_parameter_constraints` for a description of the + accepted constraints. + """ + validate_parameter_constraints( + self._parameter_constraints, + self.get_params(deep=False), + caller_name=self.__class__.__name__, + ) + @property def _repr_html_(self): """HTML representation of estimator. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index b723ffe3bd2e6..9e5c5ea711ca6 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,7 +16,7 @@ from scipy import sparse from joblib import Parallel -from .base import clone, TransformerMixin, _ParamValidationMixin +from .base import clone, TransformerMixin from .preprocessing import FunctionTransformer from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import available_if @@ -51,7 +51,7 @@ def check(self): return check -class Pipeline(_BaseComposition, _ParamValidationMixin): +class Pipeline(_BaseComposition): """ Pipeline of transforms with a final estimator. From b7db3e93409e4945a388f214eedd6857ec9faf40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 7 Dec 2022 18:17:16 +0100 Subject: [PATCH 3/6] TST remove redundant test --- sklearn/tests/test_pipeline.py | 40 ---------------------------------- 1 file changed, 40 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 0b96c81508ab6..67ad65b74b1eb 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1186,46 +1186,6 @@ def test_set_params_nested_pipeline(): estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5) -def test_pipeline_wrong_memory(): - # Test that an error is raised when memory is not a string or a Memory - # instance - X = iris.data - y = iris.target - # Define memory as an integer - memory = 1 - cached_pipe = Pipeline([("transf", DummyTransf()), ("svc", SVC())], memory=memory) - - msg = re.escape( - "'memory' should be None, a string or have the same interface " - "as joblib.Memory. Got memory='1' instead." - ) - with pytest.raises(ValueError, match=msg): - cached_pipe.fit(X, y) - - -class DummyMemory: - def cache(self, func): - return func - - -class WrongDummyMemory: - pass - - -def test_pipeline_with_cache_attribute(): - X = np.array([[1, 2]]) - pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=DummyMemory()) - pipe.fit(X, y=None) - dummy = WrongDummyMemory() - pipe = Pipeline([("transf", Transf()), ("clf", Mult())], memory=dummy) - msg = re.escape( - "'memory' should be None, a string or have the same interface " - f"as joblib.Memory. Got memory='{dummy}' instead." - ) - with pytest.raises(ValueError, match=msg): - pipe.fit(X) - - def test_pipeline_memory(): X = iris.data y = iris.target From 1d15e441c4f0781fdcbe98f54d21ebea528e0c0e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 8 Dec 2022 11:33:07 +0100 Subject: [PATCH 4/6] Update test_pipeline.py --- sklearn/tests/test_pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 67ad65b74b1eb..3c04889d95faa 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1651,6 +1651,7 @@ def test_feature_union_getitem_error(key): def test_pipeline_param_validation(): + """Run the param validation for `Pipeline`.""" model = Pipeline( [("scaler", StandardScaler()), ("classifier", LogisticRegression())] ) From fada95f433089cbd0755116d4572b0aae182828c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 9 Dec 2022 12:07:48 +0100 Subject: [PATCH 5/6] move test to common tests --- sklearn/pipeline.py | 25 ++++++++++++++++++------- sklearn/tests/test_common.py | 16 ++++++++++++++-- sklearn/tests/test_pipeline.py | 9 --------- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 9e5c5ea711ca6..5236c4499a728 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -28,7 +28,7 @@ from .utils.validation import check_memory from .utils.validation import check_is_fitted from .utils import check_pandas_support -from .utils._param_validation import HasMethods +from .utils._param_validation import HasMethods, Hidden from .utils._set_output import _safe_set_output, _get_output_config from .utils.fixes import delayed from .exceptions import NotFittedError @@ -41,7 +41,7 @@ def _final_estimator_has(attr): """Check that final_estimator has `attr`. - Used together with `avaliable_if` in `Pipeline`.""" + Used together with `available_if` in `Pipeline`.""" def check(self): # raise original `AttributeError` if `attr` does not exist @@ -145,7 +145,7 @@ class Pipeline(_BaseComposition): _required_parameters = ["steps"] _parameter_constraints: dict = { - "steps": "no_validation", # validated in `_validate_steps` + "steps": [list, Hidden(tuple)], "memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"], } @@ -314,8 +314,15 @@ def named_steps(self): @property def _final_estimator(self): - estimator = self.steps[-1][1] - return "passthrough" if estimator is None else estimator + try: + estimator = self.steps[-1][1] + return "passthrough" if estimator is None else estimator + except (ValueError, AttributeError, TypeError): + # This condition happens when a call to a method is first calling + # `_available_if` and `fit` did not validate `steps` yet. We + # return `None` and an `InvalidParameterError` will be raised + # right after. + return None def _log_message(self, step_idx): if not self.verbose: @@ -738,8 +745,12 @@ def classes_(self): return self.steps[-1][1].classes_ def _more_tags(self): - # check if first estimator expects pairwise input - return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} + try: + return {"pairwise": _safe_tags(self.steps[0][1], "pairwise")} + except (ValueError, AttributeError, TypeError): + # This happens when the `steps` is not a list of (name, estimator) + # tuples and `fit` is not called yet to validate the steps. + return {} def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 1781c4bde3134..9de45eed7e541 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -58,7 +58,7 @@ from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import HalvingGridSearchCV from sklearn.model_selection import HalvingRandomSearchCV -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import make_pipeline, Pipeline from sklearn.utils import IS_PYPY from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags @@ -273,6 +273,16 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) +def _generate_pipeline(): + for final_estimator in [Ridge(), LogisticRegression()]: + yield Pipeline( + steps=[ + ("scaler", StandardScaler()), + ("final_estimator", final_estimator), + ] + ) + + def _generate_search_cv_instances(): for SearchCV, (Estimator, param_grid) in product( [ @@ -458,7 +468,9 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator): @pytest.mark.parametrize( - "estimator", _tested_estimators(), ids=_get_check_estimator_ids + "estimator", + list(_tested_estimators()) + list(_generate_pipeline()), + ids=_get_check_estimator_ids, ) def test_check_param_validation(estimator): name = estimator.__class__.__name__ diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 3c04889d95faa..342dc12b966c9 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -12,7 +12,6 @@ from scipy import sparse import joblib -from sklearn.utils.estimator_checks import check_param_validation from sklearn.utils._testing import ( assert_allclose, assert_array_equal, @@ -1648,11 +1647,3 @@ def test_feature_union_getitem_error(key): msg = "Only string keys are supported" with pytest.raises(KeyError, match=msg): union[key] - - -def test_pipeline_param_validation(): - """Run the param validation for `Pipeline`.""" - model = Pipeline( - [("scaler", StandardScaler()), ("classifier", LogisticRegression())] - ) - check_param_validation("Pipeline", model) From a5c8577c2fa52f135663e4953c737b839a0f134c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 9 Dec 2022 13:51:27 +0100 Subject: [PATCH 6/6] iter --- sklearn/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 9de45eed7e541..4f53e78471f4e 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -469,7 +469,7 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator): @pytest.mark.parametrize( "estimator", - list(_tested_estimators()) + list(_generate_pipeline()), + chain(_tested_estimators(), _generate_pipeline()), ids=_get_check_estimator_ids, ) def test_check_param_validation(estimator):