From 868d0ff4fa9593fd9e9c65429a7bcb678dfa2be7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2024 11:25:05 +0200 Subject: [PATCH 01/22] FEAT allow metadata to be transformed in Pipeline --- sklearn/pipeline.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1b17599068d7a..4816b69a10e12 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -95,6 +95,17 @@ class Pipeline(_BaseComposition): must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details. + transform_input : list of str, default=None + This enables transforming some input arguments to ``fit`` (other than ``X``) + to be transformed by the steps of the pipeline up to the step which requires + them. Requirement is defined via :ref:`metadata routing `. + This can be used to pass a validation set through the pipeline for instance. + + See the example TBD for more details. + + You can only set this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no @@ -162,12 +173,14 @@ class Pipeline(_BaseComposition): _parameter_constraints: dict = { "steps": [list, Hidden(tuple)], + "transform_input": [list, None], "memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"], } - def __init__(self, steps, *, memory=None, verbose=False): + def __init__(self, steps, *, transform_input=None, memory=None, verbose=False): self.steps = steps + self.transform_input = transform_input self.memory = memory self.verbose = verbose @@ -409,7 +422,7 @@ def _fit(self, X, y=None, routed_params=None): cloned_transformer, X, y, - None, + weight=None, message_clsname="Pipeline", message=self._log_message(step_idx), params=routed_params[name], @@ -1288,7 +1301,14 @@ def _transform_one(transformer, X, y, weight, params): def _fit_transform_one( - transformer, X, y, weight, message_clsname="", message=None, params=None + transformer, + X, + y, + weight, + message_clsname="", + message=None, + params=None, + to_transform=None, ): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned @@ -1296,8 +1316,20 @@ def _fit_transform_one( be multiplied by ``weight``. ``params`` needs to be of the form ``process_routing()["step_name"]``. + + ``to_transform`` is a dict of {arg: value} for input parameters to be + transformed along ``X``. """ params = params or {} + to_transform = to_transform or {} + if weight is not None and to_transform: + # This should never happen! "to_transform" is used in Pipeline, while + # weight is used in ColumnTransformer and/or FeatureUnion. + raise ValueError( + "Cannot apply weight and transform parameters simultaneously. " + "Got weight={}, to_transform={}".format(weight, to_transform) + ) + with _print_elapsed_time(message_clsname, message): if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) @@ -1305,10 +1337,13 @@ def _fit_transform_one( res = transformer.fit(X, y, **params.get("fit", {})).transform( X, **params.get("transform", {}) ) + transformed = dict() + for param, value in to_transform.items(): + transformed[param] = transformer.transform(value) if weight is None: - return res, transformer - return res * weight, transformer + return res, transformed, transformer + return res * weight, transformed, transformer def _fit_one(transformer, X, y, weight, message_clsname="", message=None, params=None): From 94c8bd96349c8a6f1611223c739715eb67f2158d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 26 Apr 2024 17:28:31 +0200 Subject: [PATCH 02/22] add tests --- sklearn/pipeline.py | 83 +++++++++++++++++++++++++--------- sklearn/tests/test_pipeline.py | 60 +++++++++++++++++++++++- 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 487074a666d23..9b54ddbe40ba0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -393,6 +393,32 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps + def _get_step_params(self, *, step_idx, params): + """Get params (metadata) for step `name`. + + This transforms the metadata up to this step if required, which is + indicated by the `transform_input` parameter. + + If a param in `params` is included in the `transform_input` list, it + will be transformed. + """ + if self.transform_input is None or params is None or step_idx == 0: + return params + + step_params = dict() + transformed = dict() # used to transform each param once + for method, method_params in params.items(): + step_params[method] = Bunch() + for param_name, param_value in method_params.items(): + if param_name in self.transform_input: + # transform the parameter + if param_name not in transformed: + transformed[param_name] = self[:step_idx].transform(param_value) + step_params[method][param_name] = transformed[param_name] + else: + step_params[method][param_name] = param_value + return step_params + # Estimator interface def _fit(self, X, y=None, routed_params=None): @@ -418,6 +444,10 @@ def _fit(self, X, y=None, routed_params=None): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer + step_params = self._get_step_params( + step_idx=step_idx, params=routed_params[name] + ) + X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, @@ -425,7 +455,7 @@ def _fit(self, X, y=None, routed_params=None): weight=None, message_clsname="Pipeline", message=self._log_message(step_idx), - params=routed_params[name], + params=step_params, ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer @@ -480,11 +510,20 @@ def fit(self, X, y=None, **params): self : object Pipeline with fitted steps. """ + if not _routing_enabled() and self.transform_input is not None: + raise ValueError( + "The `transform_input` parameter can only be set if metadata " + "routing is enabled. You can enable metadata routing using " + "`sklearn.set_config(enable_metadata_routing=True)`." + ) + routed_params = self._check_method_params(method="fit", props=params) Xt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": - last_step_params = routed_params[self.steps[-1][0]] + last_step_params = self._get_step_params( + step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + ) self._final_estimator.fit(Xt, y, **last_step_params["fit"]) return self @@ -1223,7 +1262,7 @@ def _name_estimators(estimators): return list(zip(names, estimators)) -def make_pipeline(*steps, memory=None, verbose=False): +def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): """Construct a :class:`Pipeline` from the given estimators. This is a shorthand for the :class:`Pipeline` constructor; it does not @@ -1245,6 +1284,17 @@ def make_pipeline(*steps, memory=None, verbose=False): or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. + transform_input : list of str, default=None + This enables transforming some input arguments to ``fit`` (other than ``X``) + to be transformed by the steps of the pipeline up to the step which requires + them. Requirement is defined via :ref:`metadata routing `. + This can be used to pass a validation set through the pipeline for instance. + + See the example TBD for more details. + + You can only set this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. @@ -1268,7 +1318,12 @@ def make_pipeline(*steps, memory=None, verbose=False): Pipeline(steps=[('standardscaler', StandardScaler()), ('gaussiannb', GaussianNB())]) """ - return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) + return Pipeline( + _name_estimators(steps), + transform_input=transform_input, + memory=memory, + verbose=verbose, + ) def _transform_one(transformer, X, y, weight, columns=None, params=None): @@ -1315,7 +1370,6 @@ def _fit_transform_one( message_clsname="", message=None, params=None, - to_transform=None, ): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned @@ -1323,23 +1377,11 @@ def _fit_transform_one( be multiplied by ``weight``. ``params`` needs to be of the form ``process_routing()["step_name"]``. - - ``to_transform`` is a dict of {arg: value} for input parameters to be - transformed along ``X``. """ if columns is not None: X = _safe_indexing(X, columns, axis=1) params = params or {} - to_transform = to_transform or {} - if weight is not None and to_transform: - # This should never happen! "to_transform" is used in Pipeline, while - # weight is used in ColumnTransformer and/or FeatureUnion. - raise ValueError( - "Cannot apply weight and transform parameters simultaneously. " - "Got weight={}, to_transform={}".format(weight, to_transform) - ) - with _print_elapsed_time(message_clsname, message): if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) @@ -1347,13 +1389,10 @@ def _fit_transform_one( res = transformer.fit(X, y, **params.get("fit", {})).transform( X, **params.get("transform", {}) ) - transformed = dict() - for param, value in to_transform.items(): - transformed[param] = transformer.transform(value) if weight is None: - return res, transformed, transformer - return res * weight, transformed, transformer + return res, transformer + return res * weight, transformer def _fit_one(transformer, X, y, weight, message_clsname="", message=None, params=None): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 1d4cfb3dd6e2b..59583a59a3d0d 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -334,7 +334,8 @@ def test_pipeline_raise_set_params_error(): # expected error message error_msg = re.escape( "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," - " LinearRegression())]). Valid parameters are: ['memory', 'steps', 'verbose']." + " LinearRegression())]). Valid parameters are: ['memory', 'steps'," + " 'transform_input', 'verbose']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake="nope") @@ -759,6 +760,7 @@ def make(): "memory": None, "m2__mult": 2, "last__mult": 5, + "transform_input": None, "verbose": False, } @@ -1792,6 +1794,62 @@ def test_feature_union_feature_names_in_(): assert not hasattr(union, "feature_names_in_") +# transform_input tests +# ===================== + + +class IncTransformer(BaseEstimator, TransformerMixin): + """Transformer that increments the input by 1.""" + + def __init__(self, expected_fit_param=None, metadata_expected=True): + self.expected_fit_param = expected_fit_param + self.metadata_expected = metadata_expected + + def fit(self, X, y=None, expected_fit_param=None): + if self.metadata_expected: + assert_array_equal(expected_fit_param, self.expected_fit_param) + return self + + def transform(self, X): + return X + 1 + + +@pytest.mark.usefixtures("enable_slep006") +def test_transform_input_pipeline(): + """Test that with transform_input, data is correctly transformed for each step.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + expected_fit_param = np.array([[1, 2]]) + pipe = make_pipeline( + IncTransformer(expected_fit_param=expected_fit_param).set_fit_request( + expected_fit_param=True + ), + IncTransformer().set_fit_request(expected_fit_param=False), + IncTransformer(expected_fit_param=expected_fit_param + 2).set_fit_request( + expected_fit_param=True + ), + IncTransformer(expected_fit_param=expected_fit_param + 3).set_fit_request( + expected_fit_param=True + ), + transform_input=["expected_fit_param"], + ) + + pipe.fit(X, y, expected_fit_param=expected_fit_param) + + +def test_transform_input_no_slep6(): + """Make sure the right error is raised if slep6 is not enabled.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + msg = "The `transform_input` parameter can only be set if metadata" + with pytest.raises(ValueError, match=msg): + make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y) + + +# end of transform_input tests +# ============================= + + # Test that metadata is routed correctly for pipelines and FeatureUnion # ===================================================================== From 818da329a21d43b3099266127c2b403433b9ad36 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 26 Apr 2024 17:34:08 +0200 Subject: [PATCH 03/22] add fit_transform --- sklearn/pipeline.py | 4 +++- sklearn/tests/test_pipeline.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 9b54ddbe40ba0..ac564cf76a7d0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -590,7 +590,9 @@ def fit_transform(self, X, y=None, **params): with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt - last_step_params = routed_params[self.steps[-1][0]] + last_step_params = self._get_step_params( + step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + ) if hasattr(last_step, "fit_transform"): return last_step.fit_transform( Xt, y, **last_step_params["fit_transform"] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 59583a59a3d0d..ae8d0c2f0c3ef 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1835,6 +1835,7 @@ def test_transform_input_pipeline(): ) pipe.fit(X, y, expected_fit_param=expected_fit_param) + pipe.fit_transform(X, y, expected_fit_param=expected_fit_param) def test_transform_input_no_slep6(): From 067946cb3515050d4ac1c5792e4243eec82d6ba7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Apr 2024 13:05:49 +0200 Subject: [PATCH 04/22] fix pprint test --- sklearn/utils/tests/test_pprint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index ec48c4a012574..4192f388a9f63 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -303,7 +303,7 @@ def test_pipeline(print_changed_only_false): penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False))], - verbose=False)""" + transform_input=None, verbose=False)""" expected = expected[1:] # remove first \n assert pipeline.__repr__() == expected From 85c10a426aad17bc40a5cc900a24901a20b6f216 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 7 May 2024 15:00:22 +0200 Subject: [PATCH 05/22] add changelog --- doc/whats_new/v1.6.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index b90394c75b6ff..7ef641e38f99e 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -40,3 +40,10 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: TODO: update at the time of the release. + +:mod:`pipeline` +--------------- + +- |Feature| :class:`pipeline.Pipeline` can now transform metadata up to the step + requiring the metadata, which can be set using the `transform_input` parameter. + :pr:`28901` by `Adrin Jalali`_. From ad269ea889ec3d49c57826d252eead22cd40c08e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 8 May 2024 11:26:20 +0200 Subject: [PATCH 06/22] much more extensive tests --- sklearn/pipeline.py | 76 ++++++++++++----- sklearn/tests/metadata_routing_common.py | 51 +++++++----- sklearn/tests/test_pipeline.py | 102 ++++++++++++++++------- 3 files changed, 156 insertions(+), 73 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 7b36ec84896c3..4407fbffcfd6c 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -35,6 +35,7 @@ MethodMapping, _raise_for_params, _routing_enabled, + get_routing_for_object, process_routing, ) from .utils.metaestimators import _BaseComposition, available_if @@ -394,35 +395,66 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps - def _get_step_params(self, *, step_idx, params): + def _get_step_params(self, *, step_idx, step_params, all_params): """Get params (metadata) for step `name`. This transforms the metadata up to this step if required, which is indicated by the `transform_input` parameter. - If a param in `params` is included in the `transform_input` list, it + If a param in `step_params` is included in the `transform_input` list, it will be transformed. - """ - if self.transform_input is None or params is None or step_idx == 0: - return params - step_params = dict() - transformed = dict() # used to transform each param once - for method, method_params in params.items(): - step_params[method] = Bunch() + `all_params` are the metadata passed by the user. Used to call `transform` + on the pipeline itself. + """ + if ( + self.transform_input is None + or not all_params + or not step_params + or step_idx == 0 + ): + # we only need to process step_params if transform_input is set + # and metadata is given by the user. + return step_params + + sub_pipeline = self[:step_idx] + sub_metadata_routing = get_routing_for_object(sub_pipeline) + # here we get the metadata required by sub_pipeline.transform + transform_params = { + key: value + for key, value in all_params.items() + if key + in sub_metadata_routing.consumes( + method="transform", params=all_params.keys() + ) + } + transformed_params = dict() + transformed_cache = dict() # used to transform each param once + for method, method_params in step_params.items(): + transformed_params[method] = Bunch() for param_name, param_value in method_params.items(): if param_name in self.transform_input: # transform the parameter - if param_name not in transformed: - transformed[param_name] = self[:step_idx].transform(param_value) - step_params[method][param_name] = transformed[param_name] + if param_name not in transformed_cache: + transformed_cache[param_name] = sub_pipeline.transform( + param_value, **transform_params + ) + transformed_params[method][param_name] = transformed_cache[ + param_name + ] else: - step_params[method][param_name] = param_value - return step_params + transformed_params[method][param_name] = param_value + return transformed_params # Estimator interface - def _fit(self, X, y=None, routed_params=None): + def _fit(self, X, y=None, routed_params=None, raw_params=None): + """Fit the pipeline except the last step. + + routed_params is the output of `process_routing` + raw_params is the parameters passed by the user, used when `transform_input` + is set by the user, to transform metadata using a sub-pipeline. + """ # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() @@ -446,7 +478,9 @@ def _fit(self, X, y=None, routed_params=None): cloned_transformer = clone(transformer) # Fit or load from cache the current transformer step_params = self._get_step_params( - step_idx=step_idx, params=routed_params[name] + step_idx=step_idx, + step_params=routed_params[name], + all_params=raw_params, ) X, fitted_transformer = fit_transform_one_cached( @@ -519,11 +553,13 @@ def fit(self, X, y=None, **params): ) routed_params = self._check_method_params(method="fit", props=params) - Xt = self._fit(X, y, routed_params) + Xt = self._fit(X, y, routed_params, raw_params=params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": last_step_params = self._get_step_params( - step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + step_idx=len(self) - 1, + step_params=routed_params[self.steps[-1][0]], + all_params=params, ) self._final_estimator.fit(Xt, y, **last_step_params["fit"]) @@ -592,7 +628,9 @@ def fit_transform(self, X, y=None, **params): if last_step == "passthrough": return Xt last_step_params = self._get_step_params( - step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + step_idx=len(self) - 1, + step_params=routed_params[self.steps[-1][0]], + all_params=params, ) if hasattr(last_step, "fit_transform"): return last_step.fit_transform( diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 889524bc05ddb..4e1abb6ce1d59 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -35,13 +35,15 @@ def record_metadata(obj, method, record_default=True, **kwargs): """ if not hasattr(obj, "_records"): obj._records = {} + if method not in obj._records: + obj._records[method] = [] if not record_default: kwargs = { key: val for key, val in kwargs.items() if not isinstance(val, str) or (val != "default") } - obj._records[method] = kwargs + obj._records[method].append(kwargs) def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): @@ -59,21 +61,24 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): **kwargs : dict passed metadata """ - records = getattr(obj, "_records", dict()).get(method, dict()) - assert set(kwargs.keys()) == set( - records.keys() - ), f"Expected {kwargs.keys()} vs {records.keys()}" - for key, value in kwargs.items(): - recorded_value = records[key] - # The following condition is used to check for any specified parameters - # being a subset of the original values - if key in split_params and recorded_value is not None: - assert np.isin(recorded_value, value).all() - else: - if isinstance(recorded_value, np.ndarray): - assert_array_equal(recorded_value, value) + all_records = getattr(obj, "_records", dict()).get(method, dict()) + for record in all_records: + assert set(kwargs.keys()) == set( + record.keys() + ), f"Expected {kwargs.keys()} vs {record.keys()}" + for key, value in kwargs.items(): + recorded_value = record[key] + # The following condition is used to check for any specified parameters + # being a subset of the original values + if key in split_params and recorded_value is not None: + assert np.isin(recorded_value, value).all() else: - assert recorded_value is value, f"Expected {recorded_value} vs {value}" + if isinstance(recorded_value, np.ndarray): + assert_array_equal(recorded_value, value) + else: + assert ( + recorded_value is value + ), f"Expected {recorded_value} vs {value}. Method: {method}" record_metadata_not_default = partial(record_metadata, record_default=False) @@ -306,7 +311,7 @@ class ConsumingTransformer(TransformerMixin, BaseEstimator): def __init__(self, registry=None): self.registry = registry - def fit(self, X, y=None, sample_weight=None, metadata=None): + def fit(self, X, y=None, sample_weight="default", metadata="default"): if self.registry is not None: self.registry.append(self) @@ -315,18 +320,18 @@ def fit(self, X, y=None, sample_weight=None, metadata=None): ) return self - def transform(self, X, sample_weight=None, metadata=None): - record_metadata( + def transform(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( self, "transform", sample_weight=sample_weight, metadata=metadata ) - return X + return X + 1 - def fit_transform(self, X, y, sample_weight=None, metadata=None): + def fit_transform(self, X, y, sample_weight="default", metadata="default"): # implementing ``fit_transform`` is necessary since # ``TransformerMixin.fit_transform`` doesn't route any metadata to # ``transform``, while here we want ``transform`` to receive # ``sample_weight`` and ``metadata``. - record_metadata( + record_metadata_not_default( self, "fit_transform", sample_weight=sample_weight, metadata=metadata ) return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform( @@ -334,10 +339,10 @@ def fit_transform(self, X, y, sample_weight=None, metadata=None): ) def inverse_transform(self, X, sample_weight=None, metadata=None): - record_metadata( + record_metadata_not_default( self, "inverse_transform", sample_weight=sample_weight, metadata=metadata ) - return X + return X - 1 class ConsumingNoFitTransformTransformer(BaseEstimator): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9dc493031dc5a..5dd3f8b579ecc 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -335,7 +335,8 @@ def test_pipeline_raise_set_params_error(): # expected error message error_msg = re.escape( "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," - " LinearRegression())]). Valid parameters are: ['memory', 'steps', 'verbose']." + " LinearRegression())]). Valid parameters are: ['memory', 'steps'," + " 'transform_input', 'verbose']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake="nope") @@ -760,6 +761,7 @@ def make(): "memory": None, "m2__mult": 2, "last__mult": 5, + "transform_input": None, "verbose": False, } @@ -1817,44 +1819,82 @@ def test_pipeline_inverse_transform_Xt_deprecation(): # ===================== -class IncTransformer(BaseEstimator, TransformerMixin): - """Transformer that increments the input by 1.""" - - def __init__(self, expected_fit_param=None, metadata_expected=True): - self.expected_fit_param = expected_fit_param - self.metadata_expected = metadata_expected +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +def test_transform_input_pipeline(method): + """Test that with transform_input, data is correctly transformed for each step.""" - def fit(self, X, y=None, expected_fit_param=None): - if self.metadata_expected: - assert_array_equal(expected_fit_param, self.expected_fit_param) - return self + def get_transformer(registry, sample_weight, metadata): + """Get a transformer with requests set.""" + return ( + ConsumingTransformer(registry=registry) + .set_fit_request(sample_weight=sample_weight, metadata=metadata) + .set_transform_request(sample_weight=sample_weight, metadata=metadata) + ) - def transform(self, X): - return X + 1 + def get_pipeline(): + """Get a pipeline and corresponding registries. + The pipeline has 4 steps, with different request values set to test different + cases. One is aliased. + """ + registry_1, registry_2, registry_3, registry_4 = ( + _Registry(), + _Registry(), + _Registry(), + _Registry(), + ) + pipe = make_pipeline( + get_transformer(registry_1, sample_weight=True, metadata=True), + get_transformer(registry_2, sample_weight=False, metadata=False), + get_transformer(registry_3, sample_weight=True, metadata=True), + get_transformer(registry_4, sample_weight="other_weights", metadata=True), + transform_input=["sample_weight"], + ) + return pipe, registry_1, registry_2, registry_3, registry_4 + + def check_metadata(registry, methods, **metadata): + """Check that the right metadata was recorded for the given methods.""" + assert registry + for estimator in registry: + for method in methods: + check_recorded_metadata( + estimator, + method=method, + **metadata, + ) -@pytest.mark.usefixtures("enable_slep006") -def test_transform_input_pipeline(): - """Test that with transform_input, data is correctly transformed for each step.""" X = np.array([[1, 2], [3, 4]]) y = np.array([0, 1]) - expected_fit_param = np.array([[1, 2]]) - pipe = make_pipeline( - IncTransformer(expected_fit_param=expected_fit_param).set_fit_request( - expected_fit_param=True - ), - IncTransformer().set_fit_request(expected_fit_param=False), - IncTransformer(expected_fit_param=expected_fit_param + 2).set_fit_request( - expected_fit_param=True - ), - IncTransformer(expected_fit_param=expected_fit_param + 3).set_fit_request( - expected_fit_param=True - ), - transform_input=["expected_fit_param"], + sample_weight = np.array([[1, 2]]) + other_weights = np.array([[30, 40]]) + metadata = np.array([[100, 200]]) + + pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline() + pipe.fit( + X, + y, + sample_weight=sample_weight, + other_weights=other_weights, + metadata=metadata, ) - pipe.fit(X, y, expected_fit_param=expected_fit_param) - pipe.fit_transform(X, y, expected_fit_param=expected_fit_param) + check_metadata( + registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata + ) + check_metadata(registry_2, ["fit", "transform"]) + check_metadata( + registry_3, + ["fit", "transform"], + sample_weight=sample_weight + 2, + metadata=metadata, + ) + check_metadata( + registry_4, + method.split("_"), # ["fit", "transform"] if "fit_transform", ["fit"] otherwise + sample_weight=other_weights + 3, + metadata=metadata, + ) def test_transform_input_no_slep6(): From 52685143a55487ed68b6e961b9ae8b624ab83f99 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 24 May 2024 20:25:31 +0200 Subject: [PATCH 07/22] more fixes --- sklearn/compose/tests/test_column_transformer.py | 2 +- sklearn/tests/test_metaestimators_metadata_routing.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index d0f2274272230..9c1705c9c7c6d 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2631,7 +2631,7 @@ def test_metadata_routing_for_column_transformer(method): ) if method == "transform": - trs.fit(X, y) + trs.fit(X, y, sample_weight=sample_weight, metadata=metadata) trs.transform(X, sample_weight=sample_weight, metadata=metadata) else: getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index aa6af5bd09aac..3d4a0ddc7d68d 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -668,9 +668,10 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): ) if "fit" not in method_name: # fit before calling method + print(method_mapping) set_requests( estimator, - method_mapping=metaestimator.get("method_mapping", {}), + method_mapping=method_mapping, methods=["fit"], metadata_name=key, ) @@ -760,7 +761,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): method = getattr(instance, method_name) method_kwargs = {"sample_weight": sample_weight} if "fit" not in method_name: - instance.fit(X, y) + instance.fit(X, y, **method_kwargs) method(X, y, **method_kwargs) assert registry From 052b13d4aced708204b5e59bef56846ead2d014c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 26 May 2024 14:21:41 +0200 Subject: [PATCH 08/22] WIP tests improvements --- sklearn/tests/metadata_routing_common.py | 64 +++++++++++-------- sklearn/tests/test_metadata_routing.py | 33 +++++++--- .../test_metaestimators_metadata_routing.py | 36 +++++------ 3 files changed, 77 insertions(+), 56 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 4a363de8e8e4f..64747510f6b6a 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -1,3 +1,4 @@ +import inspect from functools import partial import numpy as np @@ -25,7 +26,7 @@ from sklearn.utils.multiclass import _check_partial_fit_first_call -def record_metadata(obj, method, record_default=True, **kwargs): +def record_metadata(obj, record_default=True, **kwargs): """Utility function to store passed metadata to a method. If record_default is False, kwargs whose values are "default" are skipped. @@ -33,20 +34,25 @@ def record_metadata(obj, method, record_default=True, **kwargs): are skipped. """ + stack = inspect.stack() + method = stack[1].function + parent = stack[2].function if not hasattr(obj, "_records"): obj._records = {} if method not in obj._records: - obj._records[method] = [] + obj._records[method] = {} + if parent not in obj._records[method]: + obj._records[method][parent] = [] if not record_default: kwargs = { key: val for key, val in kwargs.items() if not isinstance(val, str) or (val != "default") } - obj._records[method].append(kwargs) + obj._records[method][parent].append(kwargs) -def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): +def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs): """Check whether the expected metadata is passed to the object's method. Parameters @@ -55,13 +61,17 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): sub-estimator to check routed params for method : str sub-estimator's method where metadata is routed to + parent : str + the parent method which should have called `method` or caller split_params : tuple, default=empty specifies any parameters which are to be checked as being a subset of the original values **kwargs : dict passed metadata """ - all_records = getattr(obj, "_records", dict()).get(method, dict()) + all_records = ( + getattr(obj, "_records", dict()).get(method, dict()).get(parent, list()) + ) for record in all_records: assert set(kwargs.keys()) == set( record.keys() @@ -156,7 +166,7 @@ def partial_fit(self, X, y, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self @@ -165,19 +175,19 @@ def fit(self, X, y, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self def predict(self, X, y=None, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return np.zeros(shape=(len(X),)) def score(self, X, y, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "score", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return 1 @@ -245,7 +255,7 @@ def partial_fit( self.registry.append(self) record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) _check_partial_fit_first_call(self, classes) return self @@ -255,7 +265,7 @@ def fit(self, X, y, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) self.classes_ = np.unique(y) @@ -263,7 +273,7 @@ def fit(self, X, y, sample_weight="default", metadata="default"): def predict(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) y_score = np.empty(shape=(len(X),), dtype="int8") y_score[len(X) // 2 :] = 0 @@ -272,7 +282,7 @@ def predict(self, X, sample_weight="default", metadata="default"): def predict_proba(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) y_proba = np.empty(shape=(len(X), 2)) y_proba[: len(X) // 2, :] = np.asarray([1.0, 0.0]) @@ -284,13 +294,13 @@ def predict_log_proba(self, X, sample_weight="default", metadata="default"): # uncomment when needed # record_metadata_not_default( - # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata + # self, sample_weight=sample_weight, metadata=metadata # ) # return np.zeros(shape=(len(X), 2)) def decision_function(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) y_score = np.empty(shape=(len(X),)) y_score[len(X) // 2 :] = 0 @@ -300,7 +310,7 @@ def decision_function(self, X, sample_weight="default", metadata="default"): # uncomment when needed # def score(self, X, y, sample_weight="default", metadata="default"): # record_metadata_not_default( - # self, "score", sample_weight=sample_weight, metadata=metadata + # self, sample_weight=sample_weight, metadata=metadata # ) # return 1 @@ -325,13 +335,13 @@ def fit(self, X, y=None, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self def transform(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "transform", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return X + 1 @@ -341,7 +351,7 @@ def fit_transform(self, X, y, sample_weight="default", metadata="default"): # ``transform``, while here we want ``transform`` to receive # ``sample_weight`` and ``metadata``. record_metadata_not_default( - self, "fit_transform", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform( X, sample_weight=sample_weight, metadata=metadata @@ -349,7 +359,7 @@ def fit_transform(self, X, y, sample_weight="default", metadata="default"): def inverse_transform(self, X, sample_weight=None, metadata=None): record_metadata_not_default( - self, "inverse_transform", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return X - 1 @@ -366,14 +376,12 @@ def fit(self, X, y=None, sample_weight=None, metadata=None): if self.registry is not None: self.registry.append(self) - record_metadata(self, "fit", sample_weight=sample_weight, metadata=metadata) + record_metadata(self, sample_weight=sample_weight, metadata=metadata) return self def transform(self, X, sample_weight=None, metadata=None): - record_metadata( - self, "transform", sample_weight=sample_weight, metadata=metadata - ) + record_metadata(self, sample_weight=sample_weight, metadata=metadata) return X @@ -388,7 +396,7 @@ def _score(self, method_caller, clf, X, y, **kwargs): if self.registry is not None: self.registry.append(self) - record_metadata_not_default(self, "score", **kwargs) + record_metadata_not_default(self, **kwargs) sample_weight = kwargs.get("sample_weight", None) return super()._score(method_caller, clf, X, y, sample_weight=sample_weight) @@ -402,7 +410,7 @@ def split(self, X, y=None, groups="default", metadata="default"): if self.registry is not None: self.registry.append(self) - record_metadata_not_default(self, "split", groups=groups, metadata=metadata) + record_metadata_not_default(self, groups=groups, metadata=metadata) split_index = len(X) // 2 train_indices = list(range(0, split_index)) @@ -450,7 +458,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): if self.registry is not None: self.registry.append(self) - record_metadata(self, "fit", sample_weight=sample_weight) + record_metadata(self, sample_weight=sample_weight) params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) return self @@ -484,7 +492,7 @@ def fit(self, X, y, sample_weight=None, **kwargs): if self.registry is not None: self.registry.append(self) - record_metadata(self, "fit", sample_weight=sample_weight) + record_metadata(self, sample_weight=sample_weight) params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) return self diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 109c730bf0718..e2a5351192a42 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -327,14 +327,16 @@ def test_simple_metadata_routing(): # and passing metadata to the consumer directly is fine regardless of its # metadata_request values. clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit") + check_recorded_metadata(clf.estimator_, method="fit", parent="fit") # Requesting a metadata will make the meta-estimator forward it correctly clf = WeightedMetaClassifier( estimator=ConsumingClassifier().set_fit_request(sample_weight=True) ) clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) + check_recorded_metadata( + clf.estimator_, method="fit", parent="fit", sample_weight=my_weights + ) # And requesting it with an alias clf = WeightedMetaClassifier( @@ -343,7 +345,9 @@ def test_simple_metadata_routing(): ) ) clf.fit(X, y, alternative_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) + check_recorded_metadata( + clf.estimator_, method="fit", parent="fit", sample_weight=my_weights + ) def test_nested_routing(): @@ -367,17 +371,30 @@ def test_nested_routing(): X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3 ) check_recorded_metadata( - pipeline.steps_[0].transformer_, "fit", metadata=my_groups, sample_weight=None + pipeline.steps_[0].transformer_, + method="fit", + parent="fit", + metadata=my_groups, + ) + check_recorded_metadata( + pipeline.steps_[0].transformer_, + method="transform", + parent="fit", + sample_weight=w1, + ) + check_recorded_metadata( + pipeline.steps_[1], method="fit", parent="fit", sample_weight=w2 ) check_recorded_metadata( - pipeline.steps_[0].transformer_, "transform", sample_weight=w1, metadata=None + pipeline.steps_[1].estimator_, method="fit", parent="fit", sample_weight=w3 ) - check_recorded_metadata(pipeline.steps_[1], "fit", sample_weight=w2) - check_recorded_metadata(pipeline.steps_[1].estimator_, "fit", sample_weight=w3) pipeline.predict(X, sample_weight=w3) check_recorded_metadata( - pipeline.steps_[0].transformer_, "transform", sample_weight=w3, metadata=None + pipeline.steps_[0].transformer_, + method="transform", + parent="fit", + sample_weight=w3, ) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 5b69791ad3013..db559311817f6 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -685,14 +685,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): ) if "fit" not in method_name: # fit before calling method - print(method_mapping) - set_requests( - estimator, - method_mapping=method_mapping, - methods=["fit"], - metadata_name=key, - ) - instance.fit(X, y, **method_kwargs, **extra_method_args) + instance.fit(X, y) try: # `fit` and `partial_fit` accept y, others don't. method(X, y, **method_kwargs, **extra_method_args) @@ -702,17 +695,17 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): # sanity check that registry is not empty, or else the test passes # trivially assert registry - if preserves_metadata is True: - for estimator in registry: - check_recorded_metadata(estimator, method_name, **method_kwargs) - elif preserves_metadata == "subset": - for estimator in registry: - check_recorded_metadata( - estimator, - method_name, - split_params=method_kwargs.keys(), - **method_kwargs, - ) + split_params = ( + method_kwargs.keys() if preserves_metadata == "subset" else () + ) + for estimator in registry: + check_recorded_metadata( + estimator, + method=method_name, + parent=method_name, + split_params=split_params, + **method_kwargs, + ) @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) @@ -786,6 +779,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): check_recorded_metadata( obj=_scorer, method="score", + parent=method_name, split_params=("sample_weight",), **method_kwargs, ) @@ -820,4 +814,6 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator): method(X_, y_, **method_kwargs) assert registry for _splitter in registry: - check_recorded_metadata(obj=_splitter, method="split", **method_kwargs) + check_recorded_metadata( + obj=_splitter, method="split", parent=method_name, **method_kwargs + ) From 278dc7007df669732b42c3cd9567e33b85038b77 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 26 May 2024 19:38:07 +0200 Subject: [PATCH 09/22] TST fix pipeline tests --- .../compose/tests/test_column_transformer.py | 6 +++- sklearn/ensemble/tests/test_stacking.py | 12 +++++-- sklearn/ensemble/tests/test_voting.py | 2 +- sklearn/model_selection/tests/test_search.py | 1 + .../model_selection/tests/test_validation.py | 4 +++ sklearn/tests/test_pipeline.py | 31 +++++++++++++++---- 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 9c1705c9c7c6d..33a886bfb7e80 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2639,7 +2639,11 @@ def test_metadata_routing_for_column_transformer(method): assert len(registry) for _trs in registry: check_recorded_metadata( - obj=_trs, method=method, sample_weight=sample_weight, metadata=metadata + obj=_trs, + method=method, + parent=method, + sample_weight=sample_weight, + metadata=metadata, ) diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 1c038cd469216..507f238d12148 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -973,13 +973,21 @@ def test_metadata_routing_for_stacking_estimators(Estimator, Child, prop, prop_v assert len(registry) for sub_est in registry: check_recorded_metadata( - obj=sub_est, method="fit", split_params=(prop), **{prop: prop_value} + obj=sub_est, + method="fit", + parent="fit", + split_params=(prop), + **{prop: prop_value}, ) # access final_estimator: registry = est.final_estimator_.registry assert len(registry) check_recorded_metadata( - obj=registry[-1], method="predict", split_params=(prop), **{prop: prop_value} + obj=registry[-1], + method="predict", + parent="predict", + split_params=(prop), + **{prop: prop_value}, ) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 4b2c365752b72..3800925fa17d0 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -759,7 +759,7 @@ def test_metadata_routing_for_voting_estimators(Estimator, Child, prop): registry = estimator[1].registry assert len(registry) for sub_est in registry: - check_recorded_metadata(obj=sub_est, method="fit", **kwargs) + check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs) @pytest.mark.usefixtures("enable_slep006") diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index cb4af646aee39..5ef60c7bad577 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2612,6 +2612,7 @@ def test_multi_metric_search_forwards_metadata(SearchCV, param_search): check_recorded_metadata( obj=_scorer, method="score", + parent="_score", split_params=("sample_weight", "metadata"), sample_weight=score_weights, metadata=score_metadata, diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 679c0052e3956..ffaf069f19c1d 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -2600,6 +2600,7 @@ def test_validation_functions_routing(func): check_recorded_metadata( obj=_scorer, method="score", + parent=func.__name__, split_params=("sample_weight", "metadata"), sample_weight=score_weights, metadata=score_metadata, @@ -2610,6 +2611,7 @@ def test_validation_functions_routing(func): check_recorded_metadata( obj=_splitter, method="split", + parent=func.__name__, groups=split_groups, metadata=split_metadata, ) @@ -2619,6 +2621,7 @@ def test_validation_functions_routing(func): check_recorded_metadata( obj=_estimator, method="fit", + parent=func.__name__, split_params=("sample_weight", "metadata"), sample_weight=fit_sample_weight, metadata=fit_metadata, @@ -2656,6 +2659,7 @@ def test_learning_curve_exploit_incremental_learning_routing(): check_recorded_metadata( obj=_estimator, method="partial_fit", + parent="learning_curve", split_params=("sample_weight", "metadata"), sample_weight=fit_sample_weight, metadata=fit_metadata, diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 5dd3f8b579ecc..9614323f78bb0 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1861,6 +1861,7 @@ def check_metadata(registry, methods, **metadata): check_recorded_metadata( estimator, method=method, + parent=method, **metadata, ) @@ -1925,38 +1926,47 @@ def fit(self, X, y, sample_weight=None, prop=None): def fit_transform(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return X + 1 def fit_predict(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def predict(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def predict_proba(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def predict_log_proba(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.zeros(len(X)) def decision_function(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def score(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return 1 def transform(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return X + 1 def inverse_transform(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return X - 1 @pytest.mark.usefixtures("enable_slep006") @@ -1980,7 +1990,7 @@ def set_request(est, method, **kwarg): getattr(est, f"set_{method}_request")(**kwarg) return est - X, y = [[1]], [1] + X, y = np.array([[1]]), np.array([1]) sample_weight, prop, metadata = [1], "a", "b" # test that metadata is routed correctly for pipelines when requested @@ -1996,10 +2006,10 @@ def set_request(est, method, **kwarg): pipeline = Pipeline([("trs", trs), ("estimator", est)]) if "fit" not in method: - pipeline = pipeline.fit( - [[1]], [1], sample_weight=sample_weight, prop=prop, metadata=metadata - ) + pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop) + if method == "inverse_transform": + print("ha") try: getattr(pipeline, method)( X, y, sample_weight=sample_weight, prop=prop, metadata=metadata @@ -2013,10 +2023,18 @@ def set_request(est, method, **kwarg): # Make sure the transformer has received the metadata # For the transformer, always only `fit` and `transform` are called. check_recorded_metadata( - obj=trs, method="fit", sample_weight=sample_weight, metadata=metadata + obj=trs, + method="fit", + parent="fit", + sample_weight=sample_weight, + metadata=metadata, ) check_recorded_metadata( - obj=trs, method="transform", sample_weight=sample_weight, metadata=metadata + obj=trs, + method="transform", + parent="transform", + sample_weight=sample_weight, + metadata=metadata, ) @@ -2171,6 +2189,7 @@ def test_feature_union_metadata_routing(transformer): check_recorded_metadata( obj=sub_trans, method="fit", + parent="fit", **kwargs, ) From 75dbf5d98b9e5ed69b5f4f3702c9a503e4cad18f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 2 Sep 2024 11:13:59 +0200 Subject: [PATCH 10/22] Christian's comments --- doc/whats_new/v1.6.rst | 4 ++-- sklearn/pipeline.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 337d0028e750f..c3cfe6bbd0219 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -80,7 +80,7 @@ Changelog - |Enhancement| Added a function :func:`base.is_clusterer` which determines whether a given estimator is of category clusterer. :pr:`28936` by :user:`Christian Veenhuis `. - + :mod:`sklearn.metrics` ...................... @@ -96,6 +96,6 @@ TODO: update at the time of the release. :mod:`pipeline` --------------- -- |Feature| :class:`pipeline.Pipeline` can now transform metadata up to the step +- |MajorFeature| :class:`pipeline.Pipeline` can now transform metadata up to the step requiring the metadata, which can be set using the `transform_input` parameter. :pr:`28901` by `Adrin Jalali`_. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 70c00a7b04428..dda77d24a0945 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -95,10 +95,13 @@ class Pipeline(_BaseComposition): :ref:`Combining Estimators ` for more details. transform_input : list of str, default=None + The names of the :term:`metadata` parameters that should be transformed by the + pipeline before passing it to the step consuming it. + This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. - This can be used to pass a validation set through the pipeline for instance. + For instance, this can be used to pass a validation set through the pipeline. See the example TBD for more details. @@ -398,11 +401,30 @@ def _get_step_params(self, *, step_idx, step_params, all_params): This transforms the metadata up to this step if required, which is indicated by the `transform_input` parameter. - If a param in `step_params` is included in the `transform_input` list, it - will be transformed. + If a param in `step_params` is included in the `transform_input` list, + it will be transformed. + + Parameters + ---------- + step_idx : int + Index of the step in the pipeline. - `all_params` are the metadata passed by the user. Used to call `transform` - on the pipeline itself. + step_params : dict + Parameters specific to the step. These are routed parameters, e.g. + `routed_params[name]`. If a parameter name here is included in the + `pipeline.transform_input`, then it will be transformed. Note that + these parameters are *after* routing, so the aliases are already + resolved. + + all_params : dict + All parameters passed by the user. Here this is used to call + `transform` on the slice of the pipeline itself. + + Returns + ------- + dict + Parameters to be passed to the step. The ones which should be + transformed are transformed. """ if ( self.transform_input is None From 52e06421f59ec3aca3d0d6778ed30510d814fb0d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 7 Sep 2024 16:48:25 +0200 Subject: [PATCH 11/22] remove erronous arg passing --- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 85ea34217a1d1..614c8669592b4 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -829,7 +829,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): method = getattr(instance, method_name) method_kwargs = {"sample_weight": sample_weight} if "fit" not in method_name: - instance.fit(X, y, **method_kwargs) + instance.fit(X, y) method(X, y, **method_kwargs) assert registry From cdaf20fa5d30013b8af2bf720cb7eed18cdea5fd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 7 Sep 2024 17:25:46 +0200 Subject: [PATCH 12/22] support tupples to be transformed --- sklearn/pipeline.py | 27 ++++++++++++++++---- sklearn/tests/test_pipeline.py | 46 +++++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8847732cc8197..053f144c7a504 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -443,17 +443,34 @@ def _get_step_params(self, *, step_idx, step_params, all_params): method="transform", params=all_params.keys() ) } - transformed_params = dict() + transformed_params = dict() # this is to be returned transformed_cache = dict() # used to transform each param once + # `step_params` is the output of `process_routing`, so it has a dict for each + # method (e.g. fit, transform, predict), which are the args to be passed to + # those methods. We need to transforme the parameters which are in the + # `transform_input`, before returning these dicts. for method, method_params in step_params.items(): transformed_params[method] = Bunch() for param_name, param_value in method_params.items(): + # An example of `(param_name, param_value)` is + # `('sample_weight', array([0.5, 0.5, ...]))` if param_name in self.transform_input: - # transform the parameter + # This parameter now needs to be transformed by the sub_pipeline, to + # this step. We cache these computations to avoid repeating them. if param_name not in transformed_cache: - transformed_cache[param_name] = sub_pipeline.transform( - param_value, **transform_params - ) + # If the parameter is a tuple, transform each element of the + # tuple. This is needed to support the pattern present in + # `lightgbm` and `xgboost` where users can pass multiple + # validation sets. + if isinstance(param_value, tuple): + transformed_cache[param_name] = tuple( + sub_pipeline.transform(element, **transform_params) + for element in param_value + ) + else: + transformed_cache[param_name] = sub_pipeline.transform( + param_value, **transform_params + ) transformed_params[method][param_name] = transformed_cache[ param_name ] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index af7361795bc45..61775a2763b13 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -13,7 +13,13 @@ import numpy as np import pytest -from sklearn.base import BaseEstimator, TransformerMixin, clone, is_classifier +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + TransformerMixin, + clone, + is_classifier, +) from sklearn.cluster import KMeans from sklearn.datasets import load_iris from sklearn.decomposition import PCA, TruncatedSVD @@ -1907,6 +1913,44 @@ def test_transform_input_no_slep6(): make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y) +@pytest.mark.usefixtures("enable_slep006") +def test_transform_tuple_input(): + """Test that if metadata is a tuple of arrays, both arrays are transformed.""" + + class Estimator(ClassifierMixin, BaseEstimator): + def fit(self, X, y, X_val=None, y_val=None): + assert isinstance(X_val, tuple) + assert isinstance(y_val, tuple) + # Here we make sure that each X_val is transformed by the transformer + assert_array_equal(X_val[0], np.array([[2, 3]])) + assert_array_equal(y_val[0], np.array([0, 1])) + assert_array_equal(X_val[1], np.array([[11, 12]])) + assert_array_equal(y_val[1], np.array([1, 2])) + return self + + class Transformer(TransformerMixin, BaseEstimator): + def fit(self, X, y): + return self + + def transform(self, X): + return X + 1 + + X = np.array([[1, 2]]) + y = np.array([0, 1]) + X_val0 = np.array([[1, 2]]) + y_val0 = np.array([0, 1]) + X_val1 = np.array([[10, 11]]) + y_val1 = np.array([1, 2]) + pipe = Pipeline( + [ + ("transformer", Transformer()), + ("estimator", Estimator().set_fit_request(X_val=True, y_val=True)), + ], + transform_input=["X_val"], + ) + pipe.fit(X, y, X_val=(X_val0, X_val1), y_val=(y_val0, y_val1)) + + # end of transform_input tests # ============================= From 9dbb6b7d80395ed023bd924bcea1476b20407866 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 16 Oct 2024 17:51:54 +0200 Subject: [PATCH 13/22] rename method --- sklearn/pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index ab33865336dcf..f0f92edbf0088 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -389,7 +389,7 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps - def _get_step_params(self, *, step_idx, step_params, all_params): + def _get_metadate_for_step(self, *, step_idx, step_params, all_params): """Get params (metadata) for step `name`. This transforms the metadata up to this step if required, which is @@ -507,7 +507,7 @@ def _fit(self, X, y=None, routed_params=None, raw_params=None): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer - step_params = self._get_step_params( + step_params = self._get_metadate_for_step( step_idx=step_idx, step_params=routed_params[name], all_params=raw_params, @@ -582,7 +582,7 @@ def fit(self, X, y=None, **params): Xt = self._fit(X, y, routed_params, raw_params=params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": - last_step_params = self._get_step_params( + last_step_params = self._get_metadate_for_step( step_idx=len(self) - 1, step_params=routed_params[self.steps[-1][0]], all_params=params, @@ -649,7 +649,7 @@ def fit_transform(self, X, y=None, **params): with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt - last_step_params = self._get_step_params( + last_step_params = self._get_metadate_for_step( step_idx=len(self) - 1, step_params=routed_params[self.steps[-1][0]], all_params=params, From 08e74151685f5fdea3fafc34aeadf7ad93c15edb Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 18 Oct 2024 08:45:07 +0200 Subject: [PATCH 14/22] address comments --- sklearn/pipeline.py | 64 +++++++++++++++++++++++++--------- sklearn/tests/test_pipeline.py | 33 ++++++++++++++++++ 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index f0f92edbf0088..e54ca9ae3bfca 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -51,6 +51,46 @@ def check(self): return check +def _cached_transform( + sub_pipeline, *, cache, param_name, param_value, transform_params +): + """Transform a parameter value using a sub-pipeline and cache the result. + + Parameters + ---------- + sub_pipeline : Pipeline + The sub-pipeline to be used for transformation. + cache : dict + The cache dictionary to store the transformed values. + param_name : str + The name of the parameter to be transformed. + param_value : object + The value of the parameter to be transformed. + transform_params : dict + The metadata to be used for transformation. This passed to the + `transform` method of the sub-pipeline. + + Returns + ------- + transformed_value : object + The transformed value of the parameter. + """ + if param_name not in cache: + # If the parameter is a tuple, transform each element of the + # tuple. This is needed to support the pattern present in + # `lightgbm` and `xgboost` where users can pass multiple + # validation sets. + if isinstance(param_value, tuple): + cache[param_name] = tuple( + sub_pipeline.transform(element, **transform_params) + for element in param_value + ) + else: + cache[param_name] = sub_pipeline.transform(param_value, **transform_params) + + return cache[param_name] + + class Pipeline(_BaseComposition): """ A sequence of data transformers with an optional final predictor. @@ -455,23 +495,13 @@ def _get_metadate_for_step(self, *, step_idx, step_params, all_params): if param_name in self.transform_input: # This parameter now needs to be transformed by the sub_pipeline, to # this step. We cache these computations to avoid repeating them. - if param_name not in transformed_cache: - # If the parameter is a tuple, transform each element of the - # tuple. This is needed to support the pattern present in - # `lightgbm` and `xgboost` where users can pass multiple - # validation sets. - if isinstance(param_value, tuple): - transformed_cache[param_name] = tuple( - sub_pipeline.transform(element, **transform_params) - for element in param_value - ) - else: - transformed_cache[param_name] = sub_pipeline.transform( - param_value, **transform_params - ) - transformed_params[method][param_name] = transformed_cache[ - param_name - ] + transformed_params[method][param_name] = _cached_transform( + sub_pipeline, + cache=transformed_cache, + param_name=param_name, + param_value=param_value, + transform_params=transform_params, + ) else: transformed_params[method][param_name] = param_value return transformed_params diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 719320eac8bcd..a308c9fc77770 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1905,6 +1905,39 @@ def check_metadata(registry, methods, **metadata): ) +@config_context(enable_metadata_routing=True) +def test_transform_input_explicit_value_check(): + """Test that the right transformed values are passed to `fit`.""" + + class Transformer(TransformerMixin, BaseEstimator): + def fit(self, X, y): + return self + + def transform(self, X): + return X + 1 + + class Estimator(ClassifierMixin, BaseEstimator): + def fit(self, X, y, X_val=None, y_val=None): + assert_array_equal(X, np.array([[1, 2]])) + assert_array_equal(y, np.array([0, 1])) + assert_array_equal(X_val, np.array([[2, 3]])) + assert_array_equal(y_val, np.array([0, 1])) + return self + + X = np.array([[0, 1]]) + y = np.array([0, 1]) + X_val = np.array([[1, 2]]) + y_val = np.array([0, 1]) + pipe = Pipeline( + [ + ("transformer", Transformer()), + ("estimator", Estimator().set_fit_request(X_val=True, y_val=True)), + ], + transform_input=["X_val"], + ) + pipe.fit(X, y, X_val=X_val, y_val=y_val) + + def test_transform_input_no_slep6(): """Make sure the right error is raised if slep6 is not enabled.""" X = np.array([[1, 2], [3, 4]]) From 486c116f969772b1ec7e5fab883ea432baa1a2dc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 18 Oct 2024 08:47:59 +0200 Subject: [PATCH 15/22] changelog --- .../sklearn.pipeline/28901.major-feature.rst | 3 +++ doc/whats_new/v1.6.rst | 7 ------- 2 files changed, 3 insertions(+), 7 deletions(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst b/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst new file mode 100644 index 0000000000000..1eae1aa526fdc --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst @@ -0,0 +1,3 @@ +- :class:`pipeline.Pipeline` can now transform metadata up to the step requiring the + metadata, which can be set using the `transform_input` parameter. + By `Adrin Jalali`_. diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index f2ca72ba00063..6930dca21cf1f 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -23,10 +23,3 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: TODO: update at the time of the release. - -:mod:`pipeline` ---------------- - -- |MajorFeature| :class:`pipeline.Pipeline` can now transform metadata up to the step - requiring the metadata, which can be set using the `transform_input` parameter. - :pr:`28901` by `Adrin Jalali`_. From 06ed90b05d5e8fc0c552d02dc3dacad762decc32 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 21 Oct 2024 09:43:21 +0200 Subject: [PATCH 16/22] remove TBD --- sklearn/pipeline.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 175e8b4429062..a3bf0cd313085 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -168,8 +168,6 @@ class Pipeline(_BaseComposition): them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. - See the example TBD for more details. - You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. @@ -1452,8 +1450,6 @@ def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): them. Requirement is defined via :ref:`metadata routing `. This can be used to pass a validation set through the pipeline for instance. - See the example TBD for more details. - You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. From 179dc88f1a2601f8f5e7198803bcf3bd79e7f428 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 22 Oct 2024 08:54:41 +0200 Subject: [PATCH 17/22] fix tests --- sklearn/tests/metadata_routing_common.py | 1 + sklearn/tests/test_pipeline.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 174164daada8c..98503652df6f0 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -347,6 +347,7 @@ def fit(self, X, y=None, sample_weight="default", metadata="default"): record_metadata_not_default( self, sample_weight=sample_weight, metadata=metadata ) + self.fitted_ = True return self def transform(self, X, sample_weight="default", metadata="default"): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 6e6aa1def79d5..1400330cb1337 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1918,6 +1918,7 @@ def test_transform_input_explicit_value_check(): class Transformer(TransformerMixin, BaseEstimator): def fit(self, X, y): + self.fitted_ = True return self def transform(self, X): @@ -1967,10 +1968,12 @@ def fit(self, X, y, X_val=None, y_val=None): assert_array_equal(y_val[0], np.array([0, 1])) assert_array_equal(X_val[1], np.array([[11, 12]])) assert_array_equal(y_val[1], np.array([1, 2])) + self.fitted_ = True return self class Transformer(TransformerMixin, BaseEstimator): def fit(self, X, y): + self.fitted_ = True return self def transform(self, X): From 18530b909ebb0905cee4ce3e39b707d2801f9b15 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 31 Oct 2024 12:03:10 +0100 Subject: [PATCH 18/22] remove debug message --- sklearn/tests/test_pipeline.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index c0a59309fa2d0..b92222027a0e8 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -2160,8 +2160,6 @@ def set_request(est, method, **kwarg): if "fit" not in method: pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop) - if method == "inverse_transform": - print("ha") try: getattr(pipeline, method)( X, y, sample_weight=sample_weight, prop=prop, metadata=metadata From f3ecedd5740e5f5c942707053e34846dabdadac8 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 4 Nov 2024 09:05:17 +0100 Subject: [PATCH 19/22] Update sklearn/pipeline.py Co-authored-by: Jiaming Yuan --- sklearn/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index a3bf0cd313085..e75c9547b4033 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -516,7 +516,7 @@ def _get_metadate_for_step(self, *, step_idx, step_params, all_params): transformed_cache = dict() # used to transform each param once # `step_params` is the output of `process_routing`, so it has a dict for each # method (e.g. fit, transform, predict), which are the args to be passed to - # those methods. We need to transforme the parameters which are in the + # those methods. We need to transform the parameters which are in the # `transform_input`, before returning these dicts. for method, method_params in step_params.items(): transformed_params[method] = Bunch() From df36b3386e26bf00d8588979cd4e8f32bececc4c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 7 Nov 2024 20:46:09 +0100 Subject: [PATCH 20/22] ... --- .../upcoming_changes/sklearn.pipeline/28901.major-feature.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst b/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst index 1eae1aa526fdc..60703872d3980 100644 --- a/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst +++ b/doc/whats_new/upcoming_changes/sklearn.pipeline/28901.major-feature.rst @@ -1,3 +1,3 @@ - :class:`pipeline.Pipeline` can now transform metadata up to the step requiring the metadata, which can be set using the `transform_input` parameter. - By `Adrin Jalali`_. + By `Adrin Jalali`_ From 31a847a6c7e2436e1917854381ee27f053e379b2 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 8 Nov 2024 08:50:40 +0100 Subject: [PATCH 21/22] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/pipeline.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 03b495c46d7fe..6cb635698abce 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -171,6 +171,8 @@ class Pipeline(_BaseComposition): You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + + .. versionadded:: 1.6 memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step @@ -462,7 +464,7 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps - def _get_metadate_for_step(self, *, step_idx, step_params, all_params): + def _get_metadata_for_step(self, *, step_idx, step_params, all_params): """Get params (metadata) for step `name`. This transforms the metadata up to this step if required, which is @@ -570,7 +572,7 @@ def _fit(self, X, y=None, routed_params=None, raw_params=None): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer - step_params = self._get_metadate_for_step( + step_params = self._get_metadata_for_step( step_idx=step_idx, step_params=routed_params[name], all_params=raw_params, @@ -645,7 +647,7 @@ def fit(self, X, y=None, **params): Xt = self._fit(X, y, routed_params, raw_params=params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": - last_step_params = self._get_metadate_for_step( + last_step_params = self._get_metadata_for_step( step_idx=len(self) - 1, step_params=routed_params[self.steps[-1][0]], all_params=params, @@ -712,7 +714,7 @@ def fit_transform(self, X, y=None, **params): with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt - last_step_params = self._get_metadate_for_step( + last_step_params = self._get_metadata_for_step( step_idx=len(self) - 1, step_params=routed_params[self.steps[-1][0]], all_params=params, @@ -1461,6 +1463,8 @@ def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + + .. versionadded:: 1.6 verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it From 496d5f2b30212cfc6057a81f6573045c266790e2 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 8 Nov 2024 09:00:56 +0100 Subject: [PATCH 22/22] lint --- sklearn/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6cb635698abce..62d69b644d130 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -171,7 +171,7 @@ class Pipeline(_BaseComposition): You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. - + .. versionadded:: 1.6 memory : str or object with the joblib.Memory interface, default=None @@ -1463,7 +1463,7 @@ def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. - + .. versionadded:: 1.6 verbose : bool, default=False