From 868d0ff4fa9593fd9e9c65429a7bcb678dfa2be7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2024 11:25:05 +0200 Subject: [PATCH 01/18] FEAT allow metadata to be transformed in Pipeline --- sklearn/pipeline.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1b17599068d7a..4816b69a10e12 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -95,6 +95,17 @@ class Pipeline(_BaseComposition): must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details. + transform_input : list of str, default=None + This enables transforming some input arguments to ``fit`` (other than ``X``) + to be transformed by the steps of the pipeline up to the step which requires + them. Requirement is defined via :ref:`metadata routing `. + This can be used to pass a validation set through the pipeline for instance. + + See the example TBD for more details. + + You can only set this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no @@ -162,12 +173,14 @@ class Pipeline(_BaseComposition): _parameter_constraints: dict = { "steps": [list, Hidden(tuple)], + "transform_input": [list, None], "memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"], } - def __init__(self, steps, *, memory=None, verbose=False): + def __init__(self, steps, *, transform_input=None, memory=None, verbose=False): self.steps = steps + self.transform_input = transform_input self.memory = memory self.verbose = verbose @@ -409,7 +422,7 @@ def _fit(self, X, y=None, routed_params=None): cloned_transformer, X, y, - None, + weight=None, message_clsname="Pipeline", message=self._log_message(step_idx), params=routed_params[name], @@ -1288,7 +1301,14 @@ def _transform_one(transformer, X, y, weight, params): def _fit_transform_one( - transformer, X, y, weight, message_clsname="", message=None, params=None + transformer, + X, + y, + weight, + message_clsname="", + message=None, + params=None, + to_transform=None, ): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned @@ -1296,8 +1316,20 @@ def _fit_transform_one( be multiplied by ``weight``. ``params`` needs to be of the form ``process_routing()["step_name"]``. + + ``to_transform`` is a dict of {arg: value} for input parameters to be + transformed along ``X``. """ params = params or {} + to_transform = to_transform or {} + if weight is not None and to_transform: + # This should never happen! "to_transform" is used in Pipeline, while + # weight is used in ColumnTransformer and/or FeatureUnion. + raise ValueError( + "Cannot apply weight and transform parameters simultaneously. " + "Got weight={}, to_transform={}".format(weight, to_transform) + ) + with _print_elapsed_time(message_clsname, message): if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) @@ -1305,10 +1337,13 @@ def _fit_transform_one( res = transformer.fit(X, y, **params.get("fit", {})).transform( X, **params.get("transform", {}) ) + transformed = dict() + for param, value in to_transform.items(): + transformed[param] = transformer.transform(value) if weight is None: - return res, transformer - return res * weight, transformer + return res, transformed, transformer + return res * weight, transformed, transformer def _fit_one(transformer, X, y, weight, message_clsname="", message=None, params=None): From 94c8bd96349c8a6f1611223c739715eb67f2158d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 26 Apr 2024 17:28:31 +0200 Subject: [PATCH 02/18] add tests --- sklearn/pipeline.py | 83 +++++++++++++++++++++++++--------- sklearn/tests/test_pipeline.py | 60 +++++++++++++++++++++++- 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 487074a666d23..9b54ddbe40ba0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -393,6 +393,32 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps + def _get_step_params(self, *, step_idx, params): + """Get params (metadata) for step `name`. + + This transforms the metadata up to this step if required, which is + indicated by the `transform_input` parameter. + + If a param in `params` is included in the `transform_input` list, it + will be transformed. + """ + if self.transform_input is None or params is None or step_idx == 0: + return params + + step_params = dict() + transformed = dict() # used to transform each param once + for method, method_params in params.items(): + step_params[method] = Bunch() + for param_name, param_value in method_params.items(): + if param_name in self.transform_input: + # transform the parameter + if param_name not in transformed: + transformed[param_name] = self[:step_idx].transform(param_value) + step_params[method][param_name] = transformed[param_name] + else: + step_params[method][param_name] = param_value + return step_params + # Estimator interface def _fit(self, X, y=None, routed_params=None): @@ -418,6 +444,10 @@ def _fit(self, X, y=None, routed_params=None): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer + step_params = self._get_step_params( + step_idx=step_idx, params=routed_params[name] + ) + X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, @@ -425,7 +455,7 @@ def _fit(self, X, y=None, routed_params=None): weight=None, message_clsname="Pipeline", message=self._log_message(step_idx), - params=routed_params[name], + params=step_params, ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer @@ -480,11 +510,20 @@ def fit(self, X, y=None, **params): self : object Pipeline with fitted steps. """ + if not _routing_enabled() and self.transform_input is not None: + raise ValueError( + "The `transform_input` parameter can only be set if metadata " + "routing is enabled. You can enable metadata routing using " + "`sklearn.set_config(enable_metadata_routing=True)`." + ) + routed_params = self._check_method_params(method="fit", props=params) Xt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": - last_step_params = routed_params[self.steps[-1][0]] + last_step_params = self._get_step_params( + step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + ) self._final_estimator.fit(Xt, y, **last_step_params["fit"]) return self @@ -1223,7 +1262,7 @@ def _name_estimators(estimators): return list(zip(names, estimators)) -def make_pipeline(*steps, memory=None, verbose=False): +def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): """Construct a :class:`Pipeline` from the given estimators. This is a shorthand for the :class:`Pipeline` constructor; it does not @@ -1245,6 +1284,17 @@ def make_pipeline(*steps, memory=None, verbose=False): or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. + transform_input : list of str, default=None + This enables transforming some input arguments to ``fit`` (other than ``X``) + to be transformed by the steps of the pipeline up to the step which requires + them. Requirement is defined via :ref:`metadata routing `. + This can be used to pass a validation set through the pipeline for instance. + + See the example TBD for more details. + + You can only set this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. @@ -1268,7 +1318,12 @@ def make_pipeline(*steps, memory=None, verbose=False): Pipeline(steps=[('standardscaler', StandardScaler()), ('gaussiannb', GaussianNB())]) """ - return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) + return Pipeline( + _name_estimators(steps), + transform_input=transform_input, + memory=memory, + verbose=verbose, + ) def _transform_one(transformer, X, y, weight, columns=None, params=None): @@ -1315,7 +1370,6 @@ def _fit_transform_one( message_clsname="", message=None, params=None, - to_transform=None, ): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned @@ -1323,23 +1377,11 @@ def _fit_transform_one( be multiplied by ``weight``. ``params`` needs to be of the form ``process_routing()["step_name"]``. - - ``to_transform`` is a dict of {arg: value} for input parameters to be - transformed along ``X``. """ if columns is not None: X = _safe_indexing(X, columns, axis=1) params = params or {} - to_transform = to_transform or {} - if weight is not None and to_transform: - # This should never happen! "to_transform" is used in Pipeline, while - # weight is used in ColumnTransformer and/or FeatureUnion. - raise ValueError( - "Cannot apply weight and transform parameters simultaneously. " - "Got weight={}, to_transform={}".format(weight, to_transform) - ) - with _print_elapsed_time(message_clsname, message): if hasattr(transformer, "fit_transform"): res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) @@ -1347,13 +1389,10 @@ def _fit_transform_one( res = transformer.fit(X, y, **params.get("fit", {})).transform( X, **params.get("transform", {}) ) - transformed = dict() - for param, value in to_transform.items(): - transformed[param] = transformer.transform(value) if weight is None: - return res, transformed, transformer - return res * weight, transformed, transformer + return res, transformer + return res * weight, transformer def _fit_one(transformer, X, y, weight, message_clsname="", message=None, params=None): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 1d4cfb3dd6e2b..59583a59a3d0d 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -334,7 +334,8 @@ def test_pipeline_raise_set_params_error(): # expected error message error_msg = re.escape( "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," - " LinearRegression())]). Valid parameters are: ['memory', 'steps', 'verbose']." + " LinearRegression())]). Valid parameters are: ['memory', 'steps'," + " 'transform_input', 'verbose']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake="nope") @@ -759,6 +760,7 @@ def make(): "memory": None, "m2__mult": 2, "last__mult": 5, + "transform_input": None, "verbose": False, } @@ -1792,6 +1794,62 @@ def test_feature_union_feature_names_in_(): assert not hasattr(union, "feature_names_in_") +# transform_input tests +# ===================== + + +class IncTransformer(BaseEstimator, TransformerMixin): + """Transformer that increments the input by 1.""" + + def __init__(self, expected_fit_param=None, metadata_expected=True): + self.expected_fit_param = expected_fit_param + self.metadata_expected = metadata_expected + + def fit(self, X, y=None, expected_fit_param=None): + if self.metadata_expected: + assert_array_equal(expected_fit_param, self.expected_fit_param) + return self + + def transform(self, X): + return X + 1 + + +@pytest.mark.usefixtures("enable_slep006") +def test_transform_input_pipeline(): + """Test that with transform_input, data is correctly transformed for each step.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + expected_fit_param = np.array([[1, 2]]) + pipe = make_pipeline( + IncTransformer(expected_fit_param=expected_fit_param).set_fit_request( + expected_fit_param=True + ), + IncTransformer().set_fit_request(expected_fit_param=False), + IncTransformer(expected_fit_param=expected_fit_param + 2).set_fit_request( + expected_fit_param=True + ), + IncTransformer(expected_fit_param=expected_fit_param + 3).set_fit_request( + expected_fit_param=True + ), + transform_input=["expected_fit_param"], + ) + + pipe.fit(X, y, expected_fit_param=expected_fit_param) + + +def test_transform_input_no_slep6(): + """Make sure the right error is raised if slep6 is not enabled.""" + X = np.array([[1, 2], [3, 4]]) + y = np.array([0, 1]) + msg = "The `transform_input` parameter can only be set if metadata" + with pytest.raises(ValueError, match=msg): + make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y) + + +# end of transform_input tests +# ============================= + + # Test that metadata is routed correctly for pipelines and FeatureUnion # ===================================================================== From 818da329a21d43b3099266127c2b403433b9ad36 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 26 Apr 2024 17:34:08 +0200 Subject: [PATCH 03/18] add fit_transform --- sklearn/pipeline.py | 4 +++- sklearn/tests/test_pipeline.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 9b54ddbe40ba0..ac564cf76a7d0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -590,7 +590,9 @@ def fit_transform(self, X, y=None, **params): with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt - last_step_params = routed_params[self.steps[-1][0]] + last_step_params = self._get_step_params( + step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + ) if hasattr(last_step, "fit_transform"): return last_step.fit_transform( Xt, y, **last_step_params["fit_transform"] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 59583a59a3d0d..ae8d0c2f0c3ef 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1835,6 +1835,7 @@ def test_transform_input_pipeline(): ) pipe.fit(X, y, expected_fit_param=expected_fit_param) + pipe.fit_transform(X, y, expected_fit_param=expected_fit_param) def test_transform_input_no_slep6(): From 067946cb3515050d4ac1c5792e4243eec82d6ba7 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 29 Apr 2024 13:05:49 +0200 Subject: [PATCH 04/18] fix pprint test --- sklearn/utils/tests/test_pprint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index ec48c4a012574..4192f388a9f63 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -303,7 +303,7 @@ def test_pipeline(print_changed_only_false): penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False))], - verbose=False)""" + transform_input=None, verbose=False)""" expected = expected[1:] # remove first \n assert pipeline.__repr__() == expected From 85c10a426aad17bc40a5cc900a24901a20b6f216 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 7 May 2024 15:00:22 +0200 Subject: [PATCH 05/18] add changelog --- doc/whats_new/v1.6.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index b90394c75b6ff..7ef641e38f99e 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -40,3 +40,10 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: TODO: update at the time of the release. + +:mod:`pipeline` +--------------- + +- |Feature| :class:`pipeline.Pipeline` can now transform metadata up to the step + requiring the metadata, which can be set using the `transform_input` parameter. + :pr:`28901` by `Adrin Jalali`_. From ad269ea889ec3d49c57826d252eead22cd40c08e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 8 May 2024 11:26:20 +0200 Subject: [PATCH 06/18] much more extensive tests --- sklearn/pipeline.py | 76 ++++++++++++----- sklearn/tests/metadata_routing_common.py | 51 +++++++----- sklearn/tests/test_pipeline.py | 102 ++++++++++++++++------- 3 files changed, 156 insertions(+), 73 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 7b36ec84896c3..4407fbffcfd6c 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -35,6 +35,7 @@ MethodMapping, _raise_for_params, _routing_enabled, + get_routing_for_object, process_routing, ) from .utils.metaestimators import _BaseComposition, available_if @@ -394,35 +395,66 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps - def _get_step_params(self, *, step_idx, params): + def _get_step_params(self, *, step_idx, step_params, all_params): """Get params (metadata) for step `name`. This transforms the metadata up to this step if required, which is indicated by the `transform_input` parameter. - If a param in `params` is included in the `transform_input` list, it + If a param in `step_params` is included in the `transform_input` list, it will be transformed. - """ - if self.transform_input is None or params is None or step_idx == 0: - return params - step_params = dict() - transformed = dict() # used to transform each param once - for method, method_params in params.items(): - step_params[method] = Bunch() + `all_params` are the metadata passed by the user. Used to call `transform` + on the pipeline itself. + """ + if ( + self.transform_input is None + or not all_params + or not step_params + or step_idx == 0 + ): + # we only need to process step_params if transform_input is set + # and metadata is given by the user. + return step_params + + sub_pipeline = self[:step_idx] + sub_metadata_routing = get_routing_for_object(sub_pipeline) + # here we get the metadata required by sub_pipeline.transform + transform_params = { + key: value + for key, value in all_params.items() + if key + in sub_metadata_routing.consumes( + method="transform", params=all_params.keys() + ) + } + transformed_params = dict() + transformed_cache = dict() # used to transform each param once + for method, method_params in step_params.items(): + transformed_params[method] = Bunch() for param_name, param_value in method_params.items(): if param_name in self.transform_input: # transform the parameter - if param_name not in transformed: - transformed[param_name] = self[:step_idx].transform(param_value) - step_params[method][param_name] = transformed[param_name] + if param_name not in transformed_cache: + transformed_cache[param_name] = sub_pipeline.transform( + param_value, **transform_params + ) + transformed_params[method][param_name] = transformed_cache[ + param_name + ] else: - step_params[method][param_name] = param_value - return step_params + transformed_params[method][param_name] = param_value + return transformed_params # Estimator interface - def _fit(self, X, y=None, routed_params=None): + def _fit(self, X, y=None, routed_params=None, raw_params=None): + """Fit the pipeline except the last step. + + routed_params is the output of `process_routing` + raw_params is the parameters passed by the user, used when `transform_input` + is set by the user, to transform metadata using a sub-pipeline. + """ # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() @@ -446,7 +478,9 @@ def _fit(self, X, y=None, routed_params=None): cloned_transformer = clone(transformer) # Fit or load from cache the current transformer step_params = self._get_step_params( - step_idx=step_idx, params=routed_params[name] + step_idx=step_idx, + step_params=routed_params[name], + all_params=raw_params, ) X, fitted_transformer = fit_transform_one_cached( @@ -519,11 +553,13 @@ def fit(self, X, y=None, **params): ) routed_params = self._check_method_params(method="fit", props=params) - Xt = self._fit(X, y, routed_params) + Xt = self._fit(X, y, routed_params, raw_params=params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": last_step_params = self._get_step_params( - step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + step_idx=len(self) - 1, + step_params=routed_params[self.steps[-1][0]], + all_params=params, ) self._final_estimator.fit(Xt, y, **last_step_params["fit"]) @@ -592,7 +628,9 @@ def fit_transform(self, X, y=None, **params): if last_step == "passthrough": return Xt last_step_params = self._get_step_params( - step_idx=len(self) - 1, params=routed_params[self.steps[-1][0]] + step_idx=len(self) - 1, + step_params=routed_params[self.steps[-1][0]], + all_params=params, ) if hasattr(last_step, "fit_transform"): return last_step.fit_transform( diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 889524bc05ddb..4e1abb6ce1d59 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -35,13 +35,15 @@ def record_metadata(obj, method, record_default=True, **kwargs): """ if not hasattr(obj, "_records"): obj._records = {} + if method not in obj._records: + obj._records[method] = [] if not record_default: kwargs = { key: val for key, val in kwargs.items() if not isinstance(val, str) or (val != "default") } - obj._records[method] = kwargs + obj._records[method].append(kwargs) def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): @@ -59,21 +61,24 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): **kwargs : dict passed metadata """ - records = getattr(obj, "_records", dict()).get(method, dict()) - assert set(kwargs.keys()) == set( - records.keys() - ), f"Expected {kwargs.keys()} vs {records.keys()}" - for key, value in kwargs.items(): - recorded_value = records[key] - # The following condition is used to check for any specified parameters - # being a subset of the original values - if key in split_params and recorded_value is not None: - assert np.isin(recorded_value, value).all() - else: - if isinstance(recorded_value, np.ndarray): - assert_array_equal(recorded_value, value) + all_records = getattr(obj, "_records", dict()).get(method, dict()) + for record in all_records: + assert set(kwargs.keys()) == set( + record.keys() + ), f"Expected {kwargs.keys()} vs {record.keys()}" + for key, value in kwargs.items(): + recorded_value = record[key] + # The following condition is used to check for any specified parameters + # being a subset of the original values + if key in split_params and recorded_value is not None: + assert np.isin(recorded_value, value).all() else: - assert recorded_value is value, f"Expected {recorded_value} vs {value}" + if isinstance(recorded_value, np.ndarray): + assert_array_equal(recorded_value, value) + else: + assert ( + recorded_value is value + ), f"Expected {recorded_value} vs {value}. Method: {method}" record_metadata_not_default = partial(record_metadata, record_default=False) @@ -306,7 +311,7 @@ class ConsumingTransformer(TransformerMixin, BaseEstimator): def __init__(self, registry=None): self.registry = registry - def fit(self, X, y=None, sample_weight=None, metadata=None): + def fit(self, X, y=None, sample_weight="default", metadata="default"): if self.registry is not None: self.registry.append(self) @@ -315,18 +320,18 @@ def fit(self, X, y=None, sample_weight=None, metadata=None): ) return self - def transform(self, X, sample_weight=None, metadata=None): - record_metadata( + def transform(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( self, "transform", sample_weight=sample_weight, metadata=metadata ) - return X + return X + 1 - def fit_transform(self, X, y, sample_weight=None, metadata=None): + def fit_transform(self, X, y, sample_weight="default", metadata="default"): # implementing ``fit_transform`` is necessary since # ``TransformerMixin.fit_transform`` doesn't route any metadata to # ``transform``, while here we want ``transform`` to receive # ``sample_weight`` and ``metadata``. - record_metadata( + record_metadata_not_default( self, "fit_transform", sample_weight=sample_weight, metadata=metadata ) return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform( @@ -334,10 +339,10 @@ def fit_transform(self, X, y, sample_weight=None, metadata=None): ) def inverse_transform(self, X, sample_weight=None, metadata=None): - record_metadata( + record_metadata_not_default( self, "inverse_transform", sample_weight=sample_weight, metadata=metadata ) - return X + return X - 1 class ConsumingNoFitTransformTransformer(BaseEstimator): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9dc493031dc5a..5dd3f8b579ecc 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -335,7 +335,8 @@ def test_pipeline_raise_set_params_error(): # expected error message error_msg = re.escape( "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," - " LinearRegression())]). Valid parameters are: ['memory', 'steps', 'verbose']." + " LinearRegression())]). Valid parameters are: ['memory', 'steps'," + " 'transform_input', 'verbose']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake="nope") @@ -760,6 +761,7 @@ def make(): "memory": None, "m2__mult": 2, "last__mult": 5, + "transform_input": None, "verbose": False, } @@ -1817,44 +1819,82 @@ def test_pipeline_inverse_transform_Xt_deprecation(): # ===================== -class IncTransformer(BaseEstimator, TransformerMixin): - """Transformer that increments the input by 1.""" - - def __init__(self, expected_fit_param=None, metadata_expected=True): - self.expected_fit_param = expected_fit_param - self.metadata_expected = metadata_expected +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +def test_transform_input_pipeline(method): + """Test that with transform_input, data is correctly transformed for each step.""" - def fit(self, X, y=None, expected_fit_param=None): - if self.metadata_expected: - assert_array_equal(expected_fit_param, self.expected_fit_param) - return self + def get_transformer(registry, sample_weight, metadata): + """Get a transformer with requests set.""" + return ( + ConsumingTransformer(registry=registry) + .set_fit_request(sample_weight=sample_weight, metadata=metadata) + .set_transform_request(sample_weight=sample_weight, metadata=metadata) + ) - def transform(self, X): - return X + 1 + def get_pipeline(): + """Get a pipeline and corresponding registries. + The pipeline has 4 steps, with different request values set to test different + cases. One is aliased. + """ + registry_1, registry_2, registry_3, registry_4 = ( + _Registry(), + _Registry(), + _Registry(), + _Registry(), + ) + pipe = make_pipeline( + get_transformer(registry_1, sample_weight=True, metadata=True), + get_transformer(registry_2, sample_weight=False, metadata=False), + get_transformer(registry_3, sample_weight=True, metadata=True), + get_transformer(registry_4, sample_weight="other_weights", metadata=True), + transform_input=["sample_weight"], + ) + return pipe, registry_1, registry_2, registry_3, registry_4 + + def check_metadata(registry, methods, **metadata): + """Check that the right metadata was recorded for the given methods.""" + assert registry + for estimator in registry: + for method in methods: + check_recorded_metadata( + estimator, + method=method, + **metadata, + ) -@pytest.mark.usefixtures("enable_slep006") -def test_transform_input_pipeline(): - """Test that with transform_input, data is correctly transformed for each step.""" X = np.array([[1, 2], [3, 4]]) y = np.array([0, 1]) - expected_fit_param = np.array([[1, 2]]) - pipe = make_pipeline( - IncTransformer(expected_fit_param=expected_fit_param).set_fit_request( - expected_fit_param=True - ), - IncTransformer().set_fit_request(expected_fit_param=False), - IncTransformer(expected_fit_param=expected_fit_param + 2).set_fit_request( - expected_fit_param=True - ), - IncTransformer(expected_fit_param=expected_fit_param + 3).set_fit_request( - expected_fit_param=True - ), - transform_input=["expected_fit_param"], + sample_weight = np.array([[1, 2]]) + other_weights = np.array([[30, 40]]) + metadata = np.array([[100, 200]]) + + pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline() + pipe.fit( + X, + y, + sample_weight=sample_weight, + other_weights=other_weights, + metadata=metadata, ) - pipe.fit(X, y, expected_fit_param=expected_fit_param) - pipe.fit_transform(X, y, expected_fit_param=expected_fit_param) + check_metadata( + registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata + ) + check_metadata(registry_2, ["fit", "transform"]) + check_metadata( + registry_3, + ["fit", "transform"], + sample_weight=sample_weight + 2, + metadata=metadata, + ) + check_metadata( + registry_4, + method.split("_"), # ["fit", "transform"] if "fit_transform", ["fit"] otherwise + sample_weight=other_weights + 3, + metadata=metadata, + ) def test_transform_input_no_slep6(): From 52685143a55487ed68b6e961b9ae8b624ab83f99 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 24 May 2024 20:25:31 +0200 Subject: [PATCH 07/18] more fixes --- sklearn/compose/tests/test_column_transformer.py | 2 +- sklearn/tests/test_metaestimators_metadata_routing.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index d0f2274272230..9c1705c9c7c6d 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2631,7 +2631,7 @@ def test_metadata_routing_for_column_transformer(method): ) if method == "transform": - trs.fit(X, y) + trs.fit(X, y, sample_weight=sample_weight, metadata=metadata) trs.transform(X, sample_weight=sample_weight, metadata=metadata) else: getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index aa6af5bd09aac..3d4a0ddc7d68d 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -668,9 +668,10 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): ) if "fit" not in method_name: # fit before calling method + print(method_mapping) set_requests( estimator, - method_mapping=metaestimator.get("method_mapping", {}), + method_mapping=method_mapping, methods=["fit"], metadata_name=key, ) @@ -760,7 +761,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): method = getattr(instance, method_name) method_kwargs = {"sample_weight": sample_weight} if "fit" not in method_name: - instance.fit(X, y) + instance.fit(X, y, **method_kwargs) method(X, y, **method_kwargs) assert registry From 052b13d4aced708204b5e59bef56846ead2d014c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 26 May 2024 14:21:41 +0200 Subject: [PATCH 08/18] WIP tests improvements --- sklearn/tests/metadata_routing_common.py | 64 +++++++++++-------- sklearn/tests/test_metadata_routing.py | 33 +++++++--- .../test_metaestimators_metadata_routing.py | 36 +++++------ 3 files changed, 77 insertions(+), 56 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 4a363de8e8e4f..64747510f6b6a 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -1,3 +1,4 @@ +import inspect from functools import partial import numpy as np @@ -25,7 +26,7 @@ from sklearn.utils.multiclass import _check_partial_fit_first_call -def record_metadata(obj, method, record_default=True, **kwargs): +def record_metadata(obj, record_default=True, **kwargs): """Utility function to store passed metadata to a method. If record_default is False, kwargs whose values are "default" are skipped. @@ -33,20 +34,25 @@ def record_metadata(obj, method, record_default=True, **kwargs): are skipped. """ + stack = inspect.stack() + method = stack[1].function + parent = stack[2].function if not hasattr(obj, "_records"): obj._records = {} if method not in obj._records: - obj._records[method] = [] + obj._records[method] = {} + if parent not in obj._records[method]: + obj._records[method][parent] = [] if not record_default: kwargs = { key: val for key, val in kwargs.items() if not isinstance(val, str) or (val != "default") } - obj._records[method].append(kwargs) + obj._records[method][parent].append(kwargs) -def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): +def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs): """Check whether the expected metadata is passed to the object's method. Parameters @@ -55,13 +61,17 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): sub-estimator to check routed params for method : str sub-estimator's method where metadata is routed to + parent : str + the parent method which should have called `method` or caller split_params : tuple, default=empty specifies any parameters which are to be checked as being a subset of the original values **kwargs : dict passed metadata """ - all_records = getattr(obj, "_records", dict()).get(method, dict()) + all_records = ( + getattr(obj, "_records", dict()).get(method, dict()).get(parent, list()) + ) for record in all_records: assert set(kwargs.keys()) == set( record.keys() @@ -156,7 +166,7 @@ def partial_fit(self, X, y, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self @@ -165,19 +175,19 @@ def fit(self, X, y, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self def predict(self, X, y=None, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return np.zeros(shape=(len(X),)) def score(self, X, y, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "score", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return 1 @@ -245,7 +255,7 @@ def partial_fit( self.registry.append(self) record_metadata_not_default( - self, "partial_fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) _check_partial_fit_first_call(self, classes) return self @@ -255,7 +265,7 @@ def fit(self, X, y, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) self.classes_ = np.unique(y) @@ -263,7 +273,7 @@ def fit(self, X, y, sample_weight="default", metadata="default"): def predict(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) y_score = np.empty(shape=(len(X),), dtype="int8") y_score[len(X) // 2 :] = 0 @@ -272,7 +282,7 @@ def predict(self, X, sample_weight="default", metadata="default"): def predict_proba(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) y_proba = np.empty(shape=(len(X), 2)) y_proba[: len(X) // 2, :] = np.asarray([1.0, 0.0]) @@ -284,13 +294,13 @@ def predict_log_proba(self, X, sample_weight="default", metadata="default"): # uncomment when needed # record_metadata_not_default( - # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata + # self, sample_weight=sample_weight, metadata=metadata # ) # return np.zeros(shape=(len(X), 2)) def decision_function(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) y_score = np.empty(shape=(len(X),)) y_score[len(X) // 2 :] = 0 @@ -300,7 +310,7 @@ def decision_function(self, X, sample_weight="default", metadata="default"): # uncomment when needed # def score(self, X, y, sample_weight="default", metadata="default"): # record_metadata_not_default( - # self, "score", sample_weight=sample_weight, metadata=metadata + # self, sample_weight=sample_weight, metadata=metadata # ) # return 1 @@ -325,13 +335,13 @@ def fit(self, X, y=None, sample_weight="default", metadata="default"): self.registry.append(self) record_metadata_not_default( - self, "fit", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self def transform(self, X, sample_weight="default", metadata="default"): record_metadata_not_default( - self, "transform", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return X + 1 @@ -341,7 +351,7 @@ def fit_transform(self, X, y, sample_weight="default", metadata="default"): # ``transform``, while here we want ``transform`` to receive # ``sample_weight`` and ``metadata``. record_metadata_not_default( - self, "fit_transform", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform( X, sample_weight=sample_weight, metadata=metadata @@ -349,7 +359,7 @@ def fit_transform(self, X, y, sample_weight="default", metadata="default"): def inverse_transform(self, X, sample_weight=None, metadata=None): record_metadata_not_default( - self, "inverse_transform", sample_weight=sample_weight, metadata=metadata + self, sample_weight=sample_weight, metadata=metadata ) return X - 1 @@ -366,14 +376,12 @@ def fit(self, X, y=None, sample_weight=None, metadata=None): if self.registry is not None: self.registry.append(self) - record_metadata(self, "fit", sample_weight=sample_weight, metadata=metadata) + record_metadata(self, sample_weight=sample_weight, metadata=metadata) return self def transform(self, X, sample_weight=None, metadata=None): - record_metadata( - self, "transform", sample_weight=sample_weight, metadata=metadata - ) + record_metadata(self, sample_weight=sample_weight, metadata=metadata) return X @@ -388,7 +396,7 @@ def _score(self, method_caller, clf, X, y, **kwargs): if self.registry is not None: self.registry.append(self) - record_metadata_not_default(self, "score", **kwargs) + record_metadata_not_default(self, **kwargs) sample_weight = kwargs.get("sample_weight", None) return super()._score(method_caller, clf, X, y, sample_weight=sample_weight) @@ -402,7 +410,7 @@ def split(self, X, y=None, groups="default", metadata="default"): if self.registry is not None: self.registry.append(self) - record_metadata_not_default(self, "split", groups=groups, metadata=metadata) + record_metadata_not_default(self, groups=groups, metadata=metadata) split_index = len(X) // 2 train_indices = list(range(0, split_index)) @@ -450,7 +458,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): if self.registry is not None: self.registry.append(self) - record_metadata(self, "fit", sample_weight=sample_weight) + record_metadata(self, sample_weight=sample_weight) params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) return self @@ -484,7 +492,7 @@ def fit(self, X, y, sample_weight=None, **kwargs): if self.registry is not None: self.registry.append(self) - record_metadata(self, "fit", sample_weight=sample_weight) + record_metadata(self, sample_weight=sample_weight) params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs) self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) return self diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 109c730bf0718..e2a5351192a42 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -327,14 +327,16 @@ def test_simple_metadata_routing(): # and passing metadata to the consumer directly is fine regardless of its # metadata_request values. clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit") + check_recorded_metadata(clf.estimator_, method="fit", parent="fit") # Requesting a metadata will make the meta-estimator forward it correctly clf = WeightedMetaClassifier( estimator=ConsumingClassifier().set_fit_request(sample_weight=True) ) clf.fit(X, y, sample_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) + check_recorded_metadata( + clf.estimator_, method="fit", parent="fit", sample_weight=my_weights + ) # And requesting it with an alias clf = WeightedMetaClassifier( @@ -343,7 +345,9 @@ def test_simple_metadata_routing(): ) ) clf.fit(X, y, alternative_weight=my_weights) - check_recorded_metadata(clf.estimator_, "fit", sample_weight=my_weights) + check_recorded_metadata( + clf.estimator_, method="fit", parent="fit", sample_weight=my_weights + ) def test_nested_routing(): @@ -367,17 +371,30 @@ def test_nested_routing(): X, y, metadata=my_groups, sample_weight=w1, outer_weights=w2, inner_weights=w3 ) check_recorded_metadata( - pipeline.steps_[0].transformer_, "fit", metadata=my_groups, sample_weight=None + pipeline.steps_[0].transformer_, + method="fit", + parent="fit", + metadata=my_groups, + ) + check_recorded_metadata( + pipeline.steps_[0].transformer_, + method="transform", + parent="fit", + sample_weight=w1, + ) + check_recorded_metadata( + pipeline.steps_[1], method="fit", parent="fit", sample_weight=w2 ) check_recorded_metadata( - pipeline.steps_[0].transformer_, "transform", sample_weight=w1, metadata=None + pipeline.steps_[1].estimator_, method="fit", parent="fit", sample_weight=w3 ) - check_recorded_metadata(pipeline.steps_[1], "fit", sample_weight=w2) - check_recorded_metadata(pipeline.steps_[1].estimator_, "fit", sample_weight=w3) pipeline.predict(X, sample_weight=w3) check_recorded_metadata( - pipeline.steps_[0].transformer_, "transform", sample_weight=w3, metadata=None + pipeline.steps_[0].transformer_, + method="transform", + parent="fit", + sample_weight=w3, ) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 5b69791ad3013..db559311817f6 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -685,14 +685,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): ) if "fit" not in method_name: # fit before calling method - print(method_mapping) - set_requests( - estimator, - method_mapping=method_mapping, - methods=["fit"], - metadata_name=key, - ) - instance.fit(X, y, **method_kwargs, **extra_method_args) + instance.fit(X, y) try: # `fit` and `partial_fit` accept y, others don't. method(X, y, **method_kwargs, **extra_method_args) @@ -702,17 +695,17 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): # sanity check that registry is not empty, or else the test passes # trivially assert registry - if preserves_metadata is True: - for estimator in registry: - check_recorded_metadata(estimator, method_name, **method_kwargs) - elif preserves_metadata == "subset": - for estimator in registry: - check_recorded_metadata( - estimator, - method_name, - split_params=method_kwargs.keys(), - **method_kwargs, - ) + split_params = ( + method_kwargs.keys() if preserves_metadata == "subset" else () + ) + for estimator in registry: + check_recorded_metadata( + estimator, + method=method_name, + parent=method_name, + split_params=split_params, + **method_kwargs, + ) @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) @@ -786,6 +779,7 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): check_recorded_metadata( obj=_scorer, method="score", + parent=method_name, split_params=("sample_weight",), **method_kwargs, ) @@ -820,4 +814,6 @@ def test_metadata_is_routed_correctly_to_splitter(metaestimator): method(X_, y_, **method_kwargs) assert registry for _splitter in registry: - check_recorded_metadata(obj=_splitter, method="split", **method_kwargs) + check_recorded_metadata( + obj=_splitter, method="split", parent=method_name, **method_kwargs + ) From 278dc7007df669732b42c3cd9567e33b85038b77 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 26 May 2024 19:38:07 +0200 Subject: [PATCH 09/18] TST fix pipeline tests --- .../compose/tests/test_column_transformer.py | 6 +++- sklearn/ensemble/tests/test_stacking.py | 12 +++++-- sklearn/ensemble/tests/test_voting.py | 2 +- sklearn/model_selection/tests/test_search.py | 1 + .../model_selection/tests/test_validation.py | 4 +++ sklearn/tests/test_pipeline.py | 31 +++++++++++++++---- 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 9c1705c9c7c6d..33a886bfb7e80 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2639,7 +2639,11 @@ def test_metadata_routing_for_column_transformer(method): assert len(registry) for _trs in registry: check_recorded_metadata( - obj=_trs, method=method, sample_weight=sample_weight, metadata=metadata + obj=_trs, + method=method, + parent=method, + sample_weight=sample_weight, + metadata=metadata, ) diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 1c038cd469216..507f238d12148 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -973,13 +973,21 @@ def test_metadata_routing_for_stacking_estimators(Estimator, Child, prop, prop_v assert len(registry) for sub_est in registry: check_recorded_metadata( - obj=sub_est, method="fit", split_params=(prop), **{prop: prop_value} + obj=sub_est, + method="fit", + parent="fit", + split_params=(prop), + **{prop: prop_value}, ) # access final_estimator: registry = est.final_estimator_.registry assert len(registry) check_recorded_metadata( - obj=registry[-1], method="predict", split_params=(prop), **{prop: prop_value} + obj=registry[-1], + method="predict", + parent="predict", + split_params=(prop), + **{prop: prop_value}, ) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 4b2c365752b72..3800925fa17d0 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -759,7 +759,7 @@ def test_metadata_routing_for_voting_estimators(Estimator, Child, prop): registry = estimator[1].registry assert len(registry) for sub_est in registry: - check_recorded_metadata(obj=sub_est, method="fit", **kwargs) + check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs) @pytest.mark.usefixtures("enable_slep006") diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index cb4af646aee39..5ef60c7bad577 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2612,6 +2612,7 @@ def test_multi_metric_search_forwards_metadata(SearchCV, param_search): check_recorded_metadata( obj=_scorer, method="score", + parent="_score", split_params=("sample_weight", "metadata"), sample_weight=score_weights, metadata=score_metadata, diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 679c0052e3956..ffaf069f19c1d 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -2600,6 +2600,7 @@ def test_validation_functions_routing(func): check_recorded_metadata( obj=_scorer, method="score", + parent=func.__name__, split_params=("sample_weight", "metadata"), sample_weight=score_weights, metadata=score_metadata, @@ -2610,6 +2611,7 @@ def test_validation_functions_routing(func): check_recorded_metadata( obj=_splitter, method="split", + parent=func.__name__, groups=split_groups, metadata=split_metadata, ) @@ -2619,6 +2621,7 @@ def test_validation_functions_routing(func): check_recorded_metadata( obj=_estimator, method="fit", + parent=func.__name__, split_params=("sample_weight", "metadata"), sample_weight=fit_sample_weight, metadata=fit_metadata, @@ -2656,6 +2659,7 @@ def test_learning_curve_exploit_incremental_learning_routing(): check_recorded_metadata( obj=_estimator, method="partial_fit", + parent="learning_curve", split_params=("sample_weight", "metadata"), sample_weight=fit_sample_weight, metadata=fit_metadata, diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 5dd3f8b579ecc..9614323f78bb0 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1861,6 +1861,7 @@ def check_metadata(registry, methods, **metadata): check_recorded_metadata( estimator, method=method, + parent=method, **metadata, ) @@ -1925,38 +1926,47 @@ def fit(self, X, y, sample_weight=None, prop=None): def fit_transform(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return X + 1 def fit_predict(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def predict(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def predict_proba(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def predict_log_proba(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.zeros(len(X)) def decision_function(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return np.ones(len(X)) def score(self, X, y, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return 1 def transform(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return X + 1 def inverse_transform(self, X, sample_weight=None, prop=None): assert sample_weight is not None assert prop is not None + return X - 1 @pytest.mark.usefixtures("enable_slep006") @@ -1980,7 +1990,7 @@ def set_request(est, method, **kwarg): getattr(est, f"set_{method}_request")(**kwarg) return est - X, y = [[1]], [1] + X, y = np.array([[1]]), np.array([1]) sample_weight, prop, metadata = [1], "a", "b" # test that metadata is routed correctly for pipelines when requested @@ -1996,10 +2006,10 @@ def set_request(est, method, **kwarg): pipeline = Pipeline([("trs", trs), ("estimator", est)]) if "fit" not in method: - pipeline = pipeline.fit( - [[1]], [1], sample_weight=sample_weight, prop=prop, metadata=metadata - ) + pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop) + if method == "inverse_transform": + print("ha") try: getattr(pipeline, method)( X, y, sample_weight=sample_weight, prop=prop, metadata=metadata @@ -2013,10 +2023,18 @@ def set_request(est, method, **kwarg): # Make sure the transformer has received the metadata # For the transformer, always only `fit` and `transform` are called. check_recorded_metadata( - obj=trs, method="fit", sample_weight=sample_weight, metadata=metadata + obj=trs, + method="fit", + parent="fit", + sample_weight=sample_weight, + metadata=metadata, ) check_recorded_metadata( - obj=trs, method="transform", sample_weight=sample_weight, metadata=metadata + obj=trs, + method="transform", + parent="transform", + sample_weight=sample_weight, + metadata=metadata, ) @@ -2171,6 +2189,7 @@ def test_feature_union_metadata_routing(transformer): check_recorded_metadata( obj=sub_trans, method="fit", + parent="fit", **kwargs, ) From 0716f514c05025a42252a515f188501ad6c58c3f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Jun 2024 14:45:08 +0200 Subject: [PATCH 10/18] revert pipeline's transform_input --- doc/whats_new/v1.6.rst | 67 ++++++++++++--- sklearn/pipeline.py | 126 +++-------------------------- sklearn/tests/test_pipeline.py | 96 ---------------------- sklearn/utils/tests/test_pprint.py | 2 +- 4 files changed, 66 insertions(+), 225 deletions(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 337d0028e750f..da89deeab9d38 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -32,17 +32,24 @@ See :ref:`array_api` for more details. **Functions:** -- :func:`sklearn.metrics.mean_tweedie_deviance` now supports Array API compatible - inputs. - :pr:`28106` by :user:`Thomas Li `; +- :func:`sklearn.metrics.d2_tweedie_score` :pr:`29207` by :user:`Emily Chen `; - :func:`sklearn.metrics.mean_absolute_error` :pr:`27736` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.mean_squared_error` :pr:`29142` by :user:`Yaroslav Korobko `; +- :func:`sklearn.metrics.mean_tweedie_deviance` :pr:`28106` by :user:`Thomas Li `; - :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `. +- :func:`sklearn.metrics.pairwise.paired_cosine_distances` :pr:`29112` by :user:`Edoardo Abati `. **Classes:** - :class:`preprocessing.LabelEncoder` now supports Array API compatible inputs. :pr:`27381` by :user:`Omar Salman `. +- :class:`model_selection.GridSearchCV`, + :class:`model_selection.RandomizedSearchCV`, + :class:`model_selection.HalvingGridSearchCV` and + :class:`model_selection.HalvingRandomSearchCV` now support Array API + compatible inputs when their base estimators do. :pr:`27096` by :user:`Tim + Head ` and :user:`Olivier Grisel `. Metadata Routing ---------------- @@ -60,6 +67,14 @@ more details. ``**fit_params`` to the underlying estimators via their `fit` methods. :pr:`28701` by :user:`Stefanie Senger `. +Dropping official support for PyPy +---------------------------------- + +Due to limited maintainer resources and small number of users, official PyPy +support has been dropped. Some parts of scikit-learn may still work but PyPy is +not tested anymore in the scikit-learn Continuous Integration. +:pr:`29128` by :user:`Loïc Estève `. + Changelog --------- @@ -80,7 +95,29 @@ Changelog - |Enhancement| Added a function :func:`base.is_clusterer` which determines whether a given estimator is of category clusterer. :pr:`28936` by :user:`Christian Veenhuis `. - + +:mod:`sklearn.discriminant_analysis` +.................................... + +- |Fix| :class:`discriminant_analysis.QuadraticDiscriminantAnalysis` + will now cause `LinAlgWarning` in case of collinear variables. These errors + can be silenced using the `reg_param` attribute. + :pr:`19731` by :user:`Alihan Zihna `. + +:mod:`sklearn.impute` +..................... + +- |Fix| :class:`impute.KNNImputer` excludes samples with nan distances when + computing the mean value for uniform weights. + :pr:`29135` by :user:`Xuefeng Xu `. + +:mod:`sklearn.linear_model` +........................... + +- |API| Deprecates `copy_X` in :class:`linear_model.TheilSenRegressor` as the parameter + has no effect. `copy_X` will be removed in 1.8. + :pr:`29105` by :user:`Adam Li `. + :mod:`sklearn.metrics` ...................... @@ -88,14 +125,22 @@ Changelog whether to raise an exception if a subset of the scorers in multimetric scoring fails or to return an error code. :pr:`28992` by :user:`Stefanie Senger `. +:mod:`sklearn.model_selection` +.............................. + +- |Enhancement| Add the parameter `prefit` to + :class:`model_selection.FixedThresholdClassifier` allowing the use of a pre-fitted + estimator without re-fitting it. + :pr:`29067` by :user:`Guillaume Lemaitre `. + +:mod:`sklearn.neighbors` +........................ + +- |Fix| :class:`neighbors.LocalOutlierFactor` raises a warning in the `fit` method + when duplicate values in the training data lead to inaccurate outlier detection. + :pr:`28773` by :user:`Henrique Caroço `. + Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: TODO: update at the time of the release. - -:mod:`pipeline` ---------------- - -- |Feature| :class:`pipeline.Pipeline` can now transform metadata up to the step - requiring the metadata, which can be set using the `transform_input` parameter. - :pr:`28901` by `Adrin Jalali`_. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 70c00a7b04428..4d8e268163940 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -32,7 +32,6 @@ MethodMapping, _raise_for_params, _routing_enabled, - get_routing_for_object, process_routing, ) from .utils.metaestimators import _BaseComposition, available_if @@ -94,17 +93,6 @@ class Pipeline(_BaseComposition): must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details. - transform_input : list of str, default=None - This enables transforming some input arguments to ``fit`` (other than ``X``) - to be transformed by the steps of the pipeline up to the step which requires - them. Requirement is defined via :ref:`metadata routing `. - This can be used to pass a validation set through the pipeline for instance. - - See the example TBD for more details. - - You can only set this if metadata routing is enabled, which you - can enable using ``sklearn.set_config(enable_metadata_routing=True)``. - memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no @@ -172,14 +160,12 @@ class Pipeline(_BaseComposition): _parameter_constraints: dict = { "steps": [list, Hidden(tuple)], - "transform_input": [list, None], "memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"], } - def __init__(self, steps, *, transform_input=None, memory=None, verbose=False): + def __init__(self, steps, *, memory=None, verbose=False): self.steps = steps - self.transform_input = transform_input self.memory = memory self.verbose = verbose @@ -392,66 +378,9 @@ def _check_method_params(self, method, props, **kwargs): fit_params_steps[step]["fit_predict"][param] = pval return fit_params_steps - def _get_step_params(self, *, step_idx, step_params, all_params): - """Get params (metadata) for step `name`. - - This transforms the metadata up to this step if required, which is - indicated by the `transform_input` parameter. - - If a param in `step_params` is included in the `transform_input` list, it - will be transformed. - - `all_params` are the metadata passed by the user. Used to call `transform` - on the pipeline itself. - """ - if ( - self.transform_input is None - or not all_params - or not step_params - or step_idx == 0 - ): - # we only need to process step_params if transform_input is set - # and metadata is given by the user. - return step_params - - sub_pipeline = self[:step_idx] - sub_metadata_routing = get_routing_for_object(sub_pipeline) - # here we get the metadata required by sub_pipeline.transform - transform_params = { - key: value - for key, value in all_params.items() - if key - in sub_metadata_routing.consumes( - method="transform", params=all_params.keys() - ) - } - transformed_params = dict() - transformed_cache = dict() # used to transform each param once - for method, method_params in step_params.items(): - transformed_params[method] = Bunch() - for param_name, param_value in method_params.items(): - if param_name in self.transform_input: - # transform the parameter - if param_name not in transformed_cache: - transformed_cache[param_name] = sub_pipeline.transform( - param_value, **transform_params - ) - transformed_params[method][param_name] = transformed_cache[ - param_name - ] - else: - transformed_params[method][param_name] = param_value - return transformed_params - # Estimator interface - def _fit(self, X, y=None, routed_params=None, raw_params=None): - """Fit the pipeline except the last step. - - routed_params is the output of `process_routing` - raw_params is the parameters passed by the user, used when `transform_input` - is set by the user, to transform metadata using a sub-pipeline. - """ + def _fit(self, X, y=None, routed_params=None): # shallow copy of steps - this should really be steps_ self.steps = list(self.steps) self._validate_steps() @@ -474,20 +403,14 @@ def _fit(self, X, y=None, routed_params=None, raw_params=None): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transformer - step_params = self._get_step_params( - step_idx=step_idx, - step_params=routed_params[name], - all_params=raw_params, - ) - X, fitted_transformer = fit_transform_one_cached( cloned_transformer, X, y, - weight=None, + None, message_clsname="Pipeline", message=self._log_message(step_idx), - params=step_params, + params=routed_params[name], ) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer @@ -542,22 +465,11 @@ def fit(self, X, y=None, **params): self : object Pipeline with fitted steps. """ - if not _routing_enabled() and self.transform_input is not None: - raise ValueError( - "The `transform_input` parameter can only be set if metadata " - "routing is enabled. You can enable metadata routing using " - "`sklearn.set_config(enable_metadata_routing=True)`." - ) - routed_params = self._check_method_params(method="fit", props=params) - Xt = self._fit(X, y, routed_params, raw_params=params) + Xt = self._fit(X, y, routed_params) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if self._final_estimator != "passthrough": - last_step_params = self._get_step_params( - step_idx=len(self) - 1, - step_params=routed_params[self.steps[-1][0]], - all_params=params, - ) + last_step_params = routed_params[self.steps[-1][0]] self._final_estimator.fit(Xt, y, **last_step_params["fit"]) return self @@ -624,11 +536,7 @@ def fit_transform(self, X, y=None, **params): with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): if last_step == "passthrough": return Xt - last_step_params = self._get_step_params( - step_idx=len(self) - 1, - step_params=routed_params[self.steps[-1][0]], - all_params=params, - ) + last_step_params = routed_params[self.steps[-1][0]] if hasattr(last_step, "fit_transform"): return last_step.fit_transform( Xt, y, **last_step_params["fit_transform"] @@ -1309,7 +1217,7 @@ def _name_estimators(estimators): return list(zip(names, estimators)) -def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): +def make_pipeline(*steps, memory=None, verbose=False): """Construct a :class:`Pipeline` from the given estimators. This is a shorthand for the :class:`Pipeline` constructor; it does not @@ -1331,17 +1239,6 @@ def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. - transform_input : list of str, default=None - This enables transforming some input arguments to ``fit`` (other than ``X``) - to be transformed by the steps of the pipeline up to the step which requires - them. Requirement is defined via :ref:`metadata routing `. - This can be used to pass a validation set through the pipeline for instance. - - See the example TBD for more details. - - You can only set this if metadata routing is enabled, which you - can enable using ``sklearn.set_config(enable_metadata_routing=True)``. - verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. @@ -1365,12 +1262,7 @@ def make_pipeline(*steps, memory=None, transform_input=None, verbose=False): Pipeline(steps=[('standardscaler', StandardScaler()), ('gaussiannb', GaussianNB())]) """ - return Pipeline( - _name_estimators(steps), - transform_input=transform_input, - memory=memory, - verbose=verbose, - ) + return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) def _transform_one(transformer, X, y, weight, columns=None, params=None): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 9614323f78bb0..c1ecf65cc65ed 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1815,102 +1815,6 @@ def test_pipeline_inverse_transform_Xt_deprecation(): pipe.inverse_transform(Xt=X) -# transform_input tests -# ===================== - - -@pytest.mark.usefixtures("enable_slep006") -@pytest.mark.parametrize("method", ["fit", "fit_transform"]) -def test_transform_input_pipeline(method): - """Test that with transform_input, data is correctly transformed for each step.""" - - def get_transformer(registry, sample_weight, metadata): - """Get a transformer with requests set.""" - return ( - ConsumingTransformer(registry=registry) - .set_fit_request(sample_weight=sample_weight, metadata=metadata) - .set_transform_request(sample_weight=sample_weight, metadata=metadata) - ) - - def get_pipeline(): - """Get a pipeline and corresponding registries. - - The pipeline has 4 steps, with different request values set to test different - cases. One is aliased. - """ - registry_1, registry_2, registry_3, registry_4 = ( - _Registry(), - _Registry(), - _Registry(), - _Registry(), - ) - pipe = make_pipeline( - get_transformer(registry_1, sample_weight=True, metadata=True), - get_transformer(registry_2, sample_weight=False, metadata=False), - get_transformer(registry_3, sample_weight=True, metadata=True), - get_transformer(registry_4, sample_weight="other_weights", metadata=True), - transform_input=["sample_weight"], - ) - return pipe, registry_1, registry_2, registry_3, registry_4 - - def check_metadata(registry, methods, **metadata): - """Check that the right metadata was recorded for the given methods.""" - assert registry - for estimator in registry: - for method in methods: - check_recorded_metadata( - estimator, - method=method, - parent=method, - **metadata, - ) - - X = np.array([[1, 2], [3, 4]]) - y = np.array([0, 1]) - sample_weight = np.array([[1, 2]]) - other_weights = np.array([[30, 40]]) - metadata = np.array([[100, 200]]) - - pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline() - pipe.fit( - X, - y, - sample_weight=sample_weight, - other_weights=other_weights, - metadata=metadata, - ) - - check_metadata( - registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata - ) - check_metadata(registry_2, ["fit", "transform"]) - check_metadata( - registry_3, - ["fit", "transform"], - sample_weight=sample_weight + 2, - metadata=metadata, - ) - check_metadata( - registry_4, - method.split("_"), # ["fit", "transform"] if "fit_transform", ["fit"] otherwise - sample_weight=other_weights + 3, - metadata=metadata, - ) - - -def test_transform_input_no_slep6(): - """Make sure the right error is raised if slep6 is not enabled.""" - X = np.array([[1, 2], [3, 4]]) - y = np.array([0, 1]) - msg = "The `transform_input` parameter can only be set if metadata" - with pytest.raises(ValueError, match=msg): - make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y) - - -# end of transform_input tests -# ============================= - - # Test that metadata is routed correctly for pipelines and FeatureUnion # ===================================================================== diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index 4192f388a9f63..ec48c4a012574 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -303,7 +303,7 @@ def test_pipeline(print_changed_only_false): penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False))], - transform_input=None, verbose=False)""" + verbose=False)""" expected = expected[1:] # remove first \n assert pipeline.__repr__() == expected From d0724e1c05386419e214adbb60db39b6983ad82a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Jun 2024 14:45:37 +0200 Subject: [PATCH 11/18] TST remove print --- sklearn/tests/test_pipeline.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index c1ecf65cc65ed..372e0a8e7524c 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1912,8 +1912,6 @@ def set_request(est, method, **kwarg): if "fit" not in method: pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop) - if method == "inverse_transform": - print("ha") try: getattr(pipeline, method)( X, y, sample_weight=sample_weight, prop=prop, metadata=metadata From b563cbbff14697ad0f6d670add71d93d6b3f7c48 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 10 Jun 2024 15:05:21 +0200 Subject: [PATCH 12/18] TST fix tests --- sklearn/tests/test_pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 372e0a8e7524c..273aa4e9d36e4 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -336,7 +336,7 @@ def test_pipeline_raise_set_params_error(): error_msg = re.escape( "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," " LinearRegression())]). Valid parameters are: ['memory', 'steps'," - " 'transform_input', 'verbose']." + " 'verbose']." ) with pytest.raises(ValueError, match=error_msg): pipe.set_params(fake="nope") @@ -761,7 +761,6 @@ def make(): "memory": None, "m2__mult": 2, "last__mult": 5, - "transform_input": None, "verbose": False, } From b926dbc126fe6fc9b46bfad3a1fe5a7f68374103 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 10 Jun 2024 16:29:26 +0200 Subject: [PATCH 13/18] Update sklearn/tests/metadata_routing_common.py Co-authored-by: Adam Li --- sklearn/tests/metadata_routing_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 64747510f6b6a..e3ada9aaebf3a 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -62,7 +62,7 @@ def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs) method : str sub-estimator's method where metadata is routed to parent : str - the parent method which should have called `method` or caller + the parent method which should have called `method`, or the 'caller' split_params : tuple, default=empty specifies any parameters which are to be checked as being a subset of the original values From 9d44784c025539bc82c0117af9601fd812628a38 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 10 Jun 2024 16:29:32 +0200 Subject: [PATCH 14/18] Update sklearn/tests/metadata_routing_common.py Co-authored-by: Adam Li --- sklearn/tests/metadata_routing_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index e3ada9aaebf3a..5e2c646b7d9ec 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -27,7 +27,7 @@ def record_metadata(obj, record_default=True, **kwargs): - """Utility function to store passed metadata to a method. + """Utility function to store passed metadata to a method of obj. If record_default is False, kwargs whose values are "default" are skipped. This is so that checks on keyword arguments whose default was not changed From ded64ce1c94fff5e8e118d7a46ece05b277c146e Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 10 Jun 2024 16:29:37 +0200 Subject: [PATCH 15/18] Update sklearn/tests/metadata_routing_common.py Co-authored-by: Adam Li --- sklearn/tests/metadata_routing_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 5e2c646b7d9ec..28b110f052376 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -60,7 +60,7 @@ def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs) obj : estimator object sub-estimator to check routed params for method : str - sub-estimator's method where metadata is routed to + sub-estimator's method where metadata is routed to, or the 'callee' parent : str the parent method which should have called `method`, or the 'caller' split_params : tuple, default=empty From 4482191d6c21baa8c89e181c83b2b8df2b43c778 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 11 Jun 2024 14:22:50 +0200 Subject: [PATCH 16/18] Update sklearn/tests/metadata_routing_common.py Co-authored-by: Guillaume Lemaitre --- sklearn/tests/metadata_routing_common.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 28b110f052376..01b41e897011f 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -38,11 +38,7 @@ def record_metadata(obj, record_default=True, **kwargs): method = stack[1].function parent = stack[2].function if not hasattr(obj, "_records"): - obj._records = {} - if method not in obj._records: - obj._records[method] = {} - if parent not in obj._records[method]: - obj._records[method][parent] = [] + obj._records = defaultdict(lambda: defaultdict(list)) if not record_default: kwargs = { key: val From ee64d03a0c8b1aecd59122fb42dcb05f71645c06 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 11 Jun 2024 14:31:00 +0200 Subject: [PATCH 17/18] Guillaume's comments --- sklearn/tests/metadata_routing_common.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 01b41e897011f..0af522f9f9342 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -1,4 +1,5 @@ import inspect +from collections import defaultdict from functools import partial import numpy as np @@ -35,8 +36,8 @@ def record_metadata(obj, record_default=True, **kwargs): """ stack = inspect.stack() - method = stack[1].function - parent = stack[2].function + callee = stack[1].function + caller = stack[2].function if not hasattr(obj, "_records"): obj._records = defaultdict(lambda: defaultdict(list)) if not record_default: @@ -45,7 +46,7 @@ def record_metadata(obj, record_default=True, **kwargs): for key, val in kwargs.items() if not isinstance(val, str) or (val != "default") } - obj._records[method][parent].append(kwargs) + obj._records[callee][caller].append(kwargs) def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs): @@ -56,9 +57,11 @@ def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs) obj : estimator object sub-estimator to check routed params for method : str - sub-estimator's method where metadata is routed to, or the 'callee' + sub-estimator's method where metadata is routed to, or otherwise in + the context of metadata routing referred to as 'callee' parent : str - the parent method which should have called `method`, or the 'caller' + the parent method which should have called `method`, or otherwise in + the context of metadata routing referred to as 'caller' split_params : tuple, default=empty specifies any parameters which are to be checked as being a subset of the original values @@ -69,6 +72,8 @@ def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs) getattr(obj, "_records", dict()).get(method, dict()).get(parent, list()) ) for record in all_records: + # first check that the names of the metadata passed are the same as + # expected. The names are stored as keys in `record`. assert set(kwargs.keys()) == set( record.keys() ), f"Expected {kwargs.keys()} vs {record.keys()}" From a1023b32a540cb0a9cde7a57a24ce71babfd1b4b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 14 Jun 2024 11:22:18 +0200 Subject: [PATCH 18/18] Address Omar's comment --- .../tests/test_metaestimators_metadata_routing.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 22ef4b9319d64..cf2bb130267a3 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -764,21 +764,27 @@ def test_metadata_is_routed_correctly_to_scorer(metaestimator): cls = metaestimator["metaestimator"] routing_methods = metaestimator["scorer_routing_methods"] + method_mapping = metaestimator.get("method_mapping", {}) for method_name in routing_methods: kwargs, (estimator, _), (scorer, registry), (cv, _) = get_init_args( metaestimator, sub_estimator_consumes=True ) - if estimator: - estimator.set_fit_request(sample_weight=True, metadata=True) scorer.set_score_request(sample_weight=True) if cv: cv.set_split_request(groups=True, metadata=True) + if estimator is not None: + set_requests( + estimator, + method_mapping=method_mapping, + methods=[method_name], + metadata_name="sample_weight", + ) instance = cls(**kwargs) method = getattr(instance, method_name) method_kwargs = {"sample_weight": sample_weight} if "fit" not in method_name: - instance.fit(X, y, **method_kwargs) + instance.fit(X, y) method(X, y, **method_kwargs) assert registry