From 5286d4741b6b8cfb64eb6bfd3ebdcba7a3fb9362 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 25 Jan 2024 15:46:19 +0100 Subject: [PATCH 01/34] unfinished metadata implementation for ransacregressor --- doc/metadata_routing.rst | 2 +- sklearn/linear_model/_ransac.py | 165 +++++++++++++++--- sklearn/tests/metadata_routing_common.py | 18 +- .../test_metaestimators_metadata_routing.py | 58 ++++-- 4 files changed, 195 insertions(+), 48 deletions(-) diff --git a/doc/metadata_routing.rst b/doc/metadata_routing.rst index 96dba6ae1467b..3ab7efe5d99b9 100644 --- a/doc/metadata_routing.rst +++ b/doc/metadata_routing.rst @@ -260,6 +260,7 @@ Meta-estimators and functions supporting metadata routing: - :class:`sklearn.linear_model.LogisticRegressionCV` - :class:`sklearn.linear_model.MultiTaskElasticNetCV` - :class:`sklearn.linear_model.MultiTaskLassoCV` +- :class:`sklearn.linear_model.RANSACRegressor` - :class:`sklearn.model_selection.GridSearchCV` - :class:`sklearn.model_selection.HalvingGridSearchCV` - :class:`sklearn.model_selection.HalvingRandomSearchCV` @@ -293,7 +294,6 @@ Meta-estimators and tools not supporting metadata routing yet: - :class:`sklearn.feature_selection.RFECV` - :class:`sklearn.feature_selection.SequentialFeatureSelector` - :class:`sklearn.impute.IterativeImputer` -- :class:`sklearn.linear_model.RANSACRegressor` - :class:`sklearn.linear_model.RidgeClassifierCV` - :class:`sklearn.linear_model.RidgeCV` - :class:`sklearn.model_selection.learning_curve` diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index b2c25607f91c0..0c516728109ea 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -17,6 +17,7 @@ ) from ..exceptions import ConvergenceWarning from ..utils import check_consistent_length, check_random_state +from ..utils._bunch import Bunch from ..utils._param_validation import ( HasMethods, Interval, @@ -25,11 +26,18 @@ StrOptions, ) from ..utils.metadata_routing import ( - _raise_for_unsupported_routing, - _RoutingNotSupportedMixin, + MetadataRouter, + MethodMapping, + _routing_enabled, + process_routing, ) from ..utils.random import sample_without_replacement -from ..utils.validation import _check_sample_weight, check_is_fitted, has_fit_parameter +from ..utils.validation import ( + _check_sample_weight, + _deprecate_positional_args, + check_is_fitted, + has_fit_parameter, +) from ._base import LinearRegression _EPSILON = np.spacing(1) @@ -70,7 +78,6 @@ def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability): class RANSACRegressor( - _RoutingNotSupportedMixin, MetaEstimatorMixin, RegressorMixin, MultiOutputMixin, @@ -306,7 +313,8 @@ def __init__( # RansacRegressor.estimator is not validated yet prefer_skip_nested_validation=False ) - def fit(self, X, y, sample_weight=None): + @_deprecate_positional_args(version="1.7") + def fit(self, X, y, sample_weight=None, **fit_params): """Fit estimator using RANSAC algorithm. Parameters @@ -324,6 +332,17 @@ def fit(self, X, y, sample_weight=None): .. versionadded:: 0.18 + **fit_params : dict + Parameters routed to the `fit` method of the sub-estimator via the + metadata routing API. + + .. versionadded:: 1.5 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + Returns ------- self : object @@ -336,7 +355,6 @@ def fit(self, X, y, sample_weight=None): `is_data_valid` and `is_model_valid` return False for all `max_trials` randomly chosen sub-samples. """ - _raise_for_unsupported_routing(self, "fit", sample_weight=sample_weight) # Need to validate separately here. We can't pass multi_output=True # because that would allow y to be csr. Delay expensive finiteness # check to the estimator's own input validation. @@ -404,12 +422,21 @@ def fit(self, X, y, sample_weight=None): estimator_name = type(estimator).__name__ if sample_weight is not None and not estimator_fit_has_sample_weight: raise ValueError( - "%s does not support sample_weight. Samples" + "%s does not support sample_weight. Sample" " weights are only used for the calibration" " itself." % estimator_name ) - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch() + # following line makes pytest sklearn/linear_model/tests/test_ransac.py fail + # on collection + routed_params[estimator] = Bunch(fit={}) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + routed_params.estimator.fit = sample_weight n_inliers_best = 1 score_best = -np.inf @@ -451,13 +478,16 @@ def fit(self, X, y, sample_weight=None): self.n_skips_invalid_data_ += 1 continue + # cut `fit_params` down to their temporary lengthes according to + # `min_samples` param + fit_params_cut_to_min_samples = {} + for key in routed_params.estimator.fit: + # only apply on sample_wise metadata + if len(fit_params[key]) == len(X): + fit_params_cut_to_min_samples[key] = fit_params[key][subset_idxs] + # fit model for current random sample set - if sample_weight is None: - estimator.fit(X_subset, y_subset) - else: - estimator.fit( - X_subset, y_subset, sample_weight=sample_weight[subset_idxs] - ) + estimator.fit(X_subset, y_subset, **fit_params_cut_to_min_samples) # check if estimated model is valid if self.is_model_valid is not None and not self.is_model_valid( @@ -467,7 +497,7 @@ def fit(self, X, y, sample_weight=None): continue # residuals of all data for current random sample model - y_pred = estimator.predict(X) + y_pred = estimator.predict(X, **routed_params.estimator.predict) residuals_subset = loss_function(y, y_pred) # classify data into inliers and outliers @@ -484,8 +514,21 @@ def fit(self, X, y, sample_weight=None): X_inlier_subset = X[inlier_idxs_subset] y_inlier_subset = y[inlier_idxs_subset] + # cut `fit_params` down to `inlier_idxs_subset` + score_params_cut_to_inlier_idxs_subset = {} + for key in routed_params.estimator.score: + # only apply on sample_wise metadata + if len(fit_params[key]) == len(X): + score_params_cut_to_inlier_idxs_subset[key] = fit_params[key][ + inlier_idxs_subset + ] + # score of inlier data set - score_subset = estimator.score(X_inlier_subset, y_inlier_subset) + score_subset = estimator.score( + X_inlier_subset, + y_inlier_subset, + **score_params_cut_to_inlier_idxs_subset, + ) # same number of inliers but worse score -> skip current random # sample @@ -549,20 +592,23 @@ def fit(self, X, y, sample_weight=None): ) # estimate final model using all inliers - if sample_weight is None: - estimator.fit(X_inlier_best, y_inlier_best) - else: - estimator.fit( - X_inlier_best, - y_inlier_best, - sample_weight=sample_weight[inlier_best_idxs_subset], - ) + fit_params_cut_to_best_idxs_subset = {} + for key in routed_params.estimator.fit: + # only apply on sample_wise metadata + if len(fit_params[key]) == len(X): + fit_params_cut_to_best_idxs_subset[key] = fit_params[key][ + inlier_best_idxs_subset + ] + + estimator.fit( + X_inlier_best, y_inlier_best, **fit_params_cut_to_best_idxs_subset + ) self.estimator_ = estimator self.inlier_mask_ = inlier_mask_best return self - def predict(self, X): + def predict(self, X, **params): """Predict using the estimated model. This is a wrapper for `estimator_.predict(X)`. @@ -572,6 +618,17 @@ def predict(self, X): X : {array-like or sparse matrix} of shape (n_samples, n_features) Input data. + **params : dict + Parameters routed to the `predict` method of the sub-estimator via + the metadata routing API. + + .. versionadded:: 1.5 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + Returns ------- y : array, shape = [n_samples] or [n_samples, n_targets] @@ -584,9 +641,17 @@ def predict(self, X): accept_sparse=True, reset=False, ) - return self.estimator_.predict(X) - def score(self, X, y): + if _routing_enabled(): + predict_params = process_routing(self, "predict", **params).estimator[ + "predict" + ] + else: + predict_params = dict() + + return self.estimator_.predict(X, **predict_params) + + def score(self, X, y, **params): """Return the score of the prediction. This is a wrapper for `estimator_.score(X, y)`. @@ -599,6 +664,17 @@ def score(self, X, y): y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. + **params : dict + Parameters routed to the `score` method of the sub-estimator via + the metadata routing API. + + .. versionadded:: 1.5 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + Returns ------- z : float @@ -611,7 +687,38 @@ def score(self, X, y): accept_sparse=True, reset=False, ) - return self.estimator_.score(X, y) + + if _routing_enabled(): + score_params = process_routing(self, "score", **params).estimator["score"] + else: + score_params = dict() + + return self.estimator_.score(X, y, **score_params) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="fit", callee="score") + .add(caller="fit", callee="predict") + .add(caller="score", callee="score") + .add(caller="predict", callee="predict"), + ) + return router def _more_tags(self): return { diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 3d7d0ab24f1cc..3e945cf4a3f5e 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -155,14 +155,18 @@ def fit(self, X, y, sample_weight="default", metadata="default"): ) return self - def predict(self, X, sample_weight="default", metadata="default"): - pass # pragma: no cover + def predict(self, X, y=None, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, "predict", sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X),)) - # when needed, uncomment the implementation - # record_metadata_not_default( - # self, "predict", sample_weight=sample_weight, metadata=metadata - # ) - # return np.zeros(shape=(len(X),)) + def score(self, X, y, sample_weight="default", metadata="default"): + self.predict(X) + record_metadata_not_default( + self, "score", sample_weight=sample_weight, metadata=metadata + ) + return 1 class NonConsumingClassifier(ClassifierMixin, BaseEstimator): diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index be8106f9b1dd0..428812a6afba0 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -118,7 +118,7 @@ def enable_slep006(): "X": X, "y": y, "estimator_routing_methods": ["fit"], - "preserves_metadata": False, + "preserves_metadata": "subset", }, { "metaestimator": ClassifierChain, @@ -287,6 +287,16 @@ def enable_slep006(): "cv_name": "cv", "cv_routing_methods": ["fit"], }, + { + "metaestimator": RANSACRegressor, + "estimator_name": "estimator", + "estimator": ConsumingRegressor, + "init_args": {"min_samples": 0.5}, + "X": X, + "y": y, + "preserves_metadata": False, + "estimator_routing_methods": ["fit", "predict", "score"], + }, ] """List containing all metaestimators to be tested and their settings @@ -330,7 +340,6 @@ def enable_slep006(): FeatureUnion([]), GraphicalLassoCV(), IterativeImputer(), - RANSACRegressor(), RFE(ConsumingClassifier()), RFECV(ConsumingClassifier()), RidgeCV(), @@ -394,6 +403,14 @@ def get_init_args(metaestimator_info): ) +def set_request(estimator, method_name): + # e.g. call set_fit_request on estimator + set_request_for_method = getattr(estimator, f"set_{method_name}_request") + set_request_for_method(sample_weight=True, metadata=True) + if is_classifier(estimator) and method_name == "partial_fit": + set_request_for_method(classes=True) + + @pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS) def test_unsupported_estimators_get_metadata_routing(estimator): """Test that get_metadata_routing is not implemented on meta-estimators for @@ -470,7 +487,18 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): instance = cls(**kwargs) with pytest.raises(UnsetMetadataPassedError, match=re.escape(msg)): method = getattr(instance, method_name) - method(X, y, **method_kwargs) + if method_name in ["predict", "score"]: + # fit before calling method + set_request(estimator, "fit") + fit_method = getattr(instance, "fit") + fit_method(X, y, **method_kwargs) + # then call method + if method_name == "predict": + method(X, **method_kwargs) + else: # method_name == "score" + method(X, y, **method_kwargs) + else: + method(X, y, **method_kwargs) @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) @@ -482,13 +510,6 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): # sub-estimator, e.g. MyMetaEstimator(estimator=MySubEstimator()) return - def set_request(estimator, method_name): - # e.g. call set_fit_request on estimator - set_request_for_method = getattr(estimator, f"set_{method_name}_request") - set_request_for_method(sample_weight=True, metadata=True) - if is_classifier(estimator) and method_name == "partial_fit": - set_request_for_method(classes=True) - cls = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] @@ -507,13 +528,28 @@ def set_request(estimator, method_name): set_request(scorer, "score") if cv: cv.set_split_request(groups=True, metadata=True) + set_request(estimator, method_name) + instance = cls(**kwargs) method = getattr(instance, method_name) extra_method_args = metaestimator.get("method_args", {}).get( method_name, {} ) - method(X, y, **method_kwargs, **extra_method_args) + + if method_name in ["predict", "score"]: + # fit before calling method + set_request(estimator, "fit") + fit_method = getattr(instance, "fit") + fit_method(X, y, **method_kwargs, **extra_method_args) + # then call method + if method_name == "predict": + method(X, **method_kwargs, **extra_method_args) + else: # method_name == "score" + method(X, y, **method_kwargs, **extra_method_args) + else: + method(X, y, **method_kwargs, **extra_method_args) + # sanity check that registry is not empty, or else the test passes # trivially assert registry From 35ec69b3fd516e9f71ffab3a00c1ef522644f75f Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 25 Jan 2024 16:33:37 +0100 Subject: [PATCH 02/34] fixes --- doc/whats_new/v1.5.rst | 13 +++++++++++++ sklearn/linear_model/_ransac.py | 16 +++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 851e86668b0d4..70d6df371d9e7 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -11,6 +11,19 @@ Version 1.5.0 .. include:: changelog_legend.inc +Metadata Routing +---------------- + +The following models now support metadata routing in one or more or their +methods. Refer to the :ref:`Metadata Routing User Guide ` for +more details. + +- |Feature| :class:`linear_model.RANSACRegressor` now supports metadata routing + in its ``fit``, ``score`` and ``predict`` methods and route metadata to its + underlying estimator's' ``fit``, ``score`` and ``predict`` methods. + :pr:`28261` by :user:`Stefanie Senger `. + + Changelog --------- diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 0c516728109ea..ff58f5e72bd9c 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -433,7 +433,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): routed_params = Bunch() # following line makes pytest sklearn/linear_model/tests/test_ransac.py fail # on collection - routed_params[estimator] = Bunch(fit={}) + routed_params.estimator = Bunch(fit={}) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) routed_params.estimator.fit = sample_weight @@ -483,8 +483,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): fit_params_cut_to_min_samples = {} for key in routed_params.estimator.fit: # only apply on sample_wise metadata - if len(fit_params[key]) == len(X): - fit_params_cut_to_min_samples[key] = fit_params[key][subset_idxs] + if len(routed_params.estimator.fit[key]) == len(X): + fit_params_cut_to_min_samples[key] = routed_params.estimator.fit[ + key + ][subset_idxs] # fit model for current random sample set estimator.fit(X_subset, y_subset, **fit_params_cut_to_min_samples) @@ -518,10 +520,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): score_params_cut_to_inlier_idxs_subset = {} for key in routed_params.estimator.score: # only apply on sample_wise metadata - if len(fit_params[key]) == len(X): - score_params_cut_to_inlier_idxs_subset[key] = fit_params[key][ - inlier_idxs_subset - ] + if len(routed_params.estimator.score[key]) == len(X): + score_params_cut_to_inlier_idxs_subset[key] = ( + routed_params.estimator.score[key][inlier_idxs_subset] + ) # score of inlier data set score_subset = estimator.score( From 23fd9bf9f8c821fec0705075298830e365bf1155 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 25 Jan 2024 17:00:23 +0100 Subject: [PATCH 03/34] not quite but almost working fix --- sklearn/linear_model/_ransac.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index ff58f5e72bd9c..7d5d8d582e552 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -431,12 +431,15 @@ def fit(self, X, y, sample_weight=None, **fit_params): routed_params = process_routing(self, "fit", **fit_params) else: routed_params = Bunch() - # following line makes pytest sklearn/linear_model/tests/test_ransac.py fail - # on collection - routed_params.estimator = Bunch(fit={}) + # routed_params.estimator = {Bunch(fit={}), Bunch(predict={}), + # Bunch(score={})} + routed_params.estimator = {"fit": {}, "score": {}, "predict": {}} + # results in {'estimator': {'fit': {}, 'score': {}, 'predict': {}}} if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) routed_params.estimator.fit = sample_weight + routed_params.estimator.predict = sample_weight + routed_params.estimator.score = sample_weight n_inliers_best = 1 score_best = -np.inf From eaaab3989ba50d0610b6bf1c8dd3bb1de3f71256 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 25 Jan 2024 17:26:36 +0100 Subject: [PATCH 04/34] fix for legacy build of routed_params --- sklearn/linear_model/_ransac.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 7d5d8d582e552..a0a1e21af31aa 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -431,15 +431,12 @@ def fit(self, X, y, sample_weight=None, **fit_params): routed_params = process_routing(self, "fit", **fit_params) else: routed_params = Bunch() - # routed_params.estimator = {Bunch(fit={}), Bunch(predict={}), - # Bunch(score={})} - routed_params.estimator = {"fit": {}, "score": {}, "predict": {}} - # results in {'estimator': {'fit': {}, 'score': {}, 'predict': {}}} + routed_params.estimator = Bunch(fit={}, predict={}, score={}) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) - routed_params.estimator.fit = sample_weight - routed_params.estimator.predict = sample_weight - routed_params.estimator.score = sample_weight + routed_params.estimator.fit = {"sample_weight": sample_weight} + routed_params.estimator.predict = {"sample_weight": sample_weight} + routed_params.estimator.score = {"sample_weight": sample_weight} n_inliers_best = 1 score_best = -np.inf From 807fcc545f1ed975c87897679559120639736d01 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 25 Jan 2024 19:19:16 +0100 Subject: [PATCH 05/34] fix --- sklearn/linear_model/_ransac.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index a0a1e21af31aa..d4875a68dedad 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -478,8 +478,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): self.n_skips_invalid_data_ += 1 continue - # cut `fit_params` down to their temporary lengthes according to - # `min_samples` param + # cut `fit_params` down to `subset_idxs` fit_params_cut_to_min_samples = {} for key in routed_params.estimator.fit: # only apply on sample_wise metadata @@ -597,10 +596,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): fit_params_cut_to_best_idxs_subset = {} for key in routed_params.estimator.fit: # only apply on sample_wise metadata - if len(fit_params[key]) == len(X): - fit_params_cut_to_best_idxs_subset[key] = fit_params[key][ - inlier_best_idxs_subset - ] + if len(routed_params.estimator.fit[key]) == len(X): + fit_params_cut_to_best_idxs_subset[key] = routed_params.estimator.fit[ + key + ][inlier_best_idxs_subset] estimator.fit( X_inlier_best, y_inlier_best, **fit_params_cut_to_best_idxs_subset From a9116cc0d040942c1bd9c6188e43a01c8857a1fe Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 25 Jan 2024 20:43:29 +0100 Subject: [PATCH 06/34] ConsumingRegressor for LinearRegression because the latter doesnt consume sample_weight --- sklearn/linear_model/tests/test_ransac.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index b442f6b207e70..f8d3ef051fbb5 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -11,6 +11,7 @@ Ridge, ) from sklearn.linear_model._ransac import _dynamic_max_trials +from sklearn.tests.metadata_routing_common import ConsumingRegressor from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS @@ -458,7 +459,9 @@ def test_ransac_dynamic_max_trials(): def test_ransac_fit_sample_weight(): - ransac_estimator = RANSACRegressor(random_state=0) + ransac_estimator = RANSACRegressor( + random_state=0, estimator=ConsumingRegressor(), min_samples=0.5 + ) n_samples = y.shape[0] weights = np.ones(n_samples) ransac_estimator.fit(X, y, weights) @@ -517,10 +520,12 @@ def test_ransac_final_model_fit_sample_weight(): rng = check_random_state(42) sample_weight = rng.randint(1, 4, size=y.shape[0]) sample_weight = sample_weight / sample_weight.sum() - ransac = RANSACRegressor(estimator=LinearRegression(), random_state=0) + ransac = RANSACRegressor( + estimator=ConsumingRegressor(), min_samples=0.5, random_state=0 + ) ransac.fit(X, y, sample_weight=sample_weight) - final_model = LinearRegression() + final_model = ConsumingRegressor() mask_samples = ransac.inlier_mask_ final_model.fit( X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples] From 6be1762aacd1186fadce9268428ee665f39ab33f Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 26 Jan 2024 14:17:20 +0100 Subject: [PATCH 07/34] adjust test to set multiple metadata requests and meaningful errormessage for users --- sklearn/linear_model/_ransac.py | 21 ++++++++++++++----- .../test_metaestimators_metadata_routing.py | 17 ++++++++++++++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index d4875a68dedad..599b0d5b74964 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -15,7 +15,7 @@ _fit_context, clone, ) -from ..exceptions import ConvergenceWarning +from ..exceptions import ConvergenceWarning, UnsetMetadataPassedError from ..utils import check_consistent_length, check_random_state from ..utils._bunch import Bunch from ..utils._param_validation import ( @@ -428,7 +428,18 @@ def fit(self, X, y, sample_weight=None, **fit_params): ) if _routing_enabled(): - routed_params = process_routing(self, "fit", **fit_params) + try: + routed_params = process_routing(self, "fit", **fit_params) + except UnsetMetadataPassedError as e: + raise UnsetMetadataPassedError( + message=( + f"{e}, which is used internally by `RANSACRegressor.fit()`." + f"Call `{estimator.__class__.__name__}.set_{{method}}_request(" + "{metadata}=True)` for each metadata." + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) else: routed_params = Bunch() routed_params.estimator = Bunch(fit={}, predict={}, score={}) @@ -479,16 +490,16 @@ def fit(self, X, y, sample_weight=None, **fit_params): continue # cut `fit_params` down to `subset_idxs` - fit_params_cut_to_min_samples = {} + fit_params_cut_to_subset_idxs = {} for key in routed_params.estimator.fit: # only apply on sample_wise metadata if len(routed_params.estimator.fit[key]) == len(X): - fit_params_cut_to_min_samples[key] = routed_params.estimator.fit[ + fit_params_cut_to_subset_idxs[key] = routed_params.estimator.fit[ key ][subset_idxs] # fit model for current random sample set - estimator.fit(X_subset, y_subset, **fit_params_cut_to_min_samples) + estimator.fit(X_subset, y_subset, **fit_params_cut_to_subset_idxs) # check if estimated model is valid if self.is_model_valid is not None and not self.is_model_valid( diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 428812a6afba0..c7a31bd1144d2 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -296,6 +296,7 @@ def enable_slep006(): "y": y, "preserves_metadata": False, "estimator_routing_methods": ["fit", "predict", "score"], + "requests_set_together": {"fit": ["predict", "score"]}, }, ] """List containing all metaestimators to be tested and their settings @@ -327,6 +328,9 @@ def enable_slep006(): to the splitter - method_args: a dict of dicts, defining extra arguments needed to be passed to methods, such as passing `classes` to `partial_fit`. +- requests_set_together: a dict that defines which set_{method}_requests need + to be set together with the key; used in case a router routes to different + methods from the sub-estimator. """ # IDs used by pytest to get meaningful verbose messages when running the tests @@ -411,6 +415,12 @@ def set_request(estimator, method_name): set_request_for_method(classes=True) +def set_multiple_requests(estimator, requests_set_together, method_name): + if method_name in requests_set_together: + for additional_method in requests_set_together[method_name]: + set_request(estimator, additional_method) + + @pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS) def test_unsupported_estimators_get_metadata_routing(estimator): """Test that get_metadata_routing is not implemented on meta-estimators for @@ -514,6 +524,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): X = metaestimator["X"] y = metaestimator["y"] routing_methods = metaestimator["estimator_routing_methods"] + requests_set_together = metaestimator.get("requests_set_together", {}) preserves_metadata = metaestimator.get("preserves_metadata", True) for method_name in routing_methods: @@ -529,17 +540,21 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): if cv: cv.set_split_request(groups=True, metadata=True) + # `set_{method}_request({metadata}==True)` on the underlying objects set_request(estimator, method_name) + if requests_set_together: + set_multiple_requests(estimator, requests_set_together, method_name) instance = cls(**kwargs) method = getattr(instance, method_name) extra_method_args = metaestimator.get("method_args", {}).get( method_name, {} ) - if method_name in ["predict", "score"]: # fit before calling method set_request(estimator, "fit") + if requests_set_together: + set_multiple_requests(estimator, requests_set_together, "fit") fit_method = getattr(instance, "fit") fit_method(X, y, **method_kwargs, **extra_method_args) # then call method From 8e492a4c07c06df3c33203c69f51c4bd6e0849af Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Sun, 28 Jan 2024 21:34:07 +0100 Subject: [PATCH 08/34] fix missing requests test for multiple routings --- sklearn/linear_model/_ransac.py | 11 +++++++++- .../test_metaestimators_metadata_routing.py | 21 +++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 599b0d5b74964..6bf2744e9e5de 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -28,6 +28,7 @@ from ..utils.metadata_routing import ( MetadataRouter, MethodMapping, + _raise_for_params, _routing_enabled, process_routing, ) @@ -427,6 +428,11 @@ def fit(self, X, y, sample_weight=None, **fit_params): " itself." % estimator_name ) + _raise_for_params(fit_params, self, "fit") + + if sample_weight is not None: + fit_params["sample_weight"] = sample_weight + if _routing_enabled(): try: routed_params = process_routing(self, "fit", **fit_params) @@ -654,6 +660,8 @@ def predict(self, X, **params): reset=False, ) + _raise_for_params(params, self, "predict") + if _routing_enabled(): predict_params = process_routing(self, "predict", **params).estimator[ "predict" @@ -700,6 +708,7 @@ def score(self, X, y, **params): reset=False, ) + _raise_for_params(params, self, "score") if _routing_enabled(): score_params = process_routing(self, "score", **params).estimator["score"] else: @@ -725,8 +734,8 @@ def get_metadata_routing(self): estimator=self.estimator, method_mapping=MethodMapping() .add(caller="fit", callee="fit") - .add(caller="fit", callee="score") .add(caller="fit", callee="predict") + .add(caller="fit", callee="score") .add(caller="score", callee="score") .add(caller="predict", callee="predict"), ) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index c7a31bd1144d2..8fdc9b7dc9c1d 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -498,15 +498,17 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): with pytest.raises(UnsetMetadataPassedError, match=re.escape(msg)): method = getattr(instance, method_name) if method_name in ["predict", "score"]: - # fit before calling method + # set request on fit and on the method not tested here set_request(estimator, "fit") + if method_name == "predict": + set_request(estimator, "score") + if method_name == "score": + set_request(estimator, "predict") + # fit before calling method fit_method = getattr(instance, "fit") fit_method(X, y, **method_kwargs) - # then call method - if method_name == "predict": - method(X, **method_kwargs) - else: # method_name == "score" - method(X, y, **method_kwargs) + if method_name == "predict": + method(X, **method_kwargs) else: method(X, y, **method_kwargs) @@ -557,11 +559,8 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): set_multiple_requests(estimator, requests_set_together, "fit") fit_method = getattr(instance, "fit") fit_method(X, y, **method_kwargs, **extra_method_args) - # then call method - if method_name == "predict": - method(X, **method_kwargs, **extra_method_args) - else: # method_name == "score" - method(X, y, **method_kwargs, **extra_method_args) + if method_name == "predict": + method(X, **method_kwargs, **extra_method_args) else: method(X, y, **method_kwargs, **extra_method_args) From 8fc3bd3f692eecfab011ecbef2d6e5f06eb934a0 Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Tue, 30 Jan 2024 10:20:47 +0100 Subject: [PATCH 09/34] Apply suggestions from code review Co-authored-by: Adrin Jalali --- sklearn/linear_model/_ransac.py | 4 ++-- sklearn/tests/test_metaestimators_metadata_routing.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 6bf2744e9e5de..6ac584e228764 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -315,7 +315,7 @@ def __init__( prefer_skip_nested_validation=False ) @_deprecate_positional_args(version="1.7") - def fit(self, X, y, sample_weight=None, **fit_params): + def fit(self, X, y, *, sample_weight=None, **fit_params): """Fit estimator using RANSAC algorithm. Parameters @@ -445,7 +445,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): ), unrequested_params=e.unrequested_params, routed_params=e.routed_params, - ) + ) from e else: routed_params = Bunch() routed_params.estimator = Bunch(fit={}, predict={}, score={}) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 8fdc9b7dc9c1d..d8a69ef38ad40 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -407,12 +407,13 @@ def get_init_args(metaestimator_info): ) -def set_request(estimator, method_name): +def set_request(estimator, method_name, metadata_names, sub_methods): # e.g. call set_fit_request on estimator - set_request_for_method = getattr(estimator, f"set_{method_name}_request") - set_request_for_method(sample_weight=True, metadata=True) - if is_classifier(estimator) and method_name == "partial_fit": - set_request_for_method(classes=True) + for method in sum_methods: + set_request_for_method = getattr(estimator, f"set_{method}_request") + set_request_for_method(**{metadata: True for metadata in metadata_names}) + if is_classifier(estimator) and method_name == "partial_fit": + set_request_for_method(classes=True) def set_multiple_requests(estimator, requests_set_together, method_name): From 1308bcc4ff3145e72520ec5ac316cca8a43b6f52 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 30 Jan 2024 10:23:55 +0100 Subject: [PATCH 10/34] improvements after review --- sklearn/linear_model/_ransac.py | 62 +++++++------------ .../test_metaestimators_metadata_routing.py | 6 +- 2 files changed, 27 insertions(+), 41 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 6bf2744e9e5de..90eb2bb53e50a 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -34,6 +34,7 @@ ) from ..utils.random import sample_without_replacement from ..utils.validation import ( + _check_method_params, _check_sample_weight, _deprecate_positional_args, check_is_fitted, @@ -339,10 +340,10 @@ def fit(self, X, y, sample_weight=None, **fit_params): .. versionadded:: 1.5 - Only available if - `sklearn.set_config(enable_metadata_routing=True)` is set. See - :ref:`Metadata Routing User Guide ` for more - details. + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. Returns ------- @@ -359,6 +360,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): # Need to validate separately here. We can't pass multi_output=True # because that would allow y to be csr. Delay expensive finiteness # check to the estimator's own input validation. + _raise_for_params(fit_params, self, "fit") check_X_params = dict(accept_sparse="csr", force_all_finite=False) check_y_params = dict(ensure_2d=False) X, y = self._validate_data( @@ -428,8 +430,6 @@ def fit(self, X, y, sample_weight=None, **fit_params): " itself." % estimator_name ) - _raise_for_params(fit_params, self, "fit") - if sample_weight is not None: fit_params["sample_weight"] = sample_weight @@ -496,13 +496,9 @@ def fit(self, X, y, sample_weight=None, **fit_params): continue # cut `fit_params` down to `subset_idxs` - fit_params_cut_to_subset_idxs = {} - for key in routed_params.estimator.fit: - # only apply on sample_wise metadata - if len(routed_params.estimator.fit[key]) == len(X): - fit_params_cut_to_subset_idxs[key] = routed_params.estimator.fit[ - key - ][subset_idxs] + fit_params_cut_to_subset_idxs = _check_method_params( + X, params=routed_params.estimator.fit, indices=subset_idxs + ) # fit model for current random sample set estimator.fit(X_subset, y_subset, **fit_params_cut_to_subset_idxs) @@ -533,13 +529,9 @@ def fit(self, X, y, sample_weight=None, **fit_params): y_inlier_subset = y[inlier_idxs_subset] # cut `fit_params` down to `inlier_idxs_subset` - score_params_cut_to_inlier_idxs_subset = {} - for key in routed_params.estimator.score: - # only apply on sample_wise metadata - if len(routed_params.estimator.score[key]) == len(X): - score_params_cut_to_inlier_idxs_subset[key] = ( - routed_params.estimator.score[key][inlier_idxs_subset] - ) + score_params_cut_to_inlier_idxs_subset = _check_method_params( + X, params=routed_params.estimator.score, indices=inlier_idxs_subset + ) # score of inlier data set score_subset = estimator.score( @@ -610,13 +602,9 @@ def fit(self, X, y, sample_weight=None, **fit_params): ) # estimate final model using all inliers - fit_params_cut_to_best_idxs_subset = {} - for key in routed_params.estimator.fit: - # only apply on sample_wise metadata - if len(routed_params.estimator.fit[key]) == len(X): - fit_params_cut_to_best_idxs_subset[key] = routed_params.estimator.fit[ - key - ][inlier_best_idxs_subset] + fit_params_cut_to_best_idxs_subset = _check_method_params( + X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset + ) estimator.fit( X_inlier_best, y_inlier_best, **fit_params_cut_to_best_idxs_subset @@ -642,10 +630,10 @@ def predict(self, X, **params): .. versionadded:: 1.5 - Only available if - `sklearn.set_config(enable_metadata_routing=True)` is set. See - :ref:`Metadata Routing User Guide ` for more - details. + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. Returns ------- @@ -666,8 +654,6 @@ def predict(self, X, **params): predict_params = process_routing(self, "predict", **params).estimator[ "predict" ] - else: - predict_params = dict() return self.estimator_.predict(X, **predict_params) @@ -690,10 +676,10 @@ def score(self, X, y, **params): .. versionadded:: 1.5 - Only available if - `sklearn.set_config(enable_metadata_routing=True)` is set. See - :ref:`Metadata Routing User Guide ` for more - details. + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. Returns ------- @@ -711,8 +697,6 @@ def score(self, X, y, **params): _raise_for_params(params, self, "score") if _routing_enabled(): score_params = process_routing(self, "score", **params).estimator["score"] - else: - score_params = dict() return self.estimator_.score(X, y, **score_params) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 8fdc9b7dc9c1d..2d306ab20350c 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -497,9 +497,11 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): instance = cls(**kwargs) with pytest.raises(UnsetMetadataPassedError, match=re.escape(msg)): method = getattr(instance, method_name) - if method_name in ["predict", "score"]: - # set request on fit and on the method not tested here + if "fit" not in method_name: + # set request on fit set_request(estimator, "fit") + # make sure error message corresponding to `method_name` + # is used for test if method_name == "predict": set_request(estimator, "score") if method_name == "score": From 19358384cbcf1675ae757457ca8cd75eabd0d5c3 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 30 Jan 2024 22:08:58 +0100 Subject: [PATCH 11/34] more improvements triggered by review --- sklearn/tests/metadata_routing_common.py | 7 ++++ .../test_metaestimators_metadata_routing.py | 38 +++++++++---------- sklearn/utils/_metadata_requests.py | 4 +- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 3e945cf4a3f5e..d4ca33761a08e 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -258,6 +258,13 @@ def decision_function(self, X, sample_weight="default", metadata="default"): ) return np.zeros(shape=(len(X),)) + def score(self, X, y, sample_weight="default", metadata="default"): + self.predict(X) + record_metadata_not_default( + self, "score", sample_weight=sample_weight, metadata=metadata + ) + return 1 + class ConsumingTransformer(TransformerMixin, BaseEstimator): """A transformer which accepts metadata on fit and transform. diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index af43dba00db39..78826039be237 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -407,21 +407,15 @@ def get_init_args(metaestimator_info): ) -def set_request(estimator, method_name, metadata_names, sub_methods): - # e.g. call set_fit_request on estimator - for method in sum_methods: +def set_requests(estimator, methods, metadata_name): + """Call `set_fit_request` on a list of methods from the sub-estimator.""" + for method in methods: set_request_for_method = getattr(estimator, f"set_{method}_request") - set_request_for_method(**{metadata: True for metadata in metadata_names}) - if is_classifier(estimator) and method_name == "partial_fit": + set_request_for_method(**{metadata_name: True}) + if is_classifier(estimator) and method == "partial_fit": set_request_for_method(classes=True) -def set_multiple_requests(estimator, requests_set_together, method_name): - if method_name in requests_set_together: - for additional_method in requests_set_together[method_name]: - set_request(estimator, additional_method) - - @pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS) def test_unsupported_estimators_get_metadata_routing(estimator): """Test that get_metadata_routing is not implemented on meta-estimators for @@ -500,13 +494,13 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): method = getattr(instance, method_name) if "fit" not in method_name: # set request on fit - set_request(estimator, "fit") + set_requests(estimator, methods=["fit"], metadata_name=key) # make sure error message corresponding to `method_name` # is used for test if method_name == "predict": - set_request(estimator, "score") + set_requests(estimator, methods=["score"], metadata_name=key) if method_name == "score": - set_request(estimator, "predict") + set_requests(estimator, methods=["predict"], metadata_name=key) # fit before calling method fit_method = getattr(instance, "fit") fit_method(X, y, **method_kwargs) @@ -541,14 +535,16 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): metaestimator ) if scorer: - set_request(scorer, "score") + set_requests(scorer, methods=["score"], metadata_name=key) if cv: cv.set_split_request(groups=True, metadata=True) # `set_{method}_request({metadata}==True)` on the underlying objects - set_request(estimator, method_name) + set_requests(estimator, methods=[method_name], metadata_name=key) if requests_set_together: - set_multiple_requests(estimator, requests_set_together, method_name) + set_requests( + estimator, methods=requests_set_together["fit"], metadata_name=key + ) instance = cls(**kwargs) method = getattr(instance, method_name) @@ -557,9 +553,13 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): ) if method_name in ["predict", "score"]: # fit before calling method - set_request(estimator, "fit") + set_requests(estimator, methods=["fit"], metadata_name=key) if requests_set_together: - set_multiple_requests(estimator, requests_set_together, "fit") + set_requests( + estimator, + methods=requests_set_together["fit"], + metadata_name=key, + ) fit_method = getattr(instance, "fit") fit_method(X, y, **method_kwargs, **extra_method_args) if method_name == "predict": diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index 26fa907fee72a..a502ba65c53b7 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -1247,8 +1247,8 @@ def func(**kw): if self.validate_keys and (set(kw) - set(self.keys)): raise TypeError( - f"Unexpected args: {set(kw) - set(self.keys)}. Accepted arguments" - f" are: {set(self.keys)}" + f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. " + f"Accepted arguments are: {set(self.keys)}" ) requests = instance._get_metadata_request() From 928d99ca3b81f3692bb4c218a79c41912f7b6a6f Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 30 Jan 2024 22:28:08 +0100 Subject: [PATCH 12/34] route metadata from score to predict --- sklearn/tests/metadata_routing_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index d4ca33761a08e..22bdc4fb3b745 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -162,7 +162,7 @@ def predict(self, X, y=None, sample_weight="default", metadata="default"): return np.zeros(shape=(len(X),)) def score(self, X, y, sample_weight="default", metadata="default"): - self.predict(X) + self.predict(X, sample_weight="default", metadata="default") record_metadata_not_default( self, "score", sample_weight=sample_weight, metadata=metadata ) @@ -259,7 +259,7 @@ def decision_function(self, X, sample_weight="default", metadata="default"): return np.zeros(shape=(len(X),)) def score(self, X, y, sample_weight="default", metadata="default"): - self.predict(X) + self.predict(X, sample_weight="default", metadata="default") record_metadata_not_default( self, "score", sample_weight=sample_weight, metadata=metadata ) From 37070ecf4dce2241443196159b8696d65688ffff Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Wed, 31 Jan 2024 12:15:14 +0100 Subject: [PATCH 13/34] minor things --- sklearn/linear_model/_base.py | 1 + sklearn/linear_model/_ransac.py | 4 ++++ sklearn/linear_model/tests/test_ransac.py | 11 +++-------- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f07e974542a5b..ae4e16bf3752a 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -267,6 +267,7 @@ def _decision_function(self, X): check_is_fitted(self) X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False) + return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ def predict(self, X): diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index c3a96956b24fd..e33fce839701a 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -654,6 +654,8 @@ def predict(self, X, **params): predict_params = process_routing(self, "predict", **params).estimator[ "predict" ] + else: + predict_params = {} return self.estimator_.predict(X, **predict_params) @@ -697,6 +699,8 @@ def score(self, X, y, **params): _raise_for_params(params, self, "score") if _routing_enabled(): score_params = process_routing(self, "score", **params).estimator["score"] + else: + score_params = {} return self.estimator_.score(X, y, **score_params) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index f8d3ef051fbb5..fde800c143472 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -11,7 +11,6 @@ Ridge, ) from sklearn.linear_model._ransac import _dynamic_max_trials -from sklearn.tests.metadata_routing_common import ConsumingRegressor from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS @@ -459,9 +458,7 @@ def test_ransac_dynamic_max_trials(): def test_ransac_fit_sample_weight(): - ransac_estimator = RANSACRegressor( - random_state=0, estimator=ConsumingRegressor(), min_samples=0.5 - ) + ransac_estimator = RANSACRegressor(random_state=0) n_samples = y.shape[0] weights = np.ones(n_samples) ransac_estimator.fit(X, y, weights) @@ -520,12 +517,10 @@ def test_ransac_final_model_fit_sample_weight(): rng = check_random_state(42) sample_weight = rng.randint(1, 4, size=y.shape[0]) sample_weight = sample_weight / sample_weight.sum() - ransac = RANSACRegressor( - estimator=ConsumingRegressor(), min_samples=0.5, random_state=0 - ) + ransac = RANSACRegressor(random_state=0) ransac.fit(X, y, sample_weight=sample_weight) - final_model = ConsumingRegressor() + final_model = LinearRegression() mask_samples = ransac.inlier_mask_ final_model.fit( X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples] From 9a96eb86815b71d75529d1c9357b0a3daf558853 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 1 Feb 2024 13:40:42 +0100 Subject: [PATCH 14/34] prepared ConsumingRegressor for test in test_ransac --- doc/modules/linear_model.rst | 4 ++-- sklearn/linear_model/_base.py | 1 - sklearn/linear_model/tests/test_ransac.py | 7 +++++-- sklearn/tests/metadata_routing_common.py | 2 ++ 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index e538dde2ed6d5..76f91a30ea859 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1534,10 +1534,10 @@ Each iteration performs the following steps: 1. Select ``min_samples`` random samples from the original data and check whether the set of data is valid (see ``is_data_valid``). -2. Fit a model to the random subset (``base_estimator.fit``) and check +2. Fit a model to the random subset (``estimator.fit``) and check whether the estimated model is valid (see ``is_model_valid``). 3. Classify all data as inliers or outliers by calculating the residuals - to the estimated model (``base_estimator.predict(X) - y``) - all data + to the estimated model (``estimator.predict(X) - y``) - all data samples with absolute residuals smaller than or equal to the ``residual_threshold`` are considered as inliers. 4. Save fitted model as best model if number of inlier samples is diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index ae4e16bf3752a..f07e974542a5b 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -267,7 +267,6 @@ def _decision_function(self, X): check_is_fitted(self) X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False) - return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ def predict(self, X): diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index fde800c143472..979fb347b1d22 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -11,6 +11,7 @@ Ridge, ) from sklearn.linear_model._ransac import _dynamic_max_trials +from sklearn.tests.metadata_routing_common import ConsumingRegressor from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS @@ -517,10 +518,12 @@ def test_ransac_final_model_fit_sample_weight(): rng = check_random_state(42) sample_weight = rng.randint(1, 4, size=y.shape[0]) sample_weight = sample_weight / sample_weight.sum() - ransac = RANSACRegressor(random_state=0) + ransac = RANSACRegressor( + estimator=ConsumingRegressor(), min_samples=0.5, random_state=0 + ) ransac.fit(X, y, sample_weight=sample_weight) - final_model = LinearRegression() + final_model = ConsumingRegressor() mask_samples = ransac.inlier_mask_ final_model.fit( X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples] diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 22bdc4fb3b745..39d78797bcd2b 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -153,6 +153,8 @@ def fit(self, X, y, sample_weight="default", metadata="default"): record_metadata_not_default( self, "fit", sample_weight=sample_weight, metadata=metadata ) + n_features = X.shape[1] + self.coef_ = np.ones(n_features) return self def predict(self, X, y=None, sample_weight="default", metadata="default"): From d308d949112bfb92f8aa43744fb616ca1e7767d3 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 1 Feb 2024 13:57:13 +0100 Subject: [PATCH 15/34] fit_params should not be routed to sub_estimator.predict --- sklearn/linear_model/_ransac.py | 2 +- sklearn/linear_model/tests/test_ransac.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index e33fce839701a..46b618d7326e4 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -511,7 +511,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): continue # residuals of all data for current random sample model - y_pred = estimator.predict(X, **routed_params.estimator.predict) + y_pred = estimator.predict(X) residuals_subset = loss_function(y, y_pred) # classify data into inliers and outliers diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 979fb347b1d22..fde800c143472 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -11,7 +11,6 @@ Ridge, ) from sklearn.linear_model._ransac import _dynamic_max_trials -from sklearn.tests.metadata_routing_common import ConsumingRegressor from sklearn.utils import check_random_state from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS @@ -518,12 +517,10 @@ def test_ransac_final_model_fit_sample_weight(): rng = check_random_state(42) sample_weight = rng.randint(1, 4, size=y.shape[0]) sample_weight = sample_weight / sample_weight.sum() - ransac = RANSACRegressor( - estimator=ConsumingRegressor(), min_samples=0.5, random_state=0 - ) + ransac = RANSACRegressor(random_state=0) ransac.fit(X, y, sample_weight=sample_weight) - final_model = ConsumingRegressor() + final_model = LinearRegression() mask_samples = ransac.inlier_mask_ final_model.fit( X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples] From 5bea176daf60320fc343405a7a7b3a2a5a7421f6 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 5 Feb 2024 16:24:19 +0100 Subject: [PATCH 16/34] excluded estimator.predict from tests --- sklearn/linear_model/_ransac.py | 1 - sklearn/tests/metadata_routing_common.py | 4 --- .../test_metaestimators_metadata_routing.py | 34 +++++++++---------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 46b618d7326e4..62297995e61ee 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -722,7 +722,6 @@ def get_metadata_routing(self): estimator=self.estimator, method_mapping=MethodMapping() .add(caller="fit", callee="fit") - .add(caller="fit", callee="predict") .add(caller="fit", callee="score") .add(caller="score", callee="score") .add(caller="predict", callee="predict"), diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index 39d78797bcd2b..69f3956a39c10 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -153,8 +153,6 @@ def fit(self, X, y, sample_weight="default", metadata="default"): record_metadata_not_default( self, "fit", sample_weight=sample_weight, metadata=metadata ) - n_features = X.shape[1] - self.coef_ = np.ones(n_features) return self def predict(self, X, y=None, sample_weight="default", metadata="default"): @@ -164,7 +162,6 @@ def predict(self, X, y=None, sample_weight="default", metadata="default"): return np.zeros(shape=(len(X),)) def score(self, X, y, sample_weight="default", metadata="default"): - self.predict(X, sample_weight="default", metadata="default") record_metadata_not_default( self, "score", sample_weight=sample_weight, metadata=metadata ) @@ -261,7 +258,6 @@ def decision_function(self, X, sample_weight="default", metadata="default"): return np.zeros(shape=(len(X),)) def score(self, X, y, sample_weight="default", metadata="default"): - self.predict(X, sample_weight="default", metadata="default") record_metadata_not_default( self, "score", sample_weight=sample_weight, metadata=metadata ) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 78826039be237..69ca42dc481f9 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -296,7 +296,7 @@ def enable_slep006(): "y": y, "preserves_metadata": False, "estimator_routing_methods": ["fit", "predict", "score"], - "requests_set_together": {"fit": ["predict", "score"]}, + "requests_set_together": {"fit": ["score"]}, }, ] """List containing all metaestimators to be tested and their settings @@ -330,7 +330,10 @@ def enable_slep006(): methods, such as passing `classes` to `partial_fit`. - requests_set_together: a dict that defines which set_{method}_requests need to be set together with the key; used in case a router routes to different - methods from the sub-estimator. + methods from the sub-estimator from withing the same meta-estimator's method. + For instance, {"fit": ["score"]} would signal that + `estimator.set_fit_request` premises `estimator.set_score_request` to be set + as well. """ # IDs used by pytest to get meaningful verbose messages when running the tests @@ -497,17 +500,14 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): set_requests(estimator, methods=["fit"], metadata_name=key) # make sure error message corresponding to `method_name` # is used for test - if method_name == "predict": + if method_name != "score": set_requests(estimator, methods=["score"], metadata_name=key) - if method_name == "score": - set_requests(estimator, methods=["predict"], metadata_name=key) - # fit before calling method - fit_method = getattr(instance, "fit") - fit_method(X, y, **method_kwargs) - if method_name == "predict": - method(X, **method_kwargs) - else: + instance.fit(X, y, **method_kwargs) + try: + # `fit` and `partial_fit` accept y, others don't. method(X, y, **method_kwargs) + except TypeError: + method(X, **method_kwargs) @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) @@ -551,7 +551,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): extra_method_args = metaestimator.get("method_args", {}).get( method_name, {} ) - if method_name in ["predict", "score"]: + if "fit" not in method_name: # fit before calling method set_requests(estimator, methods=["fit"], metadata_name=key) if requests_set_together: @@ -560,12 +560,12 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): methods=requests_set_together["fit"], metadata_name=key, ) - fit_method = getattr(instance, "fit") - fit_method(X, y, **method_kwargs, **extra_method_args) - if method_name == "predict": - method(X, **method_kwargs, **extra_method_args) - else: + instance.fit(X, y, **method_kwargs, **extra_method_args) + try: + # `fit` and `partial_fit` accept y, others don't. method(X, y, **method_kwargs, **extra_method_args) + except TypeError: + method(X, **method_kwargs, **extra_method_args) # sanity check that registry is not empty, or else the test passes # trivially From f66761d4212539fd7e5aa2de734d4e55a4a39cce Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:32:50 +0100 Subject: [PATCH 17/34] Update sklearn/linear_model/_ransac.py Co-authored-by: Adrin Jalali --- sklearn/linear_model/_ransac.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 62297995e61ee..e40211fe72454 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -452,8 +452,6 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) routed_params.estimator.fit = {"sample_weight": sample_weight} - routed_params.estimator.predict = {"sample_weight": sample_weight} - routed_params.estimator.score = {"sample_weight": sample_weight} n_inliers_best = 1 score_best = -np.inf From 7d8c156cd92336adf632baef5553769a01dee0d3 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 5 Feb 2024 16:47:01 +0100 Subject: [PATCH 18/34] catch FutureWarning --- sklearn/linear_model/tests/test_ransac.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index fde800c143472..de7105bec5155 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_array_equal @@ -461,7 +463,13 @@ def test_ransac_fit_sample_weight(): ransac_estimator = RANSACRegressor(random_state=0) n_samples = y.shape[0] weights = np.ones(n_samples) - ransac_estimator.fit(X, y, weights) + with warnings.catch_warnings(record=True): + warnings.filterwarnings( + "always", + category=FutureWarning, + message="Pass sample_weight=", + ) + ransac_estimator.fit(X, y, weights) # sanity check assert ransac_estimator.inlier_mask_.shape[0] == n_samples From 341ded55948640c5acbc0f82e3f67473d8361451 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 5 Feb 2024 17:15:27 +0100 Subject: [PATCH 19/34] adjusted tests to ConsumingRegressor now uses own score method --- sklearn/tests/test_metadata_routing.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 0019212632466..6b65a216a0bb2 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -640,7 +640,7 @@ class ConsumingRegressorWarn(ConsumingRegressor): " 'predict'}], 'router': {'fit': {'sample_weight': None, 'metadata':" " None}, 'partial_fit': {'sample_weight': None, 'metadata': None}," " 'predict': {'sample_weight': None, 'metadata': None}, 'score':" - " {'sample_weight': None}}}}" + " {'sample_weight': None, 'metadata': None}}}}" ), ), ], @@ -754,7 +754,8 @@ def test_metadata_routing_add(): == "{'est': {'mapping': [{'callee': 'fit', 'caller': 'fit'}], 'router': {'fit':" " {'sample_weight': 'weights', 'metadata': None}, 'partial_fit':" " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':" - " None, 'metadata': None}, 'score': {'sample_weight': None}}}}" + " None, 'metadata': None}, 'score': {'sample_weight': None, 'metadata':" + " None}}}}" ) # adding one with an instance of MethodMapping @@ -767,7 +768,8 @@ def test_metadata_routing_add(): == "{'est': {'mapping': [{'callee': 'score', 'caller': 'fit'}], 'router':" " {'fit': {'sample_weight': None, 'metadata': None}, 'partial_fit':" " {'sample_weight': None, 'metadata': None}, 'predict': {'sample_weight':" - " None, 'metadata': None}, 'score': {'sample_weight': True}}}}" + " None, 'metadata': None}, 'score': {'sample_weight': True, 'metadata':" + " None}}}}" ) From bcd9ab5fd189edc7f72a6aac9f63ad23787c41ff Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 13 Feb 2024 11:49:06 +0100 Subject: [PATCH 20/34] add comment for deprecation of positional argument --- sklearn/linear_model/_ransac.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index e40211fe72454..931fb5e45ec47 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -315,6 +315,9 @@ def __init__( # RansacRegressor.estimator is not validated yet prefer_skip_nested_validation=False ) + # TODO(1.7): remove `sample_weight` from the signature after deprecation + # cycle; for backwards compatibility: pop it from `fit_params` before the + # `_raise_for_params` check and reinsert it after the check @_deprecate_positional_args(version="1.7") def fit(self, X, y, *, sample_weight=None, **fit_params): """Fit estimator using RANSAC algorithm. From 3145bc4a06d6a818d837f78bad3e7ef2ffbb800d Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 15 Feb 2024 13:43:37 +0100 Subject: [PATCH 21/34] repair test and revert merge conflict errors --- .../tests/test_metaestimators_metadata_routing.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 46b10c85b4df0..5f551a8dbedc1 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -299,6 +299,8 @@ def enable_slep006(): "preserves_metadata": False, "estimator_routing_methods": ["fit", "predict", "score"], "requests_set_together": {"fit": ["score"]}, + }, + { "metaestimator": IterativeImputer, "estimator_name": "estimator", "estimator": ConsumingRegressor, @@ -355,8 +357,6 @@ def enable_slep006(): BaggingRegressor(), FeatureUnion([]), GraphicalLassoCV(), - IterativeImputer(), - RANSACRegressor(), RFE(ConsumingClassifier()), RFECV(ConsumingClassifier()), RidgeCV(), @@ -641,8 +641,14 @@ def set_request(estimator, method_name): set_request(estimator, method_name) method = getattr(instance, method_name) extra_method_args = metaestimator.get("method_args", {}).get(method_name, {}) - # This following line should pass w/o raising a routing error. - method(X, y, **extra_method_args) + if "fit" not in method_name: + instance.fit(X, y, **extra_method_args) + # The following should pass w/o raising a routing error. + try: + # `fit` and `partial_fit` accept y, others don't. + method(X, y, **extra_method_args) + except TypeError: + method(X, **extra_method_args) @pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) From 423f6dd4126199863c99630cc259acafcd5dcba2 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 16 Feb 2024 09:02:11 +0100 Subject: [PATCH 22/34] bypass FutureWarnings --- sklearn/linear_model/tests/test_ransac.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index de7105bec5155..fbeaff2f440ed 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_array_equal @@ -463,13 +461,7 @@ def test_ransac_fit_sample_weight(): ransac_estimator = RANSACRegressor(random_state=0) n_samples = y.shape[0] weights = np.ones(n_samples) - with warnings.catch_warnings(record=True): - warnings.filterwarnings( - "always", - category=FutureWarning, - message="Pass sample_weight=", - ) - ransac_estimator.fit(X, y, weights) + ransac_estimator.fit(X, y, sample_weight=weights) # sanity check assert ransac_estimator.inlier_mask_.shape[0] == n_samples @@ -506,7 +498,7 @@ def test_ransac_fit_sample_weight(): sample_weight = np.append(sample_weight, outlier_weight) X_ = np.append(X_, outlier_X, axis=0) y_ = np.append(y_, outlier_y) - ransac_estimator.fit(X_, y_, sample_weight) + ransac_estimator.fit(X_, y_, sample_weight=sample_weight) assert_allclose(ransac_estimator.estimator_.coef_, ref_coef_) From 2edff90af4851684ed7d9c948b04c0ca3b949ae5 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 16 Feb 2024 10:02:56 +0100 Subject: [PATCH 23/34] bypass FutureWarning --- sklearn/linear_model/tests/test_ransac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index fbeaff2f440ed..7b2bc66160ef3 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -509,7 +509,7 @@ def test_ransac_fit_sample_weight(): err_msg = f"{estimator.__class__.__name__} does not support sample_weight." with pytest.raises(ValueError, match=err_msg): - ransac_estimator.fit(X, y, weights) + ransac_estimator.fit(X, y, sample_weight=weights) def test_ransac_final_model_fit_sample_weight(): From f0bcdd4561b70215604b34b519b9ccfa9f777243 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 16 Feb 2024 14:08:39 +0100 Subject: [PATCH 24/34] correct passing of ConsumingRegressor --- sklearn/tests/metadata_routing_common.py | 11 ++++++----- sklearn/tests/test_metaestimators_metadata_routing.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index dd5b37e0e28a6..4147c393534da 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -273,11 +273,12 @@ def decision_function(self, X, sample_weight="default", metadata="default"): ) return np.zeros(shape=(len(X),)) - def score(self, X, y, sample_weight="default", metadata="default"): - record_metadata_not_default( - self, "score", sample_weight=sample_weight, metadata=metadata - ) - return 1 + # def score(self, X, y, sample_weight="default", metadata="default"): + # uncomment when needed + # record_metadata_not_default( + # self, "score", sample_weight=sample_weight, metadata=metadata + # ) + # return 1 class ConsumingTransformer(TransformerMixin, BaseEstimator): diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 5f551a8dbedc1..40a423121cdeb 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -292,7 +292,7 @@ def enable_slep006(): { "metaestimator": RANSACRegressor, "estimator_name": "estimator", - "estimator": ConsumingRegressor, + "estimator": "regressor", "init_args": {"min_samples": 0.5}, "X": X, "y": y, @@ -303,7 +303,7 @@ def enable_slep006(): { "metaestimator": IterativeImputer, "estimator_name": "estimator", - "estimator": ConsumingRegressor, + "estimator": "regressor", "init_args": {"skip_complete": False}, "X": X, "y": y, From 2c73de44571b02e795dc43106976c80c3c529fb6 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 16 Feb 2024 15:56:24 +0100 Subject: [PATCH 25/34] raise to prevent silent bug --- sklearn/tests/test_metaestimators_metadata_routing.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 40a423121cdeb..e3f7f4c6ba6f4 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -411,13 +411,17 @@ def get_init_args(metaestimator_info, sub_estimator_consumes): if sub_estimator_consumes: if sub_estimator_type == "regressor": estimator = ConsumingRegressor(estimator_registry) - else: + elif sub_estimator_type == "classifier": estimator = ConsumingClassifier(estimator_registry) + else: + raise ValueError("Unpermitted `sub_estimator_type`.") else: if sub_estimator_type == "regressor": estimator = NonConsumingRegressor() - else: + elif sub_estimator_type == "classifier": estimator = NonConsumingClassifier() + else: + raise ValueError("Unpermitted `sub_estimator_type`.") kwargs[estimator_name] = estimator if "scorer_name" in metaestimator_info: scorer_name = metaestimator_info["scorer_name"] From fa1d712f1c074bfe85a829fb8553e70bff5564a2 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 20 Feb 2024 10:28:26 +0100 Subject: [PATCH 26/34] changes after review --- sklearn/linear_model/_ransac.py | 10 +++++----- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 931fb5e45ec47..de62d01dbe149 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -442,7 +442,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): except UnsetMetadataPassedError as e: raise UnsetMetadataPassedError( message=( - f"{e}, which is used internally by `RANSACRegressor.fit()`." + f"{e}, which is used internally by `RANSACRegressor.fit()`. " f"Call `{estimator.__class__.__name__}.set_{{method}}_request(" "{metadata}=True)` for each metadata." ), @@ -497,12 +497,12 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): continue # cut `fit_params` down to `subset_idxs` - fit_params_cut_to_subset_idxs = _check_method_params( + fit_params_subset = _check_method_params( X, params=routed_params.estimator.fit, indices=subset_idxs ) # fit model for current random sample set - estimator.fit(X_subset, y_subset, **fit_params_cut_to_subset_idxs) + estimator.fit(X_subset, y_subset, **fit_params_subset) # check if estimated model is valid if self.is_model_valid is not None and not self.is_model_valid( @@ -530,7 +530,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): y_inlier_subset = y[inlier_idxs_subset] # cut `fit_params` down to `inlier_idxs_subset` - score_params_cut_to_inlier_idxs_subset = _check_method_params( + score_params_inlier_subset = _check_method_params( X, params=routed_params.estimator.score, indices=inlier_idxs_subset ) @@ -538,7 +538,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): score_subset = estimator.score( X_inlier_subset, y_inlier_subset, - **score_params_cut_to_inlier_idxs_subset, + **score_params_inlier_subset, ) # same number of inliers but worse score -> skip current random diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index e3f7f4c6ba6f4..960fcf997d97d 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -296,7 +296,7 @@ def enable_slep006(): "init_args": {"min_samples": 0.5}, "X": X, "y": y, - "preserves_metadata": False, + "preserves_metadata": "subset", "estimator_routing_methods": ["fit", "predict", "score"], "requests_set_together": {"fit": ["score"]}, }, From 4f45a6b5c973df746c722f7bc4f96daf9838b568 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 20 Feb 2024 10:59:39 +0100 Subject: [PATCH 27/34] typo --- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 960fcf997d97d..cee9870483f0e 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -341,7 +341,7 @@ def enable_slep006(): methods, such as passing `classes` to `partial_fit`. - requests_set_together: a dict that defines which set_{method}_requests need to be set together with the key; used in case a router routes to different - methods from the sub-estimator from withing the same meta-estimator's method. + methods from the sub-estimator from within the same meta-estimator's method. For instance, {"fit": ["score"]} would signal that `estimator.set_fit_request` premises `estimator.set_score_request` to be set as well. From d3ac747f119aa4636d677068cc8883e4702653ec Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 20 Feb 2024 15:55:04 +0100 Subject: [PATCH 28/34] Add method_mapping to the common test definitions of the metaestimators --- .../test_metaestimators_metadata_routing.py | 101 ++++++++++++------ 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index cee9870483f0e..f0e6489a8698a 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -298,7 +298,7 @@ def enable_slep006(): "y": y, "preserves_metadata": "subset", "estimator_routing_methods": ["fit", "predict", "score"], - "requests_set_together": {"fit": ["score"]}, + "method_mapping": {"fit": ["fit", "score"]}, }, { "metaestimator": IterativeImputer, @@ -339,12 +339,9 @@ def enable_slep006(): to the splitter - method_args: a dict of dicts, defining extra arguments needed to be passed to methods, such as passing `classes` to `partial_fit`. -- requests_set_together: a dict that defines which set_{method}_requests need - to be set together with the key; used in case a router routes to different - methods from the sub-estimator from within the same meta-estimator's method. - For instance, {"fit": ["score"]} would signal that - `estimator.set_fit_request` premises `estimator.set_score_request` to be set - as well. +- method_mapping: a dict of the form `{caller: [callee1, ...]}` which signals + which `.set_{method}_request` methods should be called to set request values. + If not present, a one-to-one mapping is assumed. """ # IDs used by pytest to get meaningful verbose messages when running the tests @@ -442,13 +439,36 @@ def get_init_args(metaestimator_info, sub_estimator_consumes): ) -def set_requests(estimator, methods, metadata_name): - """Call `set_fit_request` on a list of methods from the sub-estimator.""" - for method in methods: - set_request_for_method = getattr(estimator, f"set_{method}_request") - set_request_for_method(**{metadata_name: True}) - if is_classifier(estimator) and method == "partial_fit": - set_request_for_method(classes=True) +def set_requests(estimator, *, method_mapping, methods, metadata_name, value=True): + """Call `set_{method}_request` on a list of methods from the sub-estimator. + + Parameters + ---------- + estimator : BaseEstimator + The estimator for which `set_{method}_request` methods are called. + + method_mapping : dict + The method mapping in the form of `{caller: [callee, ...]}`. + If a "caller" is not present in the method mapping, a one-to-one mapping is + assumed. + + methods : list of str + The list of methods as "caller"s for which the request for the child should + be set. + + metadata_name : str + The name of the metadata to be routed, usually either `"metadata"` or + `"sample_weight"` in our tests. + + value : None, bool, or str + The request value to be set, by default it's `True` + """ + for caller in methods: + for callee in method_mapping.get(caller, [caller]): + set_request_for_method = getattr(estimator, f"set_{callee}_request") + set_request_for_method(**{metadata_name: value}) + if is_classifier(estimator) and callee == "partial_fit": + set_request_for_method(classes=True) @pytest.mark.parametrize("estimator", UNSUPPORTED_ESTIMATORS) @@ -531,13 +551,26 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): method = getattr(instance, method_name) if "fit" not in method_name: # set request on fit - set_requests(estimator, methods=["fit"], metadata_name=key) - # make sure error message corresponding to `method_name` - # is used for test - if method_name != "score": - set_requests(estimator, methods=["score"], metadata_name=key) + set_requests( + estimator, + method_mapping=metaestimator.get("method_mapping", {}), + methods=["fit"], + metadata_name=key, + ) instance.fit(X, y, **method_kwargs) try: + # making sure the requests are unset, in case they were set as a + # side effect of setting them for fit. For instance, if method + # mapping for fit is: `"fit": ["fit", "score"]`, that would mean + # calling `.score` here would not raise, because we have already + # set request value for child estimator's `score`. + set_requests( + estimator, + method_mapping=metaestimator.get("method_mapping", {}), + methods=["fit"], + metadata_name=key, + value=None, + ) # `fit` and `partial_fit` accept y, others don't. method(X, y, **method_kwargs) except TypeError: @@ -557,7 +590,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): X = metaestimator["X"] y = metaestimator["y"] routing_methods = metaestimator["estimator_routing_methods"] - requests_set_together = metaestimator.get("requests_set_together", {}) + method_mapping = metaestimator.get("method_mapping", {}) preserves_metadata = metaestimator.get("preserves_metadata", True) for method_name in routing_methods: @@ -569,16 +602,19 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): metaestimator, sub_estimator_consumes=True ) if scorer: - set_requests(scorer, methods=["score"], metadata_name=key) + set_requests( + scorer, method_mapping={}, methods=["score"], metadata_name=key + ) if cv: cv.set_split_request(groups=True, metadata=True) # `set_{method}_request({metadata}==True)` on the underlying objects - set_requests(estimator, methods=[method_name], metadata_name=key) - if requests_set_together: - set_requests( - estimator, methods=requests_set_together["fit"], metadata_name=key - ) + set_requests( + estimator, + method_mapping=method_mapping, + methods=[method_name], + metadata_name=key, + ) instance = cls(**kwargs) method = getattr(instance, method_name) @@ -587,13 +623,12 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): ) if "fit" not in method_name: # fit before calling method - set_requests(estimator, methods=["fit"], metadata_name=key) - if requests_set_together: - set_requests( - estimator, - methods=requests_set_together["fit"], - metadata_name=key, - ) + set_requests( + estimator, + method_mapping=metaestimator.get("method_mapping", {}), + methods=["fit"], + metadata_name=key, + ) instance.fit(X, y, **method_kwargs, **extra_method_args) try: # `fit` and `partial_fit` accept y, others don't. From 508cc8560d93511a80a1e4e98befb7e23d0370fe Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 22 Feb 2024 14:35:15 +0100 Subject: [PATCH 29/34] without re-raising error message after review --- sklearn/linear_model/_ransac.py | 15 ++---------- .../test_metaestimators_metadata_routing.py | 24 +++++++++---------- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index de62d01dbe149..0811b2b3b0a21 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -15,7 +15,7 @@ _fit_context, clone, ) -from ..exceptions import ConvergenceWarning, UnsetMetadataPassedError +from ..exceptions import ConvergenceWarning from ..utils import check_consistent_length, check_random_state from ..utils._bunch import Bunch from ..utils._param_validation import ( @@ -437,18 +437,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): fit_params["sample_weight"] = sample_weight if _routing_enabled(): - try: - routed_params = process_routing(self, "fit", **fit_params) - except UnsetMetadataPassedError as e: - raise UnsetMetadataPassedError( - message=( - f"{e}, which is used internally by `RANSACRegressor.fit()`. " - f"Call `{estimator.__class__.__name__}.set_{{method}}_request(" - "{metadata}=True)` for each metadata." - ), - unrequested_params=e.unrequested_params, - routed_params=e.routed_params, - ) from e + routed_params = process_routing(self, "fit", **fit_params) else: routed_params = Bunch() routed_params.estimator = Bunch(fit={}, predict={}, score={}) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index f0e6489a8698a..6618b86e3be83 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -558,19 +558,19 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): metadata_name=key, ) instance.fit(X, y, **method_kwargs) + # making sure the requests are unset, in case they were set as a + # side effect of setting them for fit. For instance, if method + # mapping for fit is: `"fit": ["fit", "score"]`, that would mean + # calling `.score` here would not raise, because we have already + # set request value for child estimator's `score`. + set_requests( + estimator, + method_mapping=metaestimator.get("method_mapping", {}), + methods=["fit"], + metadata_name=key, + value=None, + ) try: - # making sure the requests are unset, in case they were set as a - # side effect of setting them for fit. For instance, if method - # mapping for fit is: `"fit": ["fit", "score"]`, that would mean - # calling `.score` here would not raise, because we have already - # set request value for child estimator's `score`. - set_requests( - estimator, - method_mapping=metaestimator.get("method_mapping", {}), - methods=["fit"], - metadata_name=key, - value=None, - ) # `fit` and `partial_fit` accept y, others don't. method(X, y, **method_kwargs) except TypeError: From 9d1e31059d6fd5fd41d1067a9dac1cff7178cda6 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 22 Feb 2024 15:02:10 +0100 Subject: [PATCH 30/34] fixed test for Bagging* --- sklearn/tests/test_metaestimators_metadata_routing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 68cea66ac5112..74cceeed4f4bf 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -312,7 +312,7 @@ def enable_slep006(): { "metaestimator": BaggingClassifier, "estimator_name": "estimator", - "estimator": ConsumingClassifier, + "estimator": "classifier", "X": X, "y": y, "preserves_metadata": False, @@ -321,7 +321,7 @@ def enable_slep006(): { "metaestimator": BaggingRegressor, "estimator_name": "estimator", - "estimator": ConsumingRegressor, + "estimator": "regressor", "X": X, "y": y, "preserves_metadata": False, From 2b1c4e87d3473fb3fda8eb740f0eff7803398995 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 23 Feb 2024 15:03:49 +0100 Subject: [PATCH 31/34] shorter variable name --- sklearn/linear_model/_ransac.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 0811b2b3b0a21..b6bf7b082fc5e 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -592,13 +592,11 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): ) # estimate final model using all inliers - fit_params_cut_to_best_idxs_subset = _check_method_params( + fit_params_best_idxs_subset = _check_method_params( X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset ) - estimator.fit( - X_inlier_best, y_inlier_best, **fit_params_cut_to_best_idxs_subset - ) + estimator.fit(X_inlier_best, y_inlier_best, **fit_params_best_idxs_subset) self.estimator_ = estimator self.inlier_mask_ = inlier_mask_best From cc7a054eff001384a185767902d7b340c86ae040 Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Mon, 26 Feb 2024 12:08:20 +0100 Subject: [PATCH 32/34] Update sklearn/tests/metadata_routing_common.py Co-authored-by: Omar Salman --- sklearn/tests/metadata_routing_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index f7eb7852cd1b5..9c9d12b2f1f91 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -281,8 +281,8 @@ def decision_function(self, X, sample_weight="default", metadata="default"): ) return np.zeros(shape=(len(X),)) - # def score(self, X, y, sample_weight="default", metadata="default"): # uncomment when needed + # def score(self, X, y, sample_weight="default", metadata="default"): # record_metadata_not_default( # self, "score", sample_weight=sample_weight, metadata=metadata # ) From 175c496132336296b3ac6467a5013b46a2a5d256 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 26 Feb 2024 17:32:41 +0500 Subject: [PATCH 33/34] Update sklearn/tests/test_metaestimators_metadata_routing.py Co-authored-by: Adrin Jalali --- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index a9ebb873ce757..afb6bf16de43c 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -423,7 +423,7 @@ def get_init_args(metaestimator_info, sub_estimator_consumes): elif sub_estimator_type == "classifier": estimator = ConsumingClassifier(estimator_registry) else: - raise ValueError("Unpermitted `sub_estimator_type`.") + raise ValueError("Unpermitted `sub_estimator_type`.") # pragma: nocover else: if sub_estimator_type == "regressor": estimator = NonConsumingRegressor() From 3af94552c99557e376d55fe6d70f29eb44834727 Mon Sep 17 00:00:00 2001 From: Omar Salman Date: Mon, 26 Feb 2024 17:32:50 +0500 Subject: [PATCH 34/34] Update sklearn/tests/test_metaestimators_metadata_routing.py Co-authored-by: Adrin Jalali --- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index afb6bf16de43c..bc0d4a649b0b7 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -430,7 +430,7 @@ def get_init_args(metaestimator_info, sub_estimator_consumes): elif sub_estimator_type == "classifier": estimator = NonConsumingClassifier() else: - raise ValueError("Unpermitted `sub_estimator_type`.") + raise ValueError("Unpermitted `sub_estimator_type`.") # pragma: nocover kwargs[estimator_name] = estimator if "scorer_name" in metaestimator_info: scorer_name = metaestimator_info["scorer_name"]