scikit-learn · adrinjalali · Jan 9, 2020 · Jan 10, 2020 · Jan 10, 2020 · Jan 13, 2020
diff --git a/doc/metadata_routing.rst b/doc/metadata_routing.rst
@@ -0,0 +1,187 @@
+
+.. _metadata_routing:
+
+Metadata Routing
+================
+
+This guide demonstrates how metadata such as ``sample_weight`` can be routed
+and passed along to estimators, scorers, and CV splitters through
+meta-estimators such as ``Pipeline`` and ``GridSearchCV``. In order to pass
+metadata to a method such as ``fit`` or ``score``, the object accepting the
+metadata, must *request* it. For estimators and splitters this is done via
+``*_requests`` methods, e.g. ``fit_requests(...)``, and for scorers this is
+done via ``score_requests`` method of a scorer. For grouped splitters such as
+``GroupKFold`` a ``groups`` parameter is requested by default. This is best
+demonstrated by the following examples.
+
+Usage Examples
+**************
+Here we present a few examples to show different common usecases. The examples
+in this section require the following imports and data::
+
+  >>> import numpy as np
+  >>> from sklearn.metrics import make_scorer, accuracy_score
+  >>> from sklearn.linear_model import LogisticRegressionCV
+  >>> from sklearn.linear_model import LogisticRegression
+  >>> from sklearn.model_selection import cross_validate
+  >>> from sklearn.model_selection import GridSearchCV
+  >>> from sklearn.model_selection import GroupKFold
+  >>> from sklearn.feature_selection import SelectKBest
+  >>> from sklearn.pipeline import make_pipeline
+  >>> n_samples, n_features = 100, 4
+  >>> X = np.random.rand(n_samples, n_features)
+  >>> y = np.random.randint(0, 2, size=n_samples)
+  >>> my_groups = np.random.randint(0, 10, size=n_samples)
+  >>> my_weights = np.random.rand(n_samples)
+  >>> my_other_weights = np.random.rand(n_samples)
+
+Weighted scoring and fitting
+----------------------------
+
+Here ``GroupKFold`` requests ``groups`` by default. However, we need to
+explicitly request weights in ``make_scorer`` and for ``LogisticRegressionCV``.
+Both of these *consumers* understand the meaning of the key
+``"sample_weight"``::
+
+  >>> weighted_acc = make_scorer(accuracy_score).score_requests(
+  ...     sample_weight=True
+  ... )
+  >>> lr = LogisticRegressionCV(
+  ...     cv=GroupKFold(), scoring=weighted_acc,
+  ... ).fit_requests(sample_weight=True)
+  >>> cv_results = cross_validate(
+  ...     lr,
+  ...     X,
+  ...     y,
+  ...     cv=GroupKFold(),
+  ...     props={"sample_weight": my_weights, "groups": my_groups},
+  ...     scoring=weighted_acc,
+  ... )
+
+Error handling: if ``props={'sample_weigh': my_weights, ...}`` were passed
+(note the typo), cross_validate would raise an error, since 'sample_weigh' was
+not requested by any of its children.
+
+Weighted scoring and unweighted fitting
+---------------------------------------
+
+Since ``LogisticRegressionCV``, like all scikit-learn estimators, requires that
+weights explicitly be requested, we need to explicitly say that
+``sample_weight`` is not used for it, so that ``cross_validate`` doesn't pass
+it along.
+
+  >>> weighted_acc = make_scorer(accuracy_score).score_requests(
+  ...     sample_weight=True
+  ... )
+  >>> lr = LogisticRegressionCV(
+  ...     cv=GroupKFold(), scoring=weighted_acc,
+  ... ).fit_requests(sample_weight=False)
+  >>> cv_results = cross_validate(
+  ...     lr,
+  ...     X,
+  ...     y,
+  ...     cv=GroupKFold(),
+  ...     props={"sample_weight": my_weights, "groups": my_groups},
+  ...     scoring=weighted_acc,
+  ... )
+
+Unweighted feature selection
+----------------------------
+
+Unlike ``LogisticRegressionCV``, ``SelectKBest`` doesn't accept weights and
+therefore `"sample_weight"` is not routed to it::
+
+  >>> weighted_acc = make_scorer(accuracy_score).score_requests(
+  ...     sample_weight=True
+  ... )
+  >>> lr = LogisticRegressionCV(
+  ...     cv=GroupKFold(), scoring=weighted_acc,
+  ... ).fit_requests(sample_weight=True)
+  >>> sel = SelectKBest(k=2)
+  >>> pipe = make_pipeline(sel, lr)
+  >>> cv_results = cross_validate(
+  ...     pipe,
+  ...     X,
+  ...     y,
+  ...     cv=GroupKFold(),
+  ...     props={"sample_weight": my_weights, "groups": my_groups},
+  ...     scoring=weighted_acc,
+  ... )
+
+Different scoring and fitting weights
+-------------------------------------
+
+Despite ``make_scorer`` and ``LogisticRegressionCV`` both expecting a key
+``sample_weight``, we can use aliases to pass different weights to different
+consumers. In this example, we pass ``scoring_weight`` to the scorer, and
+``fitting_weight`` to ``LogisticRegressionCV``::
+
+  >>> weighted_acc = make_scorer(accuracy_score).score_requests(
+  ...    sample_weight="scoring_weight"
+  ... )
+  >>> lr = LogisticRegressionCV(
+  ...     cv=GroupKFold(), scoring=weighted_acc,
+  ... ).fit_requests(sample_weight="fitting_weight")
+  >>> cv_results = cross_validate(
+  ...     lr,
+  ...     X,
+  ...     y,
+  ...     cv=GroupKFold(),
+  ...     props={
+  ...         "scoring_weight": my_weights,
+  ...         "fitting_weight": my_other_weights,
+  ...         "groups": my_groups,
+  ...     },
+  ...     scoring=weighted_acc,
+  ... )
+
+API Interface
+*************
+
+A *consumer* is an object (estimator, meta-estimator, scorer, splitter) which
+accepts and uses some metadata in at least one of their methods (``fit``,
+``predict``, ``inverse_transform``, ``transform``, ``score``, ``split``).
+Meta-estimators which only forward the metadata other objects (the child
+estimator, scorers, or splitters) and don't use the metadata themselves are not
+consumers. (Meta)Estimators which route metadata to other objects are routers.
+An (meta)estimator can be a consumer and a router at the same time.
+(Meta)Estimators and splitters expose a ``*_requests`` method for each method
+which accepts at least one metadata. For instance, if an estimator supports
+``sample_weight`` in ``fit`` and ``score``, it exposes
+``estimator.fit_requests(sample_weight=value)`` and
+``estimator.score_requests(sample_weight=value)``. Here ``value`` can be:
+
+- ``RequestType.REQUESTED`` or ``True``: method requests a ``sample_weight``.
+  This means if the metadata is provided, it will be used, otherwise no error
+  is raised.
+- ``RequestType.UNREQUESTED`` or ``False``: method does not request a
+  ``sample_weight``.
+- ``RequestType.ERROR_IF_PASSED`` or ``None``: router will raise an error if
+  ``sample_weight`` is passed. This is in almost all cases the default value
+  when an object is instantiated and ensures the user sets the metadata
+  requests explicitly when a metadata is passed.
+- ``"param_name"``: if this estimator is used in a meta-estimator, the
+  meta-estimator should forward ``"param_name"`` as ``sample_weight`` to this
+  estimator. This means the mapping between the metadata required by the
+  object, e.g. ``sample_weight`` and what is provided by the user, e.g.
+  ``my_weights`` is done at the router level, and not by the object, e.g.
+  estimator, itself.
+
+For the scorers, this is done the same way, using ``.score_requests`` method.
+
+If a metadata, e.g. ``sample_weight`` is passed by the user, the metadata
+request for all objects which potentially can accept ``sample_weight`` should
+be set by the user, otherwise an error is raised by the router object. For
+example, the following code would raise, since it hasn't been explicitly set
+whether ``sample_weight`` should be passed to the estimator's scorer or not::
+
+    >>> param_grid = {"C": [0.1, 1]}
+    >>> lr = LogisticRegression().fit_requests(sample_weight=True)
+    >>> try:
+    ...     GridSearchCV(
+    ...         estimator=lr, param_grid=param_grid
+    ...     ).fit(X, y, sample_weight=my_weights)
+    ... except ValueError as e:
+    ...     print(e)
+    sample_weight is passed but is not explicitly set as requested or not. In
+    method: score
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -30,3 +30,4 @@ User Guide
    computing.rst
    modules/model_persistence.rst
    common_pitfalls.rst
+   metadata_routing.rst
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -25,6 +25,7 @@
 from .utils.validation import _num_features
 from .utils.validation import _check_feature_names_in
 from .utils._estimator_html_repr import estimator_html_repr
+from .utils.metadata_requests import _MetadataRequester
 from .utils.validation import _get_feature_names
 
 
@@ -79,7 +80,13 @@ def clone(estimator, *, safe=True):
     new_object_params = estimator.get_params(deep=False)
     for name, param in new_object_params.items():
         new_object_params[name] = clone(param, safe=False)
+
     new_object = klass(**new_object_params)
+    try:
+        new_object._metadata_request = copy.deepcopy(estimator._metadata_request)
+    except AttributeError:
+        pass
+
     params_set = new_object.get_params(deep=False)
 
     # quick sanity check of the parameters of the clone
@@ -144,7 +151,7 @@ def _pprint(params, offset=0, printer=repr):
     return lines
 
 
-class BaseEstimator:
+class BaseEstimator(_MetadataRequester):
     """Base class for all estimators in scikit-learn.
 
     Notes

diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
@@ -307,3 +307,19 @@ def n_features_in_(self):
             ) from nfe
 
         return self.regressor_.n_features_in_
+
+    def get_metadata_request(self):
+        """Get requested data properties.
+
+        This method mirrors the given regressor's metadata request.
+
+        .. versionadded:: 1.1
+
+        Returns
+        -------
+        request : dict
+            A dict of dict of str->value. The key to the first dict is the name
+            of the method, and the key to the second dict is the name of the
+            argument requested by the method.
+        """
+        return self.regressor.get_metadata_request()
diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py
@@ -346,6 +346,8 @@ def test_transform_target_regressor_count_fit(check_inverse):
 
 
 class DummyRegressorWithExtraFitParams(DummyRegressor):
+    _metadata_request__check_input = {"fit": "check_input"}
+
     def fit(self, X, y, sample_weight=None, check_input=True):
         # on the test below we force this to false, we make sure this is
         # actually passed to the regressor
@@ -356,7 +358,10 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 def test_transform_target_regressor_pass_fit_parameters():
     X, y = friedman
     regr = TransformedTargetRegressor(
-        regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
+        regressor=DummyRegressorWithExtraFitParams().fit_requests(
+            sample_weight=True, check_input=True
+        ),
+        transformer=DummyTransformer(),
     )
 
     regr.fit(X, y, check_input=False)
@@ -367,12 +372,13 @@ def test_transform_target_regressor_route_pipeline():
     X, y = friedman
 
     regr = TransformedTargetRegressor(
-        regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
+        regressor=DummyRegressorWithExtraFitParams().fit_requests(check_input=True),
+        transformer=DummyTransformer(),
     )
     estimators = [("normalize", StandardScaler()), ("est", regr)]
 
     pip = Pipeline(estimators)
-    pip.fit(X, y, **{"est__check_input": False})
+    pip.fit(X, y, **{"check_input": False})
 
     assert regr.transformer_.fit_counter == 1