ENH Add metadata routing for FeatureUnion (#28205)

StefanieSenger · adrinjalali · glemaitre · web-flow · commit b1ce4c15ddc0 · 2024-03-05T19:25:59.000+01:00
Co-authored-by: Adrin Jalali &lt;adrin.jalali@gmail.com&gt;
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/doc/metadata_routing.rst b/doc/metadata_routing.rst
@@ -305,6 +305,7 @@ Meta-estimators and functions supporting metadata routing:
 - :class:`sklearn.multioutput.MultiOutputRegressor`
 - :class:`sklearn.linear_model.OrthogonalMatchingPursuitCV`
 - :class:`sklearn.multioutput.RegressorChain`
+- :class:`sklearn.pipeline.FeatureUnion`
 - :class:`sklearn.pipeline.Pipeline`
 
 Meta-estimators and tools not supporting metadata routing yet:
@@ -323,5 +324,4 @@ Meta-estimators and tools not supporting metadata routing yet:
 - :class:`sklearn.model_selection.learning_curve`
 - :class:`sklearn.model_selection.permutation_test_score`
 - :class:`sklearn.model_selection.validation_curve`
-- :class:`sklearn.pipeline.FeatureUnion`
 - :class:`sklearn.semi_supervised.SelfTrainingClassifier`
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -71,6 +71,12 @@ more details.
   ``**fit_params`` to the underlying estimators via their `fit` methods.
   :pr:`27584` by :user:`Stefanie Senger <StefanieSenger>`.
 
+- |Feature| :class:`pipeline.FeatureUnion` now supports metadata routing in its
+  ``fit`` and ``fit_transform`` methods and route metadata to the underlying
+  transformers' ``fit`` and ``fit_transform``. :pr:`28205` by :user:`Stefanie
+  Senger <StefanieSenger>`.
+
+
 Changelog
 ---------
 
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -31,9 +31,7 @@
     MetadataRouter,
     MethodMapping,
     _raise_for_params,
-    _raise_for_unsupported_routing,
     _routing_enabled,
-    _RoutingNotSupportedMixin,
     process_routing,
 )
 from .utils.metaestimators import _BaseComposition, available_if
@@ -1319,7 +1317,7 @@ def _fit_one(transformer, X, y, weight, message_clsname="", message=None, params
         return transformer.fit(X, y, **params["fit"])
 
 
-class FeatureUnion(_RoutingNotSupportedMixin, TransformerMixin, _BaseComposition):
+class FeatureUnion(TransformerMixin, _BaseComposition):
     """Concatenates results of multiple transformer objects.
 
     This estimator applies a list of transformer objects in parallel to the
@@ -1644,23 +1642,42 @@ def fit(self, X, y=None, **fit_params):
             Targets for supervised learning.
 
         **fit_params : dict, default=None
-            Parameters to pass to the fit method of the estimator.
+            - If `enable_metadata_routing=False` (default):
+              Parameters directly passed to the `fit` methods of the
+              sub-transformers.
+
+            - If `enable_metadata_routing=True`:
+              Parameters safely routed to the `fit` methods of the
+              sub-transformers. See :ref:`Metadata Routing User Guide
+              <metadata_routing>` for more details.
+
+            .. versionchanged:: 1.5
+                `**fit_params` can be routed via metadata routing API.
 
         Returns
         -------
         self : object
             FeatureUnion class instance.
         """
-        _raise_for_unsupported_routing(self, "fit", **fit_params)
-        transformers = self._parallel_func(X, y, fit_params, _fit_one)
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            routed_params = Bunch()
+            for name, _ in self.transformer_list:
+                routed_params[name] = Bunch(fit={})
+                routed_params[name].fit = fit_params
+
+        transformers = self._parallel_func(X, y, _fit_one, routed_params)
+
         if not transformers:
             # All transformers are None
             return self
 
         self._update_transformer_list(transformers)
         return self
 
-    def fit_transform(self, X, y=None, **fit_params):
+    def fit_transform(self, X, y=None, **params):
         """Fit all transformers, transform the data and concatenate results.
 
         Parameters
@@ -1671,8 +1688,18 @@ def fit_transform(self, X, y=None, **fit_params):
         y : array-like of shape (n_samples, n_outputs), default=None
             Targets for supervised learning.
 
-        **fit_params : dict, default=None
-            Parameters to pass to the fit method of the estimator.
+        **params : dict, default=None
+            - If `enable_metadata_routing=False` (default):
+              Parameters directly passed to the `fit` methods of the
+              sub-transformers.
+
+            - If `enable_metadata_routing=True`:
+              Parameters safely routed to the `fit` methods of the
+              sub-transformers. See :ref:`Metadata Routing User Guide
+              <metadata_routing>` for more details.
+
+            .. versionchanged:: 1.5
+                `**params` can now be routed via metadata routing API.
 
         Returns
         -------
@@ -1681,7 +1708,21 @@ def fit_transform(self, X, y=None, **fit_params):
             The `hstack` of results of transformers. `sum_n_components` is the
             sum of `n_components` (output dimension) over transformers.
         """
-        results = self._parallel_func(X, y, fit_params, _fit_transform_one)
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit_transform", **params)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            routed_params = Bunch()
+            for name, obj in self.transformer_list:
+                if hasattr(obj, "fit_transform"):
+                    routed_params[name] = Bunch(fit_transform={})
+                    routed_params[name].fit_transform = params
+                else:
+                    routed_params[name] = Bunch(fit={})
+                    routed_params[name] = Bunch(transform={})
+                    routed_params[name].fit = params
+
+        results = self._parallel_func(X, y, _fit_transform_one, routed_params)
         if not results:
             # All transformers are None
             return np.zeros((X.shape[0], 0))
@@ -1696,15 +1737,13 @@ def _log_message(self, name, idx, total):
             return None
         return "(step %d of %d) Processing %s" % (idx, total, name)
 
-    def _parallel_func(self, X, y, fit_params, func):
+    def _parallel_func(self, X, y, func, routed_params):
         """Runs func in parallel on X and y"""
         self.transformer_list = list(self.transformer_list)
         self._validate_transformers()
         self._validate_transformer_weights()
         transformers = list(self._iter())
 
-        params = Bunch(fit=fit_params, fit_transform=fit_params)
-
         return Parallel(n_jobs=self.n_jobs)(
             delayed(func)(
                 transformer,
@@ -1713,31 +1752,45 @@ def _parallel_func(self, X, y, fit_params, func):
                 weight,
                 message_clsname="FeatureUnion",
                 message=self._log_message(name, idx, len(transformers)),
-                params=params,
+                params=routed_params[name],
             )
             for idx, (name, transformer, weight) in enumerate(transformers, 1)
         )
 
-    def transform(self, X):
+    def transform(self, X, **params):
         """Transform X separately by each transformer, concatenate results.
 
         Parameters
         ----------
         X : iterable or array-like, depending on transformers
             Input data to be transformed.
 
+        **params : dict, default=None
+
+            Parameters routed to the `transform` method of the sub-transformers via the
+            metadata routing API. See :ref:`Metadata Routing User Guide
+            <metadata_routing>` for more details.
+
+            .. versionadded:: 1.5
+
         Returns
         -------
-        X_t : array-like or sparse matrix of \
-                shape (n_samples, sum_n_components)
+        X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)
             The `hstack` of results of transformers. `sum_n_components` is the
             sum of `n_components` (output dimension) over transformers.
         """
-        # TODO(SLEP6): accept **params here in `transform` and route it to the
-        # underlying estimators.
-        params = Bunch(transform={})
+        _raise_for_params(params, self, "transform")
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "transform", **params)
+        else:
+            # TODO(SLEP6): remove when metadata routing cannot be disabled.
+            routed_params = Bunch()
+            for name, _ in self.transformer_list:
+                routed_params[name] = Bunch(transform={})
+
         Xs = Parallel(n_jobs=self.n_jobs)(
-            delayed(_transform_one)(trans, X, None, weight, params)
+            delayed(_transform_one)(trans, X, None, weight, routed_params[name])
             for name, trans, weight in self._iter()
         )
         if not Xs:
@@ -1793,6 +1846,35 @@ def __getitem__(self, name):
             raise KeyError("Only string keys are supported")
         return self.named_transformers[name]
 
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__)
+
+        for name, transformer in self.transformer_list:
+            router.add(
+                **{name: transformer},
+                method_mapping=MethodMapping()
+                .add(caller="fit", callee="fit")
+                .add(caller="fit_transform", callee="fit_transform")
+                .add(caller="fit_transform", callee="fit")
+                .add(caller="fit_transform", callee="transform")
+                .add(caller="transform", callee="transform"),
+            )
+
+        return router
+
 
 def make_union(*transformers, n_jobs=None, verbose=False):
     """Construct a :class:`FeatureUnion` from the given transformers.
diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py
@@ -54,8 +54,9 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs):
         sub-estimator's method where metadata is routed to
     split_params : tuple, default=empty
         specifies any parameters which are to be checked as being a subset
-        of the original values.
-    **kwargs : metadata to check
+        of the original values
+    **kwargs : dict
+        passed metadata
     """
     records = getattr(obj, "_records", dict()).get(method, dict())
     assert set(kwargs.keys()) == set(
@@ -338,6 +339,29 @@ def inverse_transform(self, X, sample_weight=None, metadata=None):
         return X
 
 
+class ConsumingNoFitTransformTransformer(BaseEstimator):
+    """A metadata consuming transformer that doesn't inherit from
+    TransformerMixin, and thus doesn't implement `fit_transform`. Note that
+    TransformerMixin's `fit_transform` doesn't route metadata to `transform`."""
+
+    def __init__(self, registry=None):
+        self.registry = registry
+
+    def fit(self, X, y=None, sample_weight=None, metadata=None):
+        if self.registry is not None:
+            self.registry.append(self)
+
+        record_metadata(self, "fit", sample_weight=sample_weight, metadata=metadata)
+
+        return self
+
+    def transform(self, X, sample_weight=None, metadata=None):
+        record_metadata(
+            self, "transform", sample_weight=sample_weight, metadata=metadata
+        )
+        return X
+
+
 class ConsumingScorer(_Scorer):
     def __init__(self, registry=None):
         super().__init__(
diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py
@@ -59,7 +59,6 @@
     MultiOutputRegressor,
     RegressorChain,
 )
-from sklearn.pipeline import FeatureUnion
 from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.tests.metadata_routing_common import (
     ConsumingClassifier,
@@ -362,7 +361,7 @@ def enable_slep006():
 
 The keys are as follows:
 
-- metaestimator: The metaestmator to be tested
+- metaestimator: The metaestimator to be tested
 - estimator_name: The name of the argument for the sub-estimator
 - estimator: The sub-estimator type, either "regressor" or "classifier"
 - init_args: The arguments to be passed to the metaestimator's constructor
@@ -398,7 +397,6 @@ def enable_slep006():
 UNSUPPORTED_ESTIMATORS = [
     AdaBoostClassifier(),
     AdaBoostRegressor(),
-    FeatureUnion([]),
     GraphicalLassoCV(),
     RFE(ConsumingClassifier()),
     RFECV(ConsumingClassifier()),
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py