scikit-learn · adrinjalali · Jan 31, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 26, 2024
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -44,6 +44,11 @@ Changes impacting all modules
 
   :pr:`28235` by :user:`Julien Jerphanion <jjerphan>`.
 
+- |Fix| Fixes a bug for all scikit-learn transformers when using `set_output` with
+  `transform` set to `pandas` or `polars`. The bug could lead to wrong naming of the
+  columns of the returned dataframe.
+  :pr:`28262` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Changelog
 ---------
 
@@ -68,6 +73,14 @@ Changelog
 - |Enhancement| Pandas and Polars dataframe are validated directly without ducktyping
   checks. :pr:`28195` by `Thomas Fan`_.
 
+:mod:`sklearn.compose`
+......................
+
+- |Fix| :class:`compose.ColumnTransformer` now transform into a polars dataframe when
+  `verbose_feature_names_out=True` and the transformers internally used several times
+  the same columns. Previously, it would raise a due to duplicated column names.
+  :pr:`28262` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.preprocessing`
 ............................
 

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -941,7 +941,7 @@ def fit_transform(self, X, y=None, **params):
         self._validate_output(Xs)
         self._record_output_indices(Xs)
 
-        return self._hstack(list(Xs))
+        return self._hstack(list(Xs), n_samples=n_samples)
 
     def transform(self, X, **params):
         """Transform X separately by each transformer, concatenate results.
@@ -1024,9 +1024,9 @@ def transform(self, X, **params):
             # All transformers are None
             return np.zeros((n_samples, 0))
 
-        return self._hstack(list(Xs))
+        return self._hstack(list(Xs), n_samples=n_samples)
 
-    def _hstack(self, Xs):
+    def _hstack(self, Xs, *, n_samples):
         """Stacks Xs horizontally.
 
         This allows subclasses to control the stacking behavior, while reusing
@@ -1035,6 +1035,10 @@ def _hstack(self, Xs):
         Parameters
         ----------
         Xs : list of {array-like, sparse matrix, dataframe}
+            The container to concatenate.
+        n_samples : int
+            The number of samples in the input data to checking the transformation
+            consistency.
         """
         if self.sparse_output_:
             try:
@@ -1056,24 +1060,8 @@ def _hstack(self, Xs):
             Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
             adapter = _get_container_adapter("transform", self)
             if adapter and all(adapter.is_supported_container(X) for X in Xs):
-                output = adapter.hstack(Xs)
-
-                output_samples = output.shape[0]
-                if any(_num_samples(X) != output_samples for X in Xs):
-                    raise ValueError(
-                        "Concatenating DataFrames from the transformer's output lead to"
-                        " an inconsistent number of samples. The output may have Pandas"
-                        " Indexes that do not match."
-                    )
-
-                # If all transformers define `get_feature_names_out`, then transform
-                # will adjust the column names to be consistent with
-                # verbose_feature_names_out. Here we prefix the feature names if
-                # verbose_feature_names_out=True.
-
-                if not self.verbose_feature_names_out:
-                    return output
-
+                # rename before stacking as it avoids to error on temporary duplicated
+                # columns
                 transformer_names = [
                     t[0]
                     for t in self._iter(
@@ -1083,13 +1071,69 @@ def _hstack(self, Xs):
                         skip_empty_columns=True,
                     )
                 ]
-                # Selection of columns might be empty.
-                # Hence feature names are filtered for non-emptiness.
                 feature_names_outs = [X.columns for X in Xs if X.shape[1] != 0]
-                names_out = self._add_prefix_for_feature_names_out(
-                    list(zip(transformer_names, feature_names_outs))
-                )
-                return adapter.rename_columns(output, names_out)
+                if self.verbose_feature_names_out:
+                    # `_add_prefix_for_feature_names_out` takes care about raising
+                    # an error if there are duplicated columns.
+                    feature_names_outs = self._add_prefix_for_feature_names_out(
+                        list(zip(transformer_names, feature_names_outs))
+                    )
+                else:
+                    # check for duplicated columns and raise if any
+                    feature_names_outs = list(chain.from_iterable(feature_names_outs))
+                    feature_names_count = Counter(feature_names_outs)
+                    if any(count > 1 for count in feature_names_count.values()):
+                        duplicated_feature_names = sorted(
+                            name
+                            for name, count in feature_names_count.items()
+                            if count > 1
+                        )
+                        err_msg = (
+                            "Duplicated feature names found before concatenating the"
+                            " outputs of the transformers:"
+                            f" {duplicated_feature_names}.\n"
+                        )
+                        for transformer_name, X in zip(transformer_names, Xs):
+                            if X.shape[1] == 0:
+                                continue
+                            dup_cols_in_transformer = sorted(
+                                set(X.columns).intersection(duplicated_feature_names)
+                            )
+                            if len(dup_cols_in_transformer):
+                                err_msg += (
+                                    f"Transformer {transformer_name} has conflicting "
+                                    f"columns names: {dup_cols_in_transformer}.\n"
+                                )
+                        raise ValueError(
+                            err_msg
+                            + "Either make sure that the transformers named above "
+                            "do not generate columns with conflicting names or set "
+                            "verbose_feature_names_out=True to automatically "
+                            "prefix to the output feature names with the name "
+                            "of the transformer to prevent any conflicting "
+                            "names."
+                        )
+
+                names_idx = 0
+                for X in Xs:
+                    if X.shape[1] == 0:
+                        continue
+                    names_out = feature_names_outs[names_idx : names_idx + X.shape[1]]
+                    adapter.rename_columns(X, names_out)
+                    names_idx += X.shape[1]
+
+                output = adapter.hstack(Xs)
+                output_samples = output.shape[0]
+                if output_samples != n_samples:
+                    raise ValueError(
+                        "Concatenating DataFrames from the transformer's output lead to"
+                        " an inconsistent number of samples. The output may have Pandas"
+                        " Indexes that do not match, or that transformers are returning"
+                        " number of samples which are not the same as the number input"
+                        " samples."
+                    )
+
+                return output
 
             return np.hstack(Xs)
 

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -2354,6 +2354,69 @@ def test_column_transformer__getitem__():
         ct["does_not_exist"]
 
 
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_column_transformer_column_renaming(dataframe_lib):
+    """Check that we properly rename columns when using `ColumnTransformer` and
+    selected columns are redundant between transformers.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/28260
+    """
+    lib = pytest.importorskip(dataframe_lib)
+
+    df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
+
+    transformer = ColumnTransformer(
+        transformers=[
+            ("A", "passthrough", ["x1", "x2", "x3"]),
+            ("B", FunctionTransformer(), ["x1", "x2"]),
+            ("C", StandardScaler(), ["x1", "x3"]),
+            # special case of empty transformer
+            ("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
+        ],
+        verbose_feature_names_out=True,
+    ).set_output(transform=dataframe_lib)
+    df_trans = transformer.fit_transform(df)
+    assert list(df_trans.columns) == [
+        "A__x1",
+        "A__x2",
+        "A__x3",
+        "B__x1",
+        "B__x2",
+        "C__x1",
+        "C__x3",
+    ]
+
+
+@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
+def test_column_transformer_error_with_duplicated_columns(dataframe_lib):
+    """Check that we raise an error when using `ColumnTransformer` and
+    the columns names are duplicated between transformers."""
+    lib = pytest.importorskip(dataframe_lib)
+
+    df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})
+
+    transformer = ColumnTransformer(
+        transformers=[
+            ("A", "passthrough", ["x1", "x2", "x3"]),
+            ("B", FunctionTransformer(), ["x1", "x2"]),
+            ("C", StandardScaler(), ["x1", "x3"]),
+            # special case of empty transformer
+            ("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
+        ],
+        verbose_feature_names_out=False,
+    ).set_output(transform=dataframe_lib)
+    err_msg = re.escape(
+        "Duplicated feature names found before concatenating the outputs of the "
+        "transformers: ['x1', 'x2', 'x3'].\n"
+        "Transformer A has conflicting columns names: ['x1', 'x2', 'x3'].\n"
+        "Transformer B has conflicting columns names: ['x1', 'x2'].\n"
+        "Transformer C has conflicting columns names: ['x1', 'x3'].\n"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        transformer.fit_transform(df)
+
+
 # Metadata Routing Tests
 # ======================
 

diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py
@@ -122,7 +122,10 @@ def is_supported_container(self, X):
         return isinstance(X, pd.DataFrame)
 
     def rename_columns(self, X, columns):
-        return X.rename(columns=dict(zip(X.columns, columns)))
+        # we cannot use `rename` since it takes a dictionary and at this stage we have
+        # potentially duplicate column names in `X`
+        X.columns = columns
+        return X
 
     def hstack(self, Xs):
         pd = check_library_installed("pandas")
@@ -151,7 +154,10 @@ def is_supported_container(self, X):
         return isinstance(X, pl.DataFrame)
 
     def rename_columns(self, X, columns):
-        return X.rename(dict(zip(X.columns, columns)))
+        # we cannot use `rename` since it takes a dictionary and at this stage we have
+        # potentially duplicate column names in `X`
+        X.columns = columns
+        return X
 
     def hstack(self, Xs):
         pl = check_library_installed("polars")

diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
@@ -58,6 +58,14 @@ def test_pandas_adapter():
     )
     pd.testing.assert_frame_equal(X_stacked, expected_df)
 
+    # check that we update properly the columns even with duplicate column names
+    # this use-case potentially happen when using ColumnTransformer
+    # non-regression test for gh-28260
+    X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
+    new_columns = np.array(["x__a", "y__a"], dtype=object)
+    new_df = adapter.rename_columns(X_df, new_columns)
+    assert_array_equal(new_df.columns, new_columns)
+
 
 def test_polars_adapter():
     """Check Polars adapter has expected behavior."""