Thanks to visit codestin.com
Credit goes to github.com

Skip to content
13 changes: 13 additions & 0 deletions doc/whats_new/v1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ Changes impacting all modules

:pr:`28235` by :user:`Julien Jerphanion <jjerphan>`.

- |Fix| Fixes a bug for all scikit-learn transformers when using `set_output` with
`transform` set to `pandas` or `polars`. The bug could lead to wrong naming of the
columns of the returned dataframe.
:pr:`28262` by :user:`Guillaume Lemaitre <glemaitre>`.

Changelog
---------

Expand All @@ -68,6 +73,14 @@ Changelog
- |Enhancement| Pandas and Polars dataframe are validated directly without ducktyping
checks. :pr:`28195` by `Thomas Fan`_.

:mod:`sklearn.compose`
......................

- |Fix| :class:`compose.ColumnTransformer` now transform into a polars dataframe when
`verbose_feature_names_out=True` and the transformers internally used several times
the same columns. Previously, it would raise a due to duplicated column names.
:pr:`28262` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.preprocessing`
............................

Expand Down
98 changes: 71 additions & 27 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,7 @@ def fit_transform(self, X, y=None, **params):
self._validate_output(Xs)
self._record_output_indices(Xs)

return self._hstack(list(Xs))
return self._hstack(list(Xs), n_samples=n_samples)

def transform(self, X, **params):
"""Transform X separately by each transformer, concatenate results.
Expand Down Expand Up @@ -1024,9 +1024,9 @@ def transform(self, X, **params):
# All transformers are None
return np.zeros((n_samples, 0))

return self._hstack(list(Xs))
return self._hstack(list(Xs), n_samples=n_samples)

def _hstack(self, Xs):
def _hstack(self, Xs, *, n_samples):
"""Stacks Xs horizontally.

This allows subclasses to control the stacking behavior, while reusing
Expand All @@ -1035,6 +1035,10 @@ def _hstack(self, Xs):
Parameters
----------
Xs : list of {array-like, sparse matrix, dataframe}
The container to concatenate.
n_samples : int
The number of samples in the input data to checking the transformation
consistency.
"""
if self.sparse_output_:
try:
Expand All @@ -1056,24 +1060,8 @@ def _hstack(self, Xs):
Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
adapter = _get_container_adapter("transform", self)
if adapter and all(adapter.is_supported_container(X) for X in Xs):
output = adapter.hstack(Xs)

output_samples = output.shape[0]
if any(_num_samples(X) != output_samples for X in Xs):
raise ValueError(
"Concatenating DataFrames from the transformer's output lead to"
" an inconsistent number of samples. The output may have Pandas"
" Indexes that do not match."
)

# If all transformers define `get_feature_names_out`, then transform
# will adjust the column names to be consistent with
# verbose_feature_names_out. Here we prefix the feature names if
# verbose_feature_names_out=True.

if not self.verbose_feature_names_out:
return output

# rename before stacking as it avoids to error on temporary duplicated
# columns
transformer_names = [
t[0]
for t in self._iter(
Expand All @@ -1083,13 +1071,69 @@ def _hstack(self, Xs):
skip_empty_columns=True,
)
]
# Selection of columns might be empty.
# Hence feature names are filtered for non-emptiness.
feature_names_outs = [X.columns for X in Xs if X.shape[1] != 0]
names_out = self._add_prefix_for_feature_names_out(
list(zip(transformer_names, feature_names_outs))
)
return adapter.rename_columns(output, names_out)
if self.verbose_feature_names_out:
# `_add_prefix_for_feature_names_out` takes care about raising
# an error if there are duplicated columns.
feature_names_outs = self._add_prefix_for_feature_names_out(
list(zip(transformer_names, feature_names_outs))
)
else:
# check for duplicated columns and raise if any
feature_names_outs = list(chain.from_iterable(feature_names_outs))
feature_names_count = Counter(feature_names_outs)
if any(count > 1 for count in feature_names_count.values()):
duplicated_feature_names = sorted(
name
for name, count in feature_names_count.items()
if count > 1
)
err_msg = (
"Duplicated feature names found before concatenating the"
" outputs of the transformers:"
f" {duplicated_feature_names}.\n"
)
for transformer_name, X in zip(transformer_names, Xs):
if X.shape[1] == 0:
continue
dup_cols_in_transformer = sorted(
set(X.columns).intersection(duplicated_feature_names)
)
if len(dup_cols_in_transformer):
err_msg += (
f"Transformer {transformer_name} has conflicting "
f"columns names: {dup_cols_in_transformer}.\n"
)
raise ValueError(
err_msg
+ "Either make sure that the transformers named above "
"do not generate columns with conflicting names or set "
"verbose_feature_names_out=True to automatically "
"prefix to the output feature names with the name "
"of the transformer to prevent any conflicting "
"names."
)

names_idx = 0
for X in Xs:
if X.shape[1] == 0:
continue
names_out = feature_names_outs[names_idx : names_idx + X.shape[1]]
adapter.rename_columns(X, names_out)
names_idx += X.shape[1]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this code ugly but I don't see how to improve it without reshaping boilerplate

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_num_features and _num_samples instead of .shape?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we play with container that are dataframe-like container. So they will implement the shape.


output = adapter.hstack(Xs)
output_samples = output.shape[0]
if output_samples != n_samples:
raise ValueError(
"Concatenating DataFrames from the transformer's output lead to"
" an inconsistent number of samples. The output may have Pandas"
" Indexes that do not match, or that transformers are returning"
" number of samples which are not the same as the number input"
" samples."
)

return output

return np.hstack(Xs)

Expand Down
63 changes: 63 additions & 0 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2354,6 +2354,69 @@ def test_column_transformer__getitem__():
ct["does_not_exist"]


@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_column_transformer_column_renaming(dataframe_lib):
"""Check that we properly rename columns when using `ColumnTransformer` and
selected columns are redundant between transformers.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/28260
"""
lib = pytest.importorskip(dataframe_lib)

df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})

transformer = ColumnTransformer(
transformers=[
("A", "passthrough", ["x1", "x2", "x3"]),
("B", FunctionTransformer(), ["x1", "x2"]),
("C", StandardScaler(), ["x1", "x3"]),
# special case of empty transformer
("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
],
verbose_feature_names_out=True,
).set_output(transform=dataframe_lib)
df_trans = transformer.fit_transform(df)
assert list(df_trans.columns) == [
"A__x1",
"A__x2",
"A__x3",
"B__x1",
"B__x2",
"C__x1",
"C__x3",
]


@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_column_transformer_error_with_duplicated_columns(dataframe_lib):
"""Check that we raise an error when using `ColumnTransformer` and
the columns names are duplicated between transformers."""
lib = pytest.importorskip(dataframe_lib)

df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]})

transformer = ColumnTransformer(
transformers=[
("A", "passthrough", ["x1", "x2", "x3"]),
("B", FunctionTransformer(), ["x1", "x2"]),
("C", StandardScaler(), ["x1", "x3"]),
# special case of empty transformer
("D", FunctionTransformer(lambda x: x[[]]), ["x1", "x2", "x3"]),
],
verbose_feature_names_out=False,
).set_output(transform=dataframe_lib)
err_msg = re.escape(
"Duplicated feature names found before concatenating the outputs of the "
"transformers: ['x1', 'x2', 'x3'].\n"
"Transformer A has conflicting columns names: ['x1', 'x2', 'x3'].\n"
"Transformer B has conflicting columns names: ['x1', 'x2'].\n"
"Transformer C has conflicting columns names: ['x1', 'x3'].\n"
)
with pytest.raises(ValueError, match=err_msg):
transformer.fit_transform(df)


# Metadata Routing Tests
# ======================

Expand Down
10 changes: 8 additions & 2 deletions sklearn/utils/_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,10 @@ def is_supported_container(self, X):
return isinstance(X, pd.DataFrame)

def rename_columns(self, X, columns):
return X.rename(columns=dict(zip(X.columns, columns)))
# we cannot use `rename` since it takes a dictionary and at this stage we have
# potentially duplicate column names in `X`
X.columns = columns
return X

def hstack(self, Xs):
pd = check_library_installed("pandas")
Expand Down Expand Up @@ -151,7 +154,10 @@ def is_supported_container(self, X):
return isinstance(X, pl.DataFrame)

def rename_columns(self, X, columns):
return X.rename(dict(zip(X.columns, columns)))
# we cannot use `rename` since it takes a dictionary and at this stage we have
# potentially duplicate column names in `X`
X.columns = columns
return X

def hstack(self, Xs):
pl = check_library_installed("polars")
Expand Down
8 changes: 8 additions & 0 deletions sklearn/utils/tests/test_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ def test_pandas_adapter():
)
pd.testing.assert_frame_equal(X_stacked, expected_df)

# check that we update properly the columns even with duplicate column names
# this use-case potentially happen when using ColumnTransformer
# non-regression test for gh-28260
X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
new_columns = np.array(["x__a", "y__a"], dtype=object)
new_df = adapter.rename_columns(X_df, new_columns)
assert_array_equal(new_df.columns, new_columns)


def test_polars_adapter():
"""Check Polars adapter has expected behavior."""
Expand Down