From 9255261a3f5653ae8f65b8d8890921b8db789009 Mon Sep 17 00:00:00 2001 From: nicolas-bolle Date: Tue, 25 Mar 2025 22:06:11 -0400 Subject: [PATCH 1/5] pass through pd.Series index --- sklearn/utils/_set_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 6980902594663..0bf472eb627a1 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -125,7 +125,7 @@ def create_container(self, X_output, X_original, columns, inplace=True): # because `list` exposes an `index` attribute. if isinstance(X_output, pd.DataFrame): index = X_output.index - elif isinstance(X_original, pd.DataFrame): + elif isinstance(X_original, (pd.DataFrame, pd.Series)): index = X_original.index else: index = None From 14b55ef2b10fa0d80933d9515dff20fd93dd1b30 Mon Sep 17 00:00:00 2001 From: nicolas-bolle Date: Tue, 25 Mar 2025 22:21:56 -0400 Subject: [PATCH 2/5] use non-default index, add series index unit test --- sklearn/utils/tests/test_set_output.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 2b756ada64a6d..146f0a6c28592 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -25,8 +25,9 @@ def test_pandas_adapter(): pd = pytest.importorskip("pandas") X_np = np.asarray([[1, 0, 3], [0, 0, 1]]) columns = np.asarray(["f0", "f1", "f2"], dtype=object) - index = np.asarray([0, 1]) + index = np.asarray([1, 2]) X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index) + X_ser_orig = pd.Series([2, 3], index=index) adapter = ADAPTERS_MANAGER.adapters["pandas"] X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns) @@ -34,6 +35,12 @@ def test_pandas_adapter(): assert_array_equal(X_container.columns, columns) assert_array_equal(X_container.index, index) + # use original index when the original is a series + X_container = adapter.create_container(X_np, X_ser_orig, columns=lambda: columns) + assert isinstance(X_container, pd.DataFrame) + assert_array_equal(X_container.columns, columns) + assert_array_equal(X_container.index, index) + # Input dataframe's index does not change new_columns = np.asarray(["f0", "f1"], dtype=object) X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12]) From 25add4d55fa3584c6dff63450226b1037635902e Mon Sep 17 00:00:00 2001 From: nicolas-bolle Date: Wed, 9 Jul 2025 18:48:33 -0400 Subject: [PATCH 3/5] add pandas adapter column transformer unit test --- sklearn/utils/tests/test_set_output.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 146f0a6c28592..3eace40906b9c 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -6,7 +6,9 @@ from numpy.testing import assert_array_equal from sklearn._config import config_context, get_config -from sklearn.preprocessing import StandardScaler +from sklearn.compose import make_column_transformer +from sklearn.feature_extraction import DictVectorizer +from sklearn.preprocessing import FunctionTransformer, StandardScaler from sklearn.utils._set_output import ( ADAPTERS_MANAGER, ContainerAdapterProtocol, @@ -90,6 +92,27 @@ def test_pandas_adapter(): assert list(X_output.columns) == ["a", "b"] +def test_pandas_adapter_with_column_transformer(): + """Check index handling when both pd.Series and + pd.DataFrame slices are used in ColumnTransformer. + """ + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "dict_col": [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}], + "dummy_col": [1, 2], + }, + index=[1, 2], + ) + t = make_column_transformer( + (DictVectorizer(sparse=False), "dict_col"), + (FunctionTransformer(), ["dummy_col"]), + ) + t.set_output(transform="pandas") + X = t.fit_transform(df) + assert list(X.index) == [1, 2] + + def test_polars_adapter(): """Check Polars adapter has expected behavior.""" pl = pytest.importorskip("polars") From 2b03d29dd4aa1664301a9aafef72468d22828324 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Thu, 10 Jul 2025 16:12:25 +0200 Subject: [PATCH 4/5] move test into column transformer tests + changelog --- .../sklearn.compose/31079.fix.rst | 3 +++ .../compose/tests/test_column_transformer.py | 24 ++++++++++++++++++ sklearn/utils/tests/test_set_output.py | 25 +------------------ 3 files changed, 28 insertions(+), 24 deletions(-) create mode 100644 doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst b/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst new file mode 100644 index 0000000000000..8b556a7cdd110 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst @@ -0,0 +1,3 @@ +- |Fix| :class:`compose.ColumnTransformer` now correctly preserves non-default index + when mixing pandas Series and Dataframes. + By :user:`Nicolas Bolle `. diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a458d44c53fb4..4fac38defcaa7 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -20,6 +20,7 @@ make_column_transformer, ) from sklearn.exceptions import NotFittedError +from sklearn.feature_extraction import DictVectorizer from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import ( FunctionTransformer, @@ -2619,6 +2620,29 @@ def test_column_transformer_auto_memmap(global_random_seed): assert_allclose(Xt, StandardScaler().fit_transform(X[:, [0]])) +def test_column_transformer_non_default_index(): + """Check index handling when both pd.Series and pd.DataFrame slices are used in + ColumnTransformer. + + Non-regression test for issue #31546. + """ + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "dict_col": [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}], + "dummy_col": [1, 2], + }, + index=[1, 2], + ) + t = make_column_transformer( + (DictVectorizer(sparse=False), "dict_col"), + (FunctionTransformer(), ["dummy_col"]), + ) + t.set_output(transform="pandas") + X = t.fit_transform(df) + assert list(X.index) == [1, 2] + + # Metadata Routing Tests # ====================== diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 3eace40906b9c..146f0a6c28592 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -6,9 +6,7 @@ from numpy.testing import assert_array_equal from sklearn._config import config_context, get_config -from sklearn.compose import make_column_transformer -from sklearn.feature_extraction import DictVectorizer -from sklearn.preprocessing import FunctionTransformer, StandardScaler +from sklearn.preprocessing import StandardScaler from sklearn.utils._set_output import ( ADAPTERS_MANAGER, ContainerAdapterProtocol, @@ -92,27 +90,6 @@ def test_pandas_adapter(): assert list(X_output.columns) == ["a", "b"] -def test_pandas_adapter_with_column_transformer(): - """Check index handling when both pd.Series and - pd.DataFrame slices are used in ColumnTransformer. - """ - pd = pytest.importorskip("pandas") - df = pd.DataFrame( - { - "dict_col": [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}], - "dummy_col": [1, 2], - }, - index=[1, 2], - ) - t = make_column_transformer( - (DictVectorizer(sparse=False), "dict_col"), - (FunctionTransformer(), ["dummy_col"]), - ) - t.set_output(transform="pandas") - X = t.fit_transform(df) - assert list(X.index) == [1, 2] - - def test_polars_adapter(): """Check Polars adapter has expected behavior.""" pl = pytest.importorskip("polars") From c62372b538ab04ef845ae79c61ae9a6539872e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Fri, 11 Jul 2025 11:07:20 +0200 Subject: [PATCH 5/5] Update doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst --- doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst b/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst index 8b556a7cdd110..b7ecaf67292b9 100644 --- a/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst @@ -1,3 +1,3 @@ -- |Fix| :class:`compose.ColumnTransformer` now correctly preserves non-default index +- :class:`compose.ColumnTransformer` now correctly preserves non-default index when mixing pandas Series and Dataframes. By :user:`Nicolas Bolle `.