From 159aec7889b38a273ead29f1485c53e162882bd1 Mon Sep 17 00:00:00 2001 From: Phillipp Gnan Date: Mon, 15 Sep 2025 10:24:16 +0200 Subject: [PATCH 1/7] FIX ColumnTransformer.fit_transform for polars.DataFrame missing a .size attribute in sparse stacking --- sklearn/compose/_column_transformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 58570b9676078..37629abcb43e4 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -1014,10 +1014,10 @@ def fit_transform(self, X, y=None, **params): # determine if concatenated output will be sparse or not if any(sparse.issparse(X) for X in Xs): - nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs) - total = sum( - X.shape[0] * X.shape[1] if sparse.issparse(X) else X.size for X in Xs + nnz = sum( + X.nnz if sparse.issparse(X) else X.shape[0] * X.shape[1] for X in Xs ) + total = sum(X.shape[0] * X.shape[1] for X in Xs) density = nnz / total self.sparse_output_ = density < self.sparse_threshold else: From af2879fa672286c2a4e6bfe4578d9aed48328355 Mon Sep 17 00:00:00 2001 From: Phillipp Gnan Date: Mon, 15 Sep 2025 10:26:02 +0200 Subject: [PATCH 2/7] added tests for sparse stacking in ColumnTransformer.fit_transform for polars/pandas DataFrame --- .../compose/tests/test_column_transformer.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 0ba240cf5df11..d01fc06ceff63 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -11,6 +11,7 @@ import pytest from numpy.testing import assert_allclose from scipy import sparse +import itertools from sklearn import config_context from sklearn.base import BaseEstimator, TransformerMixin @@ -512,15 +513,25 @@ def test_column_transformer_list(): assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) -@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) -def test_column_transformer_sparse_stacking(csr_container): - X_array = np.array([[0, 1, 2], [2, 4, 6]]).T +@pytest.mark.parametrize( + "csr_container, constructor_name", + itertools.product(CSR_CONTAINERS, ["numpy", "polars", "pandas"]) +) +def test_column_transformer_sparse_stacking(csr_container, constructor_name): + X = np.array([[0, 1, 2], [2, 4, 6]]).T + + if constructor_name != "numpy": + pytest.importorskip(constructor_name) + X = _convert_container( + X, constructor_name, columns_name=["first", "second"] + ) + col_trans = ColumnTransformer( [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)], sparse_threshold=0.8, ) - col_trans.fit(X_array) - X_trans = col_trans.transform(X_array) + col_trans.fit(X) + X_trans = col_trans.transform(X) assert sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) @@ -531,8 +542,8 @@ def test_column_transformer_sparse_stacking(csr_container): [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)], sparse_threshold=0.1, ) - col_trans.fit(X_array) - X_trans = col_trans.transform(X_array) + col_trans.fit(X) + X_trans = col_trans.transform(X) assert not sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0])) From c011e2011ae138f8352102eefb6ed8b50b1b2c1d Mon Sep 17 00:00:00 2001 From: Phillipp Gnan Date: Mon, 15 Sep 2025 10:27:18 +0200 Subject: [PATCH 3/7] updated _convert_container docstring to indicate that constructor_name="pandas" is a valid input --- sklearn/utils/_testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index c373dbc66f6d6..c3a1b5d6b73b7 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -976,12 +976,12 @@ def _convert_container( container : array-like The container to convert. constructor_name : {"list", "tuple", "array", "sparse", "dataframe", \ - "series", "index", "slice", "sparse_csr", "sparse_csc", \ + "pandas", "series", "index", "slice", "sparse_csr", "sparse_csc", \ "sparse_csr_array", "sparse_csc_array", "pyarrow", "polars", \ "polars_series"} The type of the returned container. columns_name : index or array-like, default=None - For pandas container supporting `columns_names`, it will affect + For pandas/polars container supporting `columns_names`, it will affect specific names. dtype : dtype, default=None Force the dtype of the container. Does not apply to `"slice"` From 3347201a6a17a2230b5edda30ec129eca2be1977 Mon Sep 17 00:00:00 2001 From: Phillipp Gnan Date: Mon, 15 Sep 2025 14:08:35 +0200 Subject: [PATCH 4/7] ruff: fixed position of itertools import to conform with ruff --- sklearn/compose/tests/test_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index d01fc06ceff63..d53182880517b 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2,6 +2,7 @@ Test the ColumnTransformer. """ +import itertools import pickle import re import warnings @@ -11,7 +12,6 @@ import pytest from numpy.testing import assert_allclose from scipy import sparse -import itertools from sklearn import config_context from sklearn.base import BaseEstimator, TransformerMixin From 5853805a10a9a9c9692b207865f9eb8a62cc1648 Mon Sep 17 00:00:00 2001 From: Phillipp Gnan Date: Mon, 15 Sep 2025 20:48:49 +0200 Subject: [PATCH 5/7] ruff formatting --- sklearn/compose/tests/test_column_transformer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index d53182880517b..db27d06730629 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -514,17 +514,15 @@ def test_column_transformer_list(): @pytest.mark.parametrize( - "csr_container, constructor_name", - itertools.product(CSR_CONTAINERS, ["numpy", "polars", "pandas"]) + "csr_container, constructor_name", + itertools.product(CSR_CONTAINERS, ["numpy", "polars", "pandas"]), ) def test_column_transformer_sparse_stacking(csr_container, constructor_name): X = np.array([[0, 1, 2], [2, 4, 6]]).T if constructor_name != "numpy": pytest.importorskip(constructor_name) - X = _convert_container( - X, constructor_name, columns_name=["first", "second"] - ) + X = _convert_container(X, constructor_name, columns_name=["first", "second"]) col_trans = ColumnTransformer( [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)], From 98079f11bc5dae4ed45e801d3fd3ccb2ad7e27d3 Mon Sep 17 00:00:00 2001 From: Phillipp Gnan Date: Fri, 26 Sep 2025 22:04:13 +0200 Subject: [PATCH 6/7] added changelog news fragment --- doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst b/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst new file mode 100644 index 0000000000000..f909b11b8b02d --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst @@ -0,0 +1,5 @@ +- The :class:`compose.ColumnTransformer` now correctly fits on data + provided as a polars.DataFrame. Previously, this raised an + AttributeError: 'DataFrame' object has no attribute 'size'. + By :user:`Phillipp Gnan `. + \ No newline at end of file From 87b1ae9ebcef15ee6a7e9725abfc33757d2941f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Tue, 7 Oct 2025 13:58:32 +0200 Subject: [PATCH 7/7] nitpicks --- .../upcoming_changes/sklearn.compose/32188.fix.rst | 6 ++---- sklearn/compose/tests/test_column_transformer.py | 12 +++--------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst b/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst index f909b11b8b02d..1bd73934a426c 100644 --- a/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.compose/32188.fix.rst @@ -1,5 +1,3 @@ -- The :class:`compose.ColumnTransformer` now correctly fits on data - provided as a polars.DataFrame. Previously, this raised an - AttributeError: 'DataFrame' object has no attribute 'size'. +- The :class:`compose.ColumnTransformer` now correctly fits on data provided as a + `polars.DataFrame` when any transformer has a sparse output. By :user:`Phillipp Gnan `. - \ No newline at end of file diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index db27d06730629..031414190f87e 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2,7 +2,6 @@ Test the ColumnTransformer. """ -import itertools import pickle import re import warnings @@ -513,16 +512,11 @@ def test_column_transformer_list(): assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) -@pytest.mark.parametrize( - "csr_container, constructor_name", - itertools.product(CSR_CONTAINERS, ["numpy", "polars", "pandas"]), -) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +@pytest.mark.parametrize("constructor_name", ["array", "pandas", "polars"]) def test_column_transformer_sparse_stacking(csr_container, constructor_name): X = np.array([[0, 1, 2], [2, 4, 6]]).T - - if constructor_name != "numpy": - pytest.importorskip(constructor_name) - X = _convert_container(X, constructor_name, columns_name=["first", "second"]) + X = _convert_container(X, constructor_name, columns_name=["first", "second"]) col_trans = ColumnTransformer( [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)],