Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.compose/31079.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- :class:`compose.ColumnTransformer` now correctly preserves non-default index
when mixing pandas Series and Dataframes.
By :user:`Nicolas Bolle <nicolas-bolle>`.
24 changes: 24 additions & 0 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
make_column_transformer,
)
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import (
FunctionTransformer,
Expand Down Expand Up @@ -2619,6 +2620,29 @@ def test_column_transformer_auto_memmap(global_random_seed):
assert_allclose(Xt, StandardScaler().fit_transform(X[:, [0]]))


def test_column_transformer_non_default_index():
"""Check index handling when both pd.Series and pd.DataFrame slices are used in
ColumnTransformer.

Non-regression test for issue #31546.
"""
pd = pytest.importorskip("pandas")
df = pd.DataFrame(
{
"dict_col": [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}],
"dummy_col": [1, 2],
},
index=[1, 2],
)
t = make_column_transformer(
(DictVectorizer(sparse=False), "dict_col"),
(FunctionTransformer(), ["dummy_col"]),
)
t.set_output(transform="pandas")
X = t.fit_transform(df)
assert list(X.index) == [1, 2]


# Metadata Routing Tests
# ======================

Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def create_container(self, X_output, X_original, columns, inplace=True):
# because `list` exposes an `index` attribute.
if isinstance(X_output, pd.DataFrame):
index = X_output.index
elif isinstance(X_original, pd.DataFrame):
elif isinstance(X_original, (pd.DataFrame, pd.Series)):
index = X_original.index
else:
index = None
Expand Down
9 changes: 8 additions & 1 deletion sklearn/utils/tests/test_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,22 @@ def test_pandas_adapter():
pd = pytest.importorskip("pandas")
X_np = np.asarray([[1, 0, 3], [0, 0, 1]])
columns = np.asarray(["f0", "f1", "f2"], dtype=object)
index = np.asarray([0, 1])
index = np.asarray([1, 2])
X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index)
X_ser_orig = pd.Series([2, 3], index=index)

adapter = ADAPTERS_MANAGER.adapters["pandas"]
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
assert isinstance(X_container, pd.DataFrame)
assert_array_equal(X_container.columns, columns)
assert_array_equal(X_container.index, index)

# use original index when the original is a series
X_container = adapter.create_container(X_np, X_ser_orig, columns=lambda: columns)
assert isinstance(X_container, pd.DataFrame)
assert_array_equal(X_container.columns, columns)
assert_array_equal(X_container.index, index)

# Input dataframe's index does not change
new_columns = np.asarray(["f0", "f1"], dtype=object)
X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12])
Expand Down