-
-
Notifications
You must be signed in to change notification settings - Fork 26k
ENH Introduces set_output API for pandas output #23734
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e1ea0a9
07078a1
1faf347
9f9680a
a6a4b59
4ae72c5
beca084
021d36c
de0db34
63c4204
ee4cdff
9d318b1
609f4f0
64c761a
471e2d5
63c2011
89a854e
20fed9e
91e2448
d63f059
32d9252
126a9aa
0d02e50
fb0abaa
1c5c2ef
e4a663f
c8667b9
390e257
19b6032
0d2610a
531c9c7
321ede0
865edf5
110e50d
1c658ed
5ae531f
4c7fefa
4f8c2ac
50fd9c1
0f63fa2
c9fc072
09d2359
c59d800
128ee66
3477d51
f94870e
9cbb47c
cf0c916
94c4ff5
4e56880
980caf3
9888bdd
2b238aa
2db0dd4
77511b5
903ad04
4d7f594
26853ab
7f13efb
f64b2f5
96ae074
99f9497
2fc486d
fe87f71
cca5548
54964dd
3f56922
88e17ff
072b1a3
78f4a8b
598a94f
08e01d0
0009dc3
244c002
4b1f1e5
c33a307
51fc045
fad1fa5
c8bb076
d25ba8b
01771ae
1421e8a
24c3fc1
b87ad84
5313958
48add35
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
""" | ||
================================ | ||
Introducing the `set_output` API | ||
================================ | ||
|
||
.. currentmodule:: sklearn | ||
|
||
This example will demonstrate the `set_output` API to configure transformers to | ||
output pandas DataFrames. `set_output` can be configured per estimator by calling | ||
the `set_output` method or globally by setting `set_config(transform_output="pandas")`. | ||
For details, see | ||
`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__. | ||
""" # noqa | ||
|
||
# %% | ||
# First, we load the iris dataset as a DataFrame to demonstrate the `set_output` API. | ||
from sklearn.datasets import load_iris | ||
from sklearn.model_selection import train_test_split | ||
|
||
X, y = load_iris(as_frame=True, return_X_y=True) | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) | ||
X_train.head() | ||
|
||
# %% | ||
# To configure an estimator such as :class:`preprocessing.StandardScalar` to return | ||
# DataFrames, call `set_output`. This feature requires pandas to be installed. | ||
|
||
from sklearn.preprocessing import StandardScaler | ||
|
||
scaler = StandardScaler().set_output(transform="pandas") | ||
|
||
scaler.fit(X_train) | ||
X_test_scaled = scaler.transform(X_test) | ||
X_test_scaled.head() | ||
|
||
# %% | ||
# `set_output` can be called after `fit` to configure `transform` after the fact. | ||
scaler2 = StandardScaler() | ||
|
||
scaler2.fit(X_train) | ||
X_test_np = scaler2.transform(X_test) | ||
print(f"Default output type: {type(X_test_np).__name__}") | ||
|
||
scaler2.set_output(transform="pandas") | ||
X_test_df = scaler2.transform(X_test) | ||
print(f"Configured pandas output type: {type(X_test_df).__name__}") | ||
|
||
# %% | ||
# In a :class:`pipeline.Pipeline`, `set_output` configures all steps to output | ||
# DataFrames. | ||
from sklearn.pipeline import make_pipeline | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.feature_selection import SelectPercentile | ||
|
||
clf = make_pipeline( | ||
StandardScaler(), SelectPercentile(percentile=75), LogisticRegression() | ||
) | ||
clf.set_output(transform="pandas") | ||
clf.fit(X_train, y_train) | ||
|
||
# %% | ||
# Each transformer in the pipeline is configured to return DataFrames. This | ||
# means that the final logistic regression step contain the feature names. | ||
clf[-1].feature_names_in_ | ||
|
||
# %% | ||
# Next we load the titanic dataset to demonstrate `set_output` with | ||
# :class:`compose.ColumnTransformer` and heterogenous data. | ||
from sklearn.datasets import fetch_openml | ||
|
||
X, y = fetch_openml( | ||
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas" | ||
) | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) | ||
|
||
# %% | ||
# The `set_output` API can be configured globally by using :func:`set_config` and | ||
# setting the `transform_output` to `"pandas"`. | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.preprocessing import OneHotEncoder, StandardScaler | ||
from sklearn.impute import SimpleImputer | ||
from sklearn import set_config | ||
|
||
set_config(transform_output="pandas") | ||
|
||
num_pipe = make_pipeline(SimpleImputer(), StandardScaler()) | ||
ct = ColumnTransformer( | ||
( | ||
("numerical", num_pipe, ["age", "fare"]), | ||
( | ||
"categorical", | ||
OneHotEncoder( | ||
sparse_output=False, drop="if_binary", handle_unknown="ignore" | ||
), | ||
["embarked", "sex", "pclass"], | ||
), | ||
), | ||
verbose_feature_names_out=False, | ||
) | ||
clf = make_pipeline(ct, SelectPercentile(percentile=50), LogisticRegression()) | ||
clf.fit(X_train, y_train) | ||
clf.score(X_test, y_test) | ||
|
||
# %% | ||
# With the global configuration, all transformers output DataFrames. This allows us to | ||
# easily plot the logistic regression coefficients with the corresponding feature names. | ||
import pandas as pd | ||
|
||
log_reg = clf[-1] | ||
coef = pd.Series(log_reg.coef_.ravel(), index=log_reg.feature_names_in_) | ||
_ = coef.sort_values().plot.barh() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
), | ||
"enable_cython_pairwise_dist": True, | ||
"array_api_dispatch": False, | ||
"transform_output": "default", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use a more speaking name than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This option use to be I can get behind from scipy.sparse import csr_matrix
import numpy as np
mat = csr_matrix([[1, 2, 0]])
print(np.asarray(mat).dtype)
# object There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at our glossary entry for "array-like", we exclude "sparse matrix" from "array-like". In that case, "array-like" would not be a good default, because it does not cover sparse matrices. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have a proposal. Naming the default "default" just seems wrong to me. What if we change it in the future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The semantics for "default" is "the transformer does anything it wants". Here are some options:
I am in favor of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fine with |
||
} | ||
_threadlocal = threading.local() | ||
|
||
|
@@ -52,6 +53,7 @@ def set_config( | |
pairwise_dist_chunk_size=None, | ||
enable_cython_pairwise_dist=None, | ||
array_api_dispatch=None, | ||
transform_output=None, | ||
): | ||
"""Set global scikit-learn configuration | ||
|
||
|
@@ -120,6 +122,11 @@ def set_config( | |
|
||
.. versionadded:: 1.2 | ||
|
||
transform_output : str, default=None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use a more speaking name than |
||
Configure the output container for transform. | ||
|
||
.. versionadded:: 1.2 | ||
|
||
See Also | ||
-------- | ||
config_context : Context manager for global scikit-learn configuration. | ||
|
@@ -141,6 +148,8 @@ def set_config( | |
local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist | ||
if array_api_dispatch is not None: | ||
local_config["array_api_dispatch"] = array_api_dispatch | ||
if transform_output is not None: | ||
local_config["transform_output"] = transform_output | ||
|
||
|
||
@contextmanager | ||
|
@@ -153,6 +162,7 @@ def config_context( | |
pairwise_dist_chunk_size=None, | ||
enable_cython_pairwise_dist=None, | ||
array_api_dispatch=None, | ||
transform_output=None, | ||
): | ||
"""Context manager for global scikit-learn configuration. | ||
|
||
|
@@ -220,6 +230,11 @@ def config_context( | |
|
||
.. versionadded:: 1.2 | ||
|
||
transform_output : str, default=None | ||
Configure the output container for transform. | ||
|
||
.. versionadded:: 1.2 | ||
|
||
Yields | ||
------ | ||
None. | ||
|
@@ -256,6 +271,7 @@ def config_context( | |
pairwise_dist_chunk_size=pairwise_dist_chunk_size, | ||
enable_cython_pairwise_dist=enable_cython_pairwise_dist, | ||
array_api_dispatch=array_api_dispatch, | ||
transform_output=transform_output, | ||
) | ||
|
||
try: | ||
|
Uh oh!
There was an error while loading. Please reload this page.