From c9ce4f8f2d2c30d1df489d5cb8a5c3010d609f83 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 27 Mar 2024 21:52:55 -0400 Subject: [PATCH 01/10] Support column names for polars DataFrames in DecisionBoundaryDisplay.from_estimator --- sklearn/inspection/_plot/decision_boundary.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/_plot/decision_boundary.py b/sklearn/inspection/_plot/decision_boundary.py index 4229aa333507c..07e28e0d18ad4 100644 --- a/sklearn/inspection/_plot/decision_boundary.py +++ b/sklearn/inspection/_plot/decision_boundary.py @@ -7,6 +7,8 @@ from ...utils._response import _get_response_values from ...utils.validation import ( _is_arraylike_not_scalar, + _is_pandas_df, + _is_polars_df, _num_features, check_is_fitted, ) @@ -345,11 +347,18 @@ def from_estimator( np.linspace(x0_min, x0_max, grid_resolution), np.linspace(x1_min, x1_max, grid_resolution), ) - if hasattr(X, "iloc"): + if _is_pandas_df(X): # we need to preserve the feature names and therefore get an empty dataframe X_grid = X.iloc[[], :].copy() X_grid.iloc[:, 0] = xx0.ravel() X_grid.iloc[:, 1] = xx1.ravel() + elif _is_polars_df(X): + X_grid = X.drop(X.columns).with_columns( + **{ + column: series + for column, series in zip(X.columns, (xx0.ravel(), xx1.ravel())) + } + ) else: X_grid = np.c_[xx0.ravel(), xx1.ravel()] From e729040b7108d262bf3b44751360e332a2fe7cd0 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 28 Mar 2024 08:49:37 -0400 Subject: [PATCH 02/10] Add test for passing polars dataframes into from_estimator --- .../tests/test_boundary_decision_display.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py index 7bb38f55445a0..faffe71b5dc8b 100644 --- a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py +++ b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py @@ -468,8 +468,8 @@ def test_string_target(pyplot): ) -def test_dataframe_support(pyplot): - """Check that passing a dataframe at fit and to the Display does not +def test_pandas_dataframe_support(pyplot): + """Check that passing a pandas dataframe at fit and to the Display does not raise warnings. Non-regression test for: @@ -485,6 +485,23 @@ def test_dataframe_support(pyplot): DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict") +def test_polars_dataframe_support(pyplot): + """Check that passing a polars dataframe at fit and to the Display does not + raise warnings. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28717 + """ + pl = pytest.importorskip("polars") + df = pl.DataFrame({"col_x": X[:, 0], "col_y": X[:, 1]}) + estimator = LogisticRegression().fit(df, y) + + with warnings.catch_warnings(): + # no warnings linked to feature names validation should be raised + warnings.simplefilter("error", UserWarning) + DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict") + + @pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) def test_class_of_interest_binary(pyplot, response_method): """Check the behaviour of passing `class_of_interest` for plotting the output of From 667fff4bf83e726220e21af7e77c380a831b2056 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 29 Mar 2024 08:02:11 -0400 Subject: [PATCH 03/10] Combine pandas/polars dataframe tests --- .../tests/test_boundary_decision_display.py | 29 +++++-------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py index faffe71b5dc8b..14cb460d22646 100644 --- a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py +++ b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py @@ -468,32 +468,17 @@ def test_string_target(pyplot): ) -def test_pandas_dataframe_support(pyplot): - """Check that passing a pandas dataframe at fit and to the Display does not +@pytest.mark.parametrize("df_module_name", ["pandas", "polars"]) +def test_dataframe_support(pyplot, df_module_name): + """Check that passing a dataframe at fit and to the Display does not raise warnings. Non-regression test for: - https://github.com/scikit-learn/scikit-learn/issues/23311 + * https://github.com/scikit-learn/scikit-learn/issues/23311 + * https://github.com/scikit-learn/scikit-learn/issues/28717 """ - pd = pytest.importorskip("pandas") - df = pd.DataFrame(X, columns=["col_x", "col_y"]) - estimator = LogisticRegression().fit(df, y) - - with warnings.catch_warnings(): - # no warnings linked to feature names validation should be raised - warnings.simplefilter("error", UserWarning) - DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict") - - -def test_polars_dataframe_support(pyplot): - """Check that passing a polars dataframe at fit and to the Display does not - raise warnings. - - Non-regression test for: - https://github.com/scikit-learn/scikit-learn/issues/28717 - """ - pl = pytest.importorskip("polars") - df = pl.DataFrame({"col_x": X[:, 0], "col_y": X[:, 1]}) + df_module = pytest.importorskip(df_module_name) + df = df_module.DataFrame({"col_x": X[:, 0], "col_y": X[:, 1]}) estimator = LogisticRegression().fit(df, y) with warnings.catch_warnings(): From 4c06adf129d0bbc001130e7dd6a4087f1e702e5e Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sun, 31 Mar 2024 15:00:01 -0400 Subject: [PATCH 04/10] Use Adapters to create both pandas and polars containers --- sklearn/inspection/_plot/decision_boundary.py | 22 ++++++++---------- .../preprocessing/_function_transformer.py | 23 ++++--------------- sklearn/utils/_set_output.py | 18 +++++++++++++++ 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/sklearn/inspection/_plot/decision_boundary.py b/sklearn/inspection/_plot/decision_boundary.py index 07e28e0d18ad4..92e1a2527400e 100644 --- a/sklearn/inspection/_plot/decision_boundary.py +++ b/sklearn/inspection/_plot/decision_boundary.py @@ -5,6 +5,7 @@ from ...utils import _safe_indexing from ...utils._optional_dependencies import check_matplotlib_support from ...utils._response import _get_response_values +from ...utils._set_output import _get_adapter_from_container from ...utils.validation import ( _is_arraylike_not_scalar, _is_pandas_df, @@ -347,20 +348,15 @@ def from_estimator( np.linspace(x0_min, x0_max, grid_resolution), np.linspace(x1_min, x1_max, grid_resolution), ) - if _is_pandas_df(X): - # we need to preserve the feature names and therefore get an empty dataframe - X_grid = X.iloc[[], :].copy() - X_grid.iloc[:, 0] = xx0.ravel() - X_grid.iloc[:, 1] = xx1.ravel() - elif _is_polars_df(X): - X_grid = X.drop(X.columns).with_columns( - **{ - column: series - for column, series in zip(X.columns, (xx0.ravel(), xx1.ravel())) - } + + X_grid = np.c_[xx0.ravel(), xx1.ravel()] + if _is_pandas_df(X) or _is_polars_df(X): + adapter = _get_adapter_from_container(X) + X_grid = adapter.create_container( + X_grid, + X_grid, + columns=X.columns, ) - else: - X_grid = np.c_[xx0.ravel(), xx1.ravel()] prediction_method = _check_boundary_response_method( estimator, response_method, class_of_interest diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 921bd6a01fb71..0442e75346fed 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -4,7 +4,10 @@ from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils._param_validation import StrOptions -from ..utils._set_output import ADAPTERS_MANAGER, _get_output_config +from ..utils._set_output import ( + _get_adapter_from_container, + _get_output_config, +) from ..utils.metaestimators import available_if from ..utils.validation import ( _allclose_dense_sparse, @@ -16,24 +19,6 @@ ) -def _get_adapter_from_container(container): - """Get the adapter that nows how to handle such container. - - See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more - details. - """ - module_name = container.__class__.__module__.split(".")[0] - try: - return ADAPTERS_MANAGER.adapters[module_name] - except KeyError as exc: - available_adapters = list(ADAPTERS_MANAGER.adapters.keys()) - raise ValueError( - "The container does not have a registered adapter in scikit-learn. " - f"Available adapters are: {available_adapters} while the container " - f"provided is: {container!r}." - ) from exc - - def _identity(X): """The identity function.""" return X diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index cf7364e117320..4944a6edec145 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -197,6 +197,24 @@ def register(self, adapter): ADAPTERS_MANAGER.register(PolarsAdapter()) +def _get_adapter_from_container(container): + """Get the adapter that nows how to handle such container. + + See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more + details. + """ + module_name = container.__class__.__module__.split(".")[0] + try: + return ADAPTERS_MANAGER.adapters[module_name] + except KeyError as exc: + available_adapters = list(ADAPTERS_MANAGER.adapters.keys()) + raise ValueError( + "The container does not have a registered adapter in scikit-learn. " + f"Available adapters are: {available_adapters} while the container " + f"provided is: {container!r}." + ) from exc + + def _get_container_adapter(method, estimator=None): """Get container adapter.""" dense_config = _get_output_config(method, estimator)["dense"] From 85340997302a1cd8a577b1c1f284eaadfc201a79 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 3 Apr 2024 11:30:48 +0200 Subject: [PATCH 05/10] use _convert_container in test --- .../_plot/tests/test_boundary_decision_display.py | 10 ++++++---- sklearn/utils/_set_output.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py index 14cb460d22646..f2dae8a684369 100644 --- a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py +++ b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py @@ -17,6 +17,7 @@ from sklearn.preprocessing import scale from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils._testing import ( + _convert_container, assert_allclose, assert_array_equal, ) @@ -468,8 +469,8 @@ def test_string_target(pyplot): ) -@pytest.mark.parametrize("df_module_name", ["pandas", "polars"]) -def test_dataframe_support(pyplot, df_module_name): +@pytest.mark.parametrize("constructor_name", ["pandas", "polars"]) +def test_dataframe_support(pyplot, constructor_name): """Check that passing a dataframe at fit and to the Display does not raise warnings. @@ -477,8 +478,9 @@ def test_dataframe_support(pyplot, df_module_name): * https://github.com/scikit-learn/scikit-learn/issues/23311 * https://github.com/scikit-learn/scikit-learn/issues/28717 """ - df_module = pytest.importorskip(df_module_name) - df = df_module.DataFrame({"col_x": X[:, 0], "col_y": X[:, 1]}) + df = _convert_container( + X, constructor_name=constructor_name, columns_name=["col_x", "col_y"] + ) estimator = LogisticRegression().fit(df, y) with warnings.catch_warnings(): diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 4944a6edec145..d5c23a4c7c6f9 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -198,7 +198,7 @@ def register(self, adapter): def _get_adapter_from_container(container): - """Get the adapter that nows how to handle such container. + """Get the adapter that knows how to handle such container. See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more details. From 3593d79d28c079009d65848a83155541a479ba11 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 3 Apr 2024 16:37:17 -0400 Subject: [PATCH 06/10] Update v1.4.rst --- doc/whats_new/v1.4.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index bde83ed8679e6..bb8d7d47632a1 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -34,6 +34,13 @@ Metadata Routing Changelog --------- +:mod:`sklearn.inspection` +......................... + +- |Fix| :func:`DecisionBoundaryDisplay.from_estimator` no longer warns about + missing feature names when provided a `polars.DataFrame`. + :pr:`28718` by :user:`Patrick Wang `. + :mod:`sklearn.metrics` ...................... From 99f61c26446b369da66dd6323923e4d3bebf0e45 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 5 Apr 2024 18:17:35 +0200 Subject: [PATCH 07/10] target 1.5 --- doc/whats_new/v1.4.rst | 7 ------- doc/whats_new/v1.5.rst | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index bb8d7d47632a1..bde83ed8679e6 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -34,13 +34,6 @@ Metadata Routing Changelog --------- -:mod:`sklearn.inspection` -......................... - -- |Fix| :func:`DecisionBoundaryDisplay.from_estimator` no longer warns about - missing feature names when provided a `polars.DataFrame`. - :pr:`28718` by :user:`Patrick Wang `. - :mod:`sklearn.metrics` ...................... diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 9ca6aee7ac83a..300bd11155020 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -220,6 +220,13 @@ Changelog by passing a function in place of a strategy name. :pr:`28053` by :user:`Mark Elliot `. +:mod:`sklearn.inspection` +......................... + +- |Fix| :func:`DecisionBoundaryDisplay.from_estimator` no longer warns about + missing feature names when provided a `polars.DataFrame`. + :pr:`28718` by :user:`Patrick Wang `. + :mod:`sklearn.linear_model` ........................... From 6fa80bb0bc6168e22e187e1183d0fb71c5b0db69 Mon Sep 17 00:00:00 2001 From: Patrick Wang <1263870+patrickkwang@users.noreply.github.com> Date: Tue, 9 Apr 2024 15:00:05 -0400 Subject: [PATCH 08/10] Update doc/whats_new/v1.5.rst Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v1.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 550733cc4a135..aefeb9d2d6ac9 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -223,7 +223,7 @@ Changelog :mod:`sklearn.inspection` ......................... -- |Fix| :func:`DecisionBoundaryDisplay.from_estimator` no longer warns about +- |Fix| :meth:`inspection.DecisionBoundaryDisplay.from_estimator` no longer warns about missing feature names when provided a `polars.DataFrame`. :pr:`28718` by :user:`Patrick Wang `. From e7c9890d08105af15979afc47a8e14fe07734440 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Tue, 9 Apr 2024 15:01:13 -0400 Subject: [PATCH 09/10] Fix v1.5.rst line length --- doc/whats_new/v1.5.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index aefeb9d2d6ac9..610a7cad65c39 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -223,8 +223,8 @@ Changelog :mod:`sklearn.inspection` ......................... -- |Fix| :meth:`inspection.DecisionBoundaryDisplay.from_estimator` no longer warns about - missing feature names when provided a `polars.DataFrame`. +- |Fix| :meth:`inspection.DecisionBoundaryDisplay.from_estimator` no longer + warns about missing feature names when provided a `polars.DataFrame`. :pr:`28718` by :user:`Patrick Wang `. :mod:`sklearn.linear_model` From b66a38d57a029a4ea9bb8a42f3289bec137f2b98 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Tue, 9 Apr 2024 15:29:21 -0400 Subject: [PATCH 10/10] Move test_get_adapter_from_container to test_set_output.py --- .../preprocessing/tests/test_function_transformer.py | 12 ------------ sklearn/utils/tests/test_set_output.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index e7b86e88d1547..81d9d0b8eb843 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -5,7 +5,6 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, StandardScaler -from sklearn.preprocessing._function_transformer import _get_adapter_from_container from sklearn.utils._testing import ( _convert_container, assert_allclose_dense_sparse, @@ -14,17 +13,6 @@ from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS -def test_get_adapter_from_container(): - """Check the behavior fo `_get_adapter_from_container`.""" - pd = pytest.importorskip("pandas") - X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) - adapter = _get_adapter_from_container(X) - assert adapter.container_lib == "pandas" - err_msg = "The container does not have a registered adapter in scikit-learn." - with pytest.raises(ValueError, match=err_msg): - _get_adapter_from_container(X.to_numpy()) - - def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X): def _func(X, *args, **kwargs): args_store.append(X) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index 827627f441ddd..360b081a2a0fb 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -10,6 +10,7 @@ from sklearn.utils._set_output import ( ADAPTERS_MANAGER, ContainerAdapterProtocol, + _get_adapter_from_container, _get_output_config, _safe_set_output, _SetOutputMixin, @@ -450,3 +451,14 @@ def patched_import_module(name): msg = "Setting output container to 'pandas' requires" with pytest.raises(ImportError, match=msg): check_library_installed("pandas") + + +def test_get_adapter_from_container(): + """Check the behavior fo `_get_adapter_from_container`.""" + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + adapter = _get_adapter_from_container(X) + assert adapter.container_lib == "pandas" + err_msg = "The container does not have a registered adapter in scikit-learn." + with pytest.raises(ValueError, match=err_msg): + _get_adapter_from_container(X.to_numpy())