From 795bae6866cf7ee314b1c3ffb7d8d7d62a7b3c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 8 Feb 2024 12:04:31 +0100 Subject: [PATCH 1/4] FIX HistGradientBoosting with pandas nullable dtype --- .../_hist_gradient_boosting/gradient_boosting.py | 16 +++++++++------- .../tests/test_gradient_boosting.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 69ae0090b1fb8..e7f69948e7cc2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -368,7 +368,15 @@ def _check_categorical_features(self, X): Indicates whether a feature is categorical. If no feature is categorical, this is None. """ - if hasattr(X, "__dataframe__"): + # Special code for pandas because of a bug in recent pandas, which is + # fixed in main and maybe included in 2.2.1, see + # https://github.com/pandas-dev/pandas/pull/57173. + # Also pandas versions < 1.5.1 do not support the dataframe interchange + if _is_pandas_df(X): + X_is_dataframe = True + categorical_columns_mask = np.asarray(X.dtypes == "category") + X_has_categorical_columns = categorical_columns_mask.any() + elif hasattr(X, "__dataframe__"): X_is_dataframe = True categorical_columns_mask = np.asarray( [ @@ -377,12 +385,6 @@ def _check_categorical_features(self, X): ] ) X_has_categorical_columns = categorical_columns_mask.any() - # pandas versions < 1.5.1 do not support the dataframe interchange - # protocol so we inspect X.dtypes directly - elif _is_pandas_df(X): - X_is_dataframe = True - categorical_columns_mask = np.asarray(X.dtypes == "category") - X_has_categorical_columns = categorical_columns_mask.any() else: X_is_dataframe = False categorical_columns_mask = None diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index bdc85eccd6607..0ff9b606f49cc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1669,3 +1669,15 @@ def joblib_dump_with_different_bitness(): new_clf = joblib.load(joblib_dump_with_different_bitness()) new_score = new_clf.score(X, y) assert score == pytest.approx(new_score) + + +def test_pandas_nullable_dtype(): + # Non regression test for https://github.com/scikit-learn/scikit-learn/issues/28317 + pd = pytest.importorskip("pandas") + + rng = np.random.default_rng(0) + X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype) + y = rng.integers(2, size=100) + + clf = HistGradientBoostingClassifier() + clf.fit(X, y) From 3306c5534a2fe8628e1c0ca1fa8f95c2f357e614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 8 Feb 2024 14:14:03 +0100 Subject: [PATCH 2/4] Fix --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0ff9b606f49cc..e14d786b5bc74 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1676,7 +1676,7 @@ def test_pandas_nullable_dtype(): pd = pytest.importorskip("pandas") rng = np.random.default_rng(0) - X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype) + X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype()) y = rng.integers(2, size=100) clf = HistGradientBoostingClassifier() From 2f87b64c34f451215c930bc4ea6d70a7e4a6e68b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 8 Feb 2024 15:17:36 +0100 Subject: [PATCH 3/4] Add changelog entry to 1.4.1 --- doc/whats_new/v1.4.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 2baa45e88a115..99dc2c436cfdd 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -103,6 +103,14 @@ Changelog the same columns. Previously, it would raise a due to duplicated column names. :pr:`28262` by :user:`Guillaume Lemaitre `. +:mod:`sklearn.ensemble` +....................... + +- |Fix| :class:`HistGradientBoostingClassifier` and + :class:`HistGradientBoostingRegressor` when fitted on `pandas` `DataFrame` + with extension dtypes, for example `pd.Int64Dtype` + :pr:`28385` by :user:`Loïc Estève `. + :mod:`sklearn.inspection` ......................... From bc3c4056586ac86c32963dfbaeb6d84472d6bf05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 8 Feb 2024 15:17:46 +0100 Subject: [PATCH 4/4] [azure parallel]