From 795bae6866cf7ee314b1c3ffb7d8d7d62a7b3c52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 8 Feb 2024 12:04:31 +0100
Subject: [PATCH 1/4] FIX HistGradientBoosting with pandas nullable dtype

---
 .../_hist_gradient_boosting/gradient_boosting.py | 16 +++++++++-------
 .../tests/test_gradient_boosting.py              | 12 ++++++++++++
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 69ae0090b1fb8..e7f69948e7cc2 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -368,7 +368,15 @@ def _check_categorical_features(self, X):
             Indicates whether a feature is categorical. If no feature is
             categorical, this is None.
         """
-        if hasattr(X, "__dataframe__"):
+        # Special code for pandas because of a bug in recent pandas, which is
+        # fixed in main and maybe included in 2.2.1, see
+        # https://github.com/pandas-dev/pandas/pull/57173.
+        # Also pandas versions < 1.5.1 do not support the dataframe interchange
+        if _is_pandas_df(X):
+            X_is_dataframe = True
+            categorical_columns_mask = np.asarray(X.dtypes == "category")
+            X_has_categorical_columns = categorical_columns_mask.any()
+        elif hasattr(X, "__dataframe__"):
             X_is_dataframe = True
             categorical_columns_mask = np.asarray(
                 [
@@ -377,12 +385,6 @@ def _check_categorical_features(self, X):
                 ]
             )
             X_has_categorical_columns = categorical_columns_mask.any()
-        # pandas versions < 1.5.1 do not support the dataframe interchange
-        # protocol so we inspect X.dtypes directly
-        elif _is_pandas_df(X):
-            X_is_dataframe = True
-            categorical_columns_mask = np.asarray(X.dtypes == "category")
-            X_has_categorical_columns = categorical_columns_mask.any()
         else:
             X_is_dataframe = False
             categorical_columns_mask = None
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index bdc85eccd6607..0ff9b606f49cc 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1669,3 +1669,15 @@ def joblib_dump_with_different_bitness():
     new_clf = joblib.load(joblib_dump_with_different_bitness())
     new_score = new_clf.score(X, y)
     assert score == pytest.approx(new_score)
+
+
+def test_pandas_nullable_dtype():
+    # Non regression test for https://github.com/scikit-learn/scikit-learn/issues/28317
+    pd = pytest.importorskip("pandas")
+
+    rng = np.random.default_rng(0)
+    X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype)
+    y = rng.integers(2, size=100)
+
+    clf = HistGradientBoostingClassifier()
+    clf.fit(X, y)

From 3306c5534a2fe8628e1c0ca1fa8f95c2f357e614 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 8 Feb 2024 14:14:03 +0100
Subject: [PATCH 2/4] Fix

---
 .../_hist_gradient_boosting/tests/test_gradient_boosting.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 0ff9b606f49cc..e14d786b5bc74 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1676,7 +1676,7 @@ def test_pandas_nullable_dtype():
     pd = pytest.importorskip("pandas")
 
     rng = np.random.default_rng(0)
-    X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype)
+    X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype())
     y = rng.integers(2, size=100)
 
     clf = HistGradientBoostingClassifier()

From 2f87b64c34f451215c930bc4ea6d70a7e4a6e68b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 8 Feb 2024 15:17:36 +0100
Subject: [PATCH 3/4] Add changelog entry to 1.4.1

---
 doc/whats_new/v1.4.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 2baa45e88a115..99dc2c436cfdd 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -103,6 +103,14 @@ Changelog
   the same columns. Previously, it would raise a due to duplicated column names.
   :pr:`28262` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.ensemble`
+.......................
+
+- |Fix| :class:`HistGradientBoostingClassifier` and
+  :class:`HistGradientBoostingRegressor` when fitted on `pandas` `DataFrame`
+  with extension dtypes, for example `pd.Int64Dtype`
+  :pr:`28385` by :user:`Loïc Estève <lesteve>`.
+
 :mod:`sklearn.inspection`
 .........................
 

From bc3c4056586ac86c32963dfbaeb6d84472d6bf05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Thu, 8 Feb 2024 15:17:46 +0100
Subject: [PATCH 4/4] [azure parallel]