From e4d9ae81c75f5a600cfbe19a239f198c53189c0c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 1 Dec 2022 20:01:18 +0100
Subject: [PATCH 1/3] ENH better error message in HGBRT with feature names

---
 .../gradient_boosting.py                      | 20 ++++++++++-----
 .../tests/test_gradient_boosting.py           | 25 +++++++++++++++----
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index af9225933100c..b3c5a8d57065c 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -271,17 +271,25 @@ def _check_categories(self, X):
                     categories = categories[~missing]
 
                 if categories.size > self.max_bins:
+                    if hasattr(self, "feature_names_in_"):
+                        feature_name = f"'{self.feature_names_in_[f_idx]}'"
+                    else:
+                        feature_name = f"at index {f_idx}"
+
                     raise ValueError(
-                        f"Categorical feature at index {f_idx} is "
-                        "expected to have a "
-                        f"cardinality <= {self.max_bins}"
+                        f"Categorical feature {feature_name} is expected to "
+                        f"have a cardinality <= {self.max_bins}"
                     )
 
                 if (categories >= self.max_bins).any():
+                    if hasattr(self, "feature_names_in_"):
+                        feature_name = f"'{self.feature_names_in_[f_idx]}'"
+                    else:
+                        feature_name = f"at index {f_idx}"
+
                     raise ValueError(
-                        f"Categorical feature at index {f_idx} is "
-                        "expected to be encoded with "
-                        f"values < {self.max_bins}"
+                        f"Categorical feature {feature_name} is expected to "
+                        f"be encoded with values < {self.max_bins}"
                     )
             else:
                 categories = None
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index d1a8f56bbd479..84be8eaa38ab0 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1141,20 +1141,35 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
 @pytest.mark.parametrize(
     "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
 )
-def test_categorical_bad_encoding_errors(Est):
+@pytest.mark.parametrize("use_pandas", [False, True])
+def test_categorical_bad_encoding_errors(Est, use_pandas):
     # Test errors when categories are encoded incorrectly
 
     gb = Est(categorical_features=[True], max_bins=2)
 
-    X = np.array([[0, 1, 2]]).T
+    if use_pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame({"f0": [0, 1, 2]})
+    else:
+        X = np.array([[0, 1, 2]]).T
     y = np.arange(3)
-    msg = "Categorical feature at index 0 is expected to have a cardinality <= 2"
+
+    if use_pandas:
+        msg = "Categorical feature 'f0' is expected to have a cardinality <= 2"
+    else:
+        msg = "Categorical feature at index 0 is expected to have a cardinality <= 2"
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 
-    X = np.array([[0, 2]]).T
+    if use_pandas:
+        X = pd.DataFrame({"f0": [0, 2]})
+    else:
+        X = np.array([[0, 2]]).T
     y = np.arange(2)
-    msg = "Categorical feature at index 0 is expected to be encoded with values < 2"
+    if use_pandas:
+        msg = "Categorical feature 'f0' is expected to be encoded with values < 2"
+    else:
+        msg = "Categorical feature at index 0 is expected to be encoded with values < 2"
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 

From 26f1e5db9c722daa923de49e1ac7b88e6d347ec6 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 2 Dec 2022 12:04:27 +0100
Subject: [PATCH 2/3] less duplicated code

---
 .../gradient_boosting.py                       | 15 +++++----------
 .../tests/test_gradient_boosting.py            | 18 ++++++++----------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index b3c5a8d57065c..38f021ec5f82d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -270,23 +270,18 @@ def _check_categories(self, X):
                 if missing.any():
                     categories = categories[~missing]
 
-                if categories.size > self.max_bins:
-                    if hasattr(self, "feature_names_in_"):
-                        feature_name = f"'{self.feature_names_in_[f_idx]}'"
-                    else:
-                        feature_name = f"at index {f_idx}"
+                if hasattr(self, "feature_names_in_"):
+                    feature_name = f"'{self.feature_names_in_[f_idx]}'"
+                else:
+                    feature_name = f"at index {f_idx}"
 
+                if categories.size > self.max_bins:
                     raise ValueError(
                         f"Categorical feature {feature_name} is expected to "
                         f"have a cardinality <= {self.max_bins}"
                     )
 
                 if (categories >= self.max_bins).any():
-                    if hasattr(self, "feature_names_in_"):
-                        feature_name = f"'{self.feature_names_in_[f_idx]}'"
-                    else:
-                        feature_name = f"at index {f_idx}"
-
                     raise ValueError(
                         f"Categorical feature {feature_name} is expected to "
                         f"be encoded with values < {self.max_bins}"
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 84be8eaa38ab0..de44e0c4efa31 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1141,8 +1141,10 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
 @pytest.mark.parametrize(
     "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
 )
-@pytest.mark.parametrize("use_pandas", [False, True])
-def test_categorical_bad_encoding_errors(Est, use_pandas):
+@pytest.mark.parametrize(
+    "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
+)
+def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
     # Test errors when categories are encoded incorrectly
 
     gb = Est(categorical_features=[True], max_bins=2)
@@ -1154,10 +1156,7 @@ def test_categorical_bad_encoding_errors(Est, use_pandas):
         X = np.array([[0, 1, 2]]).T
     y = np.arange(3)
 
-    if use_pandas:
-        msg = "Categorical feature 'f0' is expected to have a cardinality <= 2"
-    else:
-        msg = "Categorical feature at index 0 is expected to have a cardinality <= 2"
+    msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2"
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 
@@ -1166,10 +1165,9 @@ def test_categorical_bad_encoding_errors(Est, use_pandas):
     else:
         X = np.array([[0, 2]]).T
     y = np.arange(2)
-    if use_pandas:
-        msg = "Categorical feature 'f0' is expected to be encoded with values < 2"
-    else:
-        msg = "Categorical feature at index 0 is expected to be encoded with values < 2"
+    msg = (
+        f"Categorical feature {feature_name} is expected to be encoded with values < 2"
+    )
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 

From 4b921bb3a0ccf5c71e065a62a61986ebf9cad6f2 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 2 Dec 2022 12:05:41 +0100
Subject: [PATCH 3/3] cln

---
 .../_hist_gradient_boosting/tests/test_gradient_boosting.py      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index de44e0c4efa31..8756d27869e36 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1155,7 +1155,6 @@ def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
     else:
         X = np.array([[0, 1, 2]]).T
     y = np.arange(3)
-
     msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2"
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)