From e4d9ae81c75f5a600cfbe19a239f198c53189c0c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 1 Dec 2022 20:01:18 +0100 Subject: [PATCH 1/3] ENH better error message in HGBRT with feature names --- .../gradient_boosting.py | 20 ++++++++++----- .../tests/test_gradient_boosting.py | 25 +++++++++++++++---- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index af9225933100c..b3c5a8d57065c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -271,17 +271,25 @@ def _check_categories(self, X): categories = categories[~missing] if categories.size > self.max_bins: + if hasattr(self, "feature_names_in_"): + feature_name = f"'{self.feature_names_in_[f_idx]}'" + else: + feature_name = f"at index {f_idx}" + raise ValueError( - f"Categorical feature at index {f_idx} is " - "expected to have a " - f"cardinality <= {self.max_bins}" + f"Categorical feature {feature_name} is expected to " + f"have a cardinality <= {self.max_bins}" ) if (categories >= self.max_bins).any(): + if hasattr(self, "feature_names_in_"): + feature_name = f"'{self.feature_names_in_[f_idx]}'" + else: + feature_name = f"at index {f_idx}" + raise ValueError( - f"Categorical feature at index {f_idx} is " - "expected to be encoded with " - f"values < {self.max_bins}" + f"Categorical feature {feature_name} is expected to " + f"be encoded with values < {self.max_bins}" ) else: categories = None diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index d1a8f56bbd479..84be8eaa38ab0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1141,20 +1141,35 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array): @pytest.mark.parametrize( "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) ) -def test_categorical_bad_encoding_errors(Est): +@pytest.mark.parametrize("use_pandas", [False, True]) +def test_categorical_bad_encoding_errors(Est, use_pandas): # Test errors when categories are encoded incorrectly gb = Est(categorical_features=[True], max_bins=2) - X = np.array([[0, 1, 2]]).T + if use_pandas: + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"f0": [0, 1, 2]}) + else: + X = np.array([[0, 1, 2]]).T y = np.arange(3) - msg = "Categorical feature at index 0 is expected to have a cardinality <= 2" + + if use_pandas: + msg = "Categorical feature 'f0' is expected to have a cardinality <= 2" + else: + msg = "Categorical feature at index 0 is expected to have a cardinality <= 2" with pytest.raises(ValueError, match=msg): gb.fit(X, y) - X = np.array([[0, 2]]).T + if use_pandas: + X = pd.DataFrame({"f0": [0, 2]}) + else: + X = np.array([[0, 2]]).T y = np.arange(2) - msg = "Categorical feature at index 0 is expected to be encoded with values < 2" + if use_pandas: + msg = "Categorical feature 'f0' is expected to be encoded with values < 2" + else: + msg = "Categorical feature at index 0 is expected to be encoded with values < 2" with pytest.raises(ValueError, match=msg): gb.fit(X, y) From 26f1e5db9c722daa923de49e1ac7b88e6d347ec6 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 2 Dec 2022 12:04:27 +0100 Subject: [PATCH 2/3] less duplicated code --- .../gradient_boosting.py | 15 +++++---------- .../tests/test_gradient_boosting.py | 18 ++++++++---------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index b3c5a8d57065c..38f021ec5f82d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -270,23 +270,18 @@ def _check_categories(self, X): if missing.any(): categories = categories[~missing] - if categories.size > self.max_bins: - if hasattr(self, "feature_names_in_"): - feature_name = f"'{self.feature_names_in_[f_idx]}'" - else: - feature_name = f"at index {f_idx}" + if hasattr(self, "feature_names_in_"): + feature_name = f"'{self.feature_names_in_[f_idx]}'" + else: + feature_name = f"at index {f_idx}" + if categories.size > self.max_bins: raise ValueError( f"Categorical feature {feature_name} is expected to " f"have a cardinality <= {self.max_bins}" ) if (categories >= self.max_bins).any(): - if hasattr(self, "feature_names_in_"): - feature_name = f"'{self.feature_names_in_[f_idx]}'" - else: - feature_name = f"at index {f_idx}" - raise ValueError( f"Categorical feature {feature_name} is expected to " f"be encoded with values < {self.max_bins}" diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 84be8eaa38ab0..de44e0c4efa31 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1141,8 +1141,10 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array): @pytest.mark.parametrize( "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) ) -@pytest.mark.parametrize("use_pandas", [False, True]) -def test_categorical_bad_encoding_errors(Est, use_pandas): +@pytest.mark.parametrize( + "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")] +) +def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name): # Test errors when categories are encoded incorrectly gb = Est(categorical_features=[True], max_bins=2) @@ -1154,10 +1156,7 @@ def test_categorical_bad_encoding_errors(Est, use_pandas): X = np.array([[0, 1, 2]]).T y = np.arange(3) - if use_pandas: - msg = "Categorical feature 'f0' is expected to have a cardinality <= 2" - else: - msg = "Categorical feature at index 0 is expected to have a cardinality <= 2" + msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2" with pytest.raises(ValueError, match=msg): gb.fit(X, y) @@ -1166,10 +1165,9 @@ def test_categorical_bad_encoding_errors(Est, use_pandas): else: X = np.array([[0, 2]]).T y = np.arange(2) - if use_pandas: - msg = "Categorical feature 'f0' is expected to be encoded with values < 2" - else: - msg = "Categorical feature at index 0 is expected to be encoded with values < 2" + msg = ( + f"Categorical feature {feature_name} is expected to be encoded with values < 2" + ) with pytest.raises(ValueError, match=msg): gb.fit(X, y) From 4b921bb3a0ccf5c71e065a62a61986ebf9cad6f2 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 2 Dec 2022 12:05:41 +0100 Subject: [PATCH 3/3] cln --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index de44e0c4efa31..8756d27869e36 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1155,7 +1155,6 @@ def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name): else: X = np.array([[0, 1, 2]]).T y = np.arange(3) - msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2" with pytest.raises(ValueError, match=msg): gb.fit(X, y)