From 47997eeac8579de53b54a5c862a94e15b27402f2 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 15 Dec 2021 15:33:46 -0500 Subject: [PATCH 01/41] max_depth: add tests --- sklearn/tree/tests/test_tree.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index cd6b245bee60e..a17b9b9977e21 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -602,8 +602,8 @@ def test_error(): TreeEstimator(min_samples_split=1.1).fit(X, y) with pytest.raises(ValueError): TreeEstimator(min_samples_split=2.5).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(max_depth=-1).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(max_depth=-1).fit(X, y) with pytest.raises(ValueError): TreeEstimator(max_features=42).fit(X, y) with pytest.raises(ValueError): @@ -662,6 +662,31 @@ def test_error(): est.fit([[0, 1, 2]], [5, -0.1, 2]) +@pytest.mark.parametrize("name, Tree", ALL_TREES.items()) +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ({"max_depth": -1}, ValueError, "max_depth == -1, must be > 0"), + ( + {"max_depth": 1.1}, + TypeError, + "max_depth must be an instance of , not", + ), + ], +) +def test_tree_params_validation(name, Tree, params, err_type, err_msg): + """Check parameter validation in DecisionTreeClassifier, DecisionTreeRegressor, + ExtraTreeClassifier, and ExtraTreeRegressor. + """ + if "Classifier" in name: + X, y = iris.data, iris.target + else: + X, y = diabetes.data, diabetes.target + est = Tree(**params) + with pytest.raises(err_type, match=err_msg): + est.fit(X, y) + + def test_min_samples_split(): """Test min_samples_split parameter""" X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE) From 02fc4aff4c5c48f01339b9cfb260d2cd42ffb5cd Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 15 Dec 2021 15:34:05 -0500 Subject: [PATCH 02/41] max_depth: add validation --- sklearn/tree/_classes.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 3cd0e000bd4dd..b6e44d77eda00 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -32,6 +32,7 @@ from ..base import MultiOutputMixin from ..utils import Bunch from ..utils import check_random_state +from ..utils import check_scalar from ..utils.deprecation import deprecated from ..utils.validation import _check_sample_weight from ..utils import compute_sample_weight @@ -225,6 +226,14 @@ def fit(self, X, y, sample_weight=None, check_input=True): y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters + if self.max_depth is not None: + check_scalar( + self.max_depth, + "max_depth", + target_type=numbers.Integral, + min_val=0, + include_boundaries="neither", + ) max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes @@ -300,8 +309,8 @@ def fit(self, X, y, sample_weight=None, check_input=True): ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") - if max_depth <= 0: - raise ValueError("max_depth must be greater than zero. ") + # if max_depth <= 0: + # raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_in_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, numbers.Integral): From 0ab553512bc05c3b8bff44c06ea21ae1b054ce77 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 15 Dec 2021 16:49:18 -0500 Subject: [PATCH 03/41] min_sample_split: add tests and validation --- sklearn/tree/_classes.py | 45 ++++++++++++++++++++++----------- sklearn/tree/tests/test_tree.py | 39 ++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index b6e44d77eda00..a0775f3fe8285 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -253,22 +253,37 @@ def fit(self, X, y, sample_weight=None, check_input=True): min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): - if not 2 <= self.min_samples_split: - raise ValueError( - "min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split - ) + # if not 2 <= self.min_samples_split: + # raise ValueError( + # "min_samples_split must be an integer " + # "greater than 1 or a float in (0.0, 1.0]; " + # "got the integer %s" + # % self.min_samples_split + # ) + check_scalar( + self.min_samples_split, + "min_samples_split", + target_type=numbers.Integral, + min_val=1, + include_boundaries="neither", + ) min_samples_split = self.min_samples_split - else: # float - if not 0.0 < self.min_samples_split <= 1.0: - raise ValueError( - "min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split - ) + else: # float, string, other + # if not 0.0 < self.min_samples_split <= 1.0: + # raise ValueError( + # "min_samples_split must be an integer " + # "greater than 1 or a float in (0.0, 1.0]; " + # "got the float %s" + # % self.min_samples_split + # ) + check_scalar( + self.min_samples_split, + "min_samples_split", + target_type=numbers.Real, + min_val=0.0, + max_val=1.0, + include_boundaries="right", + ) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a17b9b9977e21..7db4980830340 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -594,14 +594,14 @@ def test_error(): TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y) with pytest.raises(ValueError): TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=0.0).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=1.1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=2.5).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_samples_split=-1).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_samples_split=0.0).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_samples_split=1.1).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_samples_split=2.5).fit(X, y) # with pytest.raises(ValueError): # TreeEstimator(max_depth=-1).fit(X, y) with pytest.raises(ValueError): @@ -672,6 +672,29 @@ def test_error(): TypeError, "max_depth must be an instance of , not", ), + ({"min_samples_split": -1}, ValueError, "min_samples_split == -1, must be > 1"), + ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be > 1"), + ( + {"min_samples_split": 0.0}, + ValueError, + "min_samples_split == 0.0, must be > 0.0", + ), + ( + {"min_samples_split": 1.1}, + ValueError, + "min_samples_split == 1.1, must be <= 1.0", + ), + ( + {"min_samples_split": 2.5}, + ValueError, + "min_samples_split == 2.5, must be <= 1.0", + ), + ( + {"min_samples_split": "foo"}, + TypeError, + "min_samples_split must be an instance of , not" + " .", + ), ], ) def test_tree_params_validation(name, Tree, params, err_type, err_msg): From 88b13d64c0bd7929c7cb70bcf3e66a38a1c25f12 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 15 Dec 2021 17:15:10 -0500 Subject: [PATCH 04/41] min_impurity_decrease: add tests and valiation --- sklearn/tree/_classes.py | 12 +++++++++--- sklearn/tree/tests/test_tree.py | 14 ++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a0775f3fe8285..d78e2e156594a 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -354,9 +354,15 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) - if self.min_impurity_decrease < 0.0: - raise ValueError("min_impurity_decrease must be greater than or equal to 0") - + # if self.min_impurity_decrease < 0.0: + # raise ValueError("min_impurity_decrease must be greater than or equal to 0") + check_scalar( + self.min_impurity_decrease, + "min_impurity_decrease", + target_type=numbers.Real, + min_val=0, + include_boundaries="left", + ) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 7db4980830340..8080ba5ffceb2 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -606,8 +606,8 @@ def test_error(): # TreeEstimator(max_depth=-1).fit(X, y) with pytest.raises(ValueError): TreeEstimator(max_features=42).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_impurity_decrease=-1.0).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_impurity_decrease=-1.0).fit(X, y) # Wrong dimensions est = TreeEstimator() @@ -695,6 +695,16 @@ def test_error(): "min_samples_split must be an instance of , not" " .", ), + ( + {"min_impurity_decrease": -1.0}, + ValueError, + "min_impurity_decrease == -1.0, must be >= 0", + ), + ( + {"min_impurity_decrease": "foo"}, + TypeError, + "min_impurity_decrease must be an instance of ", + ), ], ) def test_tree_params_validation(name, Tree, params, err_type, err_msg): From 645fc2291a4c7cdf341dfc9243bd68f77d4149a1 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 15 Dec 2021 18:01:49 -0500 Subject: [PATCH 05/41] fix lint issue --- sklearn/tree/_classes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index d78e2e156594a..1bc5d1300a63d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -355,7 +355,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) # if self.min_impurity_decrease < 0.0: - # raise ValueError("min_impurity_decrease must be greater than or equal to 0") + # raise ValueError( + # "min_impurity_decrease must be greater than or equal to 0" + # ) check_scalar( self.min_impurity_decrease, "min_impurity_decrease", From 69c43472cfb76ee225a86822b1c9966a2bb62ae8 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 15 Dec 2021 23:24:54 -0500 Subject: [PATCH 06/41] max_leaf_nodes: add tests and validation --- sklearn/tree/_classes.py | 19 +++++++++---------- sklearn/tree/tests/test_tree.py | 6 ++++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 1bc5d1300a63d..a64698767b445 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -235,6 +235,15 @@ def fit(self, X, y, sample_weight=None, check_input=True): include_boundaries="neither", ) max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth + + if self.max_leaf_nodes is not None: + check_scalar( + self.max_leaf_nodes, + "max_leaf_nodes", + target_type=numbers.Integral, + min_val=2, + include_boundaries="left", + ) max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes if isinstance(self.min_samples_leaf, numbers.Integral): @@ -328,16 +337,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): # raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_in_): raise ValueError("max_features must be in (0, n_features]") - if not isinstance(max_leaf_nodes, numbers.Integral): - raise ValueError( - "max_leaf_nodes must be integral number but was %r" % max_leaf_nodes - ) - if -1 < max_leaf_nodes < 2: - raise ValueError( - ("max_leaf_nodes {0} must be either None or larger than 1").format( - max_leaf_nodes - ) - ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 8080ba5ffceb2..7efab0bbec04f 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -695,6 +695,12 @@ def test_error(): "min_samples_split must be an instance of , not" " .", ), + ({"max_leaf_nodes": 0}, ValueError, "max_leaf_nodes == 0, must be >= 2"), + ( + {"max_leaf_nodes": 1.5}, + TypeError, + "max_leaf_nodes must be an instance of ", + ), ( {"min_impurity_decrease": -1.0}, ValueError, From a2c8c4b9a9fa2b5d830098e3dd87e43e6ae21aeb Mon Sep 17 00:00:00 2001 From: genvalen Date: Thu, 16 Dec 2021 20:27:46 -0500 Subject: [PATCH 07/41] min_weight_fraction_leaf: add tests and validation --- sklearn/tree/_classes.py | 12 ++++++++++-- sklearn/tree/tests/test_tree.py | 11 +++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a64698767b445..0642c26347bea 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -331,8 +331,16 @@ def fit(self, X, y, sample_weight=None, check_input=True): "Number of labels=%d does not match number of samples=%d" % (len(y), n_samples) ) - if not 0 <= self.min_weight_fraction_leaf <= 0.5: - raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") + # if not 0 <= self.min_weight_fraction_leaf <= 0.5: + # raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") + check_scalar( + self.min_weight_fraction_leaf, + "min_weight_fraction_leaf", + target_type=numbers.Real, + min_val=0, + max_val=1, + include_boundaries="left" + ) # if max_depth <= 0: # raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_in_): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 7efab0bbec04f..5cfe1bd586ba6 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -590,10 +590,10 @@ def test_error(): TreeEstimator(min_samples_leaf=0.0).fit(X, y) with pytest.raises(ValueError): TreeEstimator(min_samples_leaf=3.0).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y) # with pytest.raises(ValueError): # TreeEstimator(min_samples_split=-1).fit(X, y) # with pytest.raises(ValueError): @@ -695,6 +695,9 @@ def test_error(): "min_samples_split must be an instance of , not" " .", ), + ({"min_weight_fraction_leaf": -1}, ValueError, "min_weight_fraction_leaf == -1, must be >= 0"), + ({"min_weight_fraction_leaf": 1.1}, ValueError, "min_weight_fraction_leaf == 1.1, must be < 1"), + ({"min_weight_fraction_leaf": "foo"}, TypeError, "min_weight_fraction_leaf must be an instance of "), ({"max_leaf_nodes": 0}, ValueError, "max_leaf_nodes == 0, must be >= 2"), ( {"max_leaf_nodes": 1.5}, From 4100bbd8b5b00aec830e18ac648df7f7a6cacc6e Mon Sep 17 00:00:00 2001 From: genvalen Date: Thu, 16 Dec 2021 20:29:29 -0500 Subject: [PATCH 08/41] format files w/ black --- sklearn/tree/_classes.py | 2 +- sklearn/tree/tests/test_tree.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 0642c26347bea..d056adbf76ba5 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -339,7 +339,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): target_type=numbers.Real, min_val=0, max_val=1, - include_boundaries="left" + include_boundaries="left", ) # if max_depth <= 0: # raise ValueError("max_depth must be greater than zero. ") diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 5cfe1bd586ba6..7ad1928fa6447 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -695,9 +695,21 @@ def test_error(): "min_samples_split must be an instance of , not" " .", ), - ({"min_weight_fraction_leaf": -1}, ValueError, "min_weight_fraction_leaf == -1, must be >= 0"), - ({"min_weight_fraction_leaf": 1.1}, ValueError, "min_weight_fraction_leaf == 1.1, must be < 1"), - ({"min_weight_fraction_leaf": "foo"}, TypeError, "min_weight_fraction_leaf must be an instance of "), + ( + {"min_weight_fraction_leaf": -1}, + ValueError, + "min_weight_fraction_leaf == -1, must be >= 0", + ), + ( + {"min_weight_fraction_leaf": 1.1}, + ValueError, + "min_weight_fraction_leaf == 1.1, must be < 1", + ), + ( + {"min_weight_fraction_leaf": "foo"}, + TypeError, + "min_weight_fraction_leaf must be an instance of ", + ), ({"max_leaf_nodes": 0}, ValueError, "max_leaf_nodes == 0, must be >= 2"), ( {"max_leaf_nodes": 1.5}, From ec647de9ee3cbadcf5144d5a44bdd7e1d167a482 Mon Sep 17 00:00:00 2001 From: genvalen Date: Fri, 17 Dec 2021 13:09:34 -0500 Subject: [PATCH 09/41] remove comments --- sklearn/tree/_classes.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index d056adbf76ba5..70061f0605449 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -262,13 +262,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): - # if not 2 <= self.min_samples_split: - # raise ValueError( - # "min_samples_split must be an integer " - # "greater than 1 or a float in (0.0, 1.0]; " - # "got the integer %s" - # % self.min_samples_split - # ) check_scalar( self.min_samples_split, "min_samples_split", @@ -278,13 +271,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): ) min_samples_split = self.min_samples_split else: # float, string, other - # if not 0.0 < self.min_samples_split <= 1.0: - # raise ValueError( - # "min_samples_split must be an integer " - # "greater than 1 or a float in (0.0, 1.0]; " - # "got the float %s" - # % self.min_samples_split - # ) check_scalar( self.min_samples_split, "min_samples_split", @@ -331,18 +317,15 @@ def fit(self, X, y, sample_weight=None, check_input=True): "Number of labels=%d does not match number of samples=%d" % (len(y), n_samples) ) - # if not 0 <= self.min_weight_fraction_leaf <= 0.5: - # raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") + check_scalar( self.min_weight_fraction_leaf, "min_weight_fraction_leaf", target_type=numbers.Real, - min_val=0, - max_val=1, + min_val=0.0, + max_val=1.0, include_boundaries="left", ) - # if max_depth <= 0: - # raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_in_): raise ValueError("max_features must be in (0, n_features]") @@ -361,15 +344,11 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) - # if self.min_impurity_decrease < 0.0: - # raise ValueError( - # "min_impurity_decrease must be greater than or equal to 0" - # ) check_scalar( self.min_impurity_decrease, "min_impurity_decrease", target_type=numbers.Real, - min_val=0, + min_val=0.0, include_boundaries="left", ) # Build tree From 9bc65821fcea2e542b96d33b17106f24effb8d6e Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 04:56:01 -0500 Subject: [PATCH 10/41] min_samples_leaf: add tests and validation --- sklearn/tree/_classes.py | 25 +++++++++++++++---------- sklearn/tree/tests/test_tree.py | 13 +++++++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 70061f0605449..9a1fe58c31155 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -247,18 +247,23 @@ def fit(self, X, y, sample_weight=None, check_input=True): max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes if isinstance(self.min_samples_leaf, numbers.Integral): - if not 1 <= self.min_samples_leaf: - raise ValueError( - "min_samples_leaf must be at least 1 or in (0, 0.5], got %s" - % self.min_samples_leaf - ) + check_scalar( + self.min_samples_leaf, + "min_samples_leaf", + target_type=numbers.Integral, + min_val=1, + max_val=n_samples, + ) min_samples_leaf = self.min_samples_leaf else: # float - if not 0.0 < self.min_samples_leaf <= 0.5: - raise ValueError( - "min_samples_leaf must be at least 1 or in (0, 0.5], got %s" - % self.min_samples_leaf - ) + check_scalar( + self.min_samples_leaf, + "min_samples_leaf", + target_type=numbers.Real, + min_val=0.0, + max_val=0.5, + include_boundaries="right", + ) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 7ad1928fa6447..993289dcdaf76 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -695,6 +695,19 @@ def test_error(): "min_samples_split must be an instance of , not" " .", ), + ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), + ({"min_samples_leaf": 900}, ValueError, "min_samples_leaf == 900, must be <="), + ({"min_samples_leaf": 0.0}, ValueError, "min_samples_leaf == 0.0, must be > 0"), + ( + {"min_samples_leaf": 0.6}, + ValueError, + "min_samples_leaf == 0.6, must be <= 0.5", + ), + ( + {"min_samples_leaf": "foo"}, + TypeError, + "min_samples_leaf must be an instance of ", + ), ( {"min_weight_fraction_leaf": -1}, ValueError, From c850f6cc6a9c31225b8c388fa53c29d1d97f0a43 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 05:00:19 -0500 Subject: [PATCH 11/41] max_features: add tests and validation --- sklearn/tree/_classes.py | 20 +++++++++++++++++--- sklearn/tree/tests/test_tree.py | 30 +++++++----------------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 9a1fe58c31155..acb508159e734 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -14,6 +14,7 @@ # # License: BSD 3 clause +from io import IncrementalNewlineDecoder import numbers import warnings import copy @@ -308,8 +309,24 @@ def fit(self, X, y, sample_weight=None, check_input=True): elif self.max_features is None: max_features = self.n_features_in_ elif isinstance(self.max_features, numbers.Integral): + check_scalar( + self.max_features, + "max_features", + target_type=numbers.Integral, + min_val=0, + max_val=self.n_features_in_, + include_boundaries="right", + ) max_features = self.max_features else: # float + check_scalar( + self.max_features, + "max_features", + target_type=numbers.Real, + min_val=0.0, + max_val=1.0, + include_boundaries="right", + ) if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_in_)) else: @@ -331,9 +348,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): max_val=1.0, include_boundaries="left", ) - if not (0 < max_features <= self.n_features_in_): - raise ValueError("max_features must be in (0, n_features]") - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 993289dcdaf76..d8271f79af488 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -546,27 +546,6 @@ def test_max_features(): est.fit(iris.data, iris.target) assert est.max_features_ == iris.data.shape[1] - # use values of max_features that are invalid - est = TreeEstimator(max_features=10) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features=-1) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features=0.0) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features=1.5) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features="foobar") - with pytest.raises(ValueError): - est.fit(X, y) - def test_error(): # Test that it gives proper exception on deficient input. @@ -604,8 +583,8 @@ def test_error(): # TreeEstimator(min_samples_split=2.5).fit(X, y) # with pytest.raises(ValueError): # TreeEstimator(max_depth=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(max_features=42).fit(X, y) + # with pytest.raises(ValueError): + # TreeEstimator(max_features=42).fit(X, y) # with pytest.raises(ValueError): # TreeEstimator(min_impurity_decrease=-1.0).fit(X, y) @@ -723,6 +702,11 @@ def test_error(): TypeError, "min_weight_fraction_leaf must be an instance of ", ), + ({"max_features": 0}, ValueError, "max_features == 0, must be > 0"), + ({"max_features": 1_000}, ValueError, "max_features == 1000, must be <="), + ({"max_features": 0.0}, ValueError, "max_features == 0.0, must be > 0.0"), + ({"max_features": 1.1}, ValueError, "max_features == 1.1, must be <= 1.0"), + ({"max_features": "foobar"}, ValueError, "Invalid value for max_features."), ({"max_leaf_nodes": 0}, ValueError, "max_leaf_nodes == 0, must be >= 2"), ( {"max_leaf_nodes": 1.5}, From 4fe47d0c451a61bf78e9490ddf6006d0eb7ffed6 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 05:08:28 -0500 Subject: [PATCH 12/41] min_samples_split: update --- sklearn/tree/_classes.py | 2 +- sklearn/tree/tests/test_tree.py | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index acb508159e734..4446f6d695adc 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -282,7 +282,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): "min_samples_split", target_type=numbers.Real, min_val=0.0, - max_val=1.0, + max_val=0.5, include_boundaries="right", ) min_samples_split = int(ceil(self.min_samples_split * n_samples)) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index d8271f79af488..a541557903406 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -651,7 +651,6 @@ def test_error(): TypeError, "max_depth must be an instance of , not", ), - ({"min_samples_split": -1}, ValueError, "min_samples_split == -1, must be > 1"), ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be > 1"), ( {"min_samples_split": 0.0}, @@ -659,20 +658,14 @@ def test_error(): "min_samples_split == 0.0, must be > 0.0", ), ( - {"min_samples_split": 1.1}, + {"min_samples_split": 0.6}, ValueError, - "min_samples_split == 1.1, must be <= 1.0", - ), - ( - {"min_samples_split": 2.5}, - ValueError, - "min_samples_split == 2.5, must be <= 1.0", + "min_samples_split == 0.6, must be <= 0.5", ), ( {"min_samples_split": "foo"}, TypeError, - "min_samples_split must be an instance of , not" - " .", + "min_samples_split must be an instance of ", ), ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), ({"min_samples_leaf": 900}, ValueError, "min_samples_leaf == 900, must be <="), From 44c5afc38e5cb91eafb15bae9fc94adbd244dd91 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 05:13:38 -0500 Subject: [PATCH 13/41] max_leaf_nodes: remove redundant tests --- sklearn/tree/tests/test_tree.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a541557903406..b97cce2698e1b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1329,17 +1329,6 @@ def test_max_leaf_nodes(): est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y) assert est.get_n_leaves() == k + 1 - # max_leaf_nodes in (0, 1) should raise ValueError - est = TreeEstimator(max_depth=None, max_leaf_nodes=0) - with pytest.raises(ValueError): - est.fit(X, y) - est = TreeEstimator(max_depth=None, max_leaf_nodes=1) - with pytest.raises(ValueError): - est.fit(X, y) - est = TreeEstimator(max_depth=None, max_leaf_nodes=0.1) - with pytest.raises(ValueError): - est.fit(X, y) - def test_max_leaf_nodes_max_depth(): # Test precedence of max_leaf_nodes over max_depth. From 12eb88acddc9efe3adfd2e7754f32ab38226f087 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 05:13:38 -0500 Subject: [PATCH 14/41] remove redundant tests --- sklearn/tree/tests/test_tree.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a541557903406..b97cce2698e1b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1329,17 +1329,6 @@ def test_max_leaf_nodes(): est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y) assert est.get_n_leaves() == k + 1 - # max_leaf_nodes in (0, 1) should raise ValueError - est = TreeEstimator(max_depth=None, max_leaf_nodes=0) - with pytest.raises(ValueError): - est.fit(X, y) - est = TreeEstimator(max_depth=None, max_leaf_nodes=1) - with pytest.raises(ValueError): - est.fit(X, y) - est = TreeEstimator(max_depth=None, max_leaf_nodes=0.1) - with pytest.raises(ValueError): - est.fit(X, y) - def test_max_leaf_nodes_max_depth(): # Test precedence of max_leaf_nodes over max_depth. From 60a215f76f72163bfb916f7b0ec0a095c8ea8e7a Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 05:30:01 -0500 Subject: [PATCH 15/41] Fix lint issue --- sklearn/tree/_classes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4446f6d695adc..abce2789823c9 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -14,7 +14,6 @@ # # License: BSD 3 clause -from io import IncrementalNewlineDecoder import numbers import warnings import copy From e845597267a6aab5223731a4f942f688406623ed Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 28 Dec 2021 05:58:10 -0500 Subject: [PATCH 16/41] update range for min_weight_fraction_leaf --- sklearn/tree/_classes.py | 3 +-- sklearn/tree/tests/test_tree.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index abce2789823c9..46065b92863d4 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -344,8 +344,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): "min_weight_fraction_leaf", target_type=numbers.Real, min_val=0.0, - max_val=1.0, - include_boundaries="left", + max_val=0.5, ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index b97cce2698e1b..67a15b2531045 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -686,9 +686,9 @@ def test_error(): "min_weight_fraction_leaf == -1, must be >= 0", ), ( - {"min_weight_fraction_leaf": 1.1}, + {"min_weight_fraction_leaf": 0.6}, ValueError, - "min_weight_fraction_leaf == 1.1, must be < 1", + "min_weight_fraction_leaf == 0.6, must be <= 0.5", ), ( {"min_weight_fraction_leaf": "foo"}, From 6eb026c43e5a625eaaa1466d310fec95f2f9150b Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 19:09:18 -0500 Subject: [PATCH 17/41] remove redundant tests --- sklearn/tree/tests/test_tree.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 67a15b2531045..dd22c4976f72d 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -560,34 +560,6 @@ def test_error(): with pytest.raises(ValueError): est.predict_proba(X2) - for name, TreeEstimator in ALL_TREES.items(): - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=0.6).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=0.0).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=3.0).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_samples_split=-1).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_samples_split=0.0).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_samples_split=1.1).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_samples_split=2.5).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(max_depth=-1).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(max_features=42).fit(X, y) - # with pytest.raises(ValueError): - # TreeEstimator(min_impurity_decrease=-1.0).fit(X, y) - # Wrong dimensions est = TreeEstimator() y2 = y[:-1] @@ -649,7 +621,7 @@ def test_error(): ( {"max_depth": 1.1}, TypeError, - "max_depth must be an instance of , not", + "max_depth must be an instance of ", ), ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be > 1"), ( From b9a5091766bec1bd358799d7e9f128f276101387 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 19:29:30 -0500 Subject: [PATCH 18/41] min_samples_split: update to include upper bound --- sklearn/tree/_classes.py | 4 ++-- sklearn/tree/tests/test_tree.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 46065b92863d4..e332e2a73255e 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -271,8 +271,8 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.min_samples_split, "min_samples_split", target_type=numbers.Integral, - min_val=1, - include_boundaries="neither", + min_val=2, + max_val=n_samples, ) min_samples_split = self.min_samples_split else: # float, string, other diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index dd22c4976f72d..57a84692a3615 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -623,7 +623,12 @@ def test_error(): TypeError, "max_depth must be an instance of ", ), - ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be > 1"), + ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be >= 2"), + ( + {"min_samples_split": 900}, + ValueError, + "min_samples_split == 900, must be <=", + ), ( {"min_samples_split": 0.0}, ValueError, From eb8bacb453559900b7458a0b268db5beec6fd729 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 19:34:27 -0500 Subject: [PATCH 19/41] slight style edit to tests --- sklearn/tree/tests/test_tree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 57a84692a3615..a9d7e11e4566b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -660,7 +660,7 @@ def test_error(): ( {"min_weight_fraction_leaf": -1}, ValueError, - "min_weight_fraction_leaf == -1, must be >= 0", + "min_weight_fraction_leaf == -1, must be >= 0.0", ), ( {"min_weight_fraction_leaf": 0.6}, @@ -673,7 +673,7 @@ def test_error(): "min_weight_fraction_leaf must be an instance of ", ), ({"max_features": 0}, ValueError, "max_features == 0, must be > 0"), - ({"max_features": 1_000}, ValueError, "max_features == 1000, must be <="), + ({"max_features": 1000}, ValueError, "max_features == 1000, must be <="), ({"max_features": 0.0}, ValueError, "max_features == 0.0, must be > 0.0"), ({"max_features": 1.1}, ValueError, "max_features == 1.1, must be <= 1.0"), ({"max_features": "foobar"}, ValueError, "Invalid value for max_features."), @@ -684,9 +684,9 @@ def test_error(): "max_leaf_nodes must be an instance of ", ), ( - {"min_impurity_decrease": -1.0}, + {"min_impurity_decrease": -1}, ValueError, - "min_impurity_decrease == -1.0, must be >= 0", + "min_impurity_decrease == -1, must be >= 0.0", ), ( {"min_impurity_decrease": "foo"}, From 1060fd74f862612fc246ec065f95b4dfbeaf8532 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 20:40:09 -0500 Subject: [PATCH 20/41] update boundary inclusivity for max_features --- sklearn/tree/_classes.py | 3 +-- sklearn/tree/tests/test_tree.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e332e2a73255e..a7c1d822c916f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -312,9 +312,8 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.max_features, "max_features", target_type=numbers.Integral, - min_val=0, + min_val=1, max_val=self.n_features_in_, - include_boundaries="right", ) max_features = self.max_features else: # float diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a9d7e11e4566b..af76d23557c56 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -672,7 +672,7 @@ def test_error(): TypeError, "min_weight_fraction_leaf must be an instance of ", ), - ({"max_features": 0}, ValueError, "max_features == 0, must be > 0"), + ({"max_features": 0}, ValueError, "max_features == 0, must be >= 1"), ({"max_features": 1000}, ValueError, "max_features == 1000, must be <="), ({"max_features": 0.0}, ValueError, "max_features == 0.0, must be > 0.0"), ({"max_features": 1.1}, ValueError, "max_features == 1.1, must be <= 1.0"), From f5973f842810f3b560b22490a4498511154c82f2 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 20:50:18 -0500 Subject: [PATCH 21/41] update order of tests --- sklearn/tree/tests/test_tree.py | 118 ++++++++++++++++---------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index af76d23557c56..5e466cd06381d 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -501,52 +501,6 @@ def test_importances_gini_equal_squared_error(): assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) -def test_max_features(): - # Check max_features. - for name, TreeRegressor in REG_TREES.items(): - reg = TreeRegressor(max_features="auto") - reg.fit(diabetes.data, diabetes.target) - assert reg.max_features_ == diabetes.data.shape[1] - - for name, TreeClassifier in CLF_TREES.items(): - clf = TreeClassifier(max_features="auto") - clf.fit(iris.data, iris.target) - assert clf.max_features_ == 2 - - for name, TreeEstimator in ALL_TREES.items(): - est = TreeEstimator(max_features="sqrt") - est.fit(iris.data, iris.target) - assert est.max_features_ == int(np.sqrt(iris.data.shape[1])) - - est = TreeEstimator(max_features="log2") - est.fit(iris.data, iris.target) - assert est.max_features_ == int(np.log2(iris.data.shape[1])) - - est = TreeEstimator(max_features=1) - est.fit(iris.data, iris.target) - assert est.max_features_ == 1 - - est = TreeEstimator(max_features=3) - est.fit(iris.data, iris.target) - assert est.max_features_ == 3 - - est = TreeEstimator(max_features=0.01) - est.fit(iris.data, iris.target) - assert est.max_features_ == 1 - - est = TreeEstimator(max_features=0.5) - est.fit(iris.data, iris.target) - assert est.max_features_ == int(0.5 * iris.data.shape[1]) - - est = TreeEstimator(max_features=1.0) - est.fit(iris.data, iris.target) - assert est.max_features_ == iris.data.shape[1] - - est = TreeEstimator(max_features=None) - est.fit(iris.data, iris.target) - assert est.max_features_ == iris.data.shape[1] - - def test_error(): # Test that it gives proper exception on deficient input. for name, TreeEstimator in CLF_TREES.items(): @@ -623,6 +577,19 @@ def test_error(): TypeError, "max_depth must be an instance of ", ), + ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), + ({"min_samples_leaf": 900}, ValueError, "min_samples_leaf == 900, must be <="), + ({"min_samples_leaf": 0.0}, ValueError, "min_samples_leaf == 0.0, must be > 0"), + ( + {"min_samples_leaf": 0.6}, + ValueError, + "min_samples_leaf == 0.6, must be <= 0.5", + ), + ( + {"min_samples_leaf": "foo"}, + TypeError, + "min_samples_leaf must be an instance of ", + ), ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be >= 2"), ( {"min_samples_split": 900}, @@ -644,19 +611,6 @@ def test_error(): TypeError, "min_samples_split must be an instance of ", ), - ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), - ({"min_samples_leaf": 900}, ValueError, "min_samples_leaf == 900, must be <="), - ({"min_samples_leaf": 0.0}, ValueError, "min_samples_leaf == 0.0, must be > 0"), - ( - {"min_samples_leaf": 0.6}, - ValueError, - "min_samples_leaf == 0.6, must be <= 0.5", - ), - ( - {"min_samples_leaf": "foo"}, - TypeError, - "min_samples_leaf must be an instance of ", - ), ( {"min_weight_fraction_leaf": -1}, ValueError, @@ -708,6 +662,52 @@ def test_tree_params_validation(name, Tree, params, err_type, err_msg): est.fit(X, y) +def test_max_features(): + # Check max_features. + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(max_features="auto") + reg.fit(diabetes.data, diabetes.target) + assert reg.max_features_ == diabetes.data.shape[1] + + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(max_features="auto") + clf.fit(iris.data, iris.target) + assert clf.max_features_ == 2 + + for name, TreeEstimator in ALL_TREES.items(): + est = TreeEstimator(max_features="sqrt") + est.fit(iris.data, iris.target) + assert est.max_features_ == int(np.sqrt(iris.data.shape[1])) + + est = TreeEstimator(max_features="log2") + est.fit(iris.data, iris.target) + assert est.max_features_ == int(np.log2(iris.data.shape[1])) + + est = TreeEstimator(max_features=1) + est.fit(iris.data, iris.target) + assert est.max_features_ == 1 + + est = TreeEstimator(max_features=3) + est.fit(iris.data, iris.target) + assert est.max_features_ == 3 + + est = TreeEstimator(max_features=0.01) + est.fit(iris.data, iris.target) + assert est.max_features_ == 1 + + est = TreeEstimator(max_features=0.5) + est.fit(iris.data, iris.target) + assert est.max_features_ == int(0.5 * iris.data.shape[1]) + + est = TreeEstimator(max_features=1.0) + est.fit(iris.data, iris.target) + assert est.max_features_ == iris.data.shape[1] + + est = TreeEstimator(max_features=None) + est.fit(iris.data, iris.target) + assert est.max_features_ == iris.data.shape[1] + + def test_min_samples_split(): """Test min_samples_split parameter""" X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE) From 5c5769bf0badf38a06fb7bcc65c852d16a153670 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 21:09:42 -0500 Subject: [PATCH 22/41] update order of validations to match signature more closely --- sklearn/tree/_classes.py | 50 +++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a7c1d822c916f..a96967a302e7b 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -236,16 +236,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): ) max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth - if self.max_leaf_nodes is not None: - check_scalar( - self.max_leaf_nodes, - "max_leaf_nodes", - target_type=numbers.Integral, - min_val=2, - include_boundaries="left", - ) - max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if isinstance(self.min_samples_leaf, numbers.Integral): check_scalar( self.min_samples_leaf, @@ -289,6 +279,14 @@ def fit(self, X, y, sample_weight=None, check_input=True): min_samples_split = max(min_samples_split, 2 * min_samples_leaf) + check_scalar( + self.min_weight_fraction_leaf, + "min_weight_fraction_leaf", + target_type=numbers.Real, + min_val=0.0, + max_val=0.5, + ) + if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: @@ -332,19 +330,30 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.max_features_ = max_features - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) + if self.max_leaf_nodes is not None: + check_scalar( + self.max_leaf_nodes, + "max_leaf_nodes", + target_type=numbers.Integral, + min_val=2, + include_boundaries="left", ) + max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes check_scalar( - self.min_weight_fraction_leaf, - "min_weight_fraction_leaf", + self.min_impurity_decrease, + "min_impurity_decrease", target_type=numbers.Real, min_val=0.0, - max_val=0.5, + include_boundaries="left", ) + + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) + ) + if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) @@ -360,13 +369,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) - check_scalar( - self.min_impurity_decrease, - "min_impurity_decrease", - target_type=numbers.Real, - min_val=0.0, - include_boundaries="left", - ) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): From 2e4528d96141bb7de0189f7d5e867c585eb29e0a Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 29 Dec 2021 22:45:02 -0500 Subject: [PATCH 23/41] remove some boundary args to rely on defaults more and reduce lines --- sklearn/tree/_classes.py | 5 +---- sklearn/tree/tests/test_tree.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a96967a302e7b..03a9cea4c64ef 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -231,8 +231,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.max_depth, "max_depth", target_type=numbers.Integral, - min_val=0, - include_boundaries="neither", + min_val=1, ) max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth @@ -336,7 +335,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): "max_leaf_nodes", target_type=numbers.Integral, min_val=2, - include_boundaries="left", ) max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes @@ -345,7 +343,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): "min_impurity_decrease", target_type=numbers.Real, min_val=0.0, - include_boundaries="left", ) if len(y) != n_samples: diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 5e466cd06381d..a83f33a7063e0 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -571,7 +571,7 @@ def test_error(): @pytest.mark.parametrize( "params, err_type, err_msg", [ - ({"max_depth": -1}, ValueError, "max_depth == -1, must be > 0"), + ({"max_depth": -1}, ValueError, "max_depth == -1, must be >= 1"), ( {"max_depth": 1.1}, TypeError, From dfb4b87a8ce5876cae72994c1649cfdc0943fe2b Mon Sep 17 00:00:00 2001 From: genvalen Date: Mon, 3 Jan 2022 23:31:13 -0500 Subject: [PATCH 24/41] ccp_alpha: add tests and validation --- sklearn/tree/_classes.py | 10 +++++++--- sklearn/tree/tests/test_tree.py | 8 +++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 03a9cea4c64ef..950f7abdf277b 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -152,8 +152,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): random_state = check_random_state(self.random_state) - if self.ccp_alpha < 0.0: - raise ValueError("ccp_alpha must be greater than or equal to 0") + check_scalar( + self.ccp_alpha, + "ccp_alpha", + target_type=numbers.Real, + min_val=0.0, + ) if check_input: # Need to validate separately here. @@ -570,7 +574,7 @@ def _prune_tree(self): check_is_fitted(self) if self.ccp_alpha < 0.0: - raise ValueError("ccp_alpha must be greater than or equal to 0") + raise ValueError("must be >= 0.0.") if self.ccp_alpha == 0.0: return diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a83f33a7063e0..ed7cdfb4dfa78 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -647,6 +647,12 @@ def test_error(): TypeError, "min_impurity_decrease must be an instance of ", ), + ({"ccp_alpha": -1.0}, ValueError, "ccp_alpha == -1.0, must be >= 0.0"), + ( + {"ccp_alpha": "foo"}, + TypeError, + "ccp_alpha must be an instance of ", + ), ], ) def test_tree_params_validation(name, Tree, params, err_type, err_msg): @@ -2019,7 +2025,7 @@ def assert_is_subtree(tree, subtree): def test_prune_tree_raises_negative_ccp_alpha(): clf = DecisionTreeClassifier() - msg = "ccp_alpha must be greater than or equal to 0" + msg = "must be >= 0.0." with pytest.raises(ValueError, match=msg): clf.set_params(ccp_alpha=-1.0) From fa7a486b7c34abfdf6c8d37388636eca084d0c41 Mon Sep 17 00:00:00 2001 From: genvalen Date: Mon, 3 Jan 2022 23:45:01 -0500 Subject: [PATCH 25/41] edit comment --- sklearn/tree/_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 950f7abdf277b..ade6e8ec78ffe 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -268,7 +268,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): max_val=n_samples, ) min_samples_split = self.min_samples_split - else: # float, string, other + else: # float check_scalar( self.min_samples_split, "min_samples_split", From 42676787bcdaba12c7a96fa3456d8234b695d589 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 4 Jan 2022 10:22:56 -0500 Subject: [PATCH 26/41] Update sklearn/tree/_classes.py Co-authored-by: Guillaume Lemaitre --- sklearn/tree/_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index ade6e8ec78ffe..2885cd2f56bb5 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -274,7 +274,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): "min_samples_split", target_type=numbers.Real, min_val=0.0, - max_val=0.5, + max_val=1.0, include_boundaries="right", ) min_samples_split = int(ceil(self.min_samples_split * n_samples)) From 9d1bed6f795c08599f479d24580f9444392e6ba7 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 4 Jan 2022 12:14:02 -0500 Subject: [PATCH 27/41] min_samples_split: update tests --- sklearn/tree/tests/test_tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index ed7cdfb4dfa78..6ebd806ff01e1 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -602,9 +602,9 @@ def test_error(): "min_samples_split == 0.0, must be > 0.0", ), ( - {"min_samples_split": 0.6}, + {"min_samples_split": 1.1}, ValueError, - "min_samples_split == 0.6, must be <= 0.5", + "min_samples_split == 1.1, must be <= 1.0", ), ( {"min_samples_split": "foo"}, From 63a5a937bee290b4aaef9603aa30adf91080f126 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 4 Jan 2022 12:30:58 -0500 Subject: [PATCH 28/41] ccp_alpha: remove redundant tests --- sklearn/tree/_classes.py | 3 --- sklearn/tree/tests/test_tree.py | 16 ---------------- 2 files changed, 19 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 2885cd2f56bb5..f3304dcc804e5 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -573,9 +573,6 @@ def _prune_tree(self): """Prune tree using Minimal Cost-Complexity Pruning.""" check_is_fitted(self) - if self.ccp_alpha < 0.0: - raise ValueError("must be >= 0.0.") - if self.ccp_alpha == 0.0: return diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 6ebd806ff01e1..e43dc65ce8a8b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2023,22 +2023,6 @@ def assert_is_subtree(tree, subtree): ) -def test_prune_tree_raises_negative_ccp_alpha(): - clf = DecisionTreeClassifier() - msg = "must be >= 0.0." - - with pytest.raises(ValueError, match=msg): - clf.set_params(ccp_alpha=-1.0) - clf.fit(X, y) - - clf.set_params(ccp_alpha=0.0) - clf.fit(X, y) - - with pytest.raises(ValueError, match=msg): - clf.set_params(ccp_alpha=-1.0) - clf._prune_tree() - - def check_apply_path_readonly(name): X_readonly = create_memmap_backed_data(X_small.astype(tree._tree.DTYPE, copy=False)) y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE)) From 5cf4fa3736cba6b9e502a60a694598ee69670cd9 Mon Sep 17 00:00:00 2001 From: genvalen Date: Fri, 7 Jan 2022 21:20:16 -0500 Subject: [PATCH 29/41] update check scalar calls to explicitly reference "name" param --- sklearn/tree/_classes.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index f3304dcc804e5..08a427394080e 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -154,7 +154,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): check_scalar( self.ccp_alpha, - "ccp_alpha", + name="ccp_alpha", target_type=numbers.Real, min_val=0.0, ) @@ -233,7 +233,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): if self.max_depth is not None: check_scalar( self.max_depth, - "max_depth", + name="max_depth", target_type=numbers.Integral, min_val=1, ) @@ -242,7 +242,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): if isinstance(self.min_samples_leaf, numbers.Integral): check_scalar( self.min_samples_leaf, - "min_samples_leaf", + name="min_samples_leaf", target_type=numbers.Integral, min_val=1, max_val=n_samples, @@ -251,7 +251,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: # float check_scalar( self.min_samples_leaf, - "min_samples_leaf", + name="min_samples_leaf", target_type=numbers.Real, min_val=0.0, max_val=0.5, @@ -262,7 +262,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): if isinstance(self.min_samples_split, numbers.Integral): check_scalar( self.min_samples_split, - "min_samples_split", + name="min_samples_split", target_type=numbers.Integral, min_val=2, max_val=n_samples, @@ -271,7 +271,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: # float check_scalar( self.min_samples_split, - "min_samples_split", + name="min_samples_split", target_type=numbers.Real, min_val=0.0, max_val=1.0, @@ -284,7 +284,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): check_scalar( self.min_weight_fraction_leaf, - "min_weight_fraction_leaf", + name="min_weight_fraction_leaf", target_type=numbers.Real, min_val=0.0, max_val=0.5, @@ -311,7 +311,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): elif isinstance(self.max_features, numbers.Integral): check_scalar( self.max_features, - "max_features", + name="max_features", target_type=numbers.Integral, min_val=1, max_val=self.n_features_in_, @@ -320,7 +320,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: # float check_scalar( self.max_features, - "max_features", + name="max_features", target_type=numbers.Real, min_val=0.0, max_val=1.0, @@ -336,7 +336,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): if self.max_leaf_nodes is not None: check_scalar( self.max_leaf_nodes, - "max_leaf_nodes", + name="max_leaf_nodes", target_type=numbers.Integral, min_val=2, ) @@ -344,7 +344,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): check_scalar( self.min_impurity_decrease, - "min_impurity_decrease", + name="min_impurity_decrease", target_type=numbers.Real, min_val=0.0, ) From 0b895876a834860d30c34206cdc447c71a704579 Mon Sep 17 00:00:00 2001 From: genvalen Date: Mon, 10 Jan 2022 23:32:01 -0500 Subject: [PATCH 30/41] Update sklearn/tree/_classes.py Co-authored-by: Thomas J. Fan --- sklearn/tree/_classes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 08a427394080e..d39d42272d554 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -264,8 +264,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.min_samples_split, name="min_samples_split", target_type=numbers.Integral, - min_val=2, - max_val=n_samples, + min_val=1, ) min_samples_split = self.min_samples_split else: # float From a61e73c7d70bff77c1af477f8a6e594462247ddc Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 11 Jan 2022 18:24:28 -0500 Subject: [PATCH 31/41] update tests for min_samples_split --- sklearn/tree/tests/test_tree.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index e43dc65ce8a8b..a443e249a249d 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -590,12 +590,7 @@ def test_error(): TypeError, "min_samples_leaf must be an instance of ", ), - ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be >= 2"), - ( - {"min_samples_split": 900}, - ValueError, - "min_samples_split == 900, must be <=", - ), + ({"min_samples_split": 0}, ValueError, "min_samples_split == 0, must be >= 1"), ( {"min_samples_split": 0.0}, ValueError, From 51e098040871e33626d60dd2f2fbdcb40cae8bf7 Mon Sep 17 00:00:00 2001 From: genvalen Date: Tue, 11 Jan 2022 22:50:01 -0500 Subject: [PATCH 32/41] update messages in test_gbdt_parameter_checks to pass CI --- .../ensemble/tests/test_gradient_boosting.py | 84 +++++++++++++------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index c2045aa35d652..df1dddf942f12 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -79,31 +79,63 @@ def test_classification_toy(loss): @pytest.mark.parametrize( - "params, err_msg", + "params, err_type, err_msg", [ - ({"n_estimators": 0}, "n_estimators must be greater than 0"), - ({"n_estimators": -1}, "n_estimators must be greater than 0"), - ({"learning_rate": 0}, "learning_rate must be greater than 0"), - ({"learning_rate": -1.0}, "learning_rate must be greater than 0"), - ({"loss": "foobar"}, "Loss 'foobar' not supported"), - ({"min_samples_split": 0.0}, "min_samples_split must be an integer"), - ({"min_samples_split": -1.0}, "min_samples_split must be an integer"), - ({"min_samples_split": 1.1}, "min_samples_split must be an integer"), - ({"min_samples_leaf": 0}, "min_samples_leaf must be at least 1 or"), - ({"min_samples_leaf": -1.0}, "min_samples_leaf must be at least 1 or"), - ({"min_weight_fraction_leaf": -1.0}, "min_weight_fraction_leaf must in"), - ({"min_weight_fraction_leaf": 0.6}, "min_weight_fraction_leaf must in"), - ({"subsample": 0.0}, r"subsample must be in \(0,1\]"), - ({"subsample": 1.1}, r"subsample must be in \(0,1\]"), - ({"subsample": -0.1}, r"subsample must be in \(0,1\]"), - ({"max_depth": -0.1}, "max_depth must be greater than zero"), - ({"max_depth": 0}, "max_depth must be greater than zero"), - ({"init": {}}, "The init parameter must be an estimator or 'zero'"), - ({"max_features": "invalid"}, "Invalid value for max_features:"), - ({"max_features": 0}, r"max_features must be in \(0, n_features\]"), - ({"max_features": 100}, r"max_features must be in \(0, n_features\]"), - ({"max_features": -0.1}, r"max_features must be in \(0, n_features\]"), - ({"n_iter_no_change": "invalid"}, "n_iter_no_change should either be"), + ({"n_estimators": 0}, ValueError, "n_estimators must be greater than 0"), + ({"n_estimators": -1}, ValueError, "n_estimators must be greater than 0"), + ({"learning_rate": 0}, ValueError, "learning_rate must be greater than 0"), + ({"learning_rate": -1.0}, ValueError, "learning_rate must be greater than 0"), + ({"loss": "foobar"}, ValueError, "Loss 'foobar' not supported"), + ( + {"min_samples_split": 0.0}, + ValueError, + "min_samples_split == 0.0, must be > 0.0", + ), + ( + {"min_samples_split": -1.0}, + ValueError, + "min_samples_split == -1.0, must be > 0.0", + ), + ( + {"min_samples_split": 1.1}, + ValueError, + "min_samples_split == 1.1, must be <= 1.0.", + ), + ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), + ( + {"min_samples_leaf": -1.0}, + ValueError, + "min_samples_leaf == -1.0, must be > 0.0.", + ), + ( + {"min_weight_fraction_leaf": -1.0}, + ValueError, + "min_weight_fraction_leaf == -1.0, must be >= 0", + ), + ( + {"min_weight_fraction_leaf": 0.6}, + ValueError, + "min_weight_fraction_leaf == 0.6, must be <= 0.5.", + ), + ({"subsample": 0.0}, ValueError, r"subsample must be in \(0,1\]"), + ({"subsample": 1.1}, ValueError, r"subsample must be in \(0,1\]"), + ({"subsample": -0.1}, ValueError, r"subsample must be in \(0,1\]"), + ({"max_depth": -0.1}, TypeError, "max_depth must be an instance of"), + ({"max_depth": 0}, ValueError, "max_depth == 0, must be >= 1."), + ({"init": {}}, ValueError, "The init parameter must be an estimator or 'zero'"), + ({"max_features": "invalid"}, ValueError, "Invalid value for max_features:"), + ({"max_features": 0}, ValueError, "max_features == 0, must be >= 1"), + ({"max_features": 100}, ValueError, "max_features == 100, must be <="), + ( + {"max_features": -0.1}, + ValueError, + r"max_features must be in \(0, n_features\]", + ), + ( + {"n_iter_no_change": "invalid"}, + ValueError, + "n_iter_no_change should either be", + ), ], # Avoid long error messages in test names: # https://github.com/scikit-learn/scikit-learn/issues/21362 @@ -116,9 +148,9 @@ def test_classification_toy(loss): (GradientBoostingClassifier, iris.data, iris.target), ], ) -def test_gbdt_parameter_checks(GradientBoosting, X, y, params, err_msg): +def test_gbdt_parameter_checks(GradientBoosting, X, y, params, err_type, err_msg): # Check input parameter validation for GradientBoosting - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(err_type, match=err_msg): GradientBoosting(**params).fit(X, y) From 5eda80ac09774370b7649ae907fb0e4038be7285 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 12 Jan 2022 09:46:30 -0500 Subject: [PATCH 33/41] Update sklearn/ensemble/tests/test_gradient_boosting.py Co-authored-by: Julien Jerphanion --- sklearn/ensemble/tests/test_gradient_boosting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index df1dddf942f12..66d1010eda30c 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -150,8 +150,9 @@ def test_classification_toy(loss): ) def test_gbdt_parameter_checks(GradientBoosting, X, y, params, err_type, err_msg): # Check input parameter validation for GradientBoosting + est = GradientBoosting(**params) with pytest.raises(err_type, match=err_msg): - GradientBoosting(**params).fit(X, y) + est.fit(X, y) @pytest.mark.parametrize( From a65e9bf2959f5309d50f5a9bcbbaebe05ee041a1 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 12 Jan 2022 14:49:52 -0500 Subject: [PATCH 34/41] Put test_max_features back on line 504 --- sklearn/tree/tests/test_tree.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a443e249a249d..0f98911094c90 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -501,6 +501,52 @@ def test_importances_gini_equal_squared_error(): assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) +def test_max_features(): + # Check max_features. + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(max_features="auto") + reg.fit(diabetes.data, diabetes.target) + assert reg.max_features_ == diabetes.data.shape[1] + + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(max_features="auto") + clf.fit(iris.data, iris.target) + assert clf.max_features_ == 2 + + for name, TreeEstimator in ALL_TREES.items(): + est = TreeEstimator(max_features="sqrt") + est.fit(iris.data, iris.target) + assert est.max_features_ == int(np.sqrt(iris.data.shape[1])) + + est = TreeEstimator(max_features="log2") + est.fit(iris.data, iris.target) + assert est.max_features_ == int(np.log2(iris.data.shape[1])) + + est = TreeEstimator(max_features=1) + est.fit(iris.data, iris.target) + assert est.max_features_ == 1 + + est = TreeEstimator(max_features=3) + est.fit(iris.data, iris.target) + assert est.max_features_ == 3 + + est = TreeEstimator(max_features=0.01) + est.fit(iris.data, iris.target) + assert est.max_features_ == 1 + + est = TreeEstimator(max_features=0.5) + est.fit(iris.data, iris.target) + assert est.max_features_ == int(0.5 * iris.data.shape[1]) + + est = TreeEstimator(max_features=1.0) + est.fit(iris.data, iris.target) + assert est.max_features_ == iris.data.shape[1] + + est = TreeEstimator(max_features=None) + est.fit(iris.data, iris.target) + assert est.max_features_ == iris.data.shape[1] + + def test_error(): # Test that it gives proper exception on deficient input. for name, TreeEstimator in CLF_TREES.items(): From 5e5b3818dd9536401b1741407eefd8b41e12458e Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 12 Jan 2022 15:03:58 -0500 Subject: [PATCH 35/41] fix lint issue --- sklearn/tree/tests/test_tree.py | 46 --------------------------------- 1 file changed, 46 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 0f98911094c90..a6834106a8331 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -709,52 +709,6 @@ def test_tree_params_validation(name, Tree, params, err_type, err_msg): est.fit(X, y) -def test_max_features(): - # Check max_features. - for name, TreeRegressor in REG_TREES.items(): - reg = TreeRegressor(max_features="auto") - reg.fit(diabetes.data, diabetes.target) - assert reg.max_features_ == diabetes.data.shape[1] - - for name, TreeClassifier in CLF_TREES.items(): - clf = TreeClassifier(max_features="auto") - clf.fit(iris.data, iris.target) - assert clf.max_features_ == 2 - - for name, TreeEstimator in ALL_TREES.items(): - est = TreeEstimator(max_features="sqrt") - est.fit(iris.data, iris.target) - assert est.max_features_ == int(np.sqrt(iris.data.shape[1])) - - est = TreeEstimator(max_features="log2") - est.fit(iris.data, iris.target) - assert est.max_features_ == int(np.log2(iris.data.shape[1])) - - est = TreeEstimator(max_features=1) - est.fit(iris.data, iris.target) - assert est.max_features_ == 1 - - est = TreeEstimator(max_features=3) - est.fit(iris.data, iris.target) - assert est.max_features_ == 3 - - est = TreeEstimator(max_features=0.01) - est.fit(iris.data, iris.target) - assert est.max_features_ == 1 - - est = TreeEstimator(max_features=0.5) - est.fit(iris.data, iris.target) - assert est.max_features_ == int(0.5 * iris.data.shape[1]) - - est = TreeEstimator(max_features=1.0) - est.fit(iris.data, iris.target) - assert est.max_features_ == iris.data.shape[1] - - est = TreeEstimator(max_features=None) - est.fit(iris.data, iris.target) - assert est.max_features_ == iris.data.shape[1] - - def test_min_samples_split(): """Test min_samples_split parameter""" X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE) From 3a424c241ad3b05f1a281c2faa6989ea0747f674 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 12 Jan 2022 19:01:39 -0500 Subject: [PATCH 36/41] Update sklearn/tree/_classes.py Co-authored-by: Thomas J. Fan --- sklearn/tree/_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index d39d42272d554..9490ff52a1e94 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -264,7 +264,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.min_samples_split, name="min_samples_split", target_type=numbers.Integral, - min_val=1, + min_val=2, ) min_samples_split = self.min_samples_split else: # float From 521a35c5a03af5b67f70215a3957c671c8383564 Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 12 Jan 2022 19:15:11 -0500 Subject: [PATCH 37/41] Revert "update tests for min_samples_split" --- sklearn/tree/tests/test_tree.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a6834106a8331..4570deee7ef0e 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -636,7 +636,12 @@ def test_error(): TypeError, "min_samples_leaf must be an instance of ", ), - ({"min_samples_split": 0}, ValueError, "min_samples_split == 0, must be >= 1"), + ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be >= 2"), + ( + {"min_samples_split": 900}, + ValueError, + "min_samples_split == 900, must be <=", + ), ( {"min_samples_split": 0.0}, ValueError, From ce3f22ddae134ce725a2c59d01e54f46dd01666e Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 12 Jan 2022 19:23:48 -0500 Subject: [PATCH 38/41] update min_val for min_samples_split --- sklearn/tree/tests/test_tree.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 4570deee7ef0e..1bff7f1965890 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -637,11 +637,6 @@ def test_error(): "min_samples_leaf must be an instance of ", ), ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be >= 2"), - ( - {"min_samples_split": 900}, - ValueError, - "min_samples_split == 900, must be <=", - ), ( {"min_samples_split": 0.0}, ValueError, From 9357c879fbe737717a2efe91a1db0b5fb05df332 Mon Sep 17 00:00:00 2001 From: genvalen Date: Mon, 24 Jan 2022 13:10:45 -0500 Subject: [PATCH 39/41] Update sklearn/tree/_classes.py Co-authored-by: Thomas J. Fan --- sklearn/tree/_classes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 9490ff52a1e94..f3bbcc9a3dc63 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -245,7 +245,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): name="min_samples_leaf", target_type=numbers.Integral, min_val=1, - max_val=n_samples, ) min_samples_leaf = self.min_samples_leaf else: # float From 733a3c801999f256116f4762b5e6c7fe325c1852 Mon Sep 17 00:00:00 2001 From: genvalen Date: Mon, 24 Jan 2022 13:37:06 -0500 Subject: [PATCH 40/41] Update sklearn/tree/_classes.py Co-authored-by: Olivier Grisel --- sklearn/tree/_classes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index f3bbcc9a3dc63..b8eae3975e09d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -253,7 +253,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): name="min_samples_leaf", target_type=numbers.Real, min_val=0.0, - max_val=0.5, include_boundaries="right", ) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) From 9167c472b730c18d2520d74c8e9337af3f391b2e Mon Sep 17 00:00:00 2001 From: genvalen Date: Mon, 24 Jan 2022 13:40:37 -0500 Subject: [PATCH 41/41] update tests --- sklearn/tree/tests/test_tree.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1bff7f1965890..9bba3ee861a65 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -624,13 +624,7 @@ def test_error(): "max_depth must be an instance of ", ), ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), - ({"min_samples_leaf": 900}, ValueError, "min_samples_leaf == 900, must be <="), ({"min_samples_leaf": 0.0}, ValueError, "min_samples_leaf == 0.0, must be > 0"), - ( - {"min_samples_leaf": 0.6}, - ValueError, - "min_samples_leaf == 0.6, must be <= 0.5", - ), ( {"min_samples_leaf": "foo"}, TypeError,