diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index c2045aa35d652..66d1010eda30c 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -79,31 +79,63 @@ def test_classification_toy(loss): @pytest.mark.parametrize( - "params, err_msg", + "params, err_type, err_msg", [ - ({"n_estimators": 0}, "n_estimators must be greater than 0"), - ({"n_estimators": -1}, "n_estimators must be greater than 0"), - ({"learning_rate": 0}, "learning_rate must be greater than 0"), - ({"learning_rate": -1.0}, "learning_rate must be greater than 0"), - ({"loss": "foobar"}, "Loss 'foobar' not supported"), - ({"min_samples_split": 0.0}, "min_samples_split must be an integer"), - ({"min_samples_split": -1.0}, "min_samples_split must be an integer"), - ({"min_samples_split": 1.1}, "min_samples_split must be an integer"), - ({"min_samples_leaf": 0}, "min_samples_leaf must be at least 1 or"), - ({"min_samples_leaf": -1.0}, "min_samples_leaf must be at least 1 or"), - ({"min_weight_fraction_leaf": -1.0}, "min_weight_fraction_leaf must in"), - ({"min_weight_fraction_leaf": 0.6}, "min_weight_fraction_leaf must in"), - ({"subsample": 0.0}, r"subsample must be in \(0,1\]"), - ({"subsample": 1.1}, r"subsample must be in \(0,1\]"), - ({"subsample": -0.1}, r"subsample must be in \(0,1\]"), - ({"max_depth": -0.1}, "max_depth must be greater than zero"), - ({"max_depth": 0}, "max_depth must be greater than zero"), - ({"init": {}}, "The init parameter must be an estimator or 'zero'"), - ({"max_features": "invalid"}, "Invalid value for max_features:"), - ({"max_features": 0}, r"max_features must be in \(0, n_features\]"), - ({"max_features": 100}, r"max_features must be in \(0, n_features\]"), - ({"max_features": -0.1}, r"max_features must be in \(0, n_features\]"), - ({"n_iter_no_change": "invalid"}, "n_iter_no_change should either be"), + ({"n_estimators": 0}, ValueError, "n_estimators must be greater than 0"), + ({"n_estimators": -1}, ValueError, "n_estimators must be greater than 0"), + ({"learning_rate": 0}, ValueError, "learning_rate must be greater than 0"), + ({"learning_rate": -1.0}, ValueError, "learning_rate must be greater than 0"), + ({"loss": "foobar"}, ValueError, "Loss 'foobar' not supported"), + ( + {"min_samples_split": 0.0}, + ValueError, + "min_samples_split == 0.0, must be > 0.0", + ), + ( + {"min_samples_split": -1.0}, + ValueError, + "min_samples_split == -1.0, must be > 0.0", + ), + ( + {"min_samples_split": 1.1}, + ValueError, + "min_samples_split == 1.1, must be <= 1.0.", + ), + ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), + ( + {"min_samples_leaf": -1.0}, + ValueError, + "min_samples_leaf == -1.0, must be > 0.0.", + ), + ( + {"min_weight_fraction_leaf": -1.0}, + ValueError, + "min_weight_fraction_leaf == -1.0, must be >= 0", + ), + ( + {"min_weight_fraction_leaf": 0.6}, + ValueError, + "min_weight_fraction_leaf == 0.6, must be <= 0.5.", + ), + ({"subsample": 0.0}, ValueError, r"subsample must be in \(0,1\]"), + ({"subsample": 1.1}, ValueError, r"subsample must be in \(0,1\]"), + ({"subsample": -0.1}, ValueError, r"subsample must be in \(0,1\]"), + ({"max_depth": -0.1}, TypeError, "max_depth must be an instance of"), + ({"max_depth": 0}, ValueError, "max_depth == 0, must be >= 1."), + ({"init": {}}, ValueError, "The init parameter must be an estimator or 'zero'"), + ({"max_features": "invalid"}, ValueError, "Invalid value for max_features:"), + ({"max_features": 0}, ValueError, "max_features == 0, must be >= 1"), + ({"max_features": 100}, ValueError, "max_features == 100, must be <="), + ( + {"max_features": -0.1}, + ValueError, + r"max_features must be in \(0, n_features\]", + ), + ( + {"n_iter_no_change": "invalid"}, + ValueError, + "n_iter_no_change should either be", + ), ], # Avoid long error messages in test names: # https://github.com/scikit-learn/scikit-learn/issues/21362 @@ -116,10 +148,11 @@ def test_classification_toy(loss): (GradientBoostingClassifier, iris.data, iris.target), ], ) -def test_gbdt_parameter_checks(GradientBoosting, X, y, params, err_msg): +def test_gbdt_parameter_checks(GradientBoosting, X, y, params, err_type, err_msg): # Check input parameter validation for GradientBoosting - with pytest.raises(ValueError, match=err_msg): - GradientBoosting(**params).fit(X, y) + est = GradientBoosting(**params) + with pytest.raises(err_type, match=err_msg): + est.fit(X, y) @pytest.mark.parametrize( diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 3cd0e000bd4dd..b8eae3975e09d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -32,6 +32,7 @@ from ..base import MultiOutputMixin from ..utils import Bunch from ..utils import check_random_state +from ..utils import check_scalar from ..utils.deprecation import deprecated from ..utils.validation import _check_sample_weight from ..utils import compute_sample_weight @@ -151,8 +152,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): random_state = check_random_state(self.random_state) - if self.ccp_alpha < 0.0: - raise ValueError("ccp_alpha must be greater than or equal to 0") + check_scalar( + self.ccp_alpha, + name="ccp_alpha", + target_type=numbers.Real, + min_val=0.0, + ) if check_input: # Need to validate separately here. @@ -225,46 +230,63 @@ def fit(self, X, y, sample_weight=None, check_input=True): y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters + if self.max_depth is not None: + check_scalar( + self.max_depth, + name="max_depth", + target_type=numbers.Integral, + min_val=1, + ) max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth - max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes if isinstance(self.min_samples_leaf, numbers.Integral): - if not 1 <= self.min_samples_leaf: - raise ValueError( - "min_samples_leaf must be at least 1 or in (0, 0.5], got %s" - % self.min_samples_leaf - ) + check_scalar( + self.min_samples_leaf, + name="min_samples_leaf", + target_type=numbers.Integral, + min_val=1, + ) min_samples_leaf = self.min_samples_leaf else: # float - if not 0.0 < self.min_samples_leaf <= 0.5: - raise ValueError( - "min_samples_leaf must be at least 1 or in (0, 0.5], got %s" - % self.min_samples_leaf - ) + check_scalar( + self.min_samples_leaf, + name="min_samples_leaf", + target_type=numbers.Real, + min_val=0.0, + include_boundaries="right", + ) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): - if not 2 <= self.min_samples_split: - raise ValueError( - "min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split - ) + check_scalar( + self.min_samples_split, + name="min_samples_split", + target_type=numbers.Integral, + min_val=2, + ) min_samples_split = self.min_samples_split else: # float - if not 0.0 < self.min_samples_split <= 1.0: - raise ValueError( - "min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split - ) + check_scalar( + self.min_samples_split, + name="min_samples_split", + target_type=numbers.Real, + min_val=0.0, + max_val=1.0, + include_boundaries="right", + ) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) + check_scalar( + self.min_weight_fraction_leaf, + name="min_weight_fraction_leaf", + target_type=numbers.Real, + min_val=0.0, + max_val=0.5, + ) + if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: @@ -284,8 +306,23 @@ def fit(self, X, y, sample_weight=None, check_input=True): elif self.max_features is None: max_features = self.n_features_in_ elif isinstance(self.max_features, numbers.Integral): + check_scalar( + self.max_features, + name="max_features", + target_type=numbers.Integral, + min_val=1, + max_val=self.n_features_in_, + ) max_features = self.max_features else: # float + check_scalar( + self.max_features, + name="max_features", + target_type=numbers.Real, + min_val=0.0, + max_val=1.0, + include_boundaries="right", + ) if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_in_)) else: @@ -293,27 +330,27 @@ def fit(self, X, y, sample_weight=None, check_input=True): self.max_features_ = max_features + if self.max_leaf_nodes is not None: + check_scalar( + self.max_leaf_nodes, + name="max_leaf_nodes", + target_type=numbers.Integral, + min_val=2, + ) + max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes + + check_scalar( + self.min_impurity_decrease, + name="min_impurity_decrease", + target_type=numbers.Real, + min_val=0.0, + ) + if len(y) != n_samples: raise ValueError( "Number of labels=%d does not match number of samples=%d" % (len(y), n_samples) ) - if not 0 <= self.min_weight_fraction_leaf <= 0.5: - raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") - if max_depth <= 0: - raise ValueError("max_depth must be greater than zero. ") - if not (0 < max_features <= self.n_features_in_): - raise ValueError("max_features must be in (0, n_features]") - if not isinstance(max_leaf_nodes, numbers.Integral): - raise ValueError( - "max_leaf_nodes must be integral number but was %r" % max_leaf_nodes - ) - if -1 < max_leaf_nodes < 2: - raise ValueError( - ("max_leaf_nodes {0} must be either None or larger than 1").format( - max_leaf_nodes - ) - ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) @@ -330,9 +367,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) - if self.min_impurity_decrease < 0.0: - raise ValueError("min_impurity_decrease must be greater than or equal to 0") - # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): @@ -536,9 +570,6 @@ def _prune_tree(self): """Prune tree using Minimal Cost-Complexity Pruning.""" check_is_fitted(self) - if self.ccp_alpha < 0.0: - raise ValueError("ccp_alpha must be greater than or equal to 0") - if self.ccp_alpha == 0.0: return diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index cd6b245bee60e..9bba3ee861a65 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -546,27 +546,6 @@ def test_max_features(): est.fit(iris.data, iris.target) assert est.max_features_ == iris.data.shape[1] - # use values of max_features that are invalid - est = TreeEstimator(max_features=10) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features=-1) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features=0.0) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features=1.5) - with pytest.raises(ValueError): - est.fit(X, y) - - est = TreeEstimator(max_features="foobar") - with pytest.raises(ValueError): - est.fit(X, y) - def test_error(): # Test that it gives proper exception on deficient input. @@ -581,34 +560,6 @@ def test_error(): with pytest.raises(ValueError): est.predict_proba(X2) - for name, TreeEstimator in ALL_TREES.items(): - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=0.6).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=0.0).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_leaf=3.0).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=0.0).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=1.1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_samples_split=2.5).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(max_depth=-1).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(max_features=42).fit(X, y) - with pytest.raises(ValueError): - TreeEstimator(min_impurity_decrease=-1.0).fit(X, y) - # Wrong dimensions est = TreeEstimator() y2 = y[:-1] @@ -662,6 +613,96 @@ def test_error(): est.fit([[0, 1, 2]], [5, -0.1, 2]) +@pytest.mark.parametrize("name, Tree", ALL_TREES.items()) +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ({"max_depth": -1}, ValueError, "max_depth == -1, must be >= 1"), + ( + {"max_depth": 1.1}, + TypeError, + "max_depth must be an instance of ", + ), + ({"min_samples_leaf": 0}, ValueError, "min_samples_leaf == 0, must be >= 1"), + ({"min_samples_leaf": 0.0}, ValueError, "min_samples_leaf == 0.0, must be > 0"), + ( + {"min_samples_leaf": "foo"}, + TypeError, + "min_samples_leaf must be an instance of ", + ), + ({"min_samples_split": 1}, ValueError, "min_samples_split == 1, must be >= 2"), + ( + {"min_samples_split": 0.0}, + ValueError, + "min_samples_split == 0.0, must be > 0.0", + ), + ( + {"min_samples_split": 1.1}, + ValueError, + "min_samples_split == 1.1, must be <= 1.0", + ), + ( + {"min_samples_split": "foo"}, + TypeError, + "min_samples_split must be an instance of ", + ), + ( + {"min_weight_fraction_leaf": -1}, + ValueError, + "min_weight_fraction_leaf == -1, must be >= 0.0", + ), + ( + {"min_weight_fraction_leaf": 0.6}, + ValueError, + "min_weight_fraction_leaf == 0.6, must be <= 0.5", + ), + ( + {"min_weight_fraction_leaf": "foo"}, + TypeError, + "min_weight_fraction_leaf must be an instance of ", + ), + ({"max_features": 0}, ValueError, "max_features == 0, must be >= 1"), + ({"max_features": 1000}, ValueError, "max_features == 1000, must be <="), + ({"max_features": 0.0}, ValueError, "max_features == 0.0, must be > 0.0"), + ({"max_features": 1.1}, ValueError, "max_features == 1.1, must be <= 1.0"), + ({"max_features": "foobar"}, ValueError, "Invalid value for max_features."), + ({"max_leaf_nodes": 0}, ValueError, "max_leaf_nodes == 0, must be >= 2"), + ( + {"max_leaf_nodes": 1.5}, + TypeError, + "max_leaf_nodes must be an instance of ", + ), + ( + {"min_impurity_decrease": -1}, + ValueError, + "min_impurity_decrease == -1, must be >= 0.0", + ), + ( + {"min_impurity_decrease": "foo"}, + TypeError, + "min_impurity_decrease must be an instance of ", + ), + ({"ccp_alpha": -1.0}, ValueError, "ccp_alpha == -1.0, must be >= 0.0"), + ( + {"ccp_alpha": "foo"}, + TypeError, + "ccp_alpha must be an instance of ", + ), + ], +) +def test_tree_params_validation(name, Tree, params, err_type, err_msg): + """Check parameter validation in DecisionTreeClassifier, DecisionTreeRegressor, + ExtraTreeClassifier, and ExtraTreeRegressor. + """ + if "Classifier" in name: + X, y = iris.data, iris.target + else: + X, y = diabetes.data, diabetes.target + est = Tree(**params) + with pytest.raises(err_type, match=err_msg): + est.fit(X, y) + + def test_min_samples_split(): """Test min_samples_split parameter""" X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE) @@ -1260,17 +1301,6 @@ def test_max_leaf_nodes(): est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y) assert est.get_n_leaves() == k + 1 - # max_leaf_nodes in (0, 1) should raise ValueError - est = TreeEstimator(max_depth=None, max_leaf_nodes=0) - with pytest.raises(ValueError): - est.fit(X, y) - est = TreeEstimator(max_depth=None, max_leaf_nodes=1) - with pytest.raises(ValueError): - est.fit(X, y) - est = TreeEstimator(max_depth=None, max_leaf_nodes=0.1) - with pytest.raises(ValueError): - est.fit(X, y) - def test_max_leaf_nodes_max_depth(): # Test precedence of max_leaf_nodes over max_depth. @@ -1982,22 +2012,6 @@ def assert_is_subtree(tree, subtree): ) -def test_prune_tree_raises_negative_ccp_alpha(): - clf = DecisionTreeClassifier() - msg = "ccp_alpha must be greater than or equal to 0" - - with pytest.raises(ValueError, match=msg): - clf.set_params(ccp_alpha=-1.0) - clf.fit(X, y) - - clf.set_params(ccp_alpha=0.0) - clf.fit(X, y) - - with pytest.raises(ValueError, match=msg): - clf.set_params(ccp_alpha=-1.0) - clf._prune_tree() - - def check_apply_path_readonly(name): X_readonly = create_memmap_backed_data(X_small.astype(tree._tree.DTYPE, copy=False)) y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE))