diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 56934b3d2518a..8724565ed2713 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -24,6 +24,13 @@ Enhancements Bug fixes ......... + - The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and + regressors now assumes uniform sample weights by default if the + ``sample_weight`` argument is not passed to the ``fit`` function. + Previously, the parameter was silently ignored. (`#7301 + `_) by `Nelson + Liu`_. + .. _changes_0_18: Version 0.18 diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index ed0a5e0afe8ed..3d252dfa4ff3c 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -807,8 +807,9 @@ class RandomForestClassifier(ForestClassifier): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. @@ -1018,8 +1019,9 @@ class RandomForestRegressor(ForestRegressor): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. @@ -1189,8 +1191,9 @@ class ExtraTreesClassifier(ForestClassifier): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. @@ -1399,8 +1402,9 @@ class ExtraTreesRegressor(ForestRegressor): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. @@ -1556,8 +1560,9 @@ class RandomTreesEmbedding(BaseForest): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index edb72c7f0538a..c9a36aac1bd99 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1330,8 +1330,9 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. subsample : float, optional (default=1.0) The fraction of samples to be used for fitting the individual base @@ -1698,8 +1699,9 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. subsample : float, optional (default=1.0) The fraction of samples to be used for fitting the individual base diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 61a010274b934..f6c8f38d43fd0 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -670,6 +670,30 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf)) + # test case with no weights passed in + total_weight = X.shape[0] + + for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)): + est = TreeEstimator(min_weight_fraction_leaf=frac, + max_leaf_nodes=max_leaf_nodes, + random_state=0) + est.fit(X, y) + + if sparse: + out = est.tree_.apply(X.tocsr()) + else: + out = est.tree_.apply(X) + + node_weights = np.bincount(out) + # drop inner nodes + leaf_weights = node_weights[node_weights != 0] + assert_greater_equal( + np.min(leaf_weights), + total_weight * est.min_weight_fraction_leaf, + "Failed with {0} " + "min_weight_fraction_leaf={1}".format( + name, est.min_weight_fraction_leaf)) + def test_min_weight_fraction_leaf(): # Check on dense input @@ -681,6 +705,82 @@ def test_min_weight_fraction_leaf(): yield check_min_weight_fraction_leaf, name, "multilabel", True +def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, + sparse=False): + """Test the interaction between min_weight_fraction_leaf and min_samples_leaf + when sample_weights is not provided in fit.""" + if sparse: + X = DATASETS[datasets]["X_sparse"].astype(np.float32) + else: + X = DATASETS[datasets]["X"].astype(np.float32) + y = DATASETS[datasets]["y"] + + total_weight = X.shape[0] + TreeEstimator = ALL_TREES[name] + for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)): + # test integer min_samples_leaf + est = TreeEstimator(min_weight_fraction_leaf=frac, + max_leaf_nodes=max_leaf_nodes, + min_samples_leaf=5, + random_state=0) + est.fit(X, y) + + if sparse: + out = est.tree_.apply(X.tocsr()) + else: + out = est.tree_.apply(X) + + node_weights = np.bincount(out) + # drop inner nodes + leaf_weights = node_weights[node_weights != 0] + assert_greater_equal( + np.min(leaf_weights), + max((total_weight * + est.min_weight_fraction_leaf), 5), + "Failed with {0} " + "min_weight_fraction_leaf={1}, " + "min_samples_leaf={2}".format(name, + est.min_weight_fraction_leaf, + est.min_samples_leaf)) + for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)): + # test float min_samples_leaf + est = TreeEstimator(min_weight_fraction_leaf=frac, + max_leaf_nodes=max_leaf_nodes, + min_samples_leaf=.1, + random_state=0) + est.fit(X, y) + + if sparse: + out = est.tree_.apply(X.tocsr()) + else: + out = est.tree_.apply(X) + + node_weights = np.bincount(out) + # drop inner nodes + leaf_weights = node_weights[node_weights != 0] + assert_greater_equal( + np.min(leaf_weights), + max((total_weight * est.min_weight_fraction_leaf), + (total_weight * est.min_samples_leaf)), + "Failed with {0} " + "min_weight_fraction_leaf={1}, " + "min_samples_leaf={2}".format(name, + est.min_weight_fraction_leaf, + est.min_samples_leaf)) + + +def test_min_weight_fraction_leaf_with_min_samples_leaf(): + # Check on dense input + for name in ALL_TREES: + yield (check_min_weight_fraction_leaf_with_min_samples_leaf, + name, "iris") + + # Check on sparse input + for name in SPARSE_TREES: + yield (check_min_weight_fraction_leaf_with_min_samples_leaf, + name, "multilabel", True) + + def test_min_impurity_split(): # test if min_impurity_split creates leaves with impurity # [0, min_impurity_split) when min_samples_leaf = 1 and diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 76458c72b35b0..109cb4ad0263f 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -301,11 +301,12 @@ def fit(self, X, y, sample_weight=None, check_input=True, sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf - if self.min_weight_fraction_leaf != 0. and sample_weight is not None: + if sample_weight is None: min_weight_leaf = (self.min_weight_fraction_leaf * - np.sum(sample_weight)) + n_samples) else: - min_weight_leaf = 0. + min_weight_leaf = (self.min_weight_fraction_leaf * + np.sum(sample_weight)) if self.min_impurity_split < 0.: raise ValueError("min_impurity_split must be greater than or equal " @@ -592,8 +593,9 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow a tree with ``max_leaf_nodes`` in best-first fashion. @@ -862,8 +864,9 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): Added float values for percentages. min_weight_fraction_leaf : float, optional (default=0.) - The minimum weighted fraction of the input samples required to be at a - leaf node. + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. max_leaf_nodes : int or None, optional (default=None) Grow a tree with ``max_leaf_nodes`` in best-first fashion.