From a5168dd3ede578ca26efbbedee733c680e11ff1a Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 18 Jan 2023 20:36:42 -0800 Subject: [PATCH 01/11] Support user-supplied sample weights when fitting HistGradientBoosting estimator --- .../_hist_gradient_boosting/_predictor.pyx | 4 +- .../_hist_gradient_boosting/common.pxd | 2 +- .../_hist_gradient_boosting/common.pyx | 2 +- .../gradient_boosting.py | 23 +++++++++++ .../_hist_gradient_boosting/grower.py | 40 +++++++++++++++++-- 5 files changed, 63 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index dab18bdd1d49c..b7c28b317201a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -236,8 +236,8 @@ def _compute_partial_dependence( # push left child node_idx_stack[stack_size] = current_node.left left_sample_frac = ( - nodes[current_node.left].count / - current_node.count) + nodes[current_node.left].weighted_n_node_samples / + current_node.weighted_n_node_samples) current_weight = weight_stack[stack_size] weight_stack[stack_size] = current_weight * left_sample_frac stack_size += 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd index d1c70f0483ed4..c49072af5ba1c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd @@ -22,7 +22,7 @@ cdef packed struct node_struct: # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It # needs to be packed since by default numpy dtypes aren't aligned Y_DTYPE_C value - unsigned int count + Y_DTYPE_C weighted_n_node_samples unsigned int feature_idx X_DTYPE_C num_threshold unsigned char missing_go_to_left diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx index f7b36f5796508..6cc11a8eef666 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/common.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pyx @@ -18,7 +18,7 @@ HISTOGRAM_DTYPE = np.dtype([ PREDICTOR_RECORD_DTYPE = np.dtype([ ('value', Y_DTYPE), - ('count', np.uint32), + ('weighted_n_node_samples', Y_DTYPE), ('feature_idx', np.uint32), ('num_threshold', X_DTYPE), ('missing_go_to_left', np.uint8), diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 74322eef79bbc..5143395c7ad33 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -91,6 +91,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "max_depth": [Interval(Integral, 1, None, closed="left"), None], "min_samples_leaf": [Interval(Integral, 1, None, closed="left")], + "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")], "l2_regularization": [Interval(Real, 0, None, closed="left")], "monotonic_cst": ["array-like", dict, None], "interaction_cst": [ @@ -125,6 +126,7 @@ def __init__( max_leaf_nodes, max_depth, min_samples_leaf, + min_weight_fraction_leaf, l2_regularization, max_bins, categorical_features, @@ -145,6 +147,7 @@ def __init__( self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.monotonic_cst = monotonic_cst @@ -371,6 +374,12 @@ def fit(self, X, y, sample_weight=None): # TODO: remove when PDP supports sample weights self._fitted_with_sw = True + # Set min_weight_leaf from min_weight_fraction_leaf + if sample_weight is None: + min_weight_leaf = self.min_weight_fraction_leaf * n_samples + else: + min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + sample_weight = self._finalize_sample_weight(sample_weight, y) rng = check_random_state(self.random_state) @@ -674,6 +683,7 @@ def fit(self, X, y, sample_weight=None): X_binned=X_binned_train, gradients=g_view[:, k], hessians=h_view[:, k], + sample_weight=sample_weight_train, n_bins=n_bins, n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, has_missing_values=has_missing_values, @@ -683,6 +693,7 @@ def fit(self, X, y, sample_weight=None): max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, + min_weight_leaf=min_weight_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate, n_threads=n_threads, @@ -1240,6 +1251,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built. + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. l2_regularization : float, default=0 The L2 regularization parameter. Use ``0`` for no regularization (default). @@ -1434,6 +1449,7 @@ def __init__( max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, + min_weight_fraction_leaf=0.0, l2_regularization=0.0, max_bins=255, categorical_features=None, @@ -1455,6 +1471,7 @@ def __init__( max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, l2_regularization=l2_regularization, max_bins=max_bins, monotonic_cst=monotonic_cst, @@ -1593,6 +1610,10 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built. + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. l2_regularization : float, default=0 The L2 regularization parameter. Use 0 for no regularization. max_bins : int, default=255 @@ -1810,6 +1831,7 @@ def __init__( max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, + min_weight_fraction_leaf=0.0, l2_regularization=0.0, max_bins=255, categorical_features=None, @@ -1832,6 +1854,7 @@ def __init__( max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, l2_regularization=l2_regularization, max_bins=max_bins, categorical_features=categorical_features, diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index c4669da4a60a9..3537150ba4b9a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -15,6 +15,7 @@ from .histogram import HistogramBuilder from .predictor import TreePredictor from .utils import sum_parallel +from ...utils.validation import _check_sample_weight from .common import PREDICTOR_RECORD_DTYPE from .common import X_BITSET_INNER_DTYPE from .common import Y_DTYPE @@ -98,6 +99,7 @@ def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=Non self.depth = depth self.sample_indices = sample_indices self.n_samples = sample_indices.shape[0] + self.weighted_n_node_samples = sample_indices.shape[0] self.sum_gradients = sum_gradients self.sum_hessians = sum_hessians self.value = value @@ -149,6 +151,8 @@ class TreeGrower: hessians : ndarray of shape (n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. + sample_weight : array-like of shape (n_samples,), default=None + Weights of training data. max_leaf_nodes : int, default=None The maximum number of leaves for each tree. If None, there is no maximum limit. @@ -158,6 +162,8 @@ class TreeGrower: Depth isn't constrained by default. min_samples_leaf : int, default=20 The minimum number of samples per leaf. + min_weight_leaf: float, default=0. + The minimum weight of input samples required for a node to be a leaf. min_gain_to_split : float, default=0. The minimum gain needed to split a node. Splits with lower gain will be ignored. @@ -227,9 +233,11 @@ def __init__( X_binned, gradients, hessians, + sample_weight=None, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, + min_weight_leaf=0.0, min_gain_to_split=0.0, n_bins=256, n_bins_non_missing=None, @@ -264,6 +272,8 @@ def __init__( has_missing_values = [has_missing_values] * X_binned.shape[1] has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) + sample_weight = _check_sample_weight(sample_weight, X_binned, dtype=np.float64) + # `monotonic_cst` validation is done in _validate_monotonic_cst # at the estimator level and therefore the following should not be # needed when using the public API. @@ -310,6 +320,7 @@ def __init__( ) self.n_bins_non_missing = n_bins_non_missing self.missing_values_bin_idx = missing_values_bin_idx + self.sample_weight = sample_weight self.max_leaf_nodes = max_leaf_nodes self.has_missing_values = has_missing_values self.monotonic_cst = monotonic_cst @@ -319,6 +330,7 @@ def __init__( self.n_features = X_binned.shape[1] self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf + self.min_weight_leaf = min_weight_leaf self.X_binned = X_binned self.min_gain_to_split = min_gain_to_split self.shrinkage = shrinkage @@ -394,10 +406,17 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): value=0, ) + self.root.weighted_n_node_samples = self.sample_weight[ + self.root.sample_indices + ].sum() + self.root.partition_start = 0 self.root.partition_stop = n_samples - if self.root.n_samples < 2 * self.min_samples_leaf: + if ( + self.root.n_samples < self.min_samples_leaf * 2 + or self.root.weighted_n_node_samples < self.min_weight_leaf * 2 + ): # Do not even bother computing any splitting statistics. self._finalize_leaf(self.root) return @@ -490,6 +509,13 @@ def split_next(self): node.right_child = right_child_node node.left_child = left_child_node + left_child_node.weighted_n_node_samples = self.sample_weight[ + sample_indices_left + ].sum() + right_child_node.weighted_n_node_samples = self.sample_weight[ + sample_indices_right + ].sum() + # set start and stop indices left_child_node.partition_start = node.partition_start left_child_node.partition_stop = node.partition_start + right_child_pos @@ -531,9 +557,15 @@ def split_next(self): self._finalize_leaf(right_child_node) return left_child_node, right_child_node - if left_child_node.n_samples < self.min_samples_leaf * 2: + if ( + left_child_node.n_samples < self.min_samples_leaf * 2 + or left_child_node.weighted_n_node_samples < self.min_weight_leaf * 2 + ): self._finalize_leaf(left_child_node) - if right_child_node.n_samples < self.min_samples_leaf * 2: + if ( + right_child_node.n_samples < self.min_samples_leaf * 2 + or right_child_node.weighted_n_node_samples < self.min_weight_leaf * 2 + ): self._finalize_leaf(right_child_node) if self.with_monotonic_cst: @@ -718,7 +750,7 @@ def _fill_predictor_arrays( ): """Helper used in make_predictor to set the TreePredictor fields.""" node = predictor_nodes[next_free_node_idx] - node["count"] = grower_node.n_samples + node["weighted_n_node_samples"] = grower_node.weighted_n_node_samples node["depth"] = grower_node.depth if grower_node.split_info is not None: node["gain"] = grower_node.split_info.gain From 964edd04515858331d9a034b6d8c1df0008fe359 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 19 Jan 2023 16:15:44 -0800 Subject: [PATCH 02/11] Fix ordering of min_weight_leaf calculation --- .../_hist_gradient_boosting/gradient_boosting.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 5143395c7ad33..e0c47d937a9a1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -374,12 +374,6 @@ def fit(self, X, y, sample_weight=None): # TODO: remove when PDP supports sample weights self._fitted_with_sw = True - # Set min_weight_leaf from min_weight_fraction_leaf - if sample_weight is None: - min_weight_leaf = self.min_weight_fraction_leaf * n_samples - else: - min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) - sample_weight = self._finalize_sample_weight(sample_weight, y) rng = check_random_state(self.random_state) @@ -396,6 +390,12 @@ def fit(self, X, y, sample_weight=None): # used for validation in predict n_samples, self._n_features = X.shape + # Set min_weight_leaf from min_weight_fraction_leaf + if sample_weight is None: + min_weight_leaf = self.min_weight_fraction_leaf * n_samples + else: + min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + self.is_categorical_, known_categories = self._check_categories(X) # Encode constraints into a list of sets of features indices (integers). From 01f558612c4f0eed3d2fad926b40d9e3a93f51c6 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 25 Jan 2023 16:38:50 -0800 Subject: [PATCH 03/11] Update all references of 'count' to 'weighted_n_node_samples' in tests --- .../_hist_gradient_boosting/tests/test_grower.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index c4ae90b7e7d96..47fc47ed4d29f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -250,11 +250,11 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, if n_samples >= min_samples_leaf: for node in predictor.nodes: if node["is_leaf"]: - assert node["count"] >= min_samples_leaf + assert node["weighted_n_node_samples"] >= min_samples_leaf else: assert predictor.nodes.shape[0] == 1 assert predictor.nodes[0]["is_leaf"] - assert predictor.nodes[0]["count"] == n_samples + assert predictor.nodes[0]["weighted_n_node_samples"] == n_samples @pytest.mark.parametrize("n_samples, min_samples_leaf", [(99, 50), (100, 50)]) @@ -376,7 +376,7 @@ def test_missing_value_predict_only(): while not node["is_leaf"]: left = predictor.nodes[node["left"]] right = predictor.nodes[node["right"]] - node = left if left["count"] > right["count"] else right + node = left if left["weighted_n_node_samples"] > right["weighted_n_node_samples"] else right prediction_main_path = node["value"] @@ -466,14 +466,14 @@ def test_grow_tree_categories(): categories = [np.array([4, 9], dtype=X_DTYPE)] predictor = grower.make_predictor(binning_thresholds=categories) root = predictor.nodes[0] - assert root["count"] == 23 + assert root["weighted_n_node_samples"] == 23 assert root["depth"] == 0 assert root["is_categorical"] left, right = predictor.nodes[root["left"]], predictor.nodes[root["right"]] # arbitrary validation, but this means ones go to the left. - assert left["count"] >= right["count"] + assert left["weighted_n_node_samples"] >= right["weighted_n_node_samples"] # check binned category value (1) expected_binned_cat_bitset = [2**1] + [0] * 7 From 22d80e34bf6c20cc726792e65fd4d2e9f47a4634 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 9 Feb 2023 23:24:59 -0800 Subject: [PATCH 04/11] Add tests --- .../tests/test_gradient_boosting.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 7e774d9f09f45..5f3e94c5a3e50 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -788,6 +788,68 @@ def test_sample_weight_effect(problem, duplication): assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup)) +@pytest.mark.parametrize( + "sample_weight_distribution", ("poisson", "exponential", "uniform") +) +def test_sample_weight_leaf_weighted_nodes_classification_random( + sample_weight_distribution +): + # Ensures that the `weighted_n_node_samples` for each node in the predictor + # tree is the sum of `sample_weights` whose samples belong in that node + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + + if sample_weight_distribution == "poisson": + sample_weight = np.random.RandomState(0).poisson(lam=1 + 4*y) + elif sample_weight_distribution == "exponential": + sample_weight = np.random.RandomState(0).exponential(scale=1 + 4*y) + else: + sample_weight = np.random.RandomState(0).uniform(high=1 + 4*y) + + hgbc = ( + HistGradientBoostingClassifier(random_state=0, min_samples_leaf=1, max_depth=1) + .fit(X, y, sample_weight) + ) + + for predictor in hgbc._predictors: + nodes = predictor[0].nodes + feat_idx = int(nodes[0][2]) + num_tresh = nodes[0][3] + + assert nodes[0][1] == np.sum(sample_weight) + assert nodes[1][1] == np.sum(sample_weight[X[:, feat_idx] < num_tresh]) + assert nodes[2][1] == np.sum(sample_weight[X[:, feat_idx] >= num_tresh]) + + +@pytest.mark.parametrize( + "left_sample_weight, right_sample_weight", [(2.5, 7.5), (1, 1), (0.5, 0.5)] +) +def test_sample_weight_leaf_weighted_nodes_classification_two_values( + left_sample_weight, right_sample_weight +): + # Ensures that the `weighted_n_node_samples` for each node in the predictor + # tree is the sum of `sample_weights` whose samples belong in that node + + n_samples = 1000 + X = np.array(n_samples*[0] + n_samples*[1]).reshape(-1, 1) + y = np.array(n_samples*[0] + n_samples*[1]) + + sample_weight = n_samples*[left_sample_weight] + n_samples*[right_sample_weight] + + hgbc = ( + HistGradientBoostingClassifier(min_samples_leaf=1, max_depth=1) + .fit(X, y, sample_weight) + ) + + for predictor in hgbc._predictors: + nodes = predictor[0].nodes + + assert nodes[0][1] == (left_sample_weight + right_sample_weight) * n_samples + assert nodes[1][1] == left_sample_weight * n_samples + assert nodes[2][1] == right_sample_weight * n_samples + + @pytest.mark.parametrize("Loss", (HalfSquaredError, AbsoluteError)) def test_sum_hessians_are_sample_weight(Loss): # For losses with constant hessians, the sum_hessians field of the From edadc0de9ba0861a12cf9d8bf5fd0088d2452a45 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 9 Feb 2023 23:38:17 -0800 Subject: [PATCH 05/11] Update changelog --- doc/whats_new/v1.3.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 08ebf4abc92c3..f7bba333a23f1 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -140,6 +140,12 @@ Changelog :class:`ensemble.BaggingRegressor` expose the `allow_nan` tag from the underlying estimator. :pr:`25506` by `Thomas Fan`_. +- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` now take the user-supplied + `sample_weight` into account at `fit` time, so each node in the estimator's + predictors stores and uses the weighted sample count. + :pr:`25431` by `Andrew Wang .` + :mod:`sklearn.exception` ........................ - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised From 3056dd5ab45c0b0fa5b7a6ea080400a0ac905a72 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 9 Feb 2023 23:39:34 -0800 Subject: [PATCH 06/11] Black fixes --- .../tests/test_gradient_boosting.py | 30 +++++++++---------- .../tests/test_grower.py | 6 +++- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 5f3e94c5a3e50..3222c55132b5b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -792,7 +792,7 @@ def test_sample_weight_effect(problem, duplication): "sample_weight_distribution", ("poisson", "exponential", "uniform") ) def test_sample_weight_leaf_weighted_nodes_classification_random( - sample_weight_distribution + sample_weight_distribution, ): # Ensures that the `weighted_n_node_samples` for each node in the predictor # tree is the sum of `sample_weights` whose samples belong in that node @@ -801,17 +801,16 @@ def test_sample_weight_leaf_weighted_nodes_classification_random( X, y = make_classification(n_samples=n_samples, random_state=0) if sample_weight_distribution == "poisson": - sample_weight = np.random.RandomState(0).poisson(lam=1 + 4*y) + sample_weight = np.random.RandomState(0).poisson(lam=1 + 4 * y) elif sample_weight_distribution == "exponential": - sample_weight = np.random.RandomState(0).exponential(scale=1 + 4*y) + sample_weight = np.random.RandomState(0).exponential(scale=1 + 4 * y) else: - sample_weight = np.random.RandomState(0).uniform(high=1 + 4*y) + sample_weight = np.random.RandomState(0).uniform(high=1 + 4 * y) + + hgbc = HistGradientBoostingClassifier( + random_state=0, min_samples_leaf=1, max_depth=1 + ).fit(X, y, sample_weight) - hgbc = ( - HistGradientBoostingClassifier(random_state=0, min_samples_leaf=1, max_depth=1) - .fit(X, y, sample_weight) - ) - for predictor in hgbc._predictors: nodes = predictor[0].nodes feat_idx = int(nodes[0][2]) @@ -832,16 +831,15 @@ def test_sample_weight_leaf_weighted_nodes_classification_two_values( # tree is the sum of `sample_weights` whose samples belong in that node n_samples = 1000 - X = np.array(n_samples*[0] + n_samples*[1]).reshape(-1, 1) - y = np.array(n_samples*[0] + n_samples*[1]) + X = np.array(n_samples * [0] + n_samples * [1]).reshape(-1, 1) + y = np.array(n_samples * [0] + n_samples * [1]) - sample_weight = n_samples*[left_sample_weight] + n_samples*[right_sample_weight] + sample_weight = n_samples * [left_sample_weight] + n_samples * [right_sample_weight] - hgbc = ( - HistGradientBoostingClassifier(min_samples_leaf=1, max_depth=1) - .fit(X, y, sample_weight) + hgbc = HistGradientBoostingClassifier(min_samples_leaf=1, max_depth=1).fit( + X, y, sample_weight ) - + for predictor in hgbc._predictors: nodes = predictor[0].nodes diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 47fc47ed4d29f..eb045dfac11ad 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -376,7 +376,11 @@ def test_missing_value_predict_only(): while not node["is_leaf"]: left = predictor.nodes[node["left"]] right = predictor.nodes[node["right"]] - node = left if left["weighted_n_node_samples"] > right["weighted_n_node_samples"] else right + node = ( + left + if left["weighted_n_node_samples"] > right["weighted_n_node_samples"] + else right + ) prediction_main_path = node["value"] From 0c7e91e9299d4058f6bb1c84386a16f12a92b099 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 15 Mar 2023 22:35:00 -0700 Subject: [PATCH 07/11] Fix changelog --- doc/whats_new/v1.3.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 24a715b6e3400..18bc9dec0b656 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -210,15 +210,15 @@ Changelog :class:`ensemble.BaggingRegressor` expose the `allow_nan` tag from the underlying estimator. :pr:`25506` by `Thomas Fan`_. +- |Fix| :meth:`ensemble.RandomForestClassifier.fit` sets `max_samples = 1` + when `max_samples` is a float and `round(n_samples * max_samples) < 1`. + :pr:`25601` by :user:`Jan Fidor `. + - |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor` now take the user-supplied `sample_weight` into account at `fit` time, so each node in the estimator's predictors stores and uses the weighted sample count. - :pr:`25431` by `Andrew Wang .` - -- |Fix| :meth:`ensemble.RandomForestClassifier.fit` sets `max_samples = 1` - when `max_samples` is a float and `round(n_samples * max_samples) < 1`. - :pr:`25601` by :user:`Jan Fidor `. + :pr:`25431` by :user:`Andrew Wang .` :mod:`sklearn.exception` ........................ From 5e325bd3831de79d63021f4e0c1867118ab35ae1 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Mon, 20 Mar 2023 18:26:10 -0700 Subject: [PATCH 08/11] Remove stopping criteria that use sample_weight --- .../gradient_boosting.py | 22 ------------------- .../_hist_gradient_boosting/grower.py | 19 +++------------- 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f20ede2117f12..25f6a091a0452 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -90,7 +90,6 @@ class BaseHistGradientBoosting(BaseEstimator, ABC): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "max_depth": [Interval(Integral, 1, None, closed="left"), None], "min_samples_leaf": [Interval(Integral, 1, None, closed="left")], - "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")], "l2_regularization": [Interval(Real, 0, None, closed="left")], "monotonic_cst": ["array-like", dict, None], "interaction_cst": [ @@ -125,7 +124,6 @@ def __init__( max_leaf_nodes, max_depth, min_samples_leaf, - min_weight_fraction_leaf, l2_regularization, max_bins, categorical_features, @@ -146,7 +144,6 @@ def __init__( self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.monotonic_cst = monotonic_cst @@ -389,12 +386,6 @@ def fit(self, X, y, sample_weight=None): # used for validation in predict n_samples, self._n_features = X.shape - # Set min_weight_leaf from min_weight_fraction_leaf - if sample_weight is None: - min_weight_leaf = self.min_weight_fraction_leaf * n_samples - else: - min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) - self.is_categorical_, known_categories = self._check_categories(X) # Encode constraints into a list of sets of features indices (integers). @@ -692,7 +683,6 @@ def fit(self, X, y, sample_weight=None): max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, - min_weight_leaf=min_weight_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate, n_threads=n_threads, @@ -1254,10 +1244,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built. - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. l2_regularization : float, default=0 The L2 regularization parameter. Use ``0`` for no regularization (default). @@ -1460,7 +1446,6 @@ def __init__( max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, - min_weight_fraction_leaf=0.0, l2_regularization=0.0, max_bins=255, categorical_features=None, @@ -1482,7 +1467,6 @@ def __init__( max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, l2_regularization=l2_regularization, max_bins=max_bins, monotonic_cst=monotonic_cst, @@ -1619,10 +1603,6 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built. - min_weight_fraction_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights (of all - the input samples) required to be at a leaf node. Samples have - equal weight when sample_weight is not provided. l2_regularization : float, default=0 The L2 regularization parameter. Use 0 for no regularization. max_bins : int, default=255 @@ -1824,7 +1804,6 @@ def __init__( max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, - min_weight_fraction_leaf=0.0, l2_regularization=0.0, max_bins=255, categorical_features=None, @@ -1847,7 +1826,6 @@ def __init__( max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, - min_weight_fraction_leaf=min_weight_fraction_leaf, l2_regularization=l2_regularization, max_bins=max_bins, categorical_features=categorical_features, diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 3537150ba4b9a..e869a1c1e8ef2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -162,8 +162,6 @@ class TreeGrower: Depth isn't constrained by default. min_samples_leaf : int, default=20 The minimum number of samples per leaf. - min_weight_leaf: float, default=0. - The minimum weight of input samples required for a node to be a leaf. min_gain_to_split : float, default=0. The minimum gain needed to split a node. Splits with lower gain will be ignored. @@ -237,7 +235,6 @@ def __init__( max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, - min_weight_leaf=0.0, min_gain_to_split=0.0, n_bins=256, n_bins_non_missing=None, @@ -330,7 +327,6 @@ def __init__( self.n_features = X_binned.shape[1] self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf - self.min_weight_leaf = min_weight_leaf self.X_binned = X_binned self.min_gain_to_split = min_gain_to_split self.shrinkage = shrinkage @@ -413,10 +409,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): self.root.partition_start = 0 self.root.partition_stop = n_samples - if ( - self.root.n_samples < self.min_samples_leaf * 2 - or self.root.weighted_n_node_samples < self.min_weight_leaf * 2 - ): + if self.root.n_samples < self.min_samples_leaf * 2: # Do not even bother computing any splitting statistics. self._finalize_leaf(self.root) return @@ -557,15 +550,9 @@ def split_next(self): self._finalize_leaf(right_child_node) return left_child_node, right_child_node - if ( - left_child_node.n_samples < self.min_samples_leaf * 2 - or left_child_node.weighted_n_node_samples < self.min_weight_leaf * 2 - ): + if left_child_node.n_samples < self.min_samples_leaf * 2: self._finalize_leaf(left_child_node) - if ( - right_child_node.n_samples < self.min_samples_leaf * 2 - or right_child_node.weighted_n_node_samples < self.min_weight_leaf * 2 - ): + if right_child_node.n_samples < self.min_samples_leaf * 2: self._finalize_leaf(right_child_node) if self.with_monotonic_cst: From ecdcc802943e522a1fd1cb5b4ef4f686652b7aba Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 22 Mar 2023 20:56:39 -0700 Subject: [PATCH 09/11] Move computation of weighted_n_node_samples to split_indices function --- .../_hist_gradient_boosting/grower.py | 23 ++++++++----------- .../_hist_gradient_boosting/splitting.pyx | 13 +++++++++++ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e869a1c1e8ef2..fb5fd5cca88ed 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -39,6 +39,8 @@ class TreeNode: The depth of the node, i.e. its distance from the root. sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint The indices of the samples at the node. + weighted_n_node_samples : float + The weighted number of training samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. sum_hessians : float @@ -95,11 +97,11 @@ class TreeNode: partition_start = 0 partition_stop = 0 - def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=None): + def __init__(self, depth, sample_indices, weighted_n_node_samples, sum_gradients, sum_hessians, value=None): self.depth = depth self.sample_indices = sample_indices self.n_samples = sample_indices.shape[0] - self.weighted_n_node_samples = sample_indices.shape[0] + self.weighted_n_node_samples = weighted_n_node_samples self.sum_gradients = sum_gradients self.sum_hessians = sum_hessians self.value = value @@ -313,11 +315,11 @@ def __init__( min_samples_leaf, min_gain_to_split, hessians_are_constant, + sample_weight, n_threads, ) self.n_bins_non_missing = n_bins_non_missing self.missing_values_bin_idx = missing_values_bin_idx - self.sample_weight = sample_weight self.max_leaf_nodes = max_leaf_nodes self.has_missing_values = has_missing_values self.monotonic_cst = monotonic_cst @@ -397,15 +399,12 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): self.root = TreeNode( depth=depth, sample_indices=self.splitter.partition, + weighted_n_node_samples=np.sum(self.splitter.sample_weight), sum_gradients=sum_gradients, sum_hessians=sum_hessians, value=0, ) - self.root.weighted_n_node_samples = self.sample_weight[ - self.root.sample_indices - ].sum() - self.root.partition_start = 0 self.root.partition_stop = n_samples @@ -476,6 +475,7 @@ def split_next(self): ( sample_indices_left, sample_indices_right, + right_weighted_n_node_samples, right_child_pos, ) = self.splitter.split_indices(node.split_info, node.sample_indices) self.total_apply_split_time += time() - tic @@ -487,6 +487,7 @@ def split_next(self): left_child_node = TreeNode( depth, sample_indices_left, + node.weighted_n_node_samples - right_weighted_n_node_samples, node.split_info.sum_gradient_left, node.split_info.sum_hessian_left, value=node.split_info.value_left, @@ -494,6 +495,7 @@ def split_next(self): right_child_node = TreeNode( depth, sample_indices_right, + right_weighted_n_node_samples, node.split_info.sum_gradient_right, node.split_info.sum_hessian_right, value=node.split_info.value_right, @@ -502,13 +504,6 @@ def split_next(self): node.right_child = right_child_node node.left_child = left_child_node - left_child_node.weighted_n_node_samples = self.sample_weight[ - sample_indices_left - ].sum() - right_child_node.weighted_n_node_samples = self.sample_weight[ - sample_indices_right - ].sum() - # set start and stop indices left_child_node.partition_start = node.partition_start left_child_node.partition_stop = node.partition_start + right_child_pos diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index cdeb373350ed4..37e39c77e2581 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -155,6 +155,9 @@ cdef class Splitter: be ignored. hessians_are_constant: bool, default is False Whether hessians are constant. + sample_weight: ndarray of float, shape (n_samples,), default=None + Weights of training data. If not provided, all samples are assumed + to have uniform weight. n_threads : int, default=1 Number of OpenMP threads to use. """ @@ -166,6 +169,7 @@ cdef class Splitter: const unsigned char [::1] has_missing_values const unsigned char [::1] is_categorical const signed char [::1] monotonic_cst + const Y_DTYPE_C [::1] sample_weight unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -189,6 +193,7 @@ cdef class Splitter: unsigned int min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0., unsigned char hessians_are_constant=False, + const Y_DTYPE_C [::1] sample_weight=None, unsigned int n_threads=1): self.X_binned = X_binned @@ -203,6 +208,7 @@ cdef class Splitter: self.min_samples_leaf = min_samples_leaf self.min_gain_to_split = min_gain_to_split self.hessians_are_constant = hessians_are_constant + self.sample_weight = sample_weight if sample_weight is not None else np.ones(X_binned.shape[0], dtype=np.float64) self.n_threads = n_threads # The partition array maps each sample index into the leaves of the @@ -247,6 +253,8 @@ cdef class Splitter: right_indices : ndarray of int, shape (n_right_samples,) The indices of the samples in the right child. This is a view on self.partition. + right_weighted_n_node_samples : float + The weighted number of training samples in the right child. right_child_position : int The position of the right child in ``sample_indices``. """ @@ -302,6 +310,7 @@ cdef class Splitter: self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer unsigned int [::1] right_indices_buffer = self.right_indices_buffer + const Y_DTYPE_C [::1] sample_weight = self.sample_weight unsigned char is_categorical = split_info.is_categorical # Cython is unhappy if we set left_cat_bitset to # split_info.left_cat_bitset directly, so we need a tmp var @@ -321,6 +330,7 @@ cdef class Splitter: int i int thread_idx int sample_idx + double right_weighted_n_node_samples int right_child_position unsigned char turn_left int [:] left_offset = np.zeros(n_threads, dtype=np.int32) @@ -339,6 +349,7 @@ cdef class Splitter: offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] # map indices from sample_indices to left/right_indices_buffer + right_weighted_n_node_samples = 0 for thread_idx in prange(n_threads, schedule='static', chunksize=1, num_threads=n_threads): left_count = 0 @@ -360,6 +371,7 @@ cdef class Splitter: else: right_indices_buffer[start + right_count] = sample_idx right_count = right_count + 1 + right_weighted_n_node_samples += sample_weight[sample_idx] left_counts[thread_idx] = left_count right_counts[thread_idx] = right_count @@ -410,6 +422,7 @@ cdef class Splitter: return (sample_indices[:right_child_position], sample_indices[right_child_position:], + right_weighted_n_node_samples, right_child_position) def find_node_split( From 228f4b20d754ab30c3d50e00057d12b71bdf9916 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 22 Mar 2023 21:26:20 -0700 Subject: [PATCH 10/11] Update test cases --- .../tests/test_gradient_boosting.py | 6 +++--- .../_hist_gradient_boosting/tests/test_splitting.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 5b0b59f78f6db..e1811fe450695 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -789,9 +789,9 @@ def test_sample_weight_leaf_weighted_nodes_classification_random( feat_idx = int(nodes[0][2]) num_tresh = nodes[0][3] - assert nodes[0][1] == np.sum(sample_weight) - assert nodes[1][1] == np.sum(sample_weight[X[:, feat_idx] < num_tresh]) - assert nodes[2][1] == np.sum(sample_weight[X[:, feat_idx] >= num_tresh]) + assert_allclose(nodes[0][1], sample_weight.sum()) + assert_allclose(nodes[1][1], sample_weight[X[:, feat_idx] < num_tresh].sum()) + assert_allclose(nodes[2][1], sample_weight[X[:, feat_idx] >= num_tresh].sum()) @pytest.mark.parametrize( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index d1da34015a2a4..3a3e64a339425 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -158,7 +158,7 @@ def test_gradient_and_hessian_sanity(constant_hessian): si_parent = splitter.find_node_split( n_samples, hists_parent, sum_gradients, sum_hessians, value_parent ) - sample_indices_left, sample_indices_right, _ = splitter.split_indices( + sample_indices_left, sample_indices_right, _, _ = splitter.split_indices( si_parent, sample_indices ) @@ -308,12 +308,15 @@ def test_split_indices(): assert si_root.feature_idx == 1 assert si_root.bin_idx == 3 - samples_left, samples_right, position_right = splitter.split_indices( + samples_left, samples_right, right_weight, position_right = splitter.split_indices( si_root, splitter.partition ) assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8]) assert set(samples_right) == set([2, 7, 9]) + assert len(list(samples_left)) == n_samples - right_weight + assert len(list(samples_right)) == right_weight + assert list(samples_left) == list(splitter.partition[:position_right]) assert list(samples_right) == list(splitter.partition[position_right:]) @@ -557,7 +560,7 @@ def test_splitting_missing_values( # Make sure the split is properly computed. # This also make sure missing values are properly assigned to the correct # child in split_indices() - samples_left, samples_right, _ = splitter.split_indices( + samples_left, samples_right, _, _ = splitter.split_indices( split_info, splitter.partition ) @@ -849,7 +852,7 @@ def test_splitting_categorical_sanity( # is set later in the grower. # make sure samples are split correctly - samples_left, samples_right, _ = splitter.split_indices( + samples_left, samples_right, _, _ = splitter.split_indices( split_info, splitter.partition ) From fb234b79f70d8108f0cd6457b0b64cfaae1a4a9d Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 22 Mar 2023 21:34:20 -0700 Subject: [PATCH 11/11] Fix formatting --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index fb5fd5cca88ed..755fa69f88181 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -97,7 +97,15 @@ class TreeNode: partition_start = 0 partition_stop = 0 - def __init__(self, depth, sample_indices, weighted_n_node_samples, sum_gradients, sum_hessians, value=None): + def __init__( + self, + depth, + sample_indices, + weighted_n_node_samples, + sum_gradients, + sum_hessians, + value=None, + ): self.depth = depth self.sample_indices = sample_indices self.n_samples = sample_indices.shape[0]