From a5168dd3ede578ca26efbbedee733c680e11ff1a Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 18 Jan 2023 20:36:42 -0800
Subject: [PATCH 01/11] Support user-supplied sample weights when fitting
 HistGradientBoosting estimator

---
 .../_hist_gradient_boosting/_predictor.pyx    |  4 +-
 .../_hist_gradient_boosting/common.pxd        |  2 +-
 .../_hist_gradient_boosting/common.pyx        |  2 +-
 .../gradient_boosting.py                      | 23 +++++++++++
 .../_hist_gradient_boosting/grower.py         | 40 +++++++++++++++++--
 5 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index dab18bdd1d49c..b7c28b317201a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -236,8 +236,8 @@ def _compute_partial_dependence(
                     # push left child
                     node_idx_stack[stack_size] = current_node.left
                     left_sample_frac = (
-                        <Y_DTYPE_C> nodes[current_node.left].count /
-                        current_node.count)
+                        <Y_DTYPE_C> nodes[current_node.left].weighted_n_node_samples /
+                        current_node.weighted_n_node_samples)
                     current_weight = weight_stack[stack_size]
                     weight_stack[stack_size] = current_weight * left_sample_frac
                     stack_size += 1
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
index d1c70f0483ed4..c49072af5ba1c 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -22,7 +22,7 @@ cdef packed struct node_struct:
     # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It
     # needs to be packed since by default numpy dtypes aren't aligned
     Y_DTYPE_C value
-    unsigned int count
+    Y_DTYPE_C weighted_n_node_samples
     unsigned int feature_idx
     X_DTYPE_C num_threshold
     unsigned char missing_go_to_left
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx
index f7b36f5796508..6cc11a8eef666 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pyx
@@ -18,7 +18,7 @@ HISTOGRAM_DTYPE = np.dtype([
 
 PREDICTOR_RECORD_DTYPE = np.dtype([
     ('value', Y_DTYPE),
-    ('count', np.uint32),
+    ('weighted_n_node_samples', Y_DTYPE),
     ('feature_idx', np.uint32),
     ('num_threshold', X_DTYPE),
     ('missing_go_to_left', np.uint8),
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 74322eef79bbc..5143395c7ad33 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -91,6 +91,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "max_depth": [Interval(Integral, 1, None, closed="left"), None],
         "min_samples_leaf": [Interval(Integral, 1, None, closed="left")],
+        "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")],
         "l2_regularization": [Interval(Real, 0, None, closed="left")],
         "monotonic_cst": ["array-like", dict, None],
         "interaction_cst": [
@@ -125,6 +126,7 @@ def __init__(
         max_leaf_nodes,
         max_depth,
         min_samples_leaf,
+        min_weight_fraction_leaf,
         l2_regularization,
         max_bins,
         categorical_features,
@@ -145,6 +147,7 @@ def __init__(
         self.max_leaf_nodes = max_leaf_nodes
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
         self.monotonic_cst = monotonic_cst
@@ -371,6 +374,12 @@ def fit(self, X, y, sample_weight=None):
             # TODO: remove when PDP supports sample weights
             self._fitted_with_sw = True
 
+        # Set min_weight_leaf from min_weight_fraction_leaf
+        if sample_weight is None:
+            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
+        else:
+            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
+
         sample_weight = self._finalize_sample_weight(sample_weight, y)
 
         rng = check_random_state(self.random_state)
@@ -674,6 +683,7 @@ def fit(self, X, y, sample_weight=None):
                     X_binned=X_binned_train,
                     gradients=g_view[:, k],
                     hessians=h_view[:, k],
+                    sample_weight=sample_weight_train,
                     n_bins=n_bins,
                     n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
                     has_missing_values=has_missing_values,
@@ -683,6 +693,7 @@ def fit(self, X, y, sample_weight=None):
                     max_leaf_nodes=self.max_leaf_nodes,
                     max_depth=self.max_depth,
                     min_samples_leaf=self.min_samples_leaf,
+                    min_weight_leaf=min_weight_leaf,
                     l2_regularization=self.l2_regularization,
                     shrinkage=self.learning_rate,
                     n_threads=n_threads,
@@ -1240,6 +1251,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         The minimum number of samples per leaf. For small datasets with less
         than a few hundred samples, it is recommended to lower this value
         since only very shallow trees would be built.
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
     l2_regularization : float, default=0
         The L2 regularization parameter. Use ``0`` for no regularization
         (default).
@@ -1434,6 +1449,7 @@ def __init__(
         max_leaf_nodes=31,
         max_depth=None,
         min_samples_leaf=20,
+        min_weight_fraction_leaf=0.0,
         l2_regularization=0.0,
         max_bins=255,
         categorical_features=None,
@@ -1455,6 +1471,7 @@ def __init__(
             max_leaf_nodes=max_leaf_nodes,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
             l2_regularization=l2_regularization,
             max_bins=max_bins,
             monotonic_cst=monotonic_cst,
@@ -1593,6 +1610,10 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
         The minimum number of samples per leaf. For small datasets with less
         than a few hundred samples, it is recommended to lower this value
         since only very shallow trees would be built.
+    min_weight_fraction_leaf : float, default=0.0
+        The minimum weighted fraction of the sum total of weights (of all
+        the input samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided.
     l2_regularization : float, default=0
         The L2 regularization parameter. Use 0 for no regularization.
     max_bins : int, default=255
@@ -1810,6 +1831,7 @@ def __init__(
         max_leaf_nodes=31,
         max_depth=None,
         min_samples_leaf=20,
+        min_weight_fraction_leaf=0.0,
         l2_regularization=0.0,
         max_bins=255,
         categorical_features=None,
@@ -1832,6 +1854,7 @@ def __init__(
             max_leaf_nodes=max_leaf_nodes,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
+            min_weight_fraction_leaf=min_weight_fraction_leaf,
             l2_regularization=l2_regularization,
             max_bins=max_bins,
             categorical_features=categorical_features,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index c4669da4a60a9..3537150ba4b9a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -15,6 +15,7 @@
 from .histogram import HistogramBuilder
 from .predictor import TreePredictor
 from .utils import sum_parallel
+from ...utils.validation import _check_sample_weight
 from .common import PREDICTOR_RECORD_DTYPE
 from .common import X_BITSET_INNER_DTYPE
 from .common import Y_DTYPE
@@ -98,6 +99,7 @@ def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=Non
         self.depth = depth
         self.sample_indices = sample_indices
         self.n_samples = sample_indices.shape[0]
+        self.weighted_n_node_samples = sample_indices.shape[0]
         self.sum_gradients = sum_gradients
         self.sum_hessians = sum_hessians
         self.value = value
@@ -149,6 +151,8 @@ class TreeGrower:
     hessians : ndarray of shape (n_samples,)
         The hessians of each training sample. Those are the hessians of the
         loss w.r.t the predictions, evaluated at iteration ``i - 1``.
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weights of training data.
     max_leaf_nodes : int, default=None
         The maximum number of leaves for each tree. If None, there is no
         maximum limit.
@@ -158,6 +162,8 @@ class TreeGrower:
         Depth isn't constrained by default.
     min_samples_leaf : int, default=20
         The minimum number of samples per leaf.
+    min_weight_leaf: float, default=0.
+        The minimum weight of input samples required for a node to be a leaf.
     min_gain_to_split : float, default=0.
         The minimum gain needed to split a node. Splits with lower gain will
         be ignored.
@@ -227,9 +233,11 @@ def __init__(
         X_binned,
         gradients,
         hessians,
+        sample_weight=None,
         max_leaf_nodes=None,
         max_depth=None,
         min_samples_leaf=20,
+        min_weight_leaf=0.0,
         min_gain_to_split=0.0,
         n_bins=256,
         n_bins_non_missing=None,
@@ -264,6 +272,8 @@ def __init__(
             has_missing_values = [has_missing_values] * X_binned.shape[1]
         has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
 
+        sample_weight = _check_sample_weight(sample_weight, X_binned, dtype=np.float64)
+
         # `monotonic_cst` validation is done in _validate_monotonic_cst
         # at the estimator level and therefore the following should not be
         # needed when using the public API.
@@ -310,6 +320,7 @@ def __init__(
         )
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
+        self.sample_weight = sample_weight
         self.max_leaf_nodes = max_leaf_nodes
         self.has_missing_values = has_missing_values
         self.monotonic_cst = monotonic_cst
@@ -319,6 +330,7 @@ def __init__(
         self.n_features = X_binned.shape[1]
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
         self.X_binned = X_binned
         self.min_gain_to_split = min_gain_to_split
         self.shrinkage = shrinkage
@@ -394,10 +406,17 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
             value=0,
         )
 
+        self.root.weighted_n_node_samples = self.sample_weight[
+            self.root.sample_indices
+        ].sum()
+
         self.root.partition_start = 0
         self.root.partition_stop = n_samples
 
-        if self.root.n_samples < 2 * self.min_samples_leaf:
+        if (
+            self.root.n_samples < self.min_samples_leaf * 2
+            or self.root.weighted_n_node_samples < self.min_weight_leaf * 2
+        ):
             # Do not even bother computing any splitting statistics.
             self._finalize_leaf(self.root)
             return
@@ -490,6 +509,13 @@ def split_next(self):
         node.right_child = right_child_node
         node.left_child = left_child_node
 
+        left_child_node.weighted_n_node_samples = self.sample_weight[
+            sample_indices_left
+        ].sum()
+        right_child_node.weighted_n_node_samples = self.sample_weight[
+            sample_indices_right
+        ].sum()
+
         # set start and stop indices
         left_child_node.partition_start = node.partition_start
         left_child_node.partition_stop = node.partition_start + right_child_pos
@@ -531,9 +557,15 @@ def split_next(self):
             self._finalize_leaf(right_child_node)
             return left_child_node, right_child_node
 
-        if left_child_node.n_samples < self.min_samples_leaf * 2:
+        if (
+            left_child_node.n_samples < self.min_samples_leaf * 2
+            or left_child_node.weighted_n_node_samples < self.min_weight_leaf * 2
+        ):
             self._finalize_leaf(left_child_node)
-        if right_child_node.n_samples < self.min_samples_leaf * 2:
+        if (
+            right_child_node.n_samples < self.min_samples_leaf * 2
+            or right_child_node.weighted_n_node_samples < self.min_weight_leaf * 2
+        ):
             self._finalize_leaf(right_child_node)
 
         if self.with_monotonic_cst:
@@ -718,7 +750,7 @@ def _fill_predictor_arrays(
 ):
     """Helper used in make_predictor to set the TreePredictor fields."""
     node = predictor_nodes[next_free_node_idx]
-    node["count"] = grower_node.n_samples
+    node["weighted_n_node_samples"] = grower_node.weighted_n_node_samples
     node["depth"] = grower_node.depth
     if grower_node.split_info is not None:
         node["gain"] = grower_node.split_info.gain

From 964edd04515858331d9a034b6d8c1df0008fe359 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 19 Jan 2023 16:15:44 -0800
Subject: [PATCH 02/11] Fix ordering of min_weight_leaf calculation

---
 .../_hist_gradient_boosting/gradient_boosting.py     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 5143395c7ad33..e0c47d937a9a1 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -374,12 +374,6 @@ def fit(self, X, y, sample_weight=None):
             # TODO: remove when PDP supports sample weights
             self._fitted_with_sw = True
 
-        # Set min_weight_leaf from min_weight_fraction_leaf
-        if sample_weight is None:
-            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
-        else:
-            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
-
         sample_weight = self._finalize_sample_weight(sample_weight, y)
 
         rng = check_random_state(self.random_state)
@@ -396,6 +390,12 @@ def fit(self, X, y, sample_weight=None):
         # used for validation in predict
         n_samples, self._n_features = X.shape
 
+        # Set min_weight_leaf from min_weight_fraction_leaf
+        if sample_weight is None:
+            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
+        else:
+            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
+
         self.is_categorical_, known_categories = self._check_categories(X)
 
         # Encode constraints into a list of sets of features indices (integers).

From 01f558612c4f0eed3d2fad926b40d9e3a93f51c6 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 25 Jan 2023 16:38:50 -0800
Subject: [PATCH 03/11] Update all references of 'count' to
 'weighted_n_node_samples' in tests

---
 .../_hist_gradient_boosting/tests/test_grower.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
index c4ae90b7e7d96..47fc47ed4d29f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -250,11 +250,11 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian,
     if n_samples >= min_samples_leaf:
         for node in predictor.nodes:
             if node["is_leaf"]:
-                assert node["count"] >= min_samples_leaf
+                assert node["weighted_n_node_samples"] >= min_samples_leaf
     else:
         assert predictor.nodes.shape[0] == 1
         assert predictor.nodes[0]["is_leaf"]
-        assert predictor.nodes[0]["count"] == n_samples
+        assert predictor.nodes[0]["weighted_n_node_samples"] == n_samples
 
 
 @pytest.mark.parametrize("n_samples, min_samples_leaf", [(99, 50), (100, 50)])
@@ -376,7 +376,7 @@ def test_missing_value_predict_only():
     while not node["is_leaf"]:
         left = predictor.nodes[node["left"]]
         right = predictor.nodes[node["right"]]
-        node = left if left["count"] > right["count"] else right
+        node = left if left["weighted_n_node_samples"] > right["weighted_n_node_samples"] else right
 
     prediction_main_path = node["value"]
 
@@ -466,14 +466,14 @@ def test_grow_tree_categories():
     categories = [np.array([4, 9], dtype=X_DTYPE)]
     predictor = grower.make_predictor(binning_thresholds=categories)
     root = predictor.nodes[0]
-    assert root["count"] == 23
+    assert root["weighted_n_node_samples"] == 23
     assert root["depth"] == 0
     assert root["is_categorical"]
 
     left, right = predictor.nodes[root["left"]], predictor.nodes[root["right"]]
 
     # arbitrary validation, but this means ones go to the left.
-    assert left["count"] >= right["count"]
+    assert left["weighted_n_node_samples"] >= right["weighted_n_node_samples"]
 
     # check binned category value (1)
     expected_binned_cat_bitset = [2**1] + [0] * 7

From 22d80e34bf6c20cc726792e65fd4d2e9f47a4634 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 9 Feb 2023 23:24:59 -0800
Subject: [PATCH 04/11] Add tests

---
 .../tests/test_gradient_boosting.py           | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 7e774d9f09f45..5f3e94c5a3e50 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -788,6 +788,68 @@ def test_sample_weight_effect(problem, duplication):
     assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup))
 
 
+@pytest.mark.parametrize(
+    "sample_weight_distribution", ("poisson", "exponential", "uniform")
+)
+def test_sample_weight_leaf_weighted_nodes_classification_random(
+    sample_weight_distribution
+):
+    # Ensures that the `weighted_n_node_samples` for each node in the predictor
+    # tree is the sum of `sample_weights` whose samples belong in that node
+
+    n_samples = 1000
+    X, y = make_classification(n_samples=n_samples, random_state=0)
+
+    if sample_weight_distribution == "poisson":
+        sample_weight = np.random.RandomState(0).poisson(lam=1 + 4*y)
+    elif sample_weight_distribution == "exponential":
+        sample_weight = np.random.RandomState(0).exponential(scale=1 + 4*y)
+    else:
+        sample_weight = np.random.RandomState(0).uniform(high=1 + 4*y)
+
+    hgbc = (
+        HistGradientBoostingClassifier(random_state=0, min_samples_leaf=1, max_depth=1)
+        .fit(X, y, sample_weight)
+    )
+    
+    for predictor in hgbc._predictors:
+        nodes = predictor[0].nodes
+        feat_idx = int(nodes[0][2])
+        num_tresh = nodes[0][3]
+
+        assert nodes[0][1] == np.sum(sample_weight)
+        assert nodes[1][1] == np.sum(sample_weight[X[:, feat_idx] < num_tresh])
+        assert nodes[2][1] == np.sum(sample_weight[X[:, feat_idx] >= num_tresh])
+
+
+@pytest.mark.parametrize(
+    "left_sample_weight, right_sample_weight", [(2.5, 7.5), (1, 1), (0.5, 0.5)]
+)
+def test_sample_weight_leaf_weighted_nodes_classification_two_values(
+    left_sample_weight, right_sample_weight
+):
+    # Ensures that the `weighted_n_node_samples` for each node in the predictor
+    # tree is the sum of `sample_weights` whose samples belong in that node
+
+    n_samples = 1000
+    X = np.array(n_samples*[0] + n_samples*[1]).reshape(-1, 1)
+    y = np.array(n_samples*[0] + n_samples*[1])
+
+    sample_weight = n_samples*[left_sample_weight] + n_samples*[right_sample_weight]
+
+    hgbc = (
+        HistGradientBoostingClassifier(min_samples_leaf=1, max_depth=1)
+        .fit(X, y, sample_weight)
+    )
+    
+    for predictor in hgbc._predictors:
+        nodes = predictor[0].nodes
+
+        assert nodes[0][1] == (left_sample_weight + right_sample_weight) * n_samples
+        assert nodes[1][1] == left_sample_weight * n_samples
+        assert nodes[2][1] == right_sample_weight * n_samples
+
+
 @pytest.mark.parametrize("Loss", (HalfSquaredError, AbsoluteError))
 def test_sum_hessians_are_sample_weight(Loss):
     # For losses with constant hessians, the sum_hessians field of the

From edadc0de9ba0861a12cf9d8bf5fd0088d2452a45 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 9 Feb 2023 23:38:17 -0800
Subject: [PATCH 05/11] Update changelog

---
 doc/whats_new/v1.3.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 08ebf4abc92c3..f7bba333a23f1 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -140,6 +140,12 @@ Changelog
   :class:`ensemble.BaggingRegressor` expose the `allow_nan` tag from the
   underlying estimator. :pr:`25506` by `Thomas Fan`_.
 
+- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now take the user-supplied
+  `sample_weight` into account at `fit` time, so each node in the estimator's
+  predictors stores and uses the weighted sample count.
+  :pr:`25431` by `Andrew Wang <Andrew-Wang-IB45>.`
+
 :mod:`sklearn.exception`
 ........................
 - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised

From 3056dd5ab45c0b0fa5b7a6ea080400a0ac905a72 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 9 Feb 2023 23:39:34 -0800
Subject: [PATCH 06/11] Black fixes

---
 .../tests/test_gradient_boosting.py           | 30 +++++++++----------
 .../tests/test_grower.py                      |  6 +++-
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 5f3e94c5a3e50..3222c55132b5b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -792,7 +792,7 @@ def test_sample_weight_effect(problem, duplication):
     "sample_weight_distribution", ("poisson", "exponential", "uniform")
 )
 def test_sample_weight_leaf_weighted_nodes_classification_random(
-    sample_weight_distribution
+    sample_weight_distribution,
 ):
     # Ensures that the `weighted_n_node_samples` for each node in the predictor
     # tree is the sum of `sample_weights` whose samples belong in that node
@@ -801,17 +801,16 @@ def test_sample_weight_leaf_weighted_nodes_classification_random(
     X, y = make_classification(n_samples=n_samples, random_state=0)
 
     if sample_weight_distribution == "poisson":
-        sample_weight = np.random.RandomState(0).poisson(lam=1 + 4*y)
+        sample_weight = np.random.RandomState(0).poisson(lam=1 + 4 * y)
     elif sample_weight_distribution == "exponential":
-        sample_weight = np.random.RandomState(0).exponential(scale=1 + 4*y)
+        sample_weight = np.random.RandomState(0).exponential(scale=1 + 4 * y)
     else:
-        sample_weight = np.random.RandomState(0).uniform(high=1 + 4*y)
+        sample_weight = np.random.RandomState(0).uniform(high=1 + 4 * y)
+
+    hgbc = HistGradientBoostingClassifier(
+        random_state=0, min_samples_leaf=1, max_depth=1
+    ).fit(X, y, sample_weight)
 
-    hgbc = (
-        HistGradientBoostingClassifier(random_state=0, min_samples_leaf=1, max_depth=1)
-        .fit(X, y, sample_weight)
-    )
-    
     for predictor in hgbc._predictors:
         nodes = predictor[0].nodes
         feat_idx = int(nodes[0][2])
@@ -832,16 +831,15 @@ def test_sample_weight_leaf_weighted_nodes_classification_two_values(
     # tree is the sum of `sample_weights` whose samples belong in that node
 
     n_samples = 1000
-    X = np.array(n_samples*[0] + n_samples*[1]).reshape(-1, 1)
-    y = np.array(n_samples*[0] + n_samples*[1])
+    X = np.array(n_samples * [0] + n_samples * [1]).reshape(-1, 1)
+    y = np.array(n_samples * [0] + n_samples * [1])
 
-    sample_weight = n_samples*[left_sample_weight] + n_samples*[right_sample_weight]
+    sample_weight = n_samples * [left_sample_weight] + n_samples * [right_sample_weight]
 
-    hgbc = (
-        HistGradientBoostingClassifier(min_samples_leaf=1, max_depth=1)
-        .fit(X, y, sample_weight)
+    hgbc = HistGradientBoostingClassifier(min_samples_leaf=1, max_depth=1).fit(
+        X, y, sample_weight
     )
-    
+
     for predictor in hgbc._predictors:
         nodes = predictor[0].nodes
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
index 47fc47ed4d29f..eb045dfac11ad 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -376,7 +376,11 @@ def test_missing_value_predict_only():
     while not node["is_leaf"]:
         left = predictor.nodes[node["left"]]
         right = predictor.nodes[node["right"]]
-        node = left if left["weighted_n_node_samples"] > right["weighted_n_node_samples"] else right
+        node = (
+            left
+            if left["weighted_n_node_samples"] > right["weighted_n_node_samples"]
+            else right
+        )
 
     prediction_main_path = node["value"]
 

From 0c7e91e9299d4058f6bb1c84386a16f12a92b099 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 15 Mar 2023 22:35:00 -0700
Subject: [PATCH 07/11] Fix changelog

---
 doc/whats_new/v1.3.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 24a715b6e3400..18bc9dec0b656 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -210,15 +210,15 @@ Changelog
   :class:`ensemble.BaggingRegressor` expose the `allow_nan` tag from the
   underlying estimator. :pr:`25506` by `Thomas Fan`_.
 
+- |Fix| :meth:`ensemble.RandomForestClassifier.fit` sets `max_samples = 1`
+  when `max_samples` is a float and `round(n_samples * max_samples) < 1`.
+  :pr:`25601` by :user:`Jan Fidor <JanFidor>`.
+
 - |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` and
   :class:`ensemble.HistGradientBoostingRegressor` now take the user-supplied
   `sample_weight` into account at `fit` time, so each node in the estimator's
   predictors stores and uses the weighted sample count.
-  :pr:`25431` by `Andrew Wang <Andrew-Wang-IB45>.`
-
-- |Fix| :meth:`ensemble.RandomForestClassifier.fit` sets `max_samples = 1`
-  when `max_samples` is a float and `round(n_samples * max_samples) < 1`.
-  :pr:`25601` by :user:`Jan Fidor <JanFidor>`.
+  :pr:`25431` by :user:`Andrew Wang <Andrew-Wang-IB45>.`
 
 :mod:`sklearn.exception`
 ........................

From 5e325bd3831de79d63021f4e0c1867118ab35ae1 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Mon, 20 Mar 2023 18:26:10 -0700
Subject: [PATCH 08/11] Remove stopping criteria that use sample_weight

---
 .../gradient_boosting.py                      | 22 -------------------
 .../_hist_gradient_boosting/grower.py         | 19 +++-------------
 2 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index f20ede2117f12..25f6a091a0452 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -90,7 +90,6 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "max_depth": [Interval(Integral, 1, None, closed="left"), None],
         "min_samples_leaf": [Interval(Integral, 1, None, closed="left")],
-        "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")],
         "l2_regularization": [Interval(Real, 0, None, closed="left")],
         "monotonic_cst": ["array-like", dict, None],
         "interaction_cst": [
@@ -125,7 +124,6 @@ def __init__(
         max_leaf_nodes,
         max_depth,
         min_samples_leaf,
-        min_weight_fraction_leaf,
         l2_regularization,
         max_bins,
         categorical_features,
@@ -146,7 +144,6 @@ def __init__(
         self.max_leaf_nodes = max_leaf_nodes
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
-        self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
         self.monotonic_cst = monotonic_cst
@@ -389,12 +386,6 @@ def fit(self, X, y, sample_weight=None):
         # used for validation in predict
         n_samples, self._n_features = X.shape
 
-        # Set min_weight_leaf from min_weight_fraction_leaf
-        if sample_weight is None:
-            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
-        else:
-            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
-
         self.is_categorical_, known_categories = self._check_categories(X)
 
         # Encode constraints into a list of sets of features indices (integers).
@@ -692,7 +683,6 @@ def fit(self, X, y, sample_weight=None):
                     max_leaf_nodes=self.max_leaf_nodes,
                     max_depth=self.max_depth,
                     min_samples_leaf=self.min_samples_leaf,
-                    min_weight_leaf=min_weight_leaf,
                     l2_regularization=self.l2_regularization,
                     shrinkage=self.learning_rate,
                     n_threads=n_threads,
@@ -1254,10 +1244,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         The minimum number of samples per leaf. For small datasets with less
         than a few hundred samples, it is recommended to lower this value
         since only very shallow trees would be built.
-    min_weight_fraction_leaf : float, default=0.0
-        The minimum weighted fraction of the sum total of weights (of all
-        the input samples) required to be at a leaf node. Samples have
-        equal weight when sample_weight is not provided.
     l2_regularization : float, default=0
         The L2 regularization parameter. Use ``0`` for no regularization
         (default).
@@ -1460,7 +1446,6 @@ def __init__(
         max_leaf_nodes=31,
         max_depth=None,
         min_samples_leaf=20,
-        min_weight_fraction_leaf=0.0,
         l2_regularization=0.0,
         max_bins=255,
         categorical_features=None,
@@ -1482,7 +1467,6 @@ def __init__(
             max_leaf_nodes=max_leaf_nodes,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
-            min_weight_fraction_leaf=min_weight_fraction_leaf,
             l2_regularization=l2_regularization,
             max_bins=max_bins,
             monotonic_cst=monotonic_cst,
@@ -1619,10 +1603,6 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
         The minimum number of samples per leaf. For small datasets with less
         than a few hundred samples, it is recommended to lower this value
         since only very shallow trees would be built.
-    min_weight_fraction_leaf : float, default=0.0
-        The minimum weighted fraction of the sum total of weights (of all
-        the input samples) required to be at a leaf node. Samples have
-        equal weight when sample_weight is not provided.
     l2_regularization : float, default=0
         The L2 regularization parameter. Use 0 for no regularization.
     max_bins : int, default=255
@@ -1824,7 +1804,6 @@ def __init__(
         max_leaf_nodes=31,
         max_depth=None,
         min_samples_leaf=20,
-        min_weight_fraction_leaf=0.0,
         l2_regularization=0.0,
         max_bins=255,
         categorical_features=None,
@@ -1847,7 +1826,6 @@ def __init__(
             max_leaf_nodes=max_leaf_nodes,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
-            min_weight_fraction_leaf=min_weight_fraction_leaf,
             l2_regularization=l2_regularization,
             max_bins=max_bins,
             categorical_features=categorical_features,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index 3537150ba4b9a..e869a1c1e8ef2 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -162,8 +162,6 @@ class TreeGrower:
         Depth isn't constrained by default.
     min_samples_leaf : int, default=20
         The minimum number of samples per leaf.
-    min_weight_leaf: float, default=0.
-        The minimum weight of input samples required for a node to be a leaf.
     min_gain_to_split : float, default=0.
         The minimum gain needed to split a node. Splits with lower gain will
         be ignored.
@@ -237,7 +235,6 @@ def __init__(
         max_leaf_nodes=None,
         max_depth=None,
         min_samples_leaf=20,
-        min_weight_leaf=0.0,
         min_gain_to_split=0.0,
         n_bins=256,
         n_bins_non_missing=None,
@@ -330,7 +327,6 @@ def __init__(
         self.n_features = X_binned.shape[1]
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
-        self.min_weight_leaf = min_weight_leaf
         self.X_binned = X_binned
         self.min_gain_to_split = min_gain_to_split
         self.shrinkage = shrinkage
@@ -413,10 +409,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
         self.root.partition_start = 0
         self.root.partition_stop = n_samples
 
-        if (
-            self.root.n_samples < self.min_samples_leaf * 2
-            or self.root.weighted_n_node_samples < self.min_weight_leaf * 2
-        ):
+        if self.root.n_samples < self.min_samples_leaf * 2:
             # Do not even bother computing any splitting statistics.
             self._finalize_leaf(self.root)
             return
@@ -557,15 +550,9 @@ def split_next(self):
             self._finalize_leaf(right_child_node)
             return left_child_node, right_child_node
 
-        if (
-            left_child_node.n_samples < self.min_samples_leaf * 2
-            or left_child_node.weighted_n_node_samples < self.min_weight_leaf * 2
-        ):
+        if left_child_node.n_samples < self.min_samples_leaf * 2:
             self._finalize_leaf(left_child_node)
-        if (
-            right_child_node.n_samples < self.min_samples_leaf * 2
-            or right_child_node.weighted_n_node_samples < self.min_weight_leaf * 2
-        ):
+        if right_child_node.n_samples < self.min_samples_leaf * 2:
             self._finalize_leaf(right_child_node)
 
         if self.with_monotonic_cst:

From ecdcc802943e522a1fd1cb5b4ef4f686652b7aba Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 22 Mar 2023 20:56:39 -0700
Subject: [PATCH 09/11] Move computation of weighted_n_node_samples to
 split_indices function

---
 .../_hist_gradient_boosting/grower.py         | 23 ++++++++-----------
 .../_hist_gradient_boosting/splitting.pyx     | 13 +++++++++++
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index e869a1c1e8ef2..fb5fd5cca88ed 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -39,6 +39,8 @@ class TreeNode:
         The depth of the node, i.e. its distance from the root.
     sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint
         The indices of the samples at the node.
+    weighted_n_node_samples : float
+        The weighted number of training samples at the node.
     sum_gradients : float
         The sum of the gradients of the samples at the node.
     sum_hessians : float
@@ -95,11 +97,11 @@ class TreeNode:
     partition_start = 0
     partition_stop = 0
 
-    def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=None):
+    def __init__(self, depth, sample_indices, weighted_n_node_samples, sum_gradients, sum_hessians, value=None):
         self.depth = depth
         self.sample_indices = sample_indices
         self.n_samples = sample_indices.shape[0]
-        self.weighted_n_node_samples = sample_indices.shape[0]
+        self.weighted_n_node_samples = weighted_n_node_samples
         self.sum_gradients = sum_gradients
         self.sum_hessians = sum_hessians
         self.value = value
@@ -313,11 +315,11 @@ def __init__(
             min_samples_leaf,
             min_gain_to_split,
             hessians_are_constant,
+            sample_weight,
             n_threads,
         )
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
-        self.sample_weight = sample_weight
         self.max_leaf_nodes = max_leaf_nodes
         self.has_missing_values = has_missing_values
         self.monotonic_cst = monotonic_cst
@@ -397,15 +399,12 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
         self.root = TreeNode(
             depth=depth,
             sample_indices=self.splitter.partition,
+            weighted_n_node_samples=np.sum(self.splitter.sample_weight),
             sum_gradients=sum_gradients,
             sum_hessians=sum_hessians,
             value=0,
         )
 
-        self.root.weighted_n_node_samples = self.sample_weight[
-            self.root.sample_indices
-        ].sum()
-
         self.root.partition_start = 0
         self.root.partition_stop = n_samples
 
@@ -476,6 +475,7 @@ def split_next(self):
         (
             sample_indices_left,
             sample_indices_right,
+            right_weighted_n_node_samples,
             right_child_pos,
         ) = self.splitter.split_indices(node.split_info, node.sample_indices)
         self.total_apply_split_time += time() - tic
@@ -487,6 +487,7 @@ def split_next(self):
         left_child_node = TreeNode(
             depth,
             sample_indices_left,
+            node.weighted_n_node_samples - right_weighted_n_node_samples,
             node.split_info.sum_gradient_left,
             node.split_info.sum_hessian_left,
             value=node.split_info.value_left,
@@ -494,6 +495,7 @@ def split_next(self):
         right_child_node = TreeNode(
             depth,
             sample_indices_right,
+            right_weighted_n_node_samples,
             node.split_info.sum_gradient_right,
             node.split_info.sum_hessian_right,
             value=node.split_info.value_right,
@@ -502,13 +504,6 @@ def split_next(self):
         node.right_child = right_child_node
         node.left_child = left_child_node
 
-        left_child_node.weighted_n_node_samples = self.sample_weight[
-            sample_indices_left
-        ].sum()
-        right_child_node.weighted_n_node_samples = self.sample_weight[
-            sample_indices_right
-        ].sum()
-
         # set start and stop indices
         left_child_node.partition_start = node.partition_start
         left_child_node.partition_stop = node.partition_start + right_child_pos
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index cdeb373350ed4..37e39c77e2581 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -155,6 +155,9 @@ cdef class Splitter:
         be ignored.
     hessians_are_constant: bool, default is False
         Whether hessians are constant.
+    sample_weight: ndarray of float, shape (n_samples,), default=None
+        Weights of training data. If not provided, all samples are assumed
+        to have uniform weight.
     n_threads : int, default=1
         Number of OpenMP threads to use.
     """
@@ -166,6 +169,7 @@ cdef class Splitter:
         const unsigned char [::1] has_missing_values
         const unsigned char [::1] is_categorical
         const signed char [::1] monotonic_cst
+        const Y_DTYPE_C [::1] sample_weight
         unsigned char hessians_are_constant
         Y_DTYPE_C l2_regularization
         Y_DTYPE_C min_hessian_to_split
@@ -189,6 +193,7 @@ cdef class Splitter:
                  unsigned int min_samples_leaf=20,
                  Y_DTYPE_C min_gain_to_split=0.,
                  unsigned char hessians_are_constant=False,
+                 const Y_DTYPE_C [::1] sample_weight=None,
                  unsigned int n_threads=1):
 
         self.X_binned = X_binned
@@ -203,6 +208,7 @@ cdef class Splitter:
         self.min_samples_leaf = min_samples_leaf
         self.min_gain_to_split = min_gain_to_split
         self.hessians_are_constant = hessians_are_constant
+        self.sample_weight = sample_weight if sample_weight is not None else np.ones(X_binned.shape[0], dtype=np.float64)
         self.n_threads = n_threads
 
         # The partition array maps each sample index into the leaves of the
@@ -247,6 +253,8 @@ cdef class Splitter:
         right_indices : ndarray of int, shape (n_right_samples,)
             The indices of the samples in the right child. This is a view on
             self.partition.
+        right_weighted_n_node_samples : float
+            The weighted number of training samples in the right child.
         right_child_position : int
             The position of the right child in ``sample_indices``.
         """
@@ -302,6 +310,7 @@ cdef class Splitter:
                 self.X_binned[:, feature_idx]
             unsigned int [::1] left_indices_buffer = self.left_indices_buffer
             unsigned int [::1] right_indices_buffer = self.right_indices_buffer
+            const Y_DTYPE_C [::1] sample_weight = self.sample_weight
             unsigned char is_categorical = split_info.is_categorical
             # Cython is unhappy if we set left_cat_bitset to
             # split_info.left_cat_bitset directly, so we need a tmp var
@@ -321,6 +330,7 @@ cdef class Splitter:
             int i
             int thread_idx
             int sample_idx
+            double right_weighted_n_node_samples
             int right_child_position
             unsigned char turn_left
             int [:] left_offset = np.zeros(n_threads, dtype=np.int32)
@@ -339,6 +349,7 @@ cdef class Splitter:
                     offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1]
 
             # map indices from sample_indices to left/right_indices_buffer
+            right_weighted_n_node_samples = 0
             for thread_idx in prange(n_threads, schedule='static',
                                      chunksize=1, num_threads=n_threads):
                 left_count = 0
@@ -360,6 +371,7 @@ cdef class Splitter:
                     else:
                         right_indices_buffer[start + right_count] = sample_idx
                         right_count = right_count + 1
+                        right_weighted_n_node_samples += sample_weight[sample_idx]
 
                 left_counts[thread_idx] = left_count
                 right_counts[thread_idx] = right_count
@@ -410,6 +422,7 @@ cdef class Splitter:
 
         return (sample_indices[:right_child_position],
                 sample_indices[right_child_position:],
+                right_weighted_n_node_samples,
                 right_child_position)
 
     def find_node_split(

From 228f4b20d754ab30c3d50e00057d12b71bdf9916 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 22 Mar 2023 21:26:20 -0700
Subject: [PATCH 10/11] Update test cases

---
 .../tests/test_gradient_boosting.py                   |  6 +++---
 .../_hist_gradient_boosting/tests/test_splitting.py   | 11 +++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 5b0b59f78f6db..e1811fe450695 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -789,9 +789,9 @@ def test_sample_weight_leaf_weighted_nodes_classification_random(
         feat_idx = int(nodes[0][2])
         num_tresh = nodes[0][3]
 
-        assert nodes[0][1] == np.sum(sample_weight)
-        assert nodes[1][1] == np.sum(sample_weight[X[:, feat_idx] < num_tresh])
-        assert nodes[2][1] == np.sum(sample_weight[X[:, feat_idx] >= num_tresh])
+        assert_allclose(nodes[0][1], sample_weight.sum())
+        assert_allclose(nodes[1][1], sample_weight[X[:, feat_idx] < num_tresh].sum())
+        assert_allclose(nodes[2][1], sample_weight[X[:, feat_idx] >= num_tresh].sum())
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
index d1da34015a2a4..3a3e64a339425 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@@ -158,7 +158,7 @@ def test_gradient_and_hessian_sanity(constant_hessian):
     si_parent = splitter.find_node_split(
         n_samples, hists_parent, sum_gradients, sum_hessians, value_parent
     )
-    sample_indices_left, sample_indices_right, _ = splitter.split_indices(
+    sample_indices_left, sample_indices_right, _, _ = splitter.split_indices(
         si_parent, sample_indices
     )
 
@@ -308,12 +308,15 @@ def test_split_indices():
     assert si_root.feature_idx == 1
     assert si_root.bin_idx == 3
 
-    samples_left, samples_right, position_right = splitter.split_indices(
+    samples_left, samples_right, right_weight, position_right = splitter.split_indices(
         si_root, splitter.partition
     )
     assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
     assert set(samples_right) == set([2, 7, 9])
 
+    assert len(list(samples_left)) == n_samples - right_weight
+    assert len(list(samples_right)) == right_weight
+
     assert list(samples_left) == list(splitter.partition[:position_right])
     assert list(samples_right) == list(splitter.partition[position_right:])
 
@@ -557,7 +560,7 @@ def test_splitting_missing_values(
     # Make sure the split is properly computed.
     # This also make sure missing values are properly assigned to the correct
     # child in split_indices()
-    samples_left, samples_right, _ = splitter.split_indices(
+    samples_left, samples_right, _, _ = splitter.split_indices(
         split_info, splitter.partition
     )
 
@@ -849,7 +852,7 @@ def test_splitting_categorical_sanity(
     # is set later in the grower.
 
     # make sure samples are split correctly
-    samples_left, samples_right, _ = splitter.split_indices(
+    samples_left, samples_right, _, _ = splitter.split_indices(
         split_info, splitter.partition
     )
 

From fb234b79f70d8108f0cd6457b0b64cfaae1a4a9d Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 22 Mar 2023 21:34:20 -0700
Subject: [PATCH 11/11] Fix formatting

---
 sklearn/ensemble/_hist_gradient_boosting/grower.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index fb5fd5cca88ed..755fa69f88181 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -97,7 +97,15 @@ class TreeNode:
     partition_start = 0
     partition_stop = 0
 
-    def __init__(self, depth, sample_indices, weighted_n_node_samples, sum_gradients, sum_hessians, value=None):
+    def __init__(
+        self,
+        depth,
+        sample_indices,
+        weighted_n_node_samples,
+        sum_gradients,
+        sum_hessians,
+        value=None,
+    ):
         self.depth = depth
         self.sample_indices = sample_indices
         self.n_samples = sample_indices.shape[0]