From 8fe6ae2dda7e471137ae772b6392ec2971254f4a Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Mon, 14 Jul 2025 14:58:55 +1000
Subject: [PATCH 1/5] try reverse cum sum

---
 sklearn/utils/stats.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index 66179e5ea3aba..f2b90249e6c09 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -7,8 +7,13 @@
 )
 
 
-def _weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
-    """Compute the weighted percentile with method 'inverted_cdf'.
+def _weighted_percentile(
+    array, sample_weight, percentile_rank=50, average=False, xp=None
+):
+    """Compute the weighted percentile.
+
+    Uses 'inverted_cdf' method when `average=False` (default) and
+    'averaged_inverted_cdf' when `average=True`.
 
     When the percentile lies between two data points of `array`, the function returns
     the lower value.
@@ -38,6 +43,13 @@ def _weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
         The probability level of the percentile to compute, in percent. Must be between
         0 and 100.
 
+    average : bool, default=False
+        If `True`, uses the "averaged_inverted_cdf" quantile method, otherwise
+        defaults to "inverted_cdf". "averaged_inverted_cdf" is symmetrical such that
+        the total of `sample_weights` below or equal to
+        `_weighted_percentile(percentile_rank)` is the same as the total of
+        `sample_weights` above or equal to `_weighted_percentile(100-percentile_rank).
+
     xp : array_namespace, default=None
         The standard-compatible namespace for `array`. Default: infer.
 
@@ -110,6 +122,12 @@ def _weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
 
     result = array[percentile_in_sorted, col_indices]
 
+    if average:
+        rev_idx_in_sorted = sorted_idx[1 + percentile_indices, col_indices]
+
+        rev_result = array[rev_idx_in_sorted, col_indices]
+        result = (result + rev_result) / 2
+
     return result[0] if n_dim == 1 else result
 
 

From b9c0c7b0c3a6dc2f048c342acb7c4978390b36fe Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Tue, 15 Jul 2025 21:23:23 +1000
Subject: [PATCH 2/5] initial implementation, wip tests

---
 sklearn/utils/stats.py            | 34 +++++++++++++-----
 sklearn/utils/tests/test_stats.py | 58 +++++++++++++++++++++++--------
 2 files changed, 68 insertions(+), 24 deletions(-)

diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index f2b90249e6c09..87bc50c1ff9aa 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -45,10 +45,11 @@ def _weighted_percentile(
 
     average : bool, default=False
         If `True`, uses the "averaged_inverted_cdf" quantile method, otherwise
-        defaults to "inverted_cdf". "averaged_inverted_cdf" is symmetrical such that
-        the total of `sample_weights` below or equal to
+        defaults to "inverted_cdf". "averaged_inverted_cdf" is symmetrical with
+        unit `sample_weight`, such that the total of `sample_weight` below or equal to
         `_weighted_percentile(percentile_rank)` is the same as the total of
-        `sample_weights` above or equal to `_weighted_percentile(100-percentile_rank).
+        `sample_weight` above or equal to `_weighted_percentile(100-percentile_rank).
+        This symmetry is not guaranteed with non-unit weights.
 
     xp : array_namespace, default=None
         The standard-compatible namespace for `array`. Default: infer.
@@ -120,13 +121,28 @@ def _weighted_percentile(
     col_indices = xp.arange(array.shape[1], device=device)
     percentile_in_sorted = sorted_idx[percentile_indices, col_indices]
 
-    result = array[percentile_in_sorted, col_indices]
-
     if average:
-        rev_idx_in_sorted = sorted_idx[1 + percentile_indices, col_indices]
-
-        rev_result = array[rev_idx_in_sorted, col_indices]
-        result = (result + rev_result) / 2
+        # From Hyndman and Fan (1996), `fraction_above` is `g`
+        fraction_above = (
+            weight_cdf[col_indices, percentile_indices] - adjusted_percentile_rank
+        )
+        # Alternatively, could use
+        # `is_exact_percentile = fraction_above <= xp.finfo(floating_dtype).eps`
+        # but that seems harder to read
+        is_fraction_above = fraction_above > xp.finfo(floating_dtype).eps
+        percentile_plus_one_in_sorted = sorted_idx[percentile_indices + 1, col_indices]
+
+        result = xp.where(
+            is_fraction_above,
+            array[percentile_in_sorted, col_indices],
+            (
+                array[percentile_in_sorted, col_indices]
+                + array[percentile_plus_one_in_sorted, col_indices]
+            )
+            / 2,
+        )
+    else:
+        result = array[percentile_in_sorted, col_indices]
 
     return result[0] if n_dim == 1 else result
 
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
index 1c979425f12f8..98a6928fc942c 100644
--- a/sklearn/utils/tests/test_stats.py
+++ b/sklearn/utils/tests/test_stats.py
@@ -15,32 +15,58 @@
 from sklearn.utils.stats import _averaged_weighted_percentile, _weighted_percentile
 
 
+def test_aa():
+    y1 = np.arange(6)
+    sw = np.ones_like(y1)
+    print("fwd")
+    _weighted_percentile(y1, sw, 30)
+    print("reverse")
+    _weighted_percentile(-y1, sw, 70)
+
+
+# param odd and even
+# param average and no average = note that fail for even not averaged
 def test_averaged_weighted_median():
-    y = np.array([0, 1, 2, 3, 4, 5])
-    sw = np.array([1, 1, 1, 1, 1, 1])
+    y = np.array([0, 1, 2, 3, 4, 5, 6])
+    sw = np.array([1, 1, 1, 1, 1, 1, 1])
 
-    score = _averaged_weighted_percentile(y, sw, 50)
+    score = _weighted_percentile(y, sw, 50, average=True)
 
     assert score == np.median(y)
 
 
-def test_averaged_weighted_percentile(global_random_seed):
+# test for 2D
+@pytest.mark.parametrize("percentile_rank", [20, 51, 75])
+@pytest.mark.parametrize("size", [10, 15, 21, 40])
+def test_averaged_weighted_percentile(global_random_seed, size, percentile_rank):
+    """Check `_weighted_percentile` with unit weights is correct.
+
+    Results should be the same as np.percentile/quantile's 'averaged_inverted_cdf'.
+    """
     rng = np.random.RandomState(global_random_seed)
-    y = rng.randint(20, size=10)
+    y = rng.randint(20, size=size)
 
-    sw = np.ones(10)
+    sw = np.ones_like(y)
+    # y = np.arange(10)
+    # score = _averaged_weighted_percentile(y, sw, 20)
+    score = _weighted_percentile(y, sw, percentile_rank)
+    print(f"fwd score {score}")
+    score = _weighted_percentile(-y, sw, percentile_rank)
+    print(f"back score {score}")
 
-    score = _averaged_weighted_percentile(y, sw, 20)
+    print("new method:")
+    score = _weighted_percentile(y, sw, percentile_rank, average=True)
 
-    assert score == np.percentile(y, 20, method="averaged_inverted_cdf")
+    assert score == np.percentile(y, percentile_rank, method="averaged_inverted_cdf")
 
 
 def test_averaged_and_weighted_percentile():
+    """Check `_weighted_percentile`"""
     y = np.array([0, 1, 2])
     sw = np.array([5, 1, 5])
     q = 50
 
-    score_averaged = _averaged_weighted_percentile(y, sw, q)
+    score_averaged = _weighted_percentile(y, sw, q, average=True)
     score = _weighted_percentile(y, sw, q)
 
     assert score_averaged == score
@@ -61,23 +87,22 @@ def test_weighted_percentile():
 
 def test_weighted_percentile_equal():
     """Check `weighted_percentile` with all weights equal to 1."""
-    y = np.empty(102, dtype=np.float64)
-    y.fill(0.0)
+    y = np.zeros(102, dtype=np.float64)
     sw = np.ones(102, dtype=np.float64)
     score = _weighted_percentile(y, sw, 50)
     assert approx(score) == 0
 
 
+# this should probably warn/error
 def test_weighted_percentile_zero_weight():
     """Check `weighted_percentile` with all weights equal to 0."""
-    y = np.empty(102, dtype=np.float64)
-    y.fill(1.0)
-    sw = np.ones(102, dtype=np.float64)
-    sw.fill(0.0)
+    y = np.ones(102, dtype=np.float64)
+    sw = np.zeros(102, dtype=np.float64)
     value = _weighted_percentile(y, sw, 50)
     assert approx(value) == 1.0
 
 
+# param average
 def test_weighted_percentile_zero_weight_zero_percentile():
     """Check `weighted_percentile(percentile_rank=0)` behaves correctly.
 
@@ -96,6 +121,7 @@ def test_weighted_percentile_zero_weight_zero_percentile():
     assert approx(value) == 4
 
 
+# delete?
 def test_weighted_median_equal_weights(global_random_seed):
     """Checks `_weighted_percentile(percentile_rank=50)` is the same as `np.median`.
 
@@ -126,6 +152,7 @@ def test_weighted_median_integer_weights(global_random_seed):
     assert median == approx(w_median)
 
 
+# param
 def test_weighted_percentile_2d(global_random_seed):
     # Check for when array 2D and sample_weight 1D
     rng = np.random.RandomState(global_random_seed)
@@ -234,6 +261,7 @@ def test_weighted_percentile_array_api_consistency(
         assert result_xp_np.dtype == np.float64
 
 
+# paam
 @pytest.mark.parametrize("sample_weight_ndim", [1, 2])
 def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed):
     """Test that calling _weighted_percentile on an array with nan values returns

From b56fab0c4636fd4a21e28697a18dbe045fbee8b6 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Thu, 17 Jul 2025 21:13:28 +1000
Subject: [PATCH 3/5] fix and add tests, update use

---
 sklearn/metrics/_regression.py           |   6 +-
 sklearn/preprocessing/_discretization.py |  19 +-
 sklearn/utils/stats.py                   |  32 ++--
 sklearn/utils/tests/test_stats.py        | 219 ++++++++++++-----------
 4 files changed, 148 insertions(+), 128 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 3e0148345ffa1..618a64e7d2848 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -28,7 +28,7 @@
     _xlogy as xlogy,
 )
 from ..utils._param_validation import Interval, StrOptions, validate_params
-from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
+from ..utils.stats import _weighted_percentile
 from ..utils.validation import (
     _check_sample_weight,
     _num_samples,
@@ -923,8 +923,8 @@ def median_absolute_error(
     if sample_weight is None:
         output_errors = _median(xp.abs(y_pred - y_true), axis=0)
     else:
-        output_errors = _averaged_weighted_percentile(
-            xp.abs(y_pred - y_true), sample_weight=sample_weight
+        output_errors = _weighted_percentile(
+            xp.abs(y_pred - y_true), sample_weight=sample_weight, average=True
         )
     if isinstance(multioutput, str):
         if multioutput == "raw_values":
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index ef5081080bda1..2513ceee7938e 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -10,7 +10,7 @@
 from ..base import BaseEstimator, TransformerMixin, _fit_context
 from ..utils import resample
 from ..utils._param_validation import Interval, Options, StrOptions
-from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile
+from ..utils.stats import _weighted_percentile
 from ..utils.validation import (
     _check_feature_names_in,
     _check_sample_weight,
@@ -357,17 +357,20 @@ def fit(self, X, y=None, sample_weight=None):
                         dtype=np.float64,
                     )
                 else:
-                    # TODO: make _weighted_percentile and
-                    # _averaged_weighted_percentile accept an array of
+                    # TODO: make _weighted_percentile accept an array of
                     # quantiles instead of calling it multiple times and
                     # sorting the column multiple times as a result.
-                    percentile_func = {
-                        "inverted_cdf": _weighted_percentile,
-                        "averaged_inverted_cdf": _averaged_weighted_percentile,
-                    }[quantile_method]
+                    average = (
+                        True if quantile_method == "averaged_inverted_cdf" else False
+                    )
                     bin_edges[jj] = np.asarray(
                         [
-                            percentile_func(column, sample_weight, percentile_rank=p)
+                            _weighted_percentile(
+                                column,
+                                sample_weight,
+                                percentile_rank=p,
+                                average=average,
+                            )
                             for p in percentile_levels
                         ],
                         dtype=np.float64,
diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index 87bc50c1ff9aa..96c5e0c0282b5 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -114,7 +114,8 @@ def _weighted_percentile(
             for feature_idx in range(weight_cdf.shape[0])
         ],
     )
-    # In rare cases, `percentile_indices` equals to `sorted_idx.shape[0]`
+    # `percentile_indices` may be equal to `sorted_idx.shape[0]` due to floating
+    # point error (see #11813)
     max_idx = sorted_idx.shape[0] - 1
     percentile_indices = xp.clip(percentile_indices, 0, max_idx)
 
@@ -131,6 +132,26 @@ def _weighted_percentile(
         # but that seems harder to read
         is_fraction_above = fraction_above > xp.finfo(floating_dtype).eps
         percentile_plus_one_in_sorted = sorted_idx[percentile_indices + 1, col_indices]
+        # Handle case when when next index ('plus one') has sample weight of 0
+        if xp.any(sample_weight[percentile_plus_one_in_sorted, col_indices] == 0):
+            for idx in col_indices:
+                if (
+                    sample_weight[percentile_plus_one_in_sorted[idx], col_indices[idx]]
+                    == 0
+                ):
+                    cdf_val = weight_cdf[col_indices[idx], percentile_indices[idx]]
+                    # Search for next index where `weighted_cdf` is greater
+                    next_cdf_index = xp.searchsorted(
+                        weight_cdf[idx, ...], cdf_val, side="right"
+                    )
+                    # To account for trailing 0 sample weights, use original
+                    # `percentile_indices` again
+                    if next_cdf_index >= max_idx:
+                        next_cdf_index = percentile_indices[idx]
+
+                    percentile_plus_one_in_sorted[idx] = sorted_idx[
+                        next_cdf_index, col_indices
+                    ]
 
         result = xp.where(
             is_fraction_above,
@@ -145,12 +166,3 @@ def _weighted_percentile(
         result = array[percentile_in_sorted, col_indices]
 
     return result[0] if n_dim == 1 else result
-
-
-# TODO: refactor to do the symmetrisation inside _weighted_percentile to avoid
-# sorting the input array twice.
-def _averaged_weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
-    return (
-        _weighted_percentile(array, sample_weight, percentile_rank, xp=xp)
-        - _weighted_percentile(-array, sample_weight, 100 - percentile_rank, xp=xp)
-    ) / 2
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
index 98a6928fc942c..78a4a9ad88edc 100644
--- a/sklearn/utils/tests/test_stats.py
+++ b/sklearn/utils/tests/test_stats.py
@@ -12,148 +12,138 @@
 from sklearn.utils._array_api import device as array_device
 from sklearn.utils.estimator_checks import _array_api_for_tests
 from sklearn.utils.fixes import np_version, parse_version
-from sklearn.utils.stats import _averaged_weighted_percentile, _weighted_percentile
+from sklearn.utils.stats import _weighted_percentile
 
 
-def test_aa():
-    y1 = np.arange(6)
-    sw = np.ones_like(y1)
-    print("fwd")
-    _weighted_percentile(y1, sw, 30)
-    print("reverse")
-    _weighted_percentile(-y1, sw, 70)
+@pytest.mark.parametrize("average", [True, False])
+@pytest.mark.parametrize("size", [10, 15])
+def test_weighted_percentile_matches_median(size, average):
+    """Ensure `_weighted_percentile` matches `median` when expected.
 
+    With unit `sample_weight`, `_weighted_percentile` should match median except
+    when `average=False` and the number of samples is odd.
+    When number of samples is odd, `_weighted_percentile(average=False)` always falls
+    on a single observation (not between 2 values, in which case the lower value would
+    be taken) and is thus equal to `np.median`.
+    For an even number of samples, `median` gives the average between the 2 middle
+    samples, `_weighted_percentile(average=False)` gives the higher (right) sample.
+    """
+    y = np.arange(size)
+    sample_weight = np.ones_like(y)
 
-# param odd and even
-# param average and no average = note that fail for even not averaged
-def test_averaged_weighted_median():
-    y = np.array([0, 1, 2, 3, 4, 5, 6])
-    sw = np.array([1, 1, 1, 1, 1, 1, 1])
-
-    score = _weighted_percentile(y, sw, 50, average=True)
+    score = _weighted_percentile(y, sample_weight, 50, average=average)
 
-    assert score == np.median(y)
+    # `_weighted_percentile(average=False)` does not match `median` when n is even
+    if size == 10 and average is False:
+        assert score != np.median(y)
+    else:
+        assert score == np.median(y)
 
 
-# test for 2D
-@pytest.mark.parametrize("percentile_rank", [20, 51, 75])
-@pytest.mark.parametrize("size", [10, 15, 21, 40])
-def test_averaged_weighted_percentile(global_random_seed, size, percentile_rank):
+# test 2D?
+@pytest.mark.parametrize("average", [True, False])
+@pytest.mark.parametrize("percentile_rank", [20, 35, 61])
+@pytest.mark.parametrize("size", [10, 15])
+def test_weighted_percentile_matches_numpy(
+    global_random_seed, size, percentile_rank, average
+):
     """Check `_weighted_percentile` with unit weights is correct.
 
-    Results should be the same as np.percentile/quantile's 'averaged_inverted_cdf'.
+    `average=True` results should be the same as `np.percentile`'s
+    'averaged_inverted_cdf'.
+    `average=False` results should be the same as `np.percentile`'s
+    'inverted_cdf'.
+    Note `np.percentile` is the same as `np.quantile` except `q` is in range [0, 100].
+
+    We parametrize through different `percentile_rank` and `size` to
+    ensure we get cases where `g=0` and `g>0` (see Hyndman and Fan 1996 for details).
     """
     rng = np.random.RandomState(global_random_seed)
     y = rng.randint(20, size=size)
-
     sw = np.ones_like(y)
-    # y = np.arange(10)
-    # score = _averaged_weighted_percentile(y, sw, 20)
-    score = _weighted_percentile(y, sw, percentile_rank)
-    print(f"fwd score {score}")
-    score = _weighted_percentile(-y, sw, percentile_rank)
-    print(f"back score {score}")
-
-    print("new method:")
-    score = _weighted_percentile(y, sw, percentile_rank, average=True)
-
-    assert score == np.percentile(y, percentile_rank, method="averaged_inverted_cdf")
 
+    score = _weighted_percentile(y, sw, percentile_rank, average=average)
 
-def test_averaged_and_weighted_percentile():
-    """Check `_weighted_percentile`"""
-    y = np.array([0, 1, 2])
-    sw = np.array([5, 1, 5])
-    q = 50
-
-    score_averaged = _weighted_percentile(y, sw, q, average=True)
-    score = _weighted_percentile(y, sw, q)
+    if average:
+        method = "averaged_inverted_cdf"
+    else:
+        method = "inverted_cdf"
 
-    assert score_averaged == score
+    assert score == np.percentile(y, percentile_rank, method=method)
 
 
-def test_weighted_percentile():
-    """Check `weighted_percentile` on artificial data with obvious median."""
+def test_weighted_percentile_zero_sample_weight():
+    """Check `weighted_percentile` on data with obvious median and a 0 weight."""
     y = np.empty(102, dtype=np.float64)
     y[:50] = 0
     y[-51:] = 2
     y[-1] = 100000
     y[50] = 1
-    sw = np.ones(102, dtype=np.float64)
+    sw = np.ones_like(y, dtype=np.float64)
     sw[-1] = 0.0
     value = _weighted_percentile(y, sw, 50)
     assert approx(value) == 1
 
 
 def test_weighted_percentile_equal():
-    """Check `weighted_percentile` with all weights equal to 1."""
+    """Check `weighted_percentile` with unit weights and all 0 values in `array`."""
     y = np.zeros(102, dtype=np.float64)
     sw = np.ones(102, dtype=np.float64)
     score = _weighted_percentile(y, sw, 50)
     assert approx(score) == 0
 
 
-# this should probably warn/error
-def test_weighted_percentile_zero_weight():
-    """Check `weighted_percentile` with all weights equal to 0."""
-    y = np.ones(102, dtype=np.float64)
-    sw = np.zeros(102, dtype=np.float64)
+def test_weighted_percentile_all_zero_weights():
+    """Check `weighted_percentile` with all weights equal to 0 returns last index."""
+    y = np.range(10)
+    sw = np.zeros(10)
     value = _weighted_percentile(y, sw, 50)
-    assert approx(value) == 1.0
+    assert approx(value) == 9.0
 
 
-# param average
-def test_weighted_percentile_zero_weight_zero_percentile():
+@pytest.mark.parametrize("average", [True, False])
+@pytest.mark.parametrize("percentile_rank, expected_value", [(0, 2), (50, 3), (100, 5)])
+def test_weighted_percentile_ignores_zero_weight(
+    average, percentile_rank, expected_value
+):
     """Check `weighted_percentile(percentile_rank=0)` behaves correctly.
 
-    Ensures that (leading)zero-weight observations ignored when `percentile_rank=0`.
+    Check that leading zero-weight observations ignored when `percentile_rank=0`.
     See #20528 for details.
+    Check that when `average=True` and the `j+1` ('plus one') index has sample weight
+    of 0, it is ignored. Also check that trailing zero weight observations are ignored
+    (e.g., when `percentile_rank=100`).
     """
-    y = np.array([0, 1, 2, 3, 4, 5])
-    sw = np.array([0, 0, 1, 1, 1, 0])
-    value = _weighted_percentile(y, sw, 0)
-    assert approx(value) == 2
-
-    value = _weighted_percentile(y, sw, 50)
-    assert approx(value) == 3
-
-    value = _weighted_percentile(y, sw, 100)
-    assert approx(value) == 4
-
+    y = np.array([0, 1, 2, 3, 4, 5, 6])
+    sw = np.array([0, 0, 1, 1, 0, 1, 0])
 
-# delete?
-def test_weighted_median_equal_weights(global_random_seed):
-    """Checks `_weighted_percentile(percentile_rank=50)` is the same as `np.median`.
+    value = _weighted_percentile(y, sw, percentile_rank, average=average)
+    assert approx(value) == expected_value
 
-    `sample_weights` are all 1s and the number of samples is odd.
-    When number of samples is odd, `_weighted_percentile` always falls on a single
-    observation (not between 2 values, in which case the lower value would be taken)
-    and is thus equal to `np.median`.
-    For an even number of samples, this check will not always hold as (note that
-    for some other percentile methods it will always hold). See #17370 for details.
-    """
-    rng = np.random.RandomState(global_random_seed)
-    x = rng.randint(10, size=11)
-    weights = np.ones(x.shape)
-    median = np.median(x)
-    w_median = _weighted_percentile(x, weights)
-    assert median == approx(w_median)
 
-
-def test_weighted_median_integer_weights(global_random_seed):
-    # Checks average weighted percentile_rank=0.5 is same as median when manually weight
-    # data
+@pytest.mark.parametrize("average", [True, False])
+@pytest.mark.parametrize("percentile_rank", [20, 35, 61])
+def test_weighted_median_frequency_weights(
+    global_random_seed, percentile_rank, average
+):
+    """Check integer weights give the same result as repeating values."""
     rng = np.random.RandomState(global_random_seed)
     x = rng.randint(20, size=10)
     weights = rng.choice(5, size=10)
-    x_manual = np.repeat(x, weights)
-    median = np.median(x_manual)
-    w_median = _averaged_weighted_percentile(x, weights)
-    assert median == approx(w_median)
 
+    x_repeated = np.repeat(x, weights)
+    percentile_weights = _weighted_percentile(
+        x, weights, percentile_rank, average=average
+    )
+    percentile_repeated = _weighted_percentile(
+        x_repeated, np.ones_like(x_repeated), percentile_rank, average=average
+    )
+    assert percentile_weights == approx(percentile_repeated)
 
-# param
-def test_weighted_percentile_2d(global_random_seed):
+
+@pytest.mark.parametrize("average", [True, False])
+def test_weighted_percentile_2d(global_random_seed, average):
+    """Check `_weighted_percentile` behaviour correct when `array` is 2D."""
     # Check for when array 2D and sample_weight 1D
     rng = np.random.RandomState(global_random_seed)
     x1 = rng.randint(10, size=10)
@@ -162,16 +152,21 @@ def test_weighted_percentile_2d(global_random_seed):
     x2 = rng.randint(20, size=10)
     x_2d = np.vstack((x1, x2)).T
 
-    w_median = _weighted_percentile(x_2d, w1)
-    p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
+    w_median = _weighted_percentile(x_2d, w1, average=average)
+    p_axis_0 = [
+        _weighted_percentile(x_2d[:, i], w1, average=average)
+        for i in range(x_2d.shape[1])
+    ]
     assert_allclose(w_median, p_axis_0)
+
     # Check when array and sample_weight both 2D
     w2 = rng.choice(5, size=10)
     w_2d = np.vstack((w1, w2)).T
 
-    w_median = _weighted_percentile(x_2d, w_2d)
+    w_median = _weighted_percentile(x_2d, w_2d, average=average)
     p_axis_0 = [
-        _weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
+        _weighted_percentile(x_2d[:, i], w_2d[:, i], average=average)
+        for i in range(x_2d.shape[1])
     ]
     assert_allclose(w_median, p_axis_0)
 
@@ -261,13 +256,18 @@ def test_weighted_percentile_array_api_consistency(
         assert result_xp_np.dtype == np.float64
 
 
-# paam
+@pytest.mark.parametrize("average", [True, False])
 @pytest.mark.parametrize("sample_weight_ndim", [1, 2])
-def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed):
-    """Test that calling _weighted_percentile on an array with nan values returns
-    the same results as calling _weighted_percentile on a filtered version of the data.
+def test_weighted_percentile_nan_filtered(
+    global_random_seed, sample_weight_ndim, average
+):
+    """Test `_weighted_percentile` ignores NaNs.
+
+    Calling `_weighted_percentile` on an array with nan values returns the same
+    results as calling `_weighted_percentile` on a filtered version of the data.
     We test both with sample_weight of the same shape as the data and with
-    one-dimensional sample_weight."""
+    one-dimensional sample_weight.
+    """
 
     rng = np.random.RandomState(global_random_seed)
     array_with_nans = rng.rand(100, 10)
@@ -280,7 +280,7 @@ def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed
         sample_weight = rng.randint(1, 6, size=(100,))
 
     # Find the weighted percentile on the array with nans:
-    results = _weighted_percentile(array_with_nans, sample_weight, 30)
+    results = _weighted_percentile(array_with_nans, sample_weight, 30, average=average)
 
     # Find the weighted percentile on the filtered array:
     filtered_array = [
@@ -297,7 +297,9 @@ def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed
 
     expected_results = np.array(
         [
-            _weighted_percentile(filtered_array[col], filtered_weights[col], 30)
+            _weighted_percentile(
+                filtered_array[col], filtered_weights[col], 30, average=average
+            )
             for col in range(array_with_nans.shape[1])
         ]
     )
@@ -335,8 +337,10 @@ def test_weighted_percentile_all_nan_column():
 )
 @pytest.mark.parametrize("percentile", [66, 10, 50])
 def test_weighted_percentile_like_numpy_quantile(percentile, global_random_seed):
-    """Check that _weighted_percentile delivers equivalent results as np.quantile
-    with weights."""
+    """Check `_weighted_percentile` equivalent to `np.quantile` with weights.
+
+    Note currently only "inverted_cdf" method accepts weights.
+    """
 
     rng = np.random.RandomState(global_random_seed)
     array = rng.rand(10, 100)
@@ -358,9 +362,10 @@ def test_weighted_percentile_like_numpy_quantile(percentile, global_random_seed)
 )
 @pytest.mark.parametrize("percentile", [66, 10, 50])
 def test_weighted_percentile_like_numpy_nanquantile(percentile, global_random_seed):
-    """Check that _weighted_percentile delivers equivalent results as np.nanquantile
-    with weights."""
+    """Check `_weighted_percentile` equivalent to `np.nanquantile` with weights.
 
+    Note currently only "inverted_cdf" method accepts weights.
+    """
     rng = np.random.RandomState(global_random_seed)
     array_with_nans = rng.rand(10, 100)
     array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan

From f99366c8fd9525aef11e369d908bc338ac49984f Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Fri, 18 Jul 2025 23:51:18 +1000
Subject: [PATCH 4/5] fixes and add tests

---
 sklearn/utils/stats.py            | 14 ++++++------
 sklearn/utils/tests/test_stats.py | 36 ++++++++++++++++++++-----------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index 96c5e0c0282b5..55e5f3b265816 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -131,7 +131,10 @@ def _weighted_percentile(
         # `is_exact_percentile = fraction_above <= xp.finfo(floating_dtype).eps`
         # but that seems harder to read
         is_fraction_above = fraction_above > xp.finfo(floating_dtype).eps
-        percentile_plus_one_in_sorted = sorted_idx[percentile_indices + 1, col_indices]
+        percentile_plus_one_indices = xp.clip(percentile_indices + 1, 0, max_idx)
+        percentile_plus_one_in_sorted = sorted_idx[
+            percentile_plus_one_indices, col_indices
+        ]
         # Handle case when when next index ('plus one') has sample weight of 0
         if xp.any(sample_weight[percentile_plus_one_in_sorted, col_indices] == 0):
             for idx in col_indices:
@@ -144,14 +147,13 @@ def _weighted_percentile(
                     next_cdf_index = xp.searchsorted(
                         weight_cdf[idx, ...], cdf_val, side="right"
                     )
-                    # To account for trailing 0 sample weights, use original
-                    # `percentile_indices` again
+                    # This occurs when there are trailing 0 sample weight samples
+                    # and `percentage_rank=100`
                     if next_cdf_index >= max_idx:
+                        # use original `percentile_indices` again
                         next_cdf_index = percentile_indices[idx]
 
-                    percentile_plus_one_in_sorted[idx] = sorted_idx[
-                        next_cdf_index, col_indices
-                    ]
+                    percentile_plus_one_in_sorted[idx] = sorted_idx[next_cdf_index, idx]
 
         result = xp.where(
             is_fraction_above,
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
index 78a4a9ad88edc..57df3bdef16b4 100644
--- a/sklearn/utils/tests/test_stats.py
+++ b/sklearn/utils/tests/test_stats.py
@@ -72,17 +72,24 @@ def test_weighted_percentile_matches_numpy(
     assert score == np.percentile(y, percentile_rank, method=method)
 
 
-def test_weighted_percentile_zero_sample_weight():
-    """Check `weighted_percentile` on data with obvious median and a 0 weight."""
-    y = np.empty(102, dtype=np.float64)
-    y[:50] = 0
-    y[-51:] = 2
-    y[-1] = 100000
-    y[50] = 1
-    sw = np.ones_like(y, dtype=np.float64)
-    sw[-1] = 0.0
-    value = _weighted_percentile(y, sw, 50)
-    assert approx(value) == 1
+@pytest.mark.parametrize("percentile_rank", [50, 100])
+def test_weighted_percentile_plus_one_clip_max(percentile_rank):
+    """Check `j+1` index is clipped to max, when `average=True`.
+
+    `percentile_plus_one_indices` can exceed max index when `percentile_indices`
+    is already at max index.
+    Note that when `g` (Hyndman and Fan) / `fraction_above` greater than 0,
+    `j+1` (Hyndman and Fan) / `percentile_plus_one_indices` is calculated but
+    never used (so it does not matter what this value is).
+    When `g=0` and `percentile_indices` is at max index, we perfectly at 100
+    and take the average of 2x the max index.
+    """
+    # Note for both spercentile_rank`s`,`percentile_indices` is already at max index
+    y = np.array([[0, 0], [1, 1]])
+    sw = np.array([[0.1, 0.1], [2, 2]])
+    score = _weighted_percentile(y, sw, percentile_rank)
+    for idx in range(2):
+        assert score[idx] == approx(1.0)
 
 
 def test_weighted_percentile_equal():
@@ -117,8 +124,11 @@ def test_weighted_percentile_ignores_zero_weight(
     y = np.array([0, 1, 2, 3, 4, 5, 6])
     sw = np.array([0, 0, 1, 1, 0, 1, 0])
 
-    value = _weighted_percentile(y, sw, percentile_rank, average=average)
-    assert approx(value) == expected_value
+    value = _weighted_percentile(
+        np.vstack((y, y)).T, np.vstack((sw, sw)).T, percentile_rank, average=average
+    )
+    for idx in range(2):
+        assert approx(value[idx]) == expected_value
 
 
 @pytest.mark.parametrize("average", [True, False])

From ba57727cd66f41ea86add391f9b7eb9ca4117188 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sat, 19 Jul 2025 00:13:43 +1000
Subject: [PATCH 5/5] simplify zero sample code

---
 sklearn/utils/stats.py            | 34 +++++++++++++++----------------
 sklearn/utils/tests/test_stats.py |  2 +-
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
index 55e5f3b265816..34a76f18a7514 100644
--- a/sklearn/utils/stats.py
+++ b/sklearn/utils/stats.py
@@ -136,24 +136,22 @@ def _weighted_percentile(
             percentile_plus_one_indices, col_indices
         ]
         # Handle case when when next index ('plus one') has sample weight of 0
-        if xp.any(sample_weight[percentile_plus_one_in_sorted, col_indices] == 0):
-            for idx in col_indices:
-                if (
-                    sample_weight[percentile_plus_one_in_sorted[idx], col_indices[idx]]
-                    == 0
-                ):
-                    cdf_val = weight_cdf[col_indices[idx], percentile_indices[idx]]
-                    # Search for next index where `weighted_cdf` is greater
-                    next_cdf_index = xp.searchsorted(
-                        weight_cdf[idx, ...], cdf_val, side="right"
-                    )
-                    # This occurs when there are trailing 0 sample weight samples
-                    # and `percentage_rank=100`
-                    if next_cdf_index >= max_idx:
-                        # use original `percentile_indices` again
-                        next_cdf_index = percentile_indices[idx]
-
-                    percentile_plus_one_in_sorted[idx] = sorted_idx[next_cdf_index, idx]
+        zero_weight_cols = col_indices[
+            sample_weight[percentile_plus_one_in_sorted, col_indices] == 0
+        ]
+        for col_idx in zero_weight_cols:
+            cdf_val = weight_cdf[col_idx, percentile_indices[col_idx]]
+            # Search for next index where `weighted_cdf` is greater
+            next_index = xp.searchsorted(
+                weight_cdf[col_idx, ...], cdf_val, side="right"
+            )
+            # Handle case where there are trailing 0 sample weight samples
+            # and `percentile_indices` is already max index
+            if next_index >= max_idx:
+                # use original `percentile_indices` again
+                next_index = percentile_indices[col_idx]
+
+            percentile_plus_one_in_sorted[col_idx] = sorted_idx[next_index, col_idx]
 
         result = xp.where(
             is_fraction_above,
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
index 57df3bdef16b4..ce16bc7eef27e 100644
--- a/sklearn/utils/tests/test_stats.py
+++ b/sklearn/utils/tests/test_stats.py
@@ -102,7 +102,7 @@ def test_weighted_percentile_equal():
 
 def test_weighted_percentile_all_zero_weights():
     """Check `weighted_percentile` with all weights equal to 0 returns last index."""
-    y = np.range(10)
+    y = np.arange(10)
     sw = np.zeros(10)
     value = _weighted_percentile(y, sw, 50)
     assert approx(value) == 9.0