Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst

This file was deleted.

8 changes: 8 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.ensemble/32825.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor` and
:class:`ensemble.IsolationForest` now use `sample_weight` to draw the samples
instead of forwarding them multiplied by a uniformly sampled mask to the
underlying estimators. Furthermore, when `max_samples` is a float, it is now
interpreted as a fraction of `sample_weight.sum()` instead of `X.shape[0]`.
The new default `max_samples=None` draws `X.shape[0]` samples, irrespective
of `sample_weight`.
By :user:`Antoine Baker <antoinebaker>`. :pr:`31414` and
96 changes: 73 additions & 23 deletions sklearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,63 @@
MAX_INT = np.iinfo(np.int32).max


def _get_n_samples_bootstrap(n_samples, max_samples, sample_weight):
"""
Get the number of samples in a bootstrap sample.

Parameters
----------
n_samples : int
Number of samples in the dataset.

max_samples : None, int or float
The maximum number of samples to draw.

- If None, then draw `n_samples` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * n_samples` unweighted samples or
`max_samples * sample_weight.sum()` weighted samples.

sample_weight : array of shape (n_samples,) or None
Sample weights with frequency semantics when `max_samples` is explicitly
set to a float or integer value. When keeping the `max_samples=None` default
value, the equivalence between fitting with integer weighted data points or
integer repeated data points is no longer guaranteed because the effective
bootstrap size is no longer guaranteed to be equivalent.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reviewers, I updated the notebooks of our sample weight analyzer, in particular this want to highlight the fact that max_samples=None breaks frequency weights semantics:

https://github.com/snath-xoc/sample-weight-audit-nondet/blob/main/reports/sklearn_estimators_sample_weight_audit_investigative_plots.ipynb

The p-value is very small: the null-hypothesis that fitting with integer weights is equivalent to fitting with repeated data points is rejected.

Editing the cell to select either max_samples=1.0 or max_samples=100 restores large p-values, even when increasing n_stochastic_fits meaning that we cannot reject the null-hypothesis that frequency weights semantics are respected.

Copy link
Contributor Author

@antoinebaker antoinebaker Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we could recommend:

  1. max_samples = some_integer as the safest choice for sample_weight semantics (repeated/weighted equivalence for integer weights and safe with rescaled weights)
  2. max_samples = some_float as the second best (repeated/weighted equivalence for integer weights but unsafe with rescaled weights)
  3. default max_samples = None does not respect the repeated/weighted equivalence for integer weights


Returns
-------
n_samples_bootstrap : int
The total number of samples to draw for the bootstrap sample.
"""
if max_samples is None:
return n_samples
elif isinstance(max_samples, Integral):
return max_samples

if sample_weight is None:
weighted_n_samples = n_samples
weighted_n_samples_msg = f"the number of samples is {weighted_n_samples} "
else:
weighted_n_samples = sample_weight.sum()
weighted_n_samples_msg = (
f"the total sum of sample weights is {weighted_n_samples} "
)

# max_samples Real fractional value relative to weighted_n_samples
n_samples_bootstrap = max(int(max_samples * weighted_n_samples), 1)
# Warn when number of bootstrap samples is suspiciously small
# This heuristic for "suspiciously small" might be adapted if found
# unsuitable in practice
if n_samples_bootstrap < max(10, n_samples ** (1 / 3)):
warn(
f"Using the fractional value {max_samples=} when {weighted_n_samples_msg}"
f"results in a low number ({n_samples_bootstrap}) of bootstrap samples. "
"We recommend passing `max_samples` as an integer instead."
)
return n_samples_bootstrap


def _generate_indices(random_state, bootstrap, n_population, n_samples):
"""Draw randomly sampled indices."""
# Draw sample indices
Expand Down Expand Up @@ -273,6 +330,7 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
"estimator": [HasMethods(["fit", "predict"]), None],
"n_estimators": [Interval(Integral, 1, None, closed="left")],
"max_samples": [
None,
Interval(Integral, 1, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="right"),
],
Expand All @@ -295,7 +353,7 @@ def __init__(
estimator=None,
n_estimators=10,
*,
max_samples=1.0,
max_samples=None,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
Expand Down Expand Up @@ -340,7 +398,9 @@ def fit(self, X, y, sample_weight=None, **fit_params):
Sample weights. If None, then samples are equally weighted. Used as
probabilities to sample the training set. Note that the expected
frequency semantics for the `sample_weight` parameter are only
fulfilled when sampling with replacement `bootstrap=True`.
fulfilled when sampling with replacement `bootstrap=True` and using
a float or integer `max_samples` (instead of the default
`max_samples=None`).

**fit_params : dict
Parameters to pass to the underlying estimators.
Expand Down Expand Up @@ -462,20 +522,7 @@ def _fit(
if max_samples is None:
max_samples = self.max_samples

if not isinstance(max_samples, numbers.Integral):
if sample_weight is None:
max_samples = max(int(max_samples * X.shape[0]), 1)
else:
sw_sum = np.sum(sample_weight)
if sw_sum <= 1:
raise ValueError(
f"The total sum of sample weights is {sw_sum}, which prevents "
"resampling with a fractional value for max_samples="
f"{max_samples}. Either pass max_samples as an integer or "
"use a larger sample_weight."
)
max_samples = max(int(max_samples * sw_sum), 1)

max_samples = _get_n_samples_bootstrap(X.shape[0], max_samples, sample_weight)
if not self.bootstrap and max_samples > X.shape[0]:
raise ValueError(
f"Effective max_samples={max_samples} must be <= n_samples="
Expand Down Expand Up @@ -728,13 +775,14 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
n_estimators : int, default=10
The number of base estimators in the ensemble.

max_samples : int or float, default=1.0
max_samples : int or float, default=None
The number of samples to draw from X to train each base estimator (with
replacement by default, see `bootstrap` for more details).

- If None, then draw `X.shape[0]` samples irrespective of `sample_weight`.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` unweighted samples
or `max_samples * sample_weight.sum()` weighted samples.
- If float, then draw `max_samples * X.shape[0]` unweighted samples or
`max_samples * sample_weight.sum()` weighted samples.

max_features : int or float, default=1.0
The number of features to draw from X to train each base estimator (
Expand Down Expand Up @@ -867,7 +915,7 @@ def __init__(
estimator=None,
n_estimators=10,
*,
max_samples=1.0,
max_samples=None,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
Expand Down Expand Up @@ -1239,12 +1287,14 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
n_estimators : int, default=10
The number of base estimators in the ensemble.

max_samples : int or float, default=1.0
max_samples : int or float, default=None
The number of samples to draw from X to train each base estimator (with
replacement by default, see `bootstrap` for more details).

- If None, then draw `X.shape[0]` samples irrespective of `sample_weight`.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples.
- If float, then draw `max_samples * X.shape[0]` unweighted samples or
`max_samples * sample_weight.sum()` weighted samples.

max_features : int or float, default=1.0
The number of features to draw from X to train each base estimator (
Expand Down Expand Up @@ -1368,7 +1418,7 @@ def __init__(
estimator=None,
n_estimators=10,
*,
max_samples=1.0,
max_samples=None,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
Expand Down
64 changes: 58 additions & 6 deletions sklearn/ensemble/tests/test_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# SPDX-License-Identifier: BSD-3-Clause

import re
import warnings
from itertools import cycle, product

import joblib
Expand All @@ -26,6 +27,7 @@
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.ensemble._bagging import _get_n_samples_bootstrap
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
Expand Down Expand Up @@ -706,16 +708,17 @@ def test_warning_bootstrap_sample_weight():
def test_invalid_sample_weight_max_samples_bootstrap_combinations():
X, y = iris.data, iris.target

# Case 1: small weights and fractional max_samples would lead to sampling
# less than 1 sample, which is not allowed.
# Case 1: small weights and fractional max_samples lead to a small
# number of bootstrap samples, which raises a UserWarning.
clf = BaggingClassifier(max_samples=1.0)
sample_weight = np.ones_like(y) / (2 * len(y))
expected_msg = (
r"The total sum of sample weights is 0.5(\d*), which prevents resampling with "
r"a fractional value for max_samples=1\.0\. Either pass max_samples as an "
r"integer or use a larger sample_weight\."
"Using the fractional value max_samples=1.0 when "
r"the total sum of sample weights is 0.5(\d*) "
r"results in a low number \(1\) of bootstrap samples. "
"We recommend passing `max_samples` as an integer."
)
with pytest.raises(ValueError, match=expected_msg):
with pytest.warns(UserWarning, match=expected_msg):
clf.fit(X, y, sample_weight=sample_weight)

# Case 2: large weights and bootstrap=False would lead to sampling without
Expand Down Expand Up @@ -813,6 +816,55 @@ def test_draw_indices_using_sample_weight(
assert_allclose(estimator.y_, y[samples])


def test_get_n_samples_bootstrap():
n_samples, max_samples, sample_weight = 10, None, "not_used"
assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == n_samples

n_samples, max_samples, sample_weight = 10, 5, "not_used"
assert (
_get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == max_samples
)

n_samples, max_samples, sample_weight = 10, 1e-5, None
assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == 1

n_samples, max_samples, sample_weight = 10, 0.66, None
warning_msg = ".+the number of samples.+low number.+max_samples.+as an integer"
with pytest.warns(UserWarning, match=warning_msg):
assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == int(
max_samples * n_samples
)

n_samples, max_samples, sample_weight = 10, 1e-5, None
with pytest.warns(UserWarning, match=warning_msg):
assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == 1

warning_msg_with_weights = (
".+the total sum of sample weights.+low number.+max_samples.+as an integer"
)
rng = np.random.default_rng(0)
n_samples, max_samples, sample_weight = 1_000_000, 1e-5, rng.uniform(size=1_000_000)
with pytest.warns(UserWarning, match=warning_msg_with_weights):
assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == int(
max_samples * sample_weight.sum()
)

sample_weight = np.ones(3)
with warnings.catch_warnings():
warnings.simplefilter("error")

n_samples, max_samples, sample_weight = 100, 30, None
assert (
_get_n_samples_bootstrap(n_samples, max_samples, sample_weight)
== max_samples
)

n_samples, max_samples, sample_weight = 100, 0.5, rng.uniform(size=100)
assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == int(
max_samples * sample_weight.sum()
)


def test_oob_score_removed_on_warm_start():
X, y = make_hastie_10_2(n_samples=100, random_state=1)

Expand Down
Loading