diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst deleted file mode 100644 index 17c2f765d4b7c..0000000000000 --- a/doc/whats_new/upcoming_changes/sklearn.ensemble/31414.fix.rst +++ /dev/null @@ -1,7 +0,0 @@ -- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor` - and :class:`ensemble.IsolationForest` now use `sample_weight` to draw - the samples instead of forwarding them multiplied by a uniformly sampled - mask to the underlying estimators. Furthermore, `max_samples` is now - interpreted as a fraction of `sample_weight.sum()` instead of `X.shape[0]` - when passed as a float. - By :user:`Antoine Baker `. diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/32825.fix.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/32825.fix.rst new file mode 100644 index 0000000000000..604ec9421a424 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.ensemble/32825.fix.rst @@ -0,0 +1,8 @@ +- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor` and + :class:`ensemble.IsolationForest` now use `sample_weight` to draw the samples + instead of forwarding them multiplied by a uniformly sampled mask to the + underlying estimators. Furthermore, when `max_samples` is a float, it is now + interpreted as a fraction of `sample_weight.sum()` instead of `X.shape[0]`. + The new default `max_samples=None` draws `X.shape[0]` samples, irrespective + of `sample_weight`. + By :user:`Antoine Baker `. :pr:`31414` and diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 067bdb9e7db0e..a3d0b2bc931c7 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -46,6 +46,63 @@ MAX_INT = np.iinfo(np.int32).max +def _get_n_samples_bootstrap(n_samples, max_samples, sample_weight): + """ + Get the number of samples in a bootstrap sample. + + Parameters + ---------- + n_samples : int + Number of samples in the dataset. + + max_samples : None, int or float + The maximum number of samples to draw. + + - If None, then draw `n_samples` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * n_samples` unweighted samples or + `max_samples * sample_weight.sum()` weighted samples. + + sample_weight : array of shape (n_samples,) or None + Sample weights with frequency semantics when `max_samples` is explicitly + set to a float or integer value. When keeping the `max_samples=None` default + value, the equivalence between fitting with integer weighted data points or + integer repeated data points is no longer guaranteed because the effective + bootstrap size is no longer guaranteed to be equivalent. + + Returns + ------- + n_samples_bootstrap : int + The total number of samples to draw for the bootstrap sample. + """ + if max_samples is None: + return n_samples + elif isinstance(max_samples, Integral): + return max_samples + + if sample_weight is None: + weighted_n_samples = n_samples + weighted_n_samples_msg = f"the number of samples is {weighted_n_samples} " + else: + weighted_n_samples = sample_weight.sum() + weighted_n_samples_msg = ( + f"the total sum of sample weights is {weighted_n_samples} " + ) + + # max_samples Real fractional value relative to weighted_n_samples + n_samples_bootstrap = max(int(max_samples * weighted_n_samples), 1) + # Warn when number of bootstrap samples is suspiciously small + # This heuristic for "suspiciously small" might be adapted if found + # unsuitable in practice + if n_samples_bootstrap < max(10, n_samples ** (1 / 3)): + warn( + f"Using the fractional value {max_samples=} when {weighted_n_samples_msg}" + f"results in a low number ({n_samples_bootstrap}) of bootstrap samples. " + "We recommend passing `max_samples` as an integer instead." + ) + return n_samples_bootstrap + + def _generate_indices(random_state, bootstrap, n_population, n_samples): """Draw randomly sampled indices.""" # Draw sample indices @@ -273,6 +330,7 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta): "estimator": [HasMethods(["fit", "predict"]), None], "n_estimators": [Interval(Integral, 1, None, closed="left")], "max_samples": [ + None, Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0, 1, closed="right"), ], @@ -295,7 +353,7 @@ def __init__( estimator=None, n_estimators=10, *, - max_samples=1.0, + max_samples=None, max_features=1.0, bootstrap=True, bootstrap_features=False, @@ -340,7 +398,9 @@ def fit(self, X, y, sample_weight=None, **fit_params): Sample weights. If None, then samples are equally weighted. Used as probabilities to sample the training set. Note that the expected frequency semantics for the `sample_weight` parameter are only - fulfilled when sampling with replacement `bootstrap=True`. + fulfilled when sampling with replacement `bootstrap=True` and using + a float or integer `max_samples` (instead of the default + `max_samples=None`). **fit_params : dict Parameters to pass to the underlying estimators. @@ -462,20 +522,7 @@ def _fit( if max_samples is None: max_samples = self.max_samples - if not isinstance(max_samples, numbers.Integral): - if sample_weight is None: - max_samples = max(int(max_samples * X.shape[0]), 1) - else: - sw_sum = np.sum(sample_weight) - if sw_sum <= 1: - raise ValueError( - f"The total sum of sample weights is {sw_sum}, which prevents " - "resampling with a fractional value for max_samples=" - f"{max_samples}. Either pass max_samples as an integer or " - "use a larger sample_weight." - ) - max_samples = max(int(max_samples * sw_sum), 1) - + max_samples = _get_n_samples_bootstrap(X.shape[0], max_samples, sample_weight) if not self.bootstrap and max_samples > X.shape[0]: raise ValueError( f"Effective max_samples={max_samples} must be <= n_samples=" @@ -728,13 +775,14 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): n_estimators : int, default=10 The number of base estimators in the ensemble. - max_samples : int or float, default=1.0 + max_samples : int or float, default=None The number of samples to draw from X to train each base estimator (with replacement by default, see `bootstrap` for more details). + - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` unweighted samples - or `max_samples * sample_weight.sum()` weighted samples. + - If float, then draw `max_samples * X.shape[0]` unweighted samples or + `max_samples * sample_weight.sum()` weighted samples. max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator ( @@ -867,7 +915,7 @@ def __init__( estimator=None, n_estimators=10, *, - max_samples=1.0, + max_samples=None, max_features=1.0, bootstrap=True, bootstrap_features=False, @@ -1239,12 +1287,14 @@ class BaggingRegressor(RegressorMixin, BaseBagging): n_estimators : int, default=10 The number of base estimators in the ensemble. - max_samples : int or float, default=1.0 + max_samples : int or float, default=None The number of samples to draw from X to train each base estimator (with replacement by default, see `bootstrap` for more details). + - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. + - If float, then draw `max_samples * X.shape[0]` unweighted samples or + `max_samples * sample_weight.sum()` weighted samples. max_features : int or float, default=1.0 The number of features to draw from X to train each base estimator ( @@ -1368,7 +1418,7 @@ def __init__( estimator=None, n_estimators=10, *, - max_samples=1.0, + max_samples=None, max_features=1.0, bootstrap=True, bootstrap_features=False, diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 611ea271b3f91..0b73499467da6 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -6,6 +6,7 @@ # SPDX-License-Identifier: BSD-3-Clause import re +import warnings from itertools import cycle, product import joblib @@ -26,6 +27,7 @@ RandomForestClassifier, RandomForestRegressor, ) +from sklearn.ensemble._bagging import _get_n_samples_bootstrap from sklearn.feature_selection import SelectKBest from sklearn.linear_model import LogisticRegression, Perceptron from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split @@ -706,16 +708,17 @@ def test_warning_bootstrap_sample_weight(): def test_invalid_sample_weight_max_samples_bootstrap_combinations(): X, y = iris.data, iris.target - # Case 1: small weights and fractional max_samples would lead to sampling - # less than 1 sample, which is not allowed. + # Case 1: small weights and fractional max_samples lead to a small + # number of bootstrap samples, which raises a UserWarning. clf = BaggingClassifier(max_samples=1.0) sample_weight = np.ones_like(y) / (2 * len(y)) expected_msg = ( - r"The total sum of sample weights is 0.5(\d*), which prevents resampling with " - r"a fractional value for max_samples=1\.0\. Either pass max_samples as an " - r"integer or use a larger sample_weight\." + "Using the fractional value max_samples=1.0 when " + r"the total sum of sample weights is 0.5(\d*) " + r"results in a low number \(1\) of bootstrap samples. " + "We recommend passing `max_samples` as an integer." ) - with pytest.raises(ValueError, match=expected_msg): + with pytest.warns(UserWarning, match=expected_msg): clf.fit(X, y, sample_weight=sample_weight) # Case 2: large weights and bootstrap=False would lead to sampling without @@ -813,6 +816,55 @@ def test_draw_indices_using_sample_weight( assert_allclose(estimator.y_, y[samples]) +def test_get_n_samples_bootstrap(): + n_samples, max_samples, sample_weight = 10, None, "not_used" + assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == n_samples + + n_samples, max_samples, sample_weight = 10, 5, "not_used" + assert ( + _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == max_samples + ) + + n_samples, max_samples, sample_weight = 10, 1e-5, None + assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == 1 + + n_samples, max_samples, sample_weight = 10, 0.66, None + warning_msg = ".+the number of samples.+low number.+max_samples.+as an integer" + with pytest.warns(UserWarning, match=warning_msg): + assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == int( + max_samples * n_samples + ) + + n_samples, max_samples, sample_weight = 10, 1e-5, None + with pytest.warns(UserWarning, match=warning_msg): + assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == 1 + + warning_msg_with_weights = ( + ".+the total sum of sample weights.+low number.+max_samples.+as an integer" + ) + rng = np.random.default_rng(0) + n_samples, max_samples, sample_weight = 1_000_000, 1e-5, rng.uniform(size=1_000_000) + with pytest.warns(UserWarning, match=warning_msg_with_weights): + assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == int( + max_samples * sample_weight.sum() + ) + + sample_weight = np.ones(3) + with warnings.catch_warnings(): + warnings.simplefilter("error") + + n_samples, max_samples, sample_weight = 100, 30, None + assert ( + _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) + == max_samples + ) + + n_samples, max_samples, sample_weight = 100, 0.5, rng.uniform(size=100) + assert _get_n_samples_bootstrap(n_samples, max_samples, sample_weight) == int( + max_samples * sample_weight.sum() + ) + + def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=100, random_state=1)