-
-
Notifications
You must be signed in to change notification settings - Fork 26.6k
Add new default max_samples=None in Bagging estimators #32825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
67e1313
5102346
c8afff8
da963ca
25ee7f8
dd5154a
c50c851
a22a279
6d6c4e0
762fee9
758afc2
78c7144
1d7b1bb
4a866a8
2e36e2c
81652cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| - :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor` and | ||
| :class:`ensemble.IsolationForest` now use `sample_weight` to draw the samples | ||
| instead of forwarding them multiplied by a uniformly sampled mask to the | ||
| underlying estimators. Furthermore, when `max_samples` is a float, it is now | ||
| interpreted as a fraction of `sample_weight.sum()` instead of `X.shape[0]`. | ||
| The new default `max_samples=None` draws `X.shape[0]` samples, irrespective | ||
| of `sample_weight`. | ||
| By :user:`Antoine Baker <antoinebaker>`. :pr:`31414` and |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,6 +46,63 @@ | |
| MAX_INT = np.iinfo(np.int32).max | ||
|
|
||
|
|
||
| def _get_n_samples_bootstrap(n_samples, max_samples, sample_weight): | ||
| """ | ||
| Get the number of samples in a bootstrap sample. | ||
|
|
||
antoinebaker marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Parameters | ||
| ---------- | ||
| n_samples : int | ||
| Number of samples in the dataset. | ||
|
|
||
| max_samples : None, int or float | ||
| The maximum number of samples to draw. | ||
|
|
||
| - If None, then draw `n_samples` samples. | ||
| - If int, then draw `max_samples` samples. | ||
| - If float, then draw `max_samples * n_samples` unweighted samples or | ||
| `max_samples * sample_weight.sum()` weighted samples. | ||
|
|
||
| sample_weight : array of shape (n_samples,) or None | ||
| Sample weights with frequency semantics when `max_samples` is explicitly | ||
| set to a float or integer value. When keeping the `max_samples=None` default | ||
| value, the equivalence between fitting with integer weighted data points or | ||
| integer repeated data points is no longer guaranteed because the effective | ||
| bootstrap size is no longer guaranteed to be equivalent. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For reviewers, I updated the notebooks of our sample weight analyzer, in particular this want to highlight the fact that The p-value is very small: the null-hypothesis that fitting with integer weights is equivalent to fitting with repeated data points is rejected. Editing the cell to select either
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we could recommend:
|
||
|
|
||
| Returns | ||
| ------- | ||
| n_samples_bootstrap : int | ||
| The total number of samples to draw for the bootstrap sample. | ||
| """ | ||
| if max_samples is None: | ||
| return n_samples | ||
| elif isinstance(max_samples, Integral): | ||
| return max_samples | ||
|
|
||
| if sample_weight is None: | ||
| weighted_n_samples = n_samples | ||
| weighted_n_samples_msg = f"the number of samples is {weighted_n_samples} " | ||
| else: | ||
| weighted_n_samples = sample_weight.sum() | ||
| weighted_n_samples_msg = ( | ||
| f"the total sum of sample weights is {weighted_n_samples} " | ||
| ) | ||
|
|
||
| # max_samples Real fractional value relative to weighted_n_samples | ||
| n_samples_bootstrap = max(int(max_samples * weighted_n_samples), 1) | ||
| # Warn when number of bootstrap samples is suspiciously small | ||
| # This heuristic for "suspiciously small" might be adapted if found | ||
| # unsuitable in practice | ||
| if n_samples_bootstrap < max(10, n_samples ** (1 / 3)): | ||
| warn( | ||
| f"Using the fractional value {max_samples=} when {weighted_n_samples_msg}" | ||
| f"results in a low number ({n_samples_bootstrap}) of bootstrap samples. " | ||
| "We recommend passing `max_samples` as an integer instead." | ||
| ) | ||
| return n_samples_bootstrap | ||
|
|
||
|
|
||
| def _generate_indices(random_state, bootstrap, n_population, n_samples): | ||
| """Draw randomly sampled indices.""" | ||
| # Draw sample indices | ||
|
|
@@ -273,6 +330,7 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta): | |
| "estimator": [HasMethods(["fit", "predict"]), None], | ||
| "n_estimators": [Interval(Integral, 1, None, closed="left")], | ||
| "max_samples": [ | ||
| None, | ||
| Interval(Integral, 1, None, closed="left"), | ||
| Interval(RealNotInt, 0, 1, closed="right"), | ||
| ], | ||
|
|
@@ -295,7 +353,7 @@ def __init__( | |
| estimator=None, | ||
| n_estimators=10, | ||
| *, | ||
| max_samples=1.0, | ||
| max_samples=None, | ||
| max_features=1.0, | ||
| bootstrap=True, | ||
| bootstrap_features=False, | ||
|
|
@@ -340,7 +398,9 @@ def fit(self, X, y, sample_weight=None, **fit_params): | |
| Sample weights. If None, then samples are equally weighted. Used as | ||
| probabilities to sample the training set. Note that the expected | ||
| frequency semantics for the `sample_weight` parameter are only | ||
| fulfilled when sampling with replacement `bootstrap=True`. | ||
| fulfilled when sampling with replacement `bootstrap=True` and using | ||
| a float or integer `max_samples` (instead of the default | ||
| `max_samples=None`). | ||
|
|
||
| **fit_params : dict | ||
| Parameters to pass to the underlying estimators. | ||
|
|
@@ -462,20 +522,7 @@ def _fit( | |
| if max_samples is None: | ||
| max_samples = self.max_samples | ||
|
|
||
| if not isinstance(max_samples, numbers.Integral): | ||
| if sample_weight is None: | ||
| max_samples = max(int(max_samples * X.shape[0]), 1) | ||
| else: | ||
| sw_sum = np.sum(sample_weight) | ||
| if sw_sum <= 1: | ||
| raise ValueError( | ||
| f"The total sum of sample weights is {sw_sum}, which prevents " | ||
| "resampling with a fractional value for max_samples=" | ||
| f"{max_samples}. Either pass max_samples as an integer or " | ||
| "use a larger sample_weight." | ||
| ) | ||
| max_samples = max(int(max_samples * sw_sum), 1) | ||
|
|
||
| max_samples = _get_n_samples_bootstrap(X.shape[0], max_samples, sample_weight) | ||
| if not self.bootstrap and max_samples > X.shape[0]: | ||
| raise ValueError( | ||
| f"Effective max_samples={max_samples} must be <= n_samples=" | ||
|
|
@@ -728,13 +775,14 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): | |
| n_estimators : int, default=10 | ||
| The number of base estimators in the ensemble. | ||
|
|
||
| max_samples : int or float, default=1.0 | ||
| max_samples : int or float, default=None | ||
| The number of samples to draw from X to train each base estimator (with | ||
| replacement by default, see `bootstrap` for more details). | ||
|
|
||
| - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. | ||
| - If int, then draw `max_samples` samples. | ||
| - If float, then draw `max_samples * X.shape[0]` unweighted samples | ||
| or `max_samples * sample_weight.sum()` weighted samples. | ||
| - If float, then draw `max_samples * X.shape[0]` unweighted samples or | ||
| `max_samples * sample_weight.sum()` weighted samples. | ||
|
|
||
| max_features : int or float, default=1.0 | ||
| The number of features to draw from X to train each base estimator ( | ||
|
|
@@ -867,7 +915,7 @@ def __init__( | |
| estimator=None, | ||
| n_estimators=10, | ||
| *, | ||
| max_samples=1.0, | ||
| max_samples=None, | ||
| max_features=1.0, | ||
| bootstrap=True, | ||
| bootstrap_features=False, | ||
|
|
@@ -1239,12 +1287,14 @@ class BaggingRegressor(RegressorMixin, BaseBagging): | |
| n_estimators : int, default=10 | ||
| The number of base estimators in the ensemble. | ||
|
|
||
| max_samples : int or float, default=1.0 | ||
| max_samples : int or float, default=None | ||
| The number of samples to draw from X to train each base estimator (with | ||
| replacement by default, see `bootstrap` for more details). | ||
|
|
||
| - If None, then draw `X.shape[0]` samples irrespective of `sample_weight`. | ||
| - If int, then draw `max_samples` samples. | ||
| - If float, then draw `max_samples * X.shape[0]` samples. | ||
| - If float, then draw `max_samples * X.shape[0]` unweighted samples or | ||
| `max_samples * sample_weight.sum()` weighted samples. | ||
|
|
||
| max_features : int or float, default=1.0 | ||
| The number of features to draw from X to train each base estimator ( | ||
|
|
@@ -1368,7 +1418,7 @@ def __init__( | |
| estimator=None, | ||
| n_estimators=10, | ||
| *, | ||
| max_samples=1.0, | ||
| max_samples=None, | ||
| max_features=1.0, | ||
| bootstrap=True, | ||
| bootstrap_features=False, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.