diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 8a414e5371511..86e595093f814 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -895,13 +895,14 @@ generally recommended to use as many bins as possible, which is the default. The ``l2_regularization`` parameter is a regularizer on the loss function and corresponds to :math:`\lambda` in equation (2) of [XGBoost]_. -The early-stopping behaviour is controlled via the ``scoring``, -``validation_fraction``, ``n_iter_no_change``, and ``tol`` parameters. It is -possible to early-stop using an arbitrary :term:`scorer`, or just the -training or validation loss. By default, early-stopping is performed using -the default :term:`scorer` of the estimator on a validation set but it is -also possible to perform early-stopping based on the loss value, which is -significantly faster. +Note that **early-stopping is enabled by default if the number of samples is +larger than 10,000**. The early-stopping behaviour is controlled via the +``early-stopping``, ``scoring``, ``validation_fraction``, +``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop +using an arbitrary :term:`scorer`, or just the training or validation loss. +Note that for technical reasons, using a scorer is significantly slower than +using the loss. By default, early-stopping is performed if there are at least +10,000 samples in the training set, using the validation loss. Missing values support ---------------------- diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 34e958bc9957e..26bfa0b599a42 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -110,6 +110,14 @@ Changelog Stumps (trees with one split) are now allowed. :pr: `16182` by :user:`Santhosh B ` +- |Feature| Early stopping in + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` is now determined with a + new `early_stopping` parameter instead of `n_iter_no_change`. Default value + is 'auto', which enables early stopping if there are at least 10,000 + samples in the training set. :pr:`14516` by :user:`Johann Faouzi + `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 756cb204792d5..e63e0285f553f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -28,8 +28,8 @@ class BaseHistGradientBoosting(BaseEstimator, ABC): @abstractmethod def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, - warm_start, scoring, validation_fraction, n_iter_no_change, - tol, verbose, random_state): + warm_start, early_stopping, scoring, validation_fraction, + n_iter_no_change, tol, verbose, random_state): self.loss = loss self.learning_rate = learning_rate self.max_iter = max_iter @@ -39,6 +39,7 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, self.l2_regularization = l2_regularization self.max_bins = max_bins self.warm_start = warm_start + self.early_stopping = early_stopping self.scoring = scoring self.validation_fraction = validation_fraction self.n_iter_no_change = n_iter_no_change @@ -64,7 +65,7 @@ def _validate_parameters(self): if self.max_iter < 1: raise ValueError('max_iter={} must not be smaller ' 'than 1.'.format(self.max_iter)) - if self.n_iter_no_change is not None and self.n_iter_no_change < 0: + if self.n_iter_no_change < 0: raise ValueError('n_iter_no_change={} must be ' 'positive.'.format(self.n_iter_no_change)) if (self.validation_fraction is not None and @@ -114,7 +115,7 @@ def fit(self, X, y): dtype='u8') self._validate_parameters() - self.n_features_ = X.shape[1] # used for validation in predict() + n_samples, self.n_features_ = X.shape # used for validation in predict # we need this stateful variable to tell raw_predict() that it was # called from fit() (this current method), and that the data it has @@ -127,9 +128,10 @@ def fit(self, X, y): self._in_fit = True self.loss_ = self._get_loss() - - self.do_early_stopping_ = (self.n_iter_no_change is not None and - self.n_iter_no_change > 0) + if self.early_stopping == 'auto': + self.do_early_stopping_ = n_samples > 10000 + else: + self.do_early_stopping_ = self.early_stopping # create validation data if needed self._use_validation_data = self.validation_fraction is not None @@ -710,21 +712,25 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): and add more estimators to the ensemble. For results to be valid, the estimator should be re-trained on the same data only. See :term:`the Glossary `. - scoring : str or callable or None, optional (default=None) + early_stopping : 'auto' or bool (default='auto') + If 'auto', early stopping is enabled if the sample size is larger than + 10000. If True, early stopping is enabled, otherwise early stopping is + disabled. + scoring : str or callable or None, optional (default='loss') Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`). If None, the estimator's default scorer is used. If ``scoring='loss'``, early stopping is checked w.r.t the loss value. - Only used if ``n_iter_no_change`` is not None. + Only used if early stopping is performed. validation_fraction : int or float or None, optional (default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on - the training data. Only used if ``n_iter_no_change`` is not None. - n_iter_no_change : int or None, optional (default=None) + the training data. Only used if early stopping is performed. + n_iter_no_change : int, optional (default=10) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1`` -th-to-last one, up to some - tolerance. If None or 0, no early-stopping is done. + tolerance. Only used if early stopping is performed. tol : float or None, optional (default=1e-7) The absolute tolerance to use when comparing scores during early stopping. The higher the tolerance, the more likely we are to early @@ -744,8 +750,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): Attributes ---------- n_iter_ : int - The number of iterations as selected by early stopping (if - n_iter_no_change is not None). Otherwise it corresponds to max_iter. + The number of iterations as selected by early stopping, depending on + the `early_stopping` parameter. Otherwise it corresponds to max_iter. n_trees_per_iteration_ : int The number of tree that are built at each iteration. For regressors, this is always 1. @@ -778,16 +784,16 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): def __init__(self, loss='least_squares', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=255, - warm_start=False, scoring=None, validation_fraction=0.1, - n_iter_no_change=None, tol=1e-7, verbose=0, - random_state=None): + warm_start=False, early_stopping='auto', scoring='loss', + validation_fraction=0.1, n_iter_no_change=10, tol=1e-7, + verbose=0, random_state=None): super(HistGradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, - warm_start=warm_start, scoring=scoring, - validation_fraction=validation_fraction, + warm_start=warm_start, early_stopping=early_stopping, + scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) @@ -894,21 +900,25 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, and add more estimators to the ensemble. For results to be valid, the estimator should be re-trained on the same data only. See :term:`the Glossary `. - scoring : str or callable or None, optional (default=None) + early_stopping : 'auto' or bool (default='auto') + If 'auto', early stopping is enabled if the sample size is larger than + 10000. If True, early stopping is enabled, otherwise early stopping is + disabled. + scoring : str or callable or None, optional (default='loss') Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`). If None, the estimator's default scorer is used. If ``scoring='loss'``, early stopping is checked - w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. + w.r.t the loss value. Only used if early stopping is performed. validation_fraction : int or float or None, optional (default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on - the training data. - n_iter_no_change : int or None, optional (default=None) + the training data. Only used if early stopping is performed. + n_iter_no_change : int, optional (default=10) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1`` -th-to-last one, up to some - tolerance. If None or 0, no early-stopping is done. + tolerance. Only used if early stopping is performed. tol : float or None, optional (default=1e-7) The absolute tolerance to use when comparing scores. The higher the tolerance, the more likely we are to early stop: higher tolerance @@ -930,8 +940,8 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, classes_ : array, shape = (n_classes,) Class labels. n_iter_ : int - The number of estimators as selected by early stopping (if - n_iter_no_change is not None). Otherwise it corresponds to max_iter. + The number of iterations as selected by early stopping, depending on + the `early_stopping` parameter. Otherwise it corresponds to max_iter. n_trees_per_iteration_ : int The number of tree that are built at each iteration. This is equal to 1 for binary classification, and to ``n_classes`` for multiclass @@ -966,15 +976,16 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=255, warm_start=False, - scoring=None, validation_fraction=0.1, n_iter_no_change=None, - tol=1e-7, verbose=0, random_state=None): + early_stopping='auto', scoring='loss', + validation_fraction=0.1, n_iter_no_change=10, tol=1e-7, + verbose=0, random_state=None): super(HistGradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, - warm_start=warm_start, scoring=scoring, - validation_fraction=validation_fraction, + warm_start=warm_start, early_stopping=early_stopping, + scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 32bb5dee4b197..6ac76a67d07ca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -66,7 +66,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_iter=max_iter, max_bins=max_bins, learning_rate=1, - n_iter_no_change=None, + early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') @@ -119,7 +119,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_iter=max_iter, max_bins=max_bins, learning_rate=1, - n_iter_no_change=None, + early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') @@ -181,7 +181,7 @@ def test_same_predictions_multiclass_classification( max_iter=max_iter, max_bins=max_bins, learning_rate=lr, - n_iter_no_change=None, + early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index b607cdd23b6c9..29d7bfd482b8a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -19,6 +19,14 @@ X_regression, y_regression = make_regression(random_state=0) +def _make_dumb_dataset(n_samples): + """Make a dumb dataset to test early stopping.""" + rng = np.random.RandomState(42) + X_dumb = rng.randn(n_samples, 1) + y_dumb = (X_dumb[:, 0] > 0).astype('int64') + return X_dumb, y_dumb + + @pytest.mark.parametrize('GradientBoosting, X, y', [ (HistGradientBoostingClassifier, X_classification, y_classification), (HistGradientBoostingRegressor, X_regression, y_regression) @@ -57,17 +65,17 @@ def test_invalid_classification_loss(): @pytest.mark.parametrize( - 'scoring, validation_fraction, n_iter_no_change, tol', [ - ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer - ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on train data - (None, .1, 5, 1e-7), # same with default scorer - (None, None, 5, 1e-1), - ('loss', .1, 5, 1e-7), # use loss - ('loss', None, 5, 1e-1), # use loss on training data - (None, None, None, None), # no early stopping + 'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [ + ('neg_mean_squared_error', .1, True, 5, 1e-7), # use scorer + ('neg_mean_squared_error', None, True, 5, 1e-1), # use scorer on train + (None, .1, True, 5, 1e-7), # same with default scorer + (None, None, True, 5, 1e-1), + ('loss', .1, True, 5, 1e-7), # use loss + ('loss', None, True, 5, 1e-1), # use loss on training data + (None, None, False, 5, None), # no early stopping ]) def test_early_stopping_regression(scoring, validation_fraction, - n_iter_no_change, tol): + early_stopping, n_iter_no_change, tol): max_iter = 200 @@ -78,6 +86,7 @@ def test_early_stopping_regression(scoring, validation_fraction, min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, + early_stopping=early_stopping, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, @@ -85,7 +94,7 @@ def test_early_stopping_regression(scoring, validation_fraction, ) gb.fit(X, y) - if n_iter_no_change is not None: + if early_stopping: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter @@ -97,17 +106,17 @@ def test_early_stopping_regression(scoring, validation_fraction, random_state=0) )) @pytest.mark.parametrize( - 'scoring, validation_fraction, n_iter_no_change, tol', [ - ('accuracy', .1, 5, 1e-7), # use scorer - ('accuracy', None, 5, 1e-1), # use scorer on training data - (None, .1, 5, 1e-7), # same with default scorerscor - (None, None, 5, 1e-1), - ('loss', .1, 5, 1e-7), # use loss - ('loss', None, 5, 1e-1), # use loss on training data - (None, None, None, None), # no early stopping + 'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [ + ('accuracy', .1, True, 5, 1e-7), # use scorer + ('accuracy', None, True, 5, 1e-1), # use scorer on training data + (None, .1, True, 5, 1e-7), # same with default scorer + (None, None, True, 5, 1e-1), + ('loss', .1, True, 5, 1e-7), # use loss + ('loss', None, True, 5, 1e-1), # use loss on training data + (None, None, False, 5, None), # no early stopping ]) def test_early_stopping_classification(data, scoring, validation_fraction, - n_iter_no_change, tol): + early_stopping, n_iter_no_change, tol): max_iter = 50 @@ -118,6 +127,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction, min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, + early_stopping=early_stopping, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, @@ -125,12 +135,29 @@ def test_early_stopping_classification(data, scoring, validation_fraction, ) gb.fit(X, y) - if n_iter_no_change is not None: + if early_stopping is True: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter +@pytest.mark.parametrize('GradientBoosting, X, y', [ + (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)), + (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)), + (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)), + (HistGradientBoostingRegressor, *_make_dumb_dataset(10001)) +]) +def test_early_stopping_default(GradientBoosting, X, y): + # Test that early stopping is enabled by default if and only if there + # are more than 10000 samples + gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1) + gb.fit(X, y) + if X.shape[0] > 10000: + assert gb.n_iter_ < gb.max_iter + else: + assert gb.n_iter_ == gb.max_iter + + @pytest.mark.parametrize( 'scores, n_iter_no_change, tol, stopping', [ @@ -170,7 +197,7 @@ def test_binning_train_validation_are_separated(): rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( - n_iter_no_change=5, + early_stopping=True, validation_fraction=validation_fraction, random_state=rng ) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py index e5ec1371f3aa6..2417de4f6cc63 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py @@ -11,6 +11,7 @@ from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.metrics import check_scoring X_classification, y_classification = make_classification(random_state=0) @@ -37,10 +38,11 @@ def test_max_iter_with_warm_start_validation(GradientBoosting, X, y): # is smaller than the number of iterations from the previous fit when warm # start is True. - estimator = GradientBoosting(max_iter=50, warm_start=True) + estimator = GradientBoosting(max_iter=10, early_stopping=False, + warm_start=True) estimator.fit(X, y) - estimator.set_params(max_iter=25) - err_msg = ('max_iter=25 must be larger than or equal to n_iter_=50 ' + estimator.set_params(max_iter=5) + err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 ' 'when warm_start==True') with pytest.raises(ValueError, match=err_msg): estimator.fit(X, y) @@ -75,14 +77,14 @@ def test_warm_start_yields_identical_results(GradientBoosting, X, y): ]) def test_warm_start_max_depth(GradientBoosting, X, y): # Test if possible to fit trees of different depth in ensemble. - gb = GradientBoosting(max_iter=100, min_samples_leaf=1, - warm_start=True, max_depth=2) + gb = GradientBoosting(max_iter=20, min_samples_leaf=1, + warm_start=True, max_depth=2, early_stopping=False) gb.fit(X, y) - gb.set_params(max_iter=110, max_depth=3) + gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110) gb.fit(X, y) - # First 100 trees have max_depth == 2 - for i in range(100): + # First 20 trees have max_depth == 2 + for i in range(20): assert gb._predictors[i][0].get_max_depth() == 2 # Last 10 trees have max_depth == 3 for i in range(1, 11): @@ -100,14 +102,14 @@ def test_warm_start_early_stopping(GradientBoosting, X, y, scoring): n_iter_no_change = 5 gb = GradientBoosting( - n_iter_no_change=n_iter_no_change, max_iter=10000, + n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True, random_state=42, warm_start=True, tol=1e-3, scoring=scoring, ) gb.fit(X, y) n_iter_first_fit = gb.n_iter_ gb.fit(X, y) n_iter_second_fit = gb.n_iter_ - assert n_iter_second_fit - n_iter_first_fit < n_iter_no_change + assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change @pytest.mark.parametrize('GradientBoosting, X, y', [ @@ -116,11 +118,12 @@ def test_warm_start_early_stopping(GradientBoosting, X, y, scoring): ]) def test_warm_start_equal_n_estimators(GradientBoosting, X, y): # Test if warm start with equal n_estimators does nothing - gb_1 = GradientBoosting(max_depth=2) + gb_1 = GradientBoosting(max_depth=2, early_stopping=False) gb_1.fit(X, y) gb_2 = clone(gb_1) - gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True) + gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True, + n_iter_no_change=5) gb_2.fit(X, y) # Check that both predictors are equal @@ -169,8 +172,9 @@ def _get_rng(rng_type): return np.random.RandomState(0) random_state = _get_rng(rng_type) - gb_1 = GradientBoosting(n_iter_no_change=5, max_iter=2, + gb_1 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state) + gb_1.set_params(scoring=check_scoring(gb_1)) gb_1.fit(X, y) random_seed_1_1 = gb_1._random_seed @@ -178,8 +182,9 @@ def _get_rng(rng_type): random_seed_1_2 = gb_1._random_seed # clear the old state, different seed random_state = _get_rng(rng_type) - gb_2 = GradientBoosting(n_iter_no_change=5, max_iter=2, + gb_2 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state, warm_start=True) + gb_2.set_params(scoring=check_scoring(gb_2)) gb_2.fit(X, y) # inits state random_seed_2_1 = gb_2._random_seed gb_2.fit(X, y) # clears old state and equals est diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index 4b1188b87e69e..cf2c5a51c90dd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -38,7 +38,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): if sklearn_params['loss'] == 'auto': raise ValueError('auto loss is not accepted. We need to know if ' 'the problem is binary or multiclass classification.') - if sklearn_params['n_iter_no_change'] is not None: + if sklearn_params['early_stopping']: raise NotImplementedError('Early stopping should be deactivated.') lightgbm_loss_mapping = {