scikit-learn · NicolasHug · Feb 12, 2020 · Jul 30, 2019 · Jul 30, 2019 · Jul 30, 2019
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -895,13 +895,14 @@ generally recommended to use as many bins as possible, which is the default.
 The ``l2_regularization`` parameter is a regularizer on the loss function and
 corresponds to :math:`\lambda` in equation (2) of [XGBoost]_.
 
-The early-stopping behaviour is controlled via the ``scoring``,
-``validation_fraction``, ``n_iter_no_change``, and ``tol`` parameters. It is
-possible to early-stop using an arbitrary :term:`scorer`, or just the
-training or validation loss. By default, early-stopping is performed using
-the default :term:`scorer` of the estimator on a validation set but it is
-also possible to perform early-stopping based on the loss value, which is
-significantly faster.
+Note that **early-stopping is enabled by default if the number of samples is
+larger than 10,000**. The early-stopping behaviour is controlled via the
+``early-stopping``, ``scoring``, ``validation_fraction``,
+``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop
+using an arbitrary :term:`scorer`, or just the training or validation loss.
+Note that for technical reasons, using a scorer is significantly slower than
+using the loss. By default, early-stopping is performed if there are at least
+10,000 samples in the training set, using the validation loss.
 
 Missing values support
 ----------------------

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -110,6 +110,14 @@ Changelog
   Stumps (trees with one split) are now allowed.
   :pr: `16182` by :user:`Santhosh B <santhoshbala18>`
 
+- |Feature| Early stopping in
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` is now determined with a
+  new `early_stopping` parameter instead of `n_iter_no_change`. Default value
+  is 'auto', which enables early stopping if there are at least 10,000
+  samples in the training set. :pr:`14516` by :user:`Johann Faouzi
+  <johannfaouzi>`.
+
 :mod:`sklearn.feature_extraction`
 .................................
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -28,8 +28,8 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
     @abstractmethod
     def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
                  max_depth, min_samples_leaf, l2_regularization, max_bins,
-                 warm_start, scoring, validation_fraction, n_iter_no_change,
-                 tol, verbose, random_state):
+                 warm_start, early_stopping, scoring, validation_fraction,
+                 n_iter_no_change, tol, verbose, random_state):
         self.loss = loss
         self.learning_rate = learning_rate
         self.max_iter = max_iter
@@ -39,6 +39,7 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
         self.warm_start = warm_start
+        self.early_stopping = early_stopping
         self.scoring = scoring
         self.validation_fraction = validation_fraction
         self.n_iter_no_change = n_iter_no_change
@@ -64,7 +65,7 @@ def _validate_parameters(self):
         if self.max_iter < 1:
             raise ValueError('max_iter={} must not be smaller '
                              'than 1.'.format(self.max_iter))
-        if self.n_iter_no_change is not None and self.n_iter_no_change < 0:
+        if self.n_iter_no_change < 0:
             raise ValueError('n_iter_no_change={} must be '
                              'positive.'.format(self.n_iter_no_change))
         if (self.validation_fraction is not None and
@@ -114,7 +115,7 @@ def fit(self, X, y):
                                             dtype='u8')
 
         self._validate_parameters()
-        self.n_features_ = X.shape[1]  # used for validation in predict()
+        n_samples, self.n_features_ = X.shape  # used for validation in predict
 
         # we need this stateful variable to tell raw_predict() that it was
         # called from fit() (this current method), and that the data it has
@@ -127,9 +128,10 @@ def fit(self, X, y):
         self._in_fit = True
 
         self.loss_ = self._get_loss()
-
-        self.do_early_stopping_ = (self.n_iter_no_change is not None and
-                                   self.n_iter_no_change > 0)
+        if self.early_stopping == 'auto':
+            self.do_early_stopping_ = n_samples > 10000
+        else:
+            self.do_early_stopping_ = self.early_stopping
 
         # create validation data if needed
         self._use_validation_data = self.validation_fraction is not None
@@ -710,21 +712,25 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         and add more estimators to the ensemble. For results to be valid, the
         estimator should be re-trained on the same data only.
         See :term:`the Glossary <warm_start>`.
-    scoring : str or callable or None, optional (default=None)
+    early_stopping : 'auto' or bool (default='auto')
-    early_stopping : 'auto' or bool (default='auto')
+    early_stopping : 'auto' or bool, default='auto'
-    early_stopping : 'auto' or bool (default='auto')
+    early_stopping : 'auto' or bool, default='auto'
+        If 'auto', early stopping is enabled if the sample size is larger than
+        10000. If True, early stopping is enabled, otherwise early stopping is
+        disabled.
+    scoring : str or callable or None, optional (default='loss')
         Scoring parameter to use for early stopping. It can be a single
         string (see :ref:`scoring_parameter`) or a callable (see
         :ref:`scoring`). If None, the estimator's default scorer is used. If
         ``scoring='loss'``, early stopping is checked w.r.t the loss value.
-        Only used if ``n_iter_no_change`` is not None.
+        Only used if early stopping is performed.
     validation_fraction : int or float or None, optional (default=0.1)
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data. Only used if ``n_iter_no_change`` is not None.
-    n_iter_no_change : int or None, optional (default=None)
+        the training data. Only used if early stopping is performed.
+    n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
-        tolerance. If None or 0, no early-stopping is done.
+        tolerance. Only used if early stopping is performed.
     tol : float or None, optional (default=1e-7)
         The absolute tolerance to use when comparing scores during early
         stopping. The higher the tolerance, the more likely we are to early
@@ -744,8 +750,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     Attributes
     ----------
     n_iter_ : int
-        The number of iterations as selected by early stopping (if
-        n_iter_no_change is not None). Otherwise it corresponds to max_iter.
+        The number of iterations as selected by early stopping, depending on
+        the `early_stopping` parameter. Otherwise it corresponds to max_iter.
     n_trees_per_iteration_ : int
         The number of tree that are built at each iteration. For regressors,
         this is always 1.
@@ -778,16 +784,16 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=255,
-                 warm_start=False, scoring=None, validation_fraction=0.1,
-                 n_iter_no_change=None, tol=1e-7, verbose=0,
-                 random_state=None):
+                 warm_start=False, early_stopping='auto', scoring='loss',
+                 validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
+                 verbose=0, random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, scoring=scoring,
-            validation_fraction=validation_fraction,
+            warm_start=warm_start, early_stopping=early_stopping,
+            scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 
@@ -894,21 +900,25 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         and add more estimators to the ensemble. For results to be valid, the
         estimator should be re-trained on the same data only.
         See :term:`the Glossary <warm_start>`.
-    scoring : str or callable or None, optional (default=None)
+    early_stopping : 'auto' or bool (default='auto')
+        If 'auto', early stopping is enabled if the sample size is larger than
+        10000. If True, early stopping is enabled, otherwise early stopping is
+        disabled.
+    scoring : str or callable or None, optional (default='loss')
         Scoring parameter to use for early stopping. It can be a single
         string (see :ref:`scoring_parameter`) or a callable (see
         :ref:`scoring`). If None, the estimator's default scorer
         is used. If ``scoring='loss'``, early stopping is checked
-        w.r.t the loss value. Only used if ``n_iter_no_change`` is not None.
+        w.r.t the loss value. Only used if early stopping is performed.
     validation_fraction : int or float or None, optional (default=0.1)
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data.
-    n_iter_no_change : int or None, optional (default=None)
+        the training data. Only used if early stopping is performed.
+    n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
-        tolerance. If None or 0, no early-stopping is done.
+        tolerance. Only used if early stopping is performed.
     tol : float or None, optional (default=1e-7)
         The absolute tolerance to use when comparing scores. The higher the
         tolerance, the more likely we are to early stop: higher tolerance
@@ -930,8 +940,8 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     classes_ : array, shape = (n_classes,)
         Class labels.
     n_iter_ : int
-        The number of estimators as selected by early stopping (if
-        n_iter_no_change is not None). Otherwise it corresponds to max_iter.
+        The number of iterations as selected by early stopping, depending on
+        the `early_stopping` parameter. Otherwise it corresponds to max_iter.
     n_trees_per_iteration_ : int
         The number of tree that are built at each iteration. This is equal to 1
         for binary classification, and to ``n_classes`` for multiclass
@@ -966,15 +976,16 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
                  l2_regularization=0., max_bins=255, warm_start=False,
-                 scoring=None, validation_fraction=0.1, n_iter_no_change=None,
-                 tol=1e-7, verbose=0, random_state=None):
+                 early_stopping='auto', scoring='loss',
+                 validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
+                 verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, scoring=scoring,
-            validation_fraction=validation_fraction,
+            warm_start=warm_start, early_stopping=early_stopping,
+            scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -66,7 +66,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=1,
-        n_iter_no_change=None,
+        early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
@@ -119,7 +119,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=1,
-        n_iter_no_change=None,
+        early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
@@ -181,7 +181,7 @@ def test_same_predictions_multiclass_classification(
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=lr,
-        n_iter_no_change=None,
+        early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -19,6 +19,14 @@
 X_regression, y_regression = make_regression(random_state=0)
 
 
+def _make_dumb_dataset(n_samples):
+    """Make a dumb dataset to test early stopping."""
+    rng = np.random.RandomState(42)
+    X_dumb = rng.randn(n_samples, 1)
+    y_dumb = (X_dumb[:, 0] > 0).astype('int64')
+    return X_dumb, y_dumb
+
+
 @pytest.mark.parametrize('GradientBoosting, X, y', [
     (HistGradientBoostingClassifier, X_classification, y_classification),
     (HistGradientBoostingRegressor, X_regression, y_regression)
@@ -57,17 +65,17 @@ def test_invalid_classification_loss():
 
 
 @pytest.mark.parametrize(
-    'scoring, validation_fraction, n_iter_no_change, tol', [
-        ('neg_mean_squared_error', .1, 5, 1e-7),  # use scorer
-        ('neg_mean_squared_error', None, 5, 1e-1),  # use scorer on train data
-        (None, .1, 5, 1e-7),  # same with default scorer
-        (None, None, 5, 1e-1),
-        ('loss', .1, 5, 1e-7),  # use loss
-        ('loss', None, 5, 1e-1),  # use loss on training data
-        (None, None, None, None),  # no early stopping
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('neg_mean_squared_error', .1, True, 5, 1e-7),  # use scorer
+        ('neg_mean_squared_error', None, True, 5, 1e-1),  # use scorer on train
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
         ])
 def test_early_stopping_regression(scoring, validation_fraction,
-                                   n_iter_no_change, tol):
+                                   early_stopping, n_iter_no_change, tol):
 
     max_iter = 200
 
@@ -78,14 +86,15 @@ def test_early_stopping_regression(scoring, validation_fraction,
         min_samples_leaf=5,  # easier to overfit fast
         scoring=scoring,
         tol=tol,
+        early_stopping=early_stopping,
         validation_fraction=validation_fraction,
         max_iter=max_iter,
         n_iter_no_change=n_iter_no_change,
         random_state=0
     )
     gb.fit(X, y)
 
-    if n_iter_no_change is not None:
+    if early_stopping:
         assert n_iter_no_change <= gb.n_iter_ < max_iter
     else:
         assert gb.n_iter_ == max_iter
@@ -97,17 +106,17 @@ def test_early_stopping_regression(scoring, validation_fraction,
                         random_state=0)
 ))
 @pytest.mark.parametrize(
-    'scoring, validation_fraction, n_iter_no_change, tol', [
-        ('accuracy', .1, 5, 1e-7),  # use scorer
-        ('accuracy', None, 5, 1e-1),  # use scorer on training data
-        (None, .1, 5, 1e-7),  # same with default scorerscor
-        (None, None, 5, 1e-1),
-        ('loss', .1, 5, 1e-7),  # use loss
-        ('loss', None, 5, 1e-1),  # use loss on training data
-        (None, None, None, None),  # no early stopping
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('accuracy', .1, True, 5, 1e-7),  # use scorer
+        ('accuracy', None, True, 5, 1e-1),  # use scorer on training data
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
         ])
 def test_early_stopping_classification(data, scoring, validation_fraction,
-                                       n_iter_no_change, tol):
+                                       early_stopping, n_iter_no_change, tol):
 
     max_iter = 50
 
@@ -118,19 +127,37 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
         min_samples_leaf=5,  # easier to overfit fast
         scoring=scoring,
         tol=tol,
+        early_stopping=early_stopping,
         validation_fraction=validation_fraction,
         max_iter=max_iter,
         n_iter_no_change=n_iter_no_change,
         random_state=0
     )
     gb.fit(X, y)
 
-    if n_iter_no_change is not None:
+    if early_stopping is True:
         assert n_iter_no_change <= gb.n_iter_ < max_iter
     else:
         assert gb.n_iter_ == max_iter
 
 
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10001))
+])
+def test_early_stopping_default(GradientBoosting, X, y):
+    # Test that early stopping is enabled by default if and only if there
+    # are more than 10000 samples
+    gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
+    gb.fit(X, y)
+    if X.shape[0] > 10000:
+        assert gb.n_iter_ < gb.max_iter
+    else:
+        assert gb.n_iter_ == gb.max_iter
+
+
 @pytest.mark.parametrize(
     'scores, n_iter_no_change, tol, stopping',
     [
@@ -170,7 +197,7 @@ def test_binning_train_validation_are_separated():
     rng = np.random.RandomState(0)
     validation_fraction = .2
     gb = HistGradientBoostingClassifier(
-        n_iter_no_change=5,
+        early_stopping=True,
         validation_fraction=validation_fraction,
         random_state=rng
     )