From b1bc2deb8119ce4f2314ad5b88658bd9b09a1507 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 30 Jul 2019 09:05:31 +0200
Subject: [PATCH 01/33] Update docstrings and change default value

---
 .../_hist_gradient_boosting/gradient_boosting.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index dc040ed1fa409..5df5102392319 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -622,7 +622,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
     into integer-valued bins, which considerably reduces the number of
     splitting points to consider, and allows the algorithm to leverage
-    integer-based data structures. For small sample sizes,
+    integer-based data structures. Early stopping is the default behavior, as
+    it usually makes the fitting process much faster without a substantial
+    difference in terms of predictive performance. For small sample sizes,
     :class:`GradientBoostingRegressor<sklearn.ensemble.GradientBoostingRegressor>`
     might be preferred since binning may lead to split points that are too
     approximate in this setting.
@@ -690,7 +692,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
         the training data. Only used if ``n_iter_no_change`` is not None.
-    n_iter_no_change : int or None, optional (default=None)
+    n_iter_no_change : int or None, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
@@ -747,7 +749,7 @@ def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=256,
                  warm_start=False, scoring=None, validation_fraction=0.1,
-                 n_iter_no_change=None, tol=1e-7, verbose=0,
+                 n_iter_no_change=10, tol=1e-7, verbose=0,
                  random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
@@ -795,7 +797,9 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
     into integer-valued bins, which considerably reduces the number of
     splitting points to consider, and allows the algorithm to leverage
-    integer-based data structures. For small sample sizes,
+    integer-based data structures. Early stopping is the default behavior, as
+    it usually makes the fitting process much faster without a substantial
+    difference in terms of predictive performance. For small sample sizes,
     :class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
     might be preferred since binning may lead to split points that are too
     approximate in this setting.
@@ -865,7 +869,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
         the training data.
-    n_iter_no_change : int or None, optional (default=None)
+    n_iter_no_change : int or None, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
@@ -923,7 +927,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
                  l2_regularization=0., max_bins=256, warm_start=False,
-                 scoring=None, validation_fraction=0.1, n_iter_no_change=None,
+                 scoring=None, validation_fraction=0.1, n_iter_no_change=10,
                  tol=1e-7, verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,

From c30e4d99e7b810e4bfe8a30f5eee9f6b00ba3907 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 30 Jul 2019 09:39:37 +0200
Subject: [PATCH 02/33] Disable early stopping for some tests

---
 .../_hist_gradient_boosting/tests/test_warm_start.py   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
index 806ad94ccee98..b1ef4a292c4b6 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -37,7 +37,8 @@ def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
     # is smaller than the number of iterations from the previous fit when warm
     # start is True.
 
-    estimator = GradientBoosting(max_iter=50, warm_start=True)
+    estimator = GradientBoosting(max_iter=50, warm_start=True,
+                                 n_iter_no_change=None)
     estimator.fit(X, y)
     estimator.set_params(max_iter=25)
     err_msg = ('max_iter=25 must be larger than or equal to n_iter_=50 '
@@ -76,7 +77,7 @@ def test_warm_start_yields_identical_results(GradientBoosting, X, y):
 def test_warm_start_max_depth(GradientBoosting, X, y):
     # Test if possible to fit trees of different depth in ensemble.
     gb = GradientBoosting(max_iter=100, min_samples_leaf=1,
-                          warm_start=True, max_depth=2)
+                          warm_start=True, max_depth=2, n_iter_no_change=None)
     gb.fit(X, y)
     gb.set_params(max_iter=110, max_depth=3)
     gb.fit(X, y)
@@ -115,11 +116,12 @@ def test_warm_start_early_stopping(GradientBoosting, X, y):
 ])
 def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
     # Test if warm start with equal n_estimators does nothing
-    gb_1 = GradientBoosting(max_depth=2)
+    gb_1 = GradientBoosting(max_depth=2, n_iter_no_change=None)
     gb_1.fit(X, y)
 
     gb_2 = clone(gb_1)
-    gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True)
+    gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
+                    n_iter_no_change=None)
     gb_2.fit(X, y)
 
     # Check that both predictors are equal

From a196c5ea0fa07866be7f37cb10f96ccc78698456 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 30 Jul 2019 09:40:21 +0200
Subject: [PATCH 03/33] Check that early stopping is enabled by default

---
 .../tests/test_gradient_boosting.py             | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index be7e424a844bc..b69f6743cd178 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -44,7 +44,8 @@ def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
 
 
 def test_invalid_classification_loss():
-    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
+    binary_clf = HistGradientBoostingClassifier(
+        loss="binary_crossentropy", n_iter_no_change=None)
     err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
                "classification with n_classes=3, use "
                "loss='categorical_crossentropy' instead")
@@ -227,6 +228,18 @@ def test_infinite_values():
     X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
     y = np.array([0, 0, 1, 1])
 
-    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
+    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1,
+                                         n_iter_no_change=None)
     gbdt.fit(X, y)
     np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
+
+
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_early_stopping_default(GradientBoosting, X, y):
+    # Test that early stopping is enabled by default
+    gb = GradientBoosting(max_iter=200)
+    gb.fit(X, y)
+    assert gb.n_iter_ < gb.max_iter

From b84c7e2bdb425083837636ecea45bdd843a0759a Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 30 Jul 2019 09:40:33 +0200
Subject: [PATCH 04/33] Fix the random state in the examples

---
 .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 5df5102392319..a5d2ab1ffcfbf 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -738,9 +738,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
     >>> from sklearn.datasets import load_boston
     >>> X, y = load_boston(return_X_y=True)
-    >>> est = HistGradientBoostingRegressor().fit(X, y)
+    >>> est = HistGradientBoostingRegressor(random_state=42).fit(X, y)
     >>> est.score(X, y)
-    0.98...
+    0.95...
     """
 
     _VALID_LOSSES = ('least_squares',)
@@ -916,9 +916,9 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
     >>> from sklearn.datasets import load_iris
     >>> X, y = load_iris(return_X_y=True)
-    >>> clf = HistGradientBoostingClassifier().fit(X, y)
+    >>> clf = HistGradientBoostingClassifier(random_state=42).fit(X, y)
     >>> clf.score(X, y)
-    1.0
+    0.98...
     """
 
     _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy',

From c02faeaa6a767e3db677b0db204c418a62aa723d Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Wed, 31 Jul 2019 10:01:38 +0200
Subject: [PATCH 05/33] Move sentence at the end of the paragraph

---
 .../_hist_gradient_boosting/gradient_boosting.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index a5d2ab1ffcfbf..09a638cf4e834 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -622,12 +622,12 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
     into integer-valued bins, which considerably reduces the number of
     splitting points to consider, and allows the algorithm to leverage
-    integer-based data structures. Early stopping is the default behavior, as
-    it usually makes the fitting process much faster without a substantial
-    difference in terms of predictive performance. For small sample sizes,
+    integer-based data structures. For small sample sizes,
     :class:`GradientBoostingRegressor<sklearn.ensemble.GradientBoostingRegressor>`
     might be preferred since binning may lead to split points that are too
-    approximate in this setting.
+    approximate in this setting. Early stopping is the default behavior, as
+    it usually makes the fitting process much faster without a substantial
+    difference in terms of predictive performance.
 
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
@@ -797,12 +797,12 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
     into integer-valued bins, which considerably reduces the number of
     splitting points to consider, and allows the algorithm to leverage
-    integer-based data structures. Early stopping is the default behavior, as
-    it usually makes the fitting process much faster without a substantial
-    difference in terms of predictive performance. For small sample sizes,
+    integer-based data structures. For small sample sizes,
     :class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
     might be preferred since binning may lead to split points that are too
-    approximate in this setting.
+    approximate in this setting. Early stopping is the default behavior, as
+    it usually makes the fitting process much faster without a substantial
+    difference in terms of predictive performance.
 
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.

From de41ab38c72ef41c2747518fb7fd707a378687d7 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Wed, 31 Jul 2019 10:12:23 +0200
Subject: [PATCH 06/33] Disable early stopping in test_estimators

---
 sklearn/utils/estimator_checks.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index c8a82bc8e623f..bf930af2f9307 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -384,6 +384,9 @@ def set_checking_parameters(estimator):
         # The default min_samples_leaf (20) isn't appropriate for small
         # datasets (only very shallow trees are built) that the checks use.
         estimator.set_params(min_samples_leaf=5)
+        # Early stopping is not appropriate for some tests in test_estimators
+        # because the actual training set is smaller than the given data
+        estimator.set_params(n_iter_no_change=None)
 
     # Speed-up by reducing the number of CV or splits for CV estimators
     loo_cv = ['RidgeCV']

From 9e83463c4f24e632db2a91e5886611abd9a2bc01 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Wed, 31 Jul 2019 10:39:49 +0200
Subject: [PATCH 07/33] Disable early stopping in partial dependence tests

---
 sklearn/inspection/tests/test_partial_dependence.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 19399224e07ba..26681b2579947 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -158,8 +158,10 @@ def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
     (LinearRegression(), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'recursion'),
-    (HistGradientBoostingRegressor(random_state=0), 'brute'),
-    (HistGradientBoostingRegressor(random_state=0), 'recursion')]
+    (HistGradientBoostingRegressor(random_state=0, n_iter_no_change=None),
+     'brute'),
+    (HistGradientBoostingRegressor(random_state=0, n_iter_no_change=None),
+     'recursion')]
 )
 def test_partial_dependence_helpers(est, method, target_feature):
     # Check that what is returned by _partial_dependence_brute or

From 6f332d0fd6d705259c86f9a45ba7d912f064b5f9 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Wed, 31 Jul 2019 16:34:58 +0200
Subject: [PATCH 08/33] Move the new test next to the others tests regarding ES

---
 .../tests/test_gradient_boosting.py           | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index b69f6743cd178..327b0dbd9147a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -128,6 +128,17 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
         assert gb.n_iter_ == max_iter
 
 
+@pytest.mark.parametrize('GradientBoosting, X, y', [
+    (HistGradientBoostingClassifier, X_classification, y_classification),
+    (HistGradientBoostingRegressor, X_regression, y_regression)
+])
+def test_early_stopping_default(GradientBoosting, X, y):
+    # Test that early stopping is enabled by default
+    gb = GradientBoosting(max_iter=200)
+    gb.fit(X, y)
+    assert gb.n_iter_ < gb.max_iter
+
+
 @pytest.mark.parametrize(
     'scores, n_iter_no_change, tol, stopping',
     [
@@ -232,14 +243,3 @@ def test_infinite_values():
                                          n_iter_no_change=None)
     gbdt.fit(X, y)
     np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
-
-
-@pytest.mark.parametrize('GradientBoosting, X, y', [
-    (HistGradientBoostingClassifier, X_classification, y_classification),
-    (HistGradientBoostingRegressor, X_regression, y_regression)
-])
-def test_early_stopping_default(GradientBoosting, X, y):
-    # Test that early stopping is enabled by default
-    gb = GradientBoosting(max_iter=200)
-    gb.fit(X, y)
-    assert gb.n_iter_ < gb.max_iter

From 5764facd4dd1f69d83b95b38d9dd7e2313a45ee4 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 15:36:31 +0200
Subject: [PATCH 09/33] Update the docstrings and the init for both classes

---
 .../gradient_boosting.py                      | 66 +++++++------------
 1 file changed, 25 insertions(+), 41 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 9fc5ed28049f8..e09de478e2ea5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -619,19 +619,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
 
     This estimator is much faster than
     :class:`GradientBoostingRegressor<sklearn.ensemble.GradientBoostingRegressor>`
-<<<<<<< HEAD
-    for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
-    into integer-valued bins, which considerably reduces the number of
-    splitting points to consider, and allows the algorithm to leverage
-    integer-based data structures. For small sample sizes,
-    :class:`GradientBoostingRegressor<sklearn.ensemble.GradientBoostingRegressor>`
-    might be preferred since binning may lead to split points that are too
-    approximate in this setting. Early stopping is the default behavior, as
-    it usually makes the fitting process much faster without a substantial
-    difference in terms of predictive performance.
-=======
     for big datasets (n_samples >= 10 000).
->>>>>>> c64ee34a01ded919fc7fe3ad800260029624433b
 
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
@@ -682,6 +670,10 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
         allows for a much faster training stage. Features with a small
         number of unique values may use less than ``max_bins`` bins. Must be no
         larger than 256.
+    early_stopping : 'auto' or bool (default='auto')
+        If 'auto', early stopping is enabled if the sample size is larger than
+        1000. If True, early stopping is enabled, otherwise early stopping is
+        disabled.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -697,11 +689,11 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
         the training data. Only used if ``n_iter_no_change`` is not None.
-    n_iter_no_change : int or None, optional (default=10)
+    n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
-        tolerance. If None or 0, no early-stopping is done.
+        tolerance. Ignored if ``early_stopping`` is False.
     tol : float or None, optional (default=1e-7)
         The absolute tolerance to use when comparing scores during early
         stopping. The higher the tolerance, the more likely we are to early
@@ -743,9 +735,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
     >>> from sklearn.datasets import load_boston
     >>> X, y = load_boston(return_X_y=True)
-    >>> est = HistGradientBoostingRegressor(random_state=42).fit(X, y)
+    >>> est = HistGradientBoostingRegressor().fit(X, y)
     >>> est.score(X, y)
-    0.95...
+    0.98...
     """
 
     _VALID_LOSSES = ('least_squares',)
@@ -753,16 +745,16 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=256,
-                 warm_start=False, scoring=None, validation_fraction=0.1,
-                 n_iter_no_change=10, tol=1e-7, verbose=0,
-                 random_state=None):
+                 warm_start=False, early_stopping='auto', scoring=None,
+                 validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
+                 verbose=0, random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, scoring=scoring,
-            validation_fraction=validation_fraction,
+            warm_start=warm_start, early_stopping=early_stopping,
+            scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 
@@ -799,19 +791,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
 
     This estimator is much faster than
     :class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
-<<<<<<< HEAD
-    for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned
-    into integer-valued bins, which considerably reduces the number of
-    splitting points to consider, and allows the algorithm to leverage
-    integer-based data structures. For small sample sizes,
-    :class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
-    might be preferred since binning may lead to split points that are too
-    approximate in this setting. Early stopping is the default behavior, as
-    it usually makes the fitting process much faster without a substantial
-    difference in terms of predictive performance.
-=======
     for big datasets (n_samples >= 10 000).
->>>>>>> c64ee34a01ded919fc7fe3ad800260029624433b
 
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
@@ -865,6 +845,10 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         allows for a much faster training stage. Features with a small
         number of unique values may use less than ``max_bins`` bins. Must be no
         larger than 256.
+    early_stopping : 'auto' or bool (default='auto')
+        If 'auto', early stopping is enabled if the sample size is larger than
+        1000. If True, early stopping is enabled, otherwise early stopping is
+        disabled.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -880,11 +864,11 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
         the training data.
-    n_iter_no_change : int or None, optional (default=10)
+    n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
-        tolerance. If None or 0, no early-stopping is done.
+        tolerance. Ignored if ``early_stopping`` is False.
     tol : float or None, optional (default=1e-7)
         The absolute tolerance to use when comparing scores. The higher the
         tolerance, the more likely we are to early stop: higher tolerance
@@ -927,9 +911,9 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
     >>> from sklearn.datasets import load_iris
     >>> X, y = load_iris(return_X_y=True)
-    >>> clf = HistGradientBoostingClassifier(random_state=42).fit(X, y)
+    >>> clf = HistGradientBoostingClassifier().fit(X, y)
     >>> clf.score(X, y)
-    0.98...
+    1.0
     """
 
     _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy',
@@ -938,15 +922,15 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
                  l2_regularization=0., max_bins=256, warm_start=False,
-                 scoring=None, validation_fraction=0.1, n_iter_no_change=10,
-                 tol=1e-7, verbose=0, random_state=None):
+                 early_stopping='auto', scoring=None, validation_fraction=0.1,
+                 n_iter_no_change=10, tol=1e-7, verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, scoring=scoring,
-            validation_fraction=validation_fraction,
+            warm_start=warm_start, early_stopping=early_stopping,
+            scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 

From 42f3d954ebf68df2858ed008a5b9ebfdf6e47f74 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 15:38:05 +0200
Subject: [PATCH 10/33] Swap warm_start and early_stopping

---
 .../_hist_gradient_boosting/gradient_boosting.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index e09de478e2ea5..8421aa29a8267 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -745,7 +745,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=256,
-                 warm_start=False, early_stopping='auto', scoring=None,
+                 early_stopping='auto', warm_start=False, scoring=None,
                  validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
@@ -753,7 +753,7 @@ def __init__(self, loss='least_squares', learning_rate=0.1,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, early_stopping=early_stopping,
+            early_stopping=early_stopping, warm_start=warm_start,
             scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
@@ -921,15 +921,15 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
 
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
-                 l2_regularization=0., max_bins=256, warm_start=False,
-                 early_stopping='auto', scoring=None, validation_fraction=0.1,
+                 l2_regularization=0., max_bins=256, early_stopping='auto',
+                 warm_start=False, scoring=None, validation_fraction=0.1,
                  n_iter_no_change=10, tol=1e-7, verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, early_stopping=early_stopping,
+            early_stopping=early_stopping, warm_start=warm_start,
             scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)

From 16a83e83dd94018d26d2cf3e8ce4320dfe79cac1 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 15:40:25 +0200
Subject: [PATCH 11/33] Update validation_fraction documentation

---
 .../_hist_gradient_boosting/gradient_boosting.py     | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 8421aa29a8267..630732eedbaa4 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -27,8 +27,8 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
     @abstractmethod
     def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
                  max_depth, min_samples_leaf, l2_regularization, max_bins,
-                 warm_start, scoring, validation_fraction, n_iter_no_change,
-                 tol, verbose, random_state):
+                 early_stopping, warm_start, scoring, validation_fraction,
+                 n_iter_no_change, tol, verbose, random_state):
         self.loss = loss
         self.learning_rate = learning_rate
         self.max_iter = max_iter
@@ -37,6 +37,7 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
         self.min_samples_leaf = min_samples_leaf
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
+        self.early_stopping = early_stopping
         self.warm_start = warm_start
         self.scoring = scoring
         self.validation_fraction = validation_fraction
@@ -121,9 +122,6 @@ def fit(self, X, y):
 
         self.loss_ = self._get_loss()
 
-        self.do_early_stopping_ = (self.n_iter_no_change is not None and
-                                   self.n_iter_no_change > 0)
-
         # create validation data if needed
         self._use_validation_data = self.validation_fraction is not None
         if self.do_early_stopping_ and self._use_validation_data:
@@ -688,7 +686,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     validation_fraction : int or float or None, optional (default=0.1)
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data. Only used if ``n_iter_no_change`` is not None.
+        the training data. Only used if ``early_stopping`` is True.
     n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
@@ -863,7 +861,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     validation_fraction : int or float or None, optional (default=0.1)
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data.
+        the training data. Only used if ``early_stopping`` is True.
     n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better

From 1e993d285b79cac142ffcd8d064c37f990d1edae Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 15:45:58 +0200
Subject: [PATCH 12/33] Update docstrings with early stopping

---
 .../_hist_gradient_boosting/gradient_boosting.py  | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 630732eedbaa4..424df34ace4c3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -122,6 +122,9 @@ def fit(self, X, y):
 
         self.loss_ = self._get_loss()
 
+        self.do_early_stopping_ = (self.n_iter_no_change is not None and
+                                   self.n_iter_no_change > 0)
+
         # create validation data if needed
         self._use_validation_data = self.validation_fraction is not None
         if self.do_early_stopping_ and self._use_validation_data:
@@ -682,16 +685,16 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
         string (see :ref:`scoring_parameter`) or a callable (see
         :ref:`scoring`). If None, the estimator's default scorer is used. If
         ``scoring='loss'``, early stopping is checked w.r.t the loss value.
-        Only used if ``n_iter_no_change`` is not None.
+        Only used if early stopping is performed.
     validation_fraction : int or float or None, optional (default=0.1)
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data. Only used if ``early_stopping`` is True.
+        the training data. Only used if early stopping is performed.
     n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
-        tolerance. Ignored if ``early_stopping`` is False.
+        tolerance. Only used if early stopping is performed.
     tol : float or None, optional (default=1e-7)
         The absolute tolerance to use when comparing scores during early
         stopping. The higher the tolerance, the more likely we are to early
@@ -857,16 +860,16 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         string (see :ref:`scoring_parameter`) or a callable (see
         :ref:`scoring`). If None, the estimator's default scorer
         is used. If ``scoring='loss'``, early stopping is checked
-        w.r.t the loss value. Only used if ``n_iter_no_change`` is not None.
+        w.r.t the loss value. Only used if early stopping is performed.
     validation_fraction : int or float or None, optional (default=0.1)
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data. Only used if ``early_stopping`` is True.
+        the training data. Only used if early stopping is performed.
     n_iter_no_change : int, optional (default=10)
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
-        tolerance. Ignored if ``early_stopping`` is False.
+        tolerance. Only used if early stopping is performed.
     tol : float or None, optional (default=1e-7)
         The absolute tolerance to use when comparing scores. The higher the
         tolerance, the more likely we are to early stop: higher tolerance

From e08dc0bb047ac03cf549cb99df0b3b34e7710278 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 16:29:06 +0200
Subject: [PATCH 13/33] Remove n_iter_no_cjange in set_params

---
 sklearn/utils/estimator_checks.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 10f5f4d76dfa8..c17d8f44bf8c2 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -388,9 +388,6 @@ def set_checking_parameters(estimator):
         # The default min_samples_leaf (20) isn't appropriate for small
         # datasets (only very shallow trees are built) that the checks use.
         estimator.set_params(min_samples_leaf=5)
-        # Early stopping is not appropriate for some tests in test_estimators
-        # because the actual training set is smaller than the given data
-        estimator.set_params(n_iter_no_change=None)
 
     # Speed-up by reducing the number of CV or splits for CV estimators
     loo_cv = ['RidgeCV']

From 30c7139eff86ecc657f31e6be1a78b897da03125 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 16:40:32 +0200
Subject: [PATCH 14/33] Update the tests in _hist...

---
 .../gradient_boosting.py                      | 11 ++--
 .../tests/test_gradient_boosting.py           | 66 +++++++++++--------
 .../tests/test_warm_start.py                  | 35 +++++-----
 3 files changed, 61 insertions(+), 51 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 424df34ace4c3..48646247b040a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -64,7 +64,7 @@ def _validate_parameters(self):
         if self.max_iter < 1:
             raise ValueError('max_iter={} must not be smaller '
                              'than 1.'.format(self.max_iter))
-        if self.n_iter_no_change is not None and self.n_iter_no_change < 0:
+        if self.n_iter_no_change < 0:
             raise ValueError('n_iter_no_change={} must be '
                              'positive.'.format(self.n_iter_no_change))
         if (self.validation_fraction is not None and
@@ -108,7 +108,7 @@ def fit(self, X, y):
             self._rng = rng
 
         self._validate_parameters()
-        self.n_features_ = X.shape[1]  # used for validation in predict()
+        n_samples, self.n_features_ = X.shape  # used for validation in predict
 
         # we need this stateful variable to tell raw_predict() that it was
         # called from fit() (this current method), and that the data it has
@@ -121,9 +121,10 @@ def fit(self, X, y):
         self._in_fit = True
 
         self.loss_ = self._get_loss()
-
-        self.do_early_stopping_ = (self.n_iter_no_change is not None and
-                                   self.n_iter_no_change > 0)
+        if self.early_stopping == 'auto':
+            self.do_early_stopping_ = n_samples > 1000
+        else:
+            self.do_early_stopping_ = self.early_stopping
 
         # create validation data if needed
         self._use_validation_data = self.validation_fraction is not None
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 327b0dbd9147a..8676483339f55 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -45,7 +45,7 @@ def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
 
 def test_invalid_classification_loss():
     binary_clf = HistGradientBoostingClassifier(
-        loss="binary_crossentropy", n_iter_no_change=None)
+        loss="binary_crossentropy")
     err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
                "classification with n_classes=3, use "
                "loss='categorical_crossentropy' instead")
@@ -54,27 +54,28 @@ def test_invalid_classification_loss():
 
 
 @pytest.mark.parametrize(
-    'scoring, validation_fraction, n_iter_no_change, tol', [
-        ('neg_mean_squared_error', .1, 5, 1e-7),  # use scorer
-        ('neg_mean_squared_error', None, 5, 1e-1),  # use scorer on train data
-        (None, .1, 5, 1e-7),  # same with default scorer
-        (None, None, 5, 1e-1),
-        ('loss', .1, 5, 1e-7),  # use loss
-        ('loss', None, 5, 1e-1),  # use loss on training data
-        (None, None, None, None),  # no early stopping
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('neg_mean_squared_error', .1, True, 5, 1e-7),  # use scorer
+        ('neg_mean_squared_error', None, True, 5, 1e-1),  # use scorer on train
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
         ])
 def test_early_stopping_regression(scoring, validation_fraction,
-                                   n_iter_no_change, tol):
+                                   early_stopping, n_iter_no_change, tol):
 
     max_iter = 200
 
     X, y = make_regression(n_samples=50, random_state=0)
 
     gb = HistGradientBoostingRegressor(
-        verbose=1,  # just for coverage
+        verbose=0,  # just for coverage
         min_samples_leaf=5,  # easier to overfit fast
         scoring=scoring,
         tol=tol,
+        early_stopping=early_stopping,
         validation_fraction=validation_fraction,
         max_iter=max_iter,
         n_iter_no_change=n_iter_no_change,
@@ -82,7 +83,7 @@ def test_early_stopping_regression(scoring, validation_fraction,
     )
     gb.fit(X, y)
 
-    if n_iter_no_change is not None:
+    if early_stopping is True:
         assert n_iter_no_change <= gb.n_iter_ < max_iter
     else:
         assert gb.n_iter_ == max_iter
@@ -94,27 +95,28 @@ def test_early_stopping_regression(scoring, validation_fraction,
                         random_state=0)
 ))
 @pytest.mark.parametrize(
-    'scoring, validation_fraction, n_iter_no_change, tol', [
-        ('accuracy', .1, 5, 1e-7),  # use scorer
-        ('accuracy', None, 5, 1e-1),  # use scorer on training data
-        (None, .1, 5, 1e-7),  # same with default scorerscor
-        (None, None, 5, 1e-1),
-        ('loss', .1, 5, 1e-7),  # use loss
-        ('loss', None, 5, 1e-1),  # use loss on training data
-        (None, None, None, None),  # no early stopping
+    'scoring, validation_fraction, early_stopping, n_iter_no_change, tol', [
+        ('accuracy', .1, True, 5, 1e-7),  # use scorer
+        ('accuracy', None, True, 5, 1e-1),  # use scorer on training data
+        (None, .1, True, 5, 1e-7),  # same with default scorer
+        (None, None, True, 5, 1e-1),
+        ('loss', .1, True, 5, 1e-7),  # use loss
+        ('loss', None, True, 5, 1e-1),  # use loss on training data
+        (None, None, False, 5, None),  # no early stopping
         ])
 def test_early_stopping_classification(data, scoring, validation_fraction,
-                                       n_iter_no_change, tol):
+                                       early_stopping, n_iter_no_change, tol):
 
     max_iter = 50
 
     X, y = data
 
     gb = HistGradientBoostingClassifier(
-        verbose=1,  # just for coverage
+        verbose=0,  # just for coverage
         min_samples_leaf=5,  # easier to overfit fast
         scoring=scoring,
         tol=tol,
+        early_stopping=early_stopping,
         validation_fraction=validation_fraction,
         max_iter=max_iter,
         n_iter_no_change=n_iter_no_change,
@@ -122,7 +124,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
     )
     gb.fit(X, y)
 
-    if n_iter_no_change is not None:
+    if early_stopping is True:
         assert n_iter_no_change <= gb.n_iter_ < max_iter
     else:
         assert gb.n_iter_ == max_iter
@@ -130,13 +132,19 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
 
 @pytest.mark.parametrize('GradientBoosting, X, y', [
     (HistGradientBoostingClassifier, X_classification, y_classification),
-    (HistGradientBoostingRegressor, X_regression, y_regression)
+    (HistGradientBoostingClassifier, *make_classification(n_samples=1001)),
+    (HistGradientBoostingRegressor, X_regression, y_regression),
+    (HistGradientBoostingRegressor, *make_regression(n_samples=1001))
 ])
 def test_early_stopping_default(GradientBoosting, X, y):
-    # Test that early stopping is enabled by default
+    # Test that early stopping is enabled by default if and only if there
+    # are more than 1000 samples
     gb = GradientBoosting(max_iter=200)
     gb.fit(X, y)
-    assert gb.n_iter_ < gb.max_iter
+    if X.shape[0] > 1000:
+        assert gb.n_iter_ < gb.max_iter
+    else:
+        assert gb.n_iter_ == gb.max_iter
 
 
 @pytest.mark.parametrize(
@@ -169,7 +177,7 @@ def test_binning_train_validation_are_separated():
     rng = np.random.RandomState(0)
     validation_fraction = .2
     gb = HistGradientBoostingClassifier(
-        n_iter_no_change=5,
+        early_stopping=True,
         validation_fraction=validation_fraction,
         random_state=rng
     )
@@ -215,7 +223,7 @@ def test_small_trainset():
     y = [[class_] * int(prop * n_samples) for (class_, prop)
          in original_distrib.items()]
     y = shuffle(np.concatenate(y))
-    gb = HistGradientBoostingClassifier()
+    gb = HistGradientBoostingClassifier(early_stopping=False)
 
     # Compute the small training set
     X_small, y_small = gb._get_small_trainset(X, y, seed=42)
@@ -240,6 +248,6 @@ def test_infinite_values():
     y = np.array([0, 0, 1, 1])
 
     gbdt = HistGradientBoostingRegressor(min_samples_leaf=1,
-                                         n_iter_no_change=None)
+                                         early_stopping=False)
     gbdt.fit(X, y)
     np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
index b1ef4a292c4b6..0f3c04f4d6494 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -37,11 +37,11 @@ def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
     # is smaller than the number of iterations from the previous fit when warm
     # start is True.
 
-    estimator = GradientBoosting(max_iter=50, warm_start=True,
-                                 n_iter_no_change=None)
+    estimator = GradientBoosting(max_iter=10, early_stopping=False,
+                                 warm_start=True)
     estimator.fit(X, y)
-    estimator.set_params(max_iter=25)
-    err_msg = ('max_iter=25 must be larger than or equal to n_iter_=50 '
+    estimator.set_params(max_iter=5)
+    err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '
                'when warm_start==True')
     with pytest.raises(ValueError, match=err_msg):
         estimator.fit(X, y)
@@ -76,14 +76,14 @@ def test_warm_start_yields_identical_results(GradientBoosting, X, y):
 ])
 def test_warm_start_max_depth(GradientBoosting, X, y):
     # Test if possible to fit trees of different depth in ensemble.
-    gb = GradientBoosting(max_iter=100, min_samples_leaf=1,
-                          warm_start=True, max_depth=2, n_iter_no_change=None)
+    gb = GradientBoosting(max_iter=20, min_samples_leaf=1,
+                          warm_start=True, max_depth=2, early_stopping=False)
     gb.fit(X, y)
-    gb.set_params(max_iter=110, max_depth=3)
+    gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
     gb.fit(X, y)
 
-    # First 100 trees have max_depth == 2
-    for i in range(100):
+    # First 20 trees have max_depth == 2
+    for i in range(20):
         assert gb._predictors[i][0].get_max_depth() == 2
     # Last 10 trees have max_depth == 3
     for i in range(1, 11):
@@ -100,8 +100,8 @@ def test_warm_start_early_stopping(GradientBoosting, X, y):
 
     n_iter_no_change = 5
     gb = GradientBoosting(
-        n_iter_no_change=n_iter_no_change, max_iter=10000,
-        random_state=42, warm_start=True, tol=1e-3
+        n_iter_no_change=n_iter_no_change, max_iter=200,
+        random_state=42, warm_start=True, tol=1e-3, early_stopping=False
     )
     gb.fit(X, y)
     n_iter_first_fit = gb.n_iter_
@@ -116,12 +116,12 @@ def test_warm_start_early_stopping(GradientBoosting, X, y):
 ])
 def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
     # Test if warm start with equal n_estimators does nothing
-    gb_1 = GradientBoosting(max_depth=2, n_iter_no_change=None)
+    gb_1 = GradientBoosting(max_depth=2, n_iter_no_change=5)
     gb_1.fit(X, y)
 
     gb_2 = clone(gb_1)
     gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
-                    n_iter_no_change=None)
+                    n_iter_no_change=5)
     gb_2.fit(X, y)
 
     # Check that both predictors are equal
@@ -168,15 +168,16 @@ def _get_rng(rng_type):
             return np.random.RandomState(0)
 
     random_state = _get_rng(rng_type)
-    gb_1 = GradientBoosting(n_iter_no_change=5, max_iter=2,
-                            random_state=random_state)
+    gb_1 = GradientBoosting(n_iter_no_change=5, early_stopping=True,
+                            max_iter=2, random_state=random_state)
     gb_1.fit(X, y)
     train_val_seed_1 = gb_1._train_val_split_seed
     small_trainset_seed_1 = gb_1._small_trainset_seed
 
     random_state = _get_rng(rng_type)
-    gb_2 = GradientBoosting(n_iter_no_change=5, max_iter=2,
-                            random_state=random_state, warm_start=True)
+    gb_2 = GradientBoosting(n_iter_no_change=5, early_stopping=True,
+                            max_iter=2, random_state=random_state,
+                            warm_start=True)
     gb_2.fit(X, y)  # inits state
     train_val_seed_2 = gb_2._train_val_split_seed
     small_trainset_seed_2 = gb_2._small_trainset_seed

From 36e9975cc3eeeebf01a572ac269422041a91ed99 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 16:42:33 +0200
Subject: [PATCH 15/33] Replace n_iter_no_change with early_stopping in
 test_partial_dependence

---
 sklearn/inspection/tests/test_partial_dependence.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 26681b2579947..3c4dfcc3381e8 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -158,9 +158,9 @@ def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
     (LinearRegression(), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'recursion'),
-    (HistGradientBoostingRegressor(random_state=0, n_iter_no_change=None),
+    (HistGradientBoostingRegressor(random_state=0, early_stopping=False),
      'brute'),
-    (HistGradientBoostingRegressor(random_state=0, n_iter_no_change=None),
+    (HistGradientBoostingRegressor(random_state=0, early_stopping=False),
      'recursion')]
 )
 def test_partial_dependence_helpers(est, method, target_feature):

From 081ee188838ef9167e9034ddaca5695f2e2ae802 Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 16:44:01 +0200
Subject: [PATCH 16/33] Remove early_stopping in partial_dependence

---
 sklearn/inspection/tests/test_partial_dependence.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 3c4dfcc3381e8..16ed23a790a8b 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -158,9 +158,9 @@ def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
     (LinearRegression(), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'recursion'),
-    (HistGradientBoostingRegressor(random_state=0, early_stopping=False),
+    (HistGradientBoostingRegressor(random_state=0),
      'brute'),
-    (HistGradientBoostingRegressor(random_state=0, early_stopping=False),
+    (HistGradientBoostingRegressor(random_state=0),
      'recursion')]
 )
 def test_partial_dependence_helpers(est, method, target_feature):

From 72681fdaf86e067007ab598985bc3b772248a18c Mon Sep 17 00:00:00 2001
From: "johann.faouzi" <johann.faouzi@icm-institute.org>
Date: Tue, 6 Aug 2019 16:45:13 +0200
Subject: [PATCH 17/33] One line is enough

---
 sklearn/inspection/tests/test_partial_dependence.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 16ed23a790a8b..19399224e07ba 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -158,10 +158,8 @@ def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
     (LinearRegression(), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'brute'),
     (GradientBoostingRegressor(random_state=0), 'recursion'),
-    (HistGradientBoostingRegressor(random_state=0),
-     'brute'),
-    (HistGradientBoostingRegressor(random_state=0),
-     'recursion')]
+    (HistGradientBoostingRegressor(random_state=0), 'brute'),
+    (HistGradientBoostingRegressor(random_state=0), 'recursion')]
 )
 def test_partial_dependence_helpers(est, method, target_feature):
     # Check that what is returned by _partial_dependence_brute or

From eda29c8a9821bdce0551034bc472281ec967ef4c Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 22 Aug 2019 13:59:20 +0200
Subject: [PATCH 18/33] Use 10k sample threshold for auto early stopping and
 resolve conflicts

---
 .../gradient_boosting.py                      | 32 ++++++++++---------
 .../tests/test_gradient_boosting.py           |  8 ++---
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 48646247b040a..fcc0c54cc3208 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -122,7 +122,7 @@ def fit(self, X, y):
 
         self.loss_ = self._get_loss()
         if self.early_stopping == 'auto':
-            self.do_early_stopping_ = n_samples > 1000
+            self.do_early_stopping_ = n_samples > 10000
         else:
             self.do_early_stopping_ = self.early_stopping
 
@@ -666,12 +666,13 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     l2_regularization : float, optional (default=0)
         The L2 regularization parameter. Use ``0`` for no regularization
         (default).
-    max_bins : int, optional (default=256)
-        The maximum number of bins to use. Before training, each feature of
-        the input array ``X`` is binned into at most ``max_bins`` bins, which
-        allows for a much faster training stage. Features with a small
-        number of unique values may use less than ``max_bins`` bins. Must be no
-        larger than 256.
+    max_bins : int, optional (default=255)
+        The maximum number of bins to use for non-missing values. Before
+        training, each feature of the input array `X` is binned into
+        integer-valued bins, which allows for a much faster training stage.
+        Features with a small number of unique values may use less than
+        ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
+        is always reserved for missing values. Must be no larger than 255.
     early_stopping : 'auto' or bool (default='auto')
         If 'auto', early stopping is enabled if the sample size is larger than
         1000. If True, early stopping is enabled, otherwise early stopping is
@@ -746,7 +747,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
 
     def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
-                 min_samples_leaf=20, l2_regularization=0., max_bins=256,
+                 min_samples_leaf=20, l2_regularization=0., max_bins=255,
                  early_stopping='auto', warm_start=False, scoring=None,
                  validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
@@ -841,12 +842,13 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         since only very shallow trees would be built.
     l2_regularization : float, optional (default=0)
         The L2 regularization parameter. Use 0 for no regularization.
-    max_bins : int, optional (default=256)
-        The maximum number of bins to use. Before training, each feature of
-        the input array ``X`` is binned into at most ``max_bins`` bins, which
-        allows for a much faster training stage. Features with a small
-        number of unique values may use less than ``max_bins`` bins. Must be no
-        larger than 256.
+    max_bins : int, optional (default=255)
+        The maximum number of bins to use for non-missing values. Before
+        training, each feature of the input array `X` is binned into
+        integer-valued bins, which allows for a much faster training stage.
+        Features with a small number of unique values may use less than
+        ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
+        is always reserved for missing values. Must be no larger than 255.
     early_stopping : 'auto' or bool (default='auto')
         If 'auto', early stopping is enabled if the sample size is larger than
         1000. If True, early stopping is enabled, otherwise early stopping is
@@ -923,7 +925,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
 
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
-                 l2_regularization=0., max_bins=256, early_stopping='auto',
+                 l2_regularization=0., max_bins=255, early_stopping='auto',
                  warm_start=False, scoring=None, validation_fraction=0.1,
                  n_iter_no_change=10, tol=1e-7, verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 8676483339f55..c89a0d2079beb 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -132,16 +132,16 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
 
 @pytest.mark.parametrize('GradientBoosting, X, y', [
     (HistGradientBoostingClassifier, X_classification, y_classification),
-    (HistGradientBoostingClassifier, *make_classification(n_samples=1001)),
+    (HistGradientBoostingClassifier, *make_classification(n_samples=10001)),
     (HistGradientBoostingRegressor, X_regression, y_regression),
-    (HistGradientBoostingRegressor, *make_regression(n_samples=1001))
+    (HistGradientBoostingRegressor, *make_regression(n_samples=10001))
 ])
 def test_early_stopping_default(GradientBoosting, X, y):
     # Test that early stopping is enabled by default if and only if there
-    # are more than 1000 samples
+    # are more than 10000 samples
     gb = GradientBoosting(max_iter=200)
     gb.fit(X, y)
-    if X.shape[0] > 1000:
+    if X.shape[0] > 10000:
         assert gb.n_iter_ < gb.max_iter
     else:
         assert gb.n_iter_ == gb.max_iter

From 3c4cfeaca743915ee95ee5aa19dab339c86bf513 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 22 Aug 2019 14:51:37 +0200
Subject: [PATCH 19/33] Increase the maximum number of iterations to check
 early stopping

---
 .../_hist_gradient_boosting/tests/test_gradient_boosting.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index c6a45d96303e2..a9f9e8eaf1081 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -144,7 +144,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
 def test_early_stopping_default(GradientBoosting, X, y):
     # Test that early stopping is enabled by default if and only if there
     # are more than 10000 samples
-    gb = GradientBoosting(max_iter=200)
+    gb = GradientBoosting(min_samples_leaf=50, max_iter=1000)
     gb.fit(X, y)
     if X.shape[0] > 10000:
         assert gb.n_iter_ < gb.max_iter

From 73f6756221e4cb1b3d63cdb527ea00e21aa560b0 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 22 Aug 2019 15:30:41 +0200
Subject: [PATCH 20/33] Fix min number of samples for early stopping in
 docstrings

---
 sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 161e3dd4311b4..b6bf86115a316 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -704,7 +704,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
         is always reserved for missing values. Must be no larger than 255.
     early_stopping : 'auto' or bool (default='auto')
         If 'auto', early stopping is enabled if the sample size is larger than
-        1000. If True, early stopping is enabled, otherwise early stopping is
+        10000. If True, early stopping is enabled, otherwise early stopping is
         disabled.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
@@ -888,7 +888,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         is always reserved for missing values. Must be no larger than 255.
     early_stopping : 'auto' or bool (default='auto')
         If 'auto', early stopping is enabled if the sample size is larger than
-        1000. If True, early stopping is enabled, otherwise early stopping is
+        10000. If True, early stopping is enabled, otherwise early stopping is
         disabled.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit

From df72eef5d8742a2a64eb5037279119c3134b0d1a Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 22 Aug 2019 16:51:24 +0200
Subject: [PATCH 21/33] Update ensemble.rst with new early stopping behavior

---
 doc/modules/ensemble.rst | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index fde8f40db6c8c..4d61a73e5a203 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -888,11 +888,13 @@ generally recommended to use as many bins as possible, which is the default.
 The ``l2_regularization`` parameter is a regularizer on the loss function and
 corresponds to :math:`\lambda` in equation (2) of [XGBoost]_.
 
-Note that **early-stopping is enabled by default**. The early-stopping
-behaviour is controlled via the ``scoring``, ``validation_fraction``,
+Note that **early-stopping is enabled by default if the number of samples is
+larger than 10,000**. The early-stopping behaviour is controlled via the
+``early-stopping``, ``scoring``, ``validation_fraction``,
 ``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop
 using an arbitrary :term:`scorer`, or just the training or validation loss. By
-default, early-stopping is performed using the default :term:`scorer` of
+default, early-stopping is performed if there are at least 10,000 samples in
+the training set, using the default :term:`scorer` of
 the estimator on a validation set.
 
 Missing values support
@@ -1179,7 +1181,7 @@ The following example shows how to fit the VotingRegressor::
 
    >>> # Loading some example data
    >>> X, y = load_boston(return_X_y=True)
-   
+
    >>> # Training classifiers
    >>> reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10)
    >>> reg2 = RandomForestRegressor(random_state=1, n_estimators=10)

From 99a830f270709f2570868f1687a9f103ec9e736e Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Tue, 3 Sep 2019 16:36:51 +0200
Subject: [PATCH 22/33] Update the user guide with the new default scoring

---
 doc/modules/ensemble.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 4d61a73e5a203..4783c613175c1 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -894,8 +894,7 @@ larger than 10,000**. The early-stopping behaviour is controlled via the
 ``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop
 using an arbitrary :term:`scorer`, or just the training or validation loss. By
 default, early-stopping is performed if there are at least 10,000 samples in
-the training set, using the default :term:`scorer` of
-the estimator on a validation set.
+the training set, using the validation loss.
 
 Missing values support
 ----------------------

From f019d47fec92e3bb4ac93a13f439816c586fa325 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Tue, 3 Sep 2019 16:39:39 +0200
Subject: [PATCH 23/33] Update code after reviews

---
 .../gradient_boosting.py                      | 37 ++++++++++---------
 .../tests/test_gradient_boosting.py           | 30 +++++++++------
 .../tests/test_warm_start.py                  | 14 ++++---
 3 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index b6bf86115a316..e803dd5a00031 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -28,7 +28,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
     @abstractmethod
     def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
                  max_depth, min_samples_leaf, l2_regularization, max_bins,
-                 early_stopping, warm_start, scoring, validation_fraction,
+                 warm_start, early_stopping, scoring, validation_fraction,
                  n_iter_no_change, tol, verbose, random_state):
         self.loss = loss
         self.learning_rate = learning_rate
@@ -38,8 +38,8 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
         self.min_samples_leaf = min_samples_leaf
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
-        self.early_stopping = early_stopping
         self.warm_start = warm_start
+        self.early_stopping = early_stopping
         self.scoring = scoring
         self.validation_fraction = validation_fraction
         self.n_iter_no_change = n_iter_no_change
@@ -702,16 +702,16 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
-    early_stopping : 'auto' or bool (default='auto')
-        If 'auto', early stopping is enabled if the sample size is larger than
-        10000. If True, early stopping is enabled, otherwise early stopping is
-        disabled.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
         estimator should be re-trained on the same data only.
         See :term:`the Glossary <warm_start>`.
-    scoring : str or callable or None, optional (default=None)
+    early_stopping : 'auto' or bool (default='auto')
+        If 'auto', early stopping is enabled if the sample size is larger than
+        10000. If True, early stopping is enabled, otherwise early stopping is
+        disabled.
+    scoring : str or callable or None, optional (default='loss')
         Scoring parameter to use for early stopping. It can be a single
         string (see :ref:`scoring_parameter`) or a callable (see
         :ref:`scoring`). If None, the estimator's default scorer is used. If
@@ -777,7 +777,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=255,
-                 early_stopping='auto', warm_start=False, scoring=None,
+                 warm_start=False, early_stopping='auto', scoring='loss',
                  validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
@@ -785,7 +785,7 @@ def __init__(self, loss='least_squares', learning_rate=0.1,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            early_stopping=early_stopping, warm_start=warm_start,
+            warm_start=warm_start, early_stopping=early_stopping,
             scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
@@ -886,16 +886,16 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
-    early_stopping : 'auto' or bool (default='auto')
-        If 'auto', early stopping is enabled if the sample size is larger than
-        10000. If True, early stopping is enabled, otherwise early stopping is
-        disabled.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
         estimator should be re-trained on the same data only.
         See :term:`the Glossary <warm_start>`.
-    scoring : str or callable or None, optional (default=None)
+    early_stopping : 'auto' or bool (default='auto')
+        If 'auto', early stopping is enabled if the sample size is larger than
+        10000. If True, early stopping is enabled, otherwise early stopping is
+        disabled.
+    scoring : str or callable or None, optional (default='loss')
         Scoring parameter to use for early stopping. It can be a single
         string (see :ref:`scoring_parameter`) or a callable (see
         :ref:`scoring`). If None, the estimator's default scorer
@@ -962,15 +962,16 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
 
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
-                 l2_regularization=0., max_bins=255, early_stopping='auto',
-                 warm_start=False, scoring=None, validation_fraction=0.1,
-                 n_iter_no_change=10, tol=1e-7, verbose=0, random_state=None):
+                 l2_regularization=0., max_bins=255, warm_start=False,
+                 early_stopping='auto', scoring='loss',
+                 validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
+                 verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            early_stopping=early_stopping, warm_start=warm_start,
+            warm_start=warm_start, early_stopping=early_stopping,
             scoring=scoring, validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index a9f9e8eaf1081..2ed0f93228343 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -19,6 +19,14 @@
 X_regression, y_regression = make_regression(random_state=0)
 
 
+def _make_dumb_dataset(n_samples):
+    """Make a dumb dataset to test early stopping."""
+    rng = np.random.RandomState(42)
+    X_dumb = rng.randn(n_samples, 1)
+    y_dumb = (X_dumb[:, 0] > 0).astype('int64')
+    return X_dumb, y_dumb
+
+
 @pytest.mark.parametrize('GradientBoosting, X, y', [
     (HistGradientBoostingClassifier, X_classification, y_classification),
     (HistGradientBoostingRegressor, X_regression, y_regression)
@@ -49,8 +57,7 @@ def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg):
 
 
 def test_invalid_classification_loss():
-    binary_clf = HistGradientBoostingClassifier(
-        loss="binary_crossentropy")
+    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
     err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
                "classification with n_classes=3, use "
                "loss='categorical_crossentropy' instead")
@@ -76,7 +83,7 @@ def test_early_stopping_regression(scoring, validation_fraction,
     X, y = make_regression(n_samples=50, random_state=0)
 
     gb = HistGradientBoostingRegressor(
-        verbose=0,  # just for coverage
+        verbose=1,  # just for coverage
         min_samples_leaf=5,  # easier to overfit fast
         scoring=scoring,
         tol=tol,
@@ -88,7 +95,7 @@ def test_early_stopping_regression(scoring, validation_fraction,
     )
     gb.fit(X, y)
 
-    if early_stopping is True:
+    if early_stopping:
         assert n_iter_no_change <= gb.n_iter_ < max_iter
     else:
         assert gb.n_iter_ == max_iter
@@ -117,7 +124,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
     X, y = data
 
     gb = HistGradientBoostingClassifier(
-        verbose=0,  # just for coverage
+        verbose=1,  # just for coverage
         min_samples_leaf=5,  # easier to overfit fast
         scoring=scoring,
         tol=tol,
@@ -136,15 +143,15 @@ def test_early_stopping_classification(data, scoring, validation_fraction,
 
 
 @pytest.mark.parametrize('GradientBoosting, X, y', [
-    (HistGradientBoostingClassifier, X_classification, y_classification),
-    (HistGradientBoostingClassifier, *make_classification(n_samples=10001)),
-    (HistGradientBoostingRegressor, X_regression, y_regression),
-    (HistGradientBoostingRegressor, *make_regression(n_samples=10001))
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)),
+    (HistGradientBoostingRegressor, *_make_dumb_dataset(10001))
 ])
 def test_early_stopping_default(GradientBoosting, X, y):
     # Test that early stopping is enabled by default if and only if there
     # are more than 10000 samples
-    gb = GradientBoosting(min_samples_leaf=50, max_iter=1000)
+    gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1)
     gb.fit(X, y)
     if X.shape[0] > 10000:
         assert gb.n_iter_ < gb.max_iter
@@ -415,8 +422,7 @@ def test_infinite_values():
     X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
     y = np.array([0, 0, 1, 1])
 
-    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1,
-                                         early_stopping=False)
+    gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
     gbdt.fit(X, y)
     np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
index 0f3c04f4d6494..fbb20fcba1aef 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -11,6 +11,7 @@
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import check_scoring
 
 
 X_classification, y_classification = make_classification(random_state=0)
@@ -116,7 +117,7 @@ def test_warm_start_early_stopping(GradientBoosting, X, y):
 ])
 def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
     # Test if warm start with equal n_estimators does nothing
-    gb_1 = GradientBoosting(max_depth=2, n_iter_no_change=5)
+    gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
     gb_1.fit(X, y)
 
     gb_2 = clone(gb_1)
@@ -168,16 +169,17 @@ def _get_rng(rng_type):
             return np.random.RandomState(0)
 
     random_state = _get_rng(rng_type)
-    gb_1 = GradientBoosting(n_iter_no_change=5, early_stopping=True,
-                            max_iter=2, random_state=random_state)
+    gb_1 = GradientBoosting(early_stopping=True, max_iter=2,
+                            random_state=random_state)
+    gb_1.set_params(scoring=check_scoring(gb_1))
     gb_1.fit(X, y)
     train_val_seed_1 = gb_1._train_val_split_seed
     small_trainset_seed_1 = gb_1._small_trainset_seed
 
     random_state = _get_rng(rng_type)
-    gb_2 = GradientBoosting(n_iter_no_change=5, early_stopping=True,
-                            max_iter=2, random_state=random_state,
-                            warm_start=True)
+    gb_2 = GradientBoosting(early_stopping=True, max_iter=2,
+                            random_state=random_state, warm_start=True)
+    gb_2.set_params(scoring=check_scoring(gb_2))
     gb_2.fit(X, y)  # inits state
     train_val_seed_2 = gb_2._train_val_split_seed
     small_trainset_seed_2 = gb_2._small_trainset_seed

From 847ed81b7539d2f99e640929dff90c75c86b02a8 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 14 Nov 2019 09:35:12 +0100
Subject: [PATCH 24/33] Fix issues in tests

---
 .../_hist_gradient_boosting/tests/test_gradient_boosting.py     | 2 +-
 .../ensemble/_hist_gradient_boosting/tests/test_warm_start.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 2ed0f93228343..767381cb8ab29 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -290,7 +290,7 @@ def test_small_trainset():
     y = [[class_] * int(prop * n_samples) for (class_, prop)
          in original_distrib.items()]
     y = shuffle(np.concatenate(y))
-    gb = HistGradientBoostingClassifier(early_stopping=False)
+    gb = HistGradientBoostingClassifier()
 
     # Compute the small training set
     X_small, y_small = gb._get_small_trainset(X, y, seed=42)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
index fbb20fcba1aef..0025080aa274d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -102,7 +102,7 @@ def test_warm_start_early_stopping(GradientBoosting, X, y):
     n_iter_no_change = 5
     gb = GradientBoosting(
         n_iter_no_change=n_iter_no_change, max_iter=200,
-        random_state=42, warm_start=True, tol=1e-3, early_stopping=False
+        random_state=42, warm_start=True, tol=1e-3, early_stopping=True
     )
     gb.fit(X, y)
     n_iter_first_fit = gb.n_iter_

From 39a69db43f665a18acd0a0d46beaef91183fc24a Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 14 Nov 2019 09:44:08 +0100
Subject: [PATCH 25/33] Update what's new

---
 doc/whats_new/v0.22.rst | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 38d8cb2e3285d..4755e1257733f 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -127,6 +127,10 @@ Changelog
     training loss or score is now monitored on a class-wise stratified
     subsample to preserve the class balance of the original training set.
     :pr:`14194` by :user:`Johann Faouzi <johannfaouzi>`.
+  - |Feature| Early stopping is now determined with a new `early_stopping`
+    parameter instead of `n_iter_no_change`. Default value is 'auto', which
+    enables early stopping if there are at least 10,000 samples in the
+    training set. :pr:`14516` by :user:`Johann Faouzi <johannfaouzi>`.
   - |Feature| :func:`inspection.partial_dependence` and
     :func:`inspection.plot_partial_dependence` now support the fast 'recursion'
     method for both estimators. :pr:`13769` by `Nicolas Hug`_.
@@ -298,19 +302,19 @@ Changelog
 - |Enhancement| SVM now throws more specific error when fit on non-square data
   and kernel = precomputed.  :class:`svm.BaseLibSVM`
   :pr:`14336` by :user:`Gregory Dexter <gdex1>`.
-  
+
 :mod:`sklearn.tree`
 ...................
 
 - |Feature| Adds minimal cost complexity pruning, controlled by ``ccp_alpha``,
   to :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`,
   :class:`tree.ExtraTreeClassifier`, :class:`tree.ExtraTreeRegressor`,
-  :class:`ensemble.RandomForestClassifier`, 
+  :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`,
-  :class:`ensemble.ExtraTreesClassifier`, 
+  :class:`ensemble.ExtraTreesClassifier`,
   :class:`ensemble.ExtraTreesRegressor`,
-  :class:`ensemble.RandomTreesEmbedding`, 
-  :class:`ensemble.GradientBoostingClassifier`, 
+  :class:`ensemble.RandomTreesEmbedding`,
+  :class:`ensemble.GradientBoostingClassifier`,
   and :class:`ensemble.GradientBoostingRegressor`.
   :pr:`12887` by `Thomas Fan`_.
 

From 5b366d92aa378cab73796abdbf49f012b7e72f0d Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Thu, 14 Nov 2019 10:52:57 +0100
Subject: [PATCH 26/33] Make raw_predictions and raw_predictions_val private
 attributes

---
 .../gradient_boosting.py                      | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index e803dd5a00031..9a40aba76859d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -186,11 +186,11 @@ def fit(self, X, y):
             self._baseline_prediction = self.loss_.get_baseline_prediction(
                 y_train, self.n_trees_per_iteration_
             )
-            raw_predictions = np.zeros(
+            self._raw_predictions = np.zeros(
                 shape=(self.n_trees_per_iteration_, n_samples),
                 dtype=self._baseline_prediction.dtype
             )
-            raw_predictions += self._baseline_prediction
+            self._raw_predictions += self._baseline_prediction
 
             # initialize gradients and hessians (empty arrays).
             # shape = (n_trees_per_iteration, n_samples).
@@ -205,7 +205,8 @@ def fit(self, X, y):
 
             # Initialize structures and attributes related to early stopping
             self.scorer_ = None  # set if scoring != loss
-            raw_predictions_val = None  # set if scoring == loss and use val
+            # set if scoring == loss and use val
+            self._raw_predictions_val = None
             self.train_score_ = []
             self.validation_score_ = []
 
@@ -224,16 +225,18 @@ def fit(self, X, y):
                     # the validation data.
 
                     if self._use_validation_data:
-                        raw_predictions_val = np.zeros(
+                        self._raw_predictions_val = np.zeros(
                             shape=(self.n_trees_per_iteration_,
                                    X_binned_val.shape[0]),
                             dtype=self._baseline_prediction.dtype
                         )
 
-                        raw_predictions_val += self._baseline_prediction
+                        self._raw_predictions_val += self._baseline_prediction
 
-                    self._check_early_stopping_loss(raw_predictions, y_train,
-                                                    raw_predictions_val, y_val)
+                    self._check_early_stopping_loss(
+                        self._raw_predictions, y_train,
+                        self._raw_predictions_val, y_val
+                    )
                 else:
                     self.scorer_ = check_scoring(self, self.scoring)
                     # scorer_ is a callable with signature (est, X, y) and
@@ -273,7 +276,7 @@ def fit(self, X, y):
             self.validation_score_ = self.validation_score_.tolist()
 
             # Compute raw predictions
-            raw_predictions = self._raw_predict(X_binned_train)
+            self._raw_predictions = self._raw_predict(X_binned_train)
 
             if self.do_early_stopping_ and self.scoring != 'loss':
                 # Compute the subsample set
@@ -299,8 +302,8 @@ def fit(self, X, y):
                       end='', flush=True)
 
             # Update gradients and hessians, inplace
-            self.loss_.update_gradients_and_hessians(gradients, hessians,
-                                                     y_train, raw_predictions)
+            self.loss_.update_gradients_and_hessians(
+                gradients, hessians, y_train, self._raw_predictions)
 
             # Append a list since there may be more than 1 predictor per iter
             predictors.append([])
@@ -332,7 +335,7 @@ def fit(self, X, y):
                 # Update raw_predictions with the predictions of the newly
                 # created tree.
                 tic_pred = time()
-                _update_raw_predictions(raw_predictions[k, :], grower)
+                _update_raw_predictions(self._raw_predictions[k, :], grower)
                 toc_pred = time()
                 acc_prediction_time += toc_pred - tic_pred
 
@@ -342,7 +345,7 @@ def fit(self, X, y):
                     # Update raw_predictions_val with the newest tree(s)
                     if self._use_validation_data:
                         for k, pred in enumerate(self._predictors[-1]):
-                            raw_predictions_val[k, :] += (
+                            self._raw_predictions_val[k, :] += (
                                 pred.predict_binned(
                                     X_binned_val,
                                     self.bin_mapper_.missing_values_bin_idx_
@@ -350,8 +353,8 @@ def fit(self, X, y):
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
-                        raw_predictions, y_train,
-                        raw_predictions_val, y_val
+                        self._raw_predictions, y_train,
+                        self._raw_predictions_val, y_val
                     )
 
                 else:

From 7f81df7b3f671ff6b34fb81046d09b3e454bbd71 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Fri, 15 Nov 2019 15:04:03 +0100
Subject: [PATCH 27/33] Remove private attributes for raw predictions

---
 .../gradient_boosting.py                      | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 8d7feef08f555..bfb48614d10ea 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -187,11 +187,11 @@ def fit(self, X, y):
             self._baseline_prediction = self.loss_.get_baseline_prediction(
                 y_train, self.n_trees_per_iteration_
             )
-            self._raw_predictions = np.zeros(
+            raw_predictions = np.zeros(
                 shape=(self.n_trees_per_iteration_, n_samples),
                 dtype=self._baseline_prediction.dtype
             )
-            self._raw_predictions += self._baseline_prediction
+            raw_predictions += self._baseline_prediction
 
             # initialize gradients and hessians (empty arrays).
             # shape = (n_trees_per_iteration, n_samples).
@@ -207,7 +207,7 @@ def fit(self, X, y):
             # Initialize structures and attributes related to early stopping
             self.scorer_ = None  # set if scoring != loss
             # set if scoring == loss and use val
-            self._raw_predictions_val = None
+            raw_predictions_val = None
             self.train_score_ = []
             self.validation_score_ = []
 
@@ -226,17 +226,17 @@ def fit(self, X, y):
                     # the validation data.
 
                     if self._use_validation_data:
-                        self._raw_predictions_val = np.zeros(
+                        raw_predictions_val = np.zeros(
                             shape=(self.n_trees_per_iteration_,
                                    X_binned_val.shape[0]),
                             dtype=self._baseline_prediction.dtype
                         )
 
-                        self._raw_predictions_val += self._baseline_prediction
+                        raw_predictions_val += self._baseline_prediction
 
                     self._check_early_stopping_loss(
-                        self._raw_predictions, y_train,
-                        self._raw_predictions_val, y_val
+                        raw_predictions, y_train,
+                        raw_predictions_val, y_val
                     )
                 else:
                     self.scorer_ = check_scoring(self, self.scoring)
@@ -274,7 +274,7 @@ def fit(self, X, y):
             self.validation_score_ = self.validation_score_.tolist()
 
             # Compute raw predictions
-            self._raw_predictions = self._raw_predict(X_binned_train)
+            raw_predictions = self._raw_predict(X_binned_train)
 
             if self.do_early_stopping_ and self.scoring != 'loss':
                 # Compute the subsample set
@@ -301,7 +301,7 @@ def fit(self, X, y):
 
             # Update gradients and hessians, inplace
             self.loss_.update_gradients_and_hessians(
-                gradients, hessians, y_train, self._raw_predictions)
+                gradients, hessians, y_train, raw_predictions)
 
             # Append a list since there may be more than 1 predictor per iter
             predictors.append([])
@@ -337,7 +337,7 @@ def fit(self, X, y):
                 # Update raw_predictions with the predictions of the newly
                 # created tree.
                 tic_pred = time()
-                _update_raw_predictions(self._raw_predictions[k, :], grower)
+                _update_raw_predictions(raw_predictions[k, :], grower)
                 toc_pred = time()
                 acc_prediction_time += toc_pred - tic_pred
 
@@ -347,7 +347,7 @@ def fit(self, X, y):
                     # Update raw_predictions_val with the newest tree(s)
                     if self._use_validation_data:
                         for k, pred in enumerate(self._predictors[-1]):
-                            self._raw_predictions_val[k, :] += (
+                            raw_predictions_val[k, :] += (
                                 pred.predict_binned(
                                     X_binned_val,
                                     self.bin_mapper_.missing_values_bin_idx_
@@ -355,8 +355,8 @@ def fit(self, X, y):
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
-                        self._raw_predictions, y_train,
-                        self._raw_predictions_val, y_val
+                        raw_predictions, y_train,
+                        raw_predictions_val, y_val
                     )
 
                 else:

From 58c9bceb2ff5580af62e9a94023bafd2f8345308 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Fri, 15 Nov 2019 15:06:52 +0100
Subject: [PATCH 28/33] Revert changes

---
 .../gradient_boosting.py                        | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index bfb48614d10ea..153b3a7d09e3c 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -206,8 +206,7 @@ def fit(self, X, y):
 
             # Initialize structures and attributes related to early stopping
             self.scorer_ = None  # set if scoring != loss
-            # set if scoring == loss and use val
-            raw_predictions_val = None
+            raw_predictions_val = None  # set if scoring == loss and use val
             self.train_score_ = []
             self.validation_score_ = []
 
@@ -234,10 +233,8 @@ def fit(self, X, y):
 
                         raw_predictions_val += self._baseline_prediction
 
-                    self._check_early_stopping_loss(
-                        raw_predictions, y_train,
-                        raw_predictions_val, y_val
-                    )
+                    self._check_early_stopping_loss(raw_predictions, y_train,
+                                                    raw_predictions_val, y_val)
                 else:
                     self.scorer_ = check_scoring(self, self.scoring)
                     # scorer_ is a callable with signature (est, X, y) and
@@ -300,8 +297,8 @@ def fit(self, X, y):
                       end='', flush=True)
 
             # Update gradients and hessians, inplace
-            self.loss_.update_gradients_and_hessians(
-                gradients, hessians, y_train, raw_predictions)
+            self.loss_.update_gradients_and_hessians(gradients, hessians,
+                                                     y_train, raw_predictions)
 
             # Append a list since there may be more than 1 predictor per iter
             predictors.append([])
@@ -355,9 +352,7 @@ def fit(self, X, y):
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
-                        raw_predictions, y_train,
-                        raw_predictions_val, y_val
-                    )
+                        raw_predictions, y_train, raw_predictions_val, y_val)
 
                 else:
                     should_early_stop = self._check_early_stopping_scorer(

From 4e093ad9b257a6d22d058bfcb1c31b3baa8fb5c4 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Fri, 15 Nov 2019 15:09:17 +0100
Subject: [PATCH 29/33] Revert changes

---
 sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 153b3a7d09e3c..fd5711c767161 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -352,7 +352,9 @@ def fit(self, X, y):
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
-                        raw_predictions, y_train, raw_predictions_val, y_val)
+                        raw_predictions, y_train,
+                        raw_predictions_val, y_val
+                    )
 
                 else:
                     should_early_stop = self._check_early_stopping_scorer(

From 6ed073508e7b0fd37fa642402f5e01163902b2c2 Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Fri, 15 Nov 2019 18:16:49 +0100
Subject: [PATCH 30/33] Add note about scorer

---
 doc/modules/ensemble.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 5bec1a262200c..a16041e6ca659 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -899,9 +899,10 @@ Note that **early-stopping is enabled by default if the number of samples is
 larger than 10,000**. The early-stopping behaviour is controlled via the
 ``early-stopping``, ``scoring``, ``validation_fraction``,
 ``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop
-using an arbitrary :term:`scorer`, or just the training or validation loss. By
-default, early-stopping is performed if there are at least 10,000 samples in
-the training set, using the validation loss.
+using an arbitrary :term:`scorer`, or just the training or validation loss.
+Note that for technical reasons, using a scorer is significantly slower than
+using the loss. By default, early-stopping is performed if there are at least
+10,000 samples in the training set, using the validation loss.
 
 Missing values support
 ----------------------

From 29a37224643f120ed9199dc9106713fd5722e6db Mon Sep 17 00:00:00 2001
From: Johann Faouzi <johann.faouzi@gmail.com>
Date: Fri, 15 Nov 2019 18:19:36 +0100
Subject: [PATCH 31/33] Fix test_warm_start_early_stopping

---
 .../ensemble/_hist_gradient_boosting/tests/test_warm_start.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
index ccd005be6daed..2417de4f6cc63 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -102,14 +102,14 @@ def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
 
     n_iter_no_change = 5
     gb = GradientBoosting(
-        n_iter_no_change=n_iter_no_change, max_iter=10000,
+        n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,
         random_state=42, warm_start=True, tol=1e-3, scoring=scoring,
     )
     gb.fit(X, y)
     n_iter_first_fit = gb.n_iter_
     gb.fit(X, y)
     n_iter_second_fit = gb.n_iter_
-    assert n_iter_second_fit - n_iter_first_fit < n_iter_no_change
+    assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change
 
 
 @pytest.mark.parametrize('GradientBoosting, X, y', [

From 82a379627bf442debe451186e92015fc4be797fd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 3 Feb 2020 09:07:20 -0500
Subject: [PATCH 32/33] fixed bad merge

---
 doc/whats_new/v0.23.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 6bc9d23c2c637..26bfa0b599a42 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -118,10 +118,6 @@ Changelog
   samples in the training set. :pr:`14516` by :user:`Johann Faouzi
   <johannfaouzi>`.
 
-- |Feature| :func:`inspection.partial_dependence` and
-  :func:`inspection.plot_partial_dependence` now support the fast 'recursion'
-  method for both estimators. :pr:`13769` by `Nicolas Hug`_.
-
 :mod:`sklearn.feature_extraction`
 .................................
 

From 6f2b70a4c5399919896be7fa45ebe97b4d402526 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 3 Feb 2020 09:51:57 -0500
Subject: [PATCH 33/33] Fixed LightGBM tests: properly deactive ES since
 parameters have changed

---
 .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++----
 .../tests/test_compare_lightgbm.py                        | 6 +++---
 sklearn/ensemble/_hist_gradient_boosting/utils.pyx        | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index dacb1f428817e..e63e0285f553f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -750,8 +750,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     Attributes
     ----------
     n_iter_ : int
-        The number of iterations as selected by early stopping (if
-        n_iter_no_change is not None). Otherwise it corresponds to max_iter.
+        The number of iterations as selected by early stopping, depending on
+        the `early_stopping` parameter. Otherwise it corresponds to max_iter.
     n_trees_per_iteration_ : int
         The number of tree that are built at each iteration. For regressors,
         this is always 1.
@@ -940,8 +940,8 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     classes_ : array, shape = (n_classes,)
         Class labels.
     n_iter_ : int
-        The number of estimators as selected by early stopping (if
-        n_iter_no_change is not None). Otherwise it corresponds to max_iter.
+        The number of iterations as selected by early stopping, depending on
+        the `early_stopping` parameter. Otherwise it corresponds to max_iter.
     n_trees_per_iteration_ : int
         The number of tree that are built at each iteration. This is equal to 1
         for binary classification, and to ``n_classes`` for multiclass
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
index 32bb5dee4b197..6ac76a67d07ca 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -66,7 +66,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=1,
-        n_iter_no_change=None,
+        early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
@@ -119,7 +119,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=1,
-        n_iter_no_change=None,
+        early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
@@ -181,7 +181,7 @@ def test_same_predictions_multiclass_classification(
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=lr,
-        n_iter_no_change=None,
+        early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes)
     est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')
diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
index 4b1188b87e69e..cf2c5a51c90dd 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
@@ -38,7 +38,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
     if sklearn_params['loss'] == 'auto':
         raise ValueError('auto loss is not accepted. We need to know if '
                          'the problem is binary or multiclass classification.')
-    if sklearn_params['n_iter_no_change'] is not None:
+    if sklearn_params['early_stopping']:
         raise NotImplementedError('Early stopping should be deactivated.')
 
     lightgbm_loss_mapping = {