diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
index 8d055b22c2252..5f070bd45708d 100644
--- a/benchmarks/bench_hist_gradient_boosting.py
+++ b/benchmarks/bench_hist_gradient_boosting.py
@@ -2,6 +2,7 @@
 import argparse
 
 import matplotlib.pyplot as plt
+import numpy as np
 from sklearn.model_selection import train_test_split
 # To use this experimental feature, we need to explicitly ask for it:
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
@@ -25,6 +26,7 @@
 parser.add_argument('--learning-rate', type=float, default=.1)
 parser.add_argument('--problem', type=str, default='classification',
                     choices=['classification', 'regression'])
+parser.add_argument('--missing-fraction', type=float, default=0)
 parser.add_argument('--n-classes', type=int, default=2)
 parser.add_argument('--n-samples-max', type=int, default=int(1e6))
 parser.add_argument('--n-features', type=int, default=20)
@@ -52,6 +54,11 @@ def get_estimator_and_data():
 
 
 X, y, Estimator = get_estimator_and_data()
+if args.missing_fraction:
+    mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype(
+        np.bool)
+    X[mask] = np.nan
+
 X_train_, X_test_, y_train_, y_test_ = train_test_split(
     X, y, test_size=0.5, random_state=0)
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 084a2aca22597..fde8f40db6c8c 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -864,7 +864,7 @@ Usage
 Most of the parameters are unchanged from
 :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`.
 One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and
-controls the number of iterations of the boosting process:
+controls the number of iterations of the boosting process::
 
   >>> from sklearn.experimental import enable_hist_gradient_boosting
   >>> from sklearn.ensemble import HistGradientBoostingClassifier
@@ -873,10 +873,10 @@ controls the number of iterations of the boosting process:
   >>> X, y = make_hastie_10_2(random_state=0)
   >>> X_train, X_test = X[:2000], X[2000:]
   >>> y_train, y_test = y[:2000], y[2000:]
-  >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train)
 
+  >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train)
   >>> clf.score(X_test, y_test)
-  0.8998
+  0.8965
 
 The size of the trees can be controlled through the ``max_leaf_nodes``,
 ``max_depth``, and ``min_samples_leaf`` parameters.
@@ -895,6 +895,45 @@ using an arbitrary :term:`scorer`, or just the training or validation loss. By
 default, early-stopping is performed using the default :term:`scorer` of
 the estimator on a validation set.
 
+Missing values support
+----------------------
+
+:class:`HistGradientBoostingClassifier` and
+:class:`HistGradientBoostingRegressor` have built-in support for missing
+values (NaNs).
+
+During training, the tree grower learns at each split point whether samples
+with missing values should go to the left or right child, based on the
+potential gain. When predicting, samples with missing values are assigned to
+the left or right child consequently::
+
+  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+  >>> from sklearn.ensemble import HistGradientBoostingClassifier
+  >>> import numpy as np
+
+  >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
+  >>> y = [0, 0, 1, 1]
+
+  >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
+  >>> gbdt.predict(X)
+  array([0, 0, 1, 1])
+
+When the missingness pattern is predictive, the splits can be done on
+whether the feature value is missing or not::
+
+  >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1)
+  >>> y = [0, 1, 0, 0, 1]
+  >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1,
+  ...                                       max_depth=2,
+  ...                                       learning_rate=1,
+  ...                                       max_iter=1).fit(X, y)
+  >>> gbdt.predict(X)
+  array([0, 1, 0, 0, 1])
+
+If no missing values were encountered for a given feature during training,
+then samples with missing values are mapped to whichever child has the most
+samples.
+
 Low-level parallelism
 ---------------------
 
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index 4ac7afe644e89..779a94c2dd1b0 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -23,10 +23,11 @@ random sampling procedures.
 - :class:`decomposition.SparseCoder` with `algorithm='lasso_lars'` |Fix|
 - :class:`decomposition.SparsePCA` where `normalize_components` has no effect
   due to deprecation.
-
 - :class:`linear_model.Ridge` when `X` is sparse. |Fix|
-
 - :class:`cluster.KMeans` when `n_jobs=1`. |Fix|
+- :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` |Fix|, |Feature|,
+  |Enhancement|.
 
 Details are listed in the changelog below.
 
@@ -112,24 +113,31 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
-- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` have an additional
-  parameter called `warm_start` that enables warm starting. :pr:`14012` by
-  :user:`Johann Faouzi <johannfaouzi>`.
-
-- |Fix| :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` now bin the training and
-  validation data separately to avoid any data leak. :pr:`13933` by
-  `Nicolas Hug`_.
+- Many improvements were made to
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor`:
+
+  - |MajorFeature| Estimators now natively support dense data with missing
+    values both for training and predicting. They also support infinite
+    values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_
+    and `Olivier Grisel`_.
+  - |Feature| Estimators now have an additional `warm_start` parameter that
+    enables warm starting. :pr:`14012` by :user:`Johann Faouzi <johannfaouzi>`.
+  - |Enhancement| for :class:`ensemble.HistGradientBoostingClassifier` the
+    training loss or score is now monitored on a class-wise stratified
+    subsample to preserve the class balance of the original training set.
+    :pr:`14194` by :user:`Johann Faouzi <johannfaouzi>`.
+  - |Feature| :func:`inspection.partial_dependence` and
+    :func:`inspection.plot_partial_dependence` now support the fast 'recursion'
+    method for both estimators. :pr:`13769` by `Nicolas Hug`_.
+  - |Fix| Estimators now bin the training and validation data separately to
+    avoid any data leak. :pr:`13933` by `Nicolas Hug`_.
+
+  Note that pickles from 0.21 will not work in 0.22.
 
 - |Fix| :func:`ensemble.VotingClassifier.predict_proba` will no longer be
   present when `voting='hard'`. :pr:`14287` by `Thomas Fan`_.
 
-- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` the training
-  loss or score is now monitored on a class-wise stratified subsample to
-  preserve the class balance of the original training set. :pr:`14194`
-  by :user:`Johann Faouzi <johannfaouzi>`.
-
 - |Fix| Run by default
   :func:`utils.estimator_checks.check_estimator` on both
   :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. It
@@ -182,6 +190,12 @@ Changelog
   measure the importance of each feature in an arbitrary trained model with
   respect to a given scoring function. :issue:`13146` by `Thomas Fan`_.
 
+- |Feature| :func:`inspection.partial_dependence` and
+  :func:`inspection.plot_partial_dependence` now support the fast 'recursion'
+  method for :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor`. :pr:`13769` by
+  `Nicolas Hug`_.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
index 80eb625cdb676..1ecee3c9ee27e 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -12,11 +12,14 @@ import numpy as np
 cimport numpy as np
 from numpy.math cimport INFINITY
 from cython.parallel import prange
+from libc.math cimport isnan
 
-from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C
+from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
 
-cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds,
-                   X_BINNED_DTYPE_C [::1, :] binned):
+def _map_to_bins(const X_DTYPE_C [:, :] data,
+                 list binning_thresholds,
+                 const unsigned char missing_values_bin_idx,
+                 X_BINNED_DTYPE_C [::1, :] binned):
     """Bin numerical values to discrete integer-coded levels.
 
     Parameters
@@ -35,11 +38,13 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds,
     for feature_idx in range(data.shape[1]):
         _map_num_col_to_bins(data[:, feature_idx],
                              binning_thresholds[feature_idx],
+                             missing_values_bin_idx,
                              binned[:, feature_idx])
 
 
 cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
                                const X_DTYPE_C [:] binning_thresholds,
+                               const unsigned char missing_values_bin_idx,
                                X_BINNED_DTYPE_C [:] binned):
     """Binary search to find the bin index for each value in the data."""
     cdef:
@@ -49,11 +54,11 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
         int middle
 
     for i in prange(data.shape[0], schedule='static', nogil=True):
-        if data[i] == INFINITY:
-            # Special case for +inf.
-            # -inf is handled properly by binary search.
-            binned[i] = binning_thresholds.shape[0]
+
+        if isnan(data[i]):
+            binned[i] = missing_values_bin_idx
         else:
+            # for known values, use binary search
             left, right = 0, binning_thresholds.shape[0]
             while left < right:
                 middle = (right + left - 1) // 2
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
index 3603e6b2e2d8e..8d307c3806532 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -10,8 +10,8 @@ from cython.parallel import prange
 import numpy as np
 cimport numpy as np
 
-from .types import Y_DTYPE
-from .types cimport Y_DTYPE_C
+from .common import Y_DTYPE
+from .common cimport Y_DTYPE_C
 
 
 def _update_raw_predictions(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index 91c3e53101ed6..ff17654840005 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -12,8 +12,8 @@ cimport numpy as np
 
 from libc.math cimport exp
 
-from .types cimport Y_DTYPE_C
-from .types cimport G_H_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common cimport G_H_DTYPE_C
 
 
 def _update_gradients_least_squares(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index 21f9038210722..b3234cb5ba945 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -7,15 +7,16 @@
 
 cimport cython
 from cython.parallel import prange
+from libc.math cimport isnan
 import numpy as np
 cimport numpy as np
 from numpy.math cimport INFINITY
 
-from .types cimport X_DTYPE_C
-from .types cimport Y_DTYPE_C
-from .types import Y_DTYPE
-from .types cimport X_BINNED_DTYPE_C
-from .types cimport node_struct
+from .common cimport X_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common import Y_DTYPE
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport node_struct
 
 
 def _predict_from_numeric_data(
@@ -43,10 +44,12 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data(
     while True:
         if node.is_leaf:
             return node.value
-        if numeric_data[row, node.feature_idx] == INFINITY:
-            # if data is +inf we always go to the right child, even when the
-            # threhsold is +inf
-            node = nodes[node.right]
+
+        if isnan(numeric_data[row, node.feature_idx]):
+            if node.missing_go_to_left:
+                node = nodes[node.left]
+            else:
+                node = nodes[node.right]
         else:
             if numeric_data[row, node.feature_idx] <= node.threshold:
                 node = nodes[node.left]
@@ -57,19 +60,22 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data(
 def _predict_from_binned_data(
         node_struct [:] nodes,
         const X_BINNED_DTYPE_C [:, :] binned_data,
+        const unsigned char missing_values_bin_idx,
         Y_DTYPE_C [:] out):
 
     cdef:
         int i
 
     for i in prange(binned_data.shape[0], schedule='static', nogil=True):
-        out[i] = _predict_one_from_binned_data(nodes, binned_data, i)
+        out[i] = _predict_one_from_binned_data(nodes, binned_data, i,
+                                               missing_values_bin_idx)
 
 
 cdef inline Y_DTYPE_C _predict_one_from_binned_data(
         node_struct [:] nodes,
         const X_BINNED_DTYPE_C [:, :] binned_data,
-        const int row) nogil:
+        const int row,
+        const unsigned char missing_values_bin_idx) nogil:
     # Need to pass the whole array and the row index, else prange won't work.
     # See issue Cython #2798
 
@@ -79,10 +85,16 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data(
     while True:
         if node.is_leaf:
             return node.value
-        if binned_data[row, node.feature_idx] <= node.bin_threshold:
-            node = nodes[node.left]
+        if binned_data[row, node.feature_idx] ==  missing_values_bin_idx:
+            if node.missing_go_to_left:
+                node = nodes[node.left]
+            else:
+                node = nodes[node.right]
         else:
-            node = nodes[node.right]
+            if binned_data[row, node.feature_idx] <= node.bin_threshold:
+                node = nodes[node.left]
+            else:
+                node = nodes[node.right]
 
 def _compute_partial_dependence(
     node_struct [:] nodes,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
index b35b2a2083b03..a6c779ca0a97b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -13,20 +13,23 @@
 from ...base import BaseEstimator, TransformerMixin
 from ...utils.validation import check_is_fitted
 from ._binning import _map_to_bins
-from .types import X_DTYPE, X_BINNED_DTYPE
+from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF
 
 
 def _find_binning_thresholds(data, max_bins, subsample, random_state):
     """Extract feature-wise quantiles from numerical data.
 
+    Missing values are ignored for finding the thresholds.
+
     Parameters
     ----------
     data : array-like, shape (n_samples, n_features)
         The data to bin.
-    max_bins : int
-        The maximum number of bins to use. If for a given feature the number of
-        unique values is less than ``max_bins``, then those unique values
-        will be used to compute the bin thresholds, instead of the quantiles.
+    max_bins: int
+        The maximum number of bins to use for non-missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles.
     subsample : int or None
         If ``n_samples > subsample``, then ``sub_samples`` samples will be
         randomly choosen to compute the quantiles. If ``None``, the whole data
@@ -42,19 +45,19 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
         be used to separate the bins. Thus ``len(binning_thresholds) ==
         n_features``.
     """
-    if not (2 <= max_bins <= 256):
-        raise ValueError('max_bins={} should be no smaller than 2 '
-                         'and no larger than 256.'.format(max_bins))
     rng = check_random_state(random_state)
     if subsample is not None and data.shape[0] > subsample:
         subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False)
         data = data.take(subset, axis=0)
 
-    percentiles = np.linspace(0, 100, num=max_bins + 1)
-    percentiles = percentiles[1:-1]
     binning_thresholds = []
     for f_idx in range(data.shape[1]):
-        col_data = np.ascontiguousarray(data[:, f_idx], dtype=X_DTYPE)
+        col_data = data[:, f_idx]
+        # ignore missing values when computing bin thresholds
+        missing_mask = np.isnan(col_data)
+        if missing_mask.any():
+            col_data = col_data[~missing_mask]
+        col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
         distinct_values = np.unique(col_data)
         if len(distinct_values) <= max_bins:
             midpoints = distinct_values[:-1] + distinct_values[1:]
@@ -65,9 +68,18 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
             # np.unique(col_data, return_counts) instead but this is more
             # work and the performance benefit will be limited because we
             # work on a fixed-size subsample of the full data.
+            percentiles = np.linspace(0, 100, num=max_bins + 1)
+            percentiles = percentiles[1:-1]
             midpoints = np.percentile(col_data, percentiles,
                                       interpolation='midpoint').astype(X_DTYPE)
+            assert midpoints.shape[0] == max_bins - 1
+
+        # We avoid having +inf thresholds: +inf thresholds are only allowed in
+        # a "split on nan" situation.
+        np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
+
         binning_thresholds.append(midpoints)
+
     return binning_thresholds
 
 
@@ -80,16 +92,19 @@ class _BinMapper(BaseEstimator, TransformerMixin):
     For large datasets, quantiles are computed on a subset of the data to
     speed-up the binning, but the quantiles should remain stable.
 
-    If the number of unique values for a given feature is less than
-    ``max_bins``, then the unique values of this feature are used instead of
-    the quantiles.
+    Features with a small number of values may be binned into less than
+    ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
+    for missing values.
 
     Parameters
     ----------
-    max_bins : int, optional (default=256)
-        The maximum number of bins to use. If for a given feature the number of
-        unique values is less than ``max_bins``, then those unique values
-        will be used to compute the bin thresholds, instead of the quantiles.
+    n_bins : int, optional (default=256)
+        The maximum number of bins to use (including the bin for missing
+        values). Non-missing values are binned on ``max_bins = n_bins - 1``
+        bins. The last bin is always reserved for missing values. If for a
+        given feature the number of unique values is less than ``max_bins``,
+        then those unique values will be used to compute the bin thresholds,
+        instead of the quantiles.
     subsample : int or None, optional (default=2e5)
         If ``n_samples > subsample``, then ``sub_samples`` samples will be
         randomly choosen to compute the quantiles. If ``None``, the whole data
@@ -98,15 +113,35 @@ class _BinMapper(BaseEstimator, TransformerMixin):
         optional (default=None)
         Pseudo-random number generator to control the random sub-sampling.
         See :term:`random_state`.
+
+    Attributes
+    ----------
+    bin_thresholds_ : list of arrays
+        For each feature, gives the real-valued bin threhsolds. There are
+        ``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the
+        number of bins used for non-missing values.
+    n_bins_non_missing_ : array of uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this is
+        equal to ``n_bins - 1``.
+    missing_values_bin_idx_ : uint8
+        The index of the bin where missing values are mapped. This is a
+        constant accross all features. This corresponds to the last bin, and
+        it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
+        is less than ``n_bins - 1`` for a given feature, then there are
+        empty (and unused) bins.
     """
-    def __init__(self, max_bins=256, subsample=int(2e5), random_state=None):
-        self.max_bins = max_bins
+    def __init__(self, n_bins=256, subsample=int(2e5), random_state=None):
+        self.n_bins = n_bins
         self.subsample = subsample
         self.random_state = random_state
 
     def fit(self, X, y=None):
         """Fit data X by computing the binning thresholds.
 
+        The last bin is reserved for missing values, whether missing values
+        are present in the data or not.
+
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
@@ -118,20 +153,30 @@ def fit(self, X, y=None):
         -------
         self : object
         """
+        if not (3 <= self.n_bins <= 256):
+            # min is 3: at least 2 distinct bins and a missing values bin
+            raise ValueError('n_bins={} should be no smaller than 3 '
+                             'and no larger than 256.'.format(self.n_bins))
+
         X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
+        max_bins = self.n_bins - 1
         self.bin_thresholds_ = _find_binning_thresholds(
-            X, self.max_bins, subsample=self.subsample,
+            X, max_bins, subsample=self.subsample,
             random_state=self.random_state)
 
-        self.actual_n_bins_ = np.array(
+        self.n_bins_non_missing_ = np.array(
             [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
             dtype=np.uint32)
 
+        self.missing_values_bin_idx_ = self.n_bins - 1
+
         return self
 
     def transform(self, X):
         """Bin data X.
 
+        Missing values will be mapped to the last bin.
+
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
@@ -144,12 +189,13 @@ def transform(self, X):
         """
         X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
         check_is_fitted(self)
-        if X.shape[1] != self.actual_n_bins_.shape[0]:
+        if X.shape[1] != self.n_bins_non_missing_.shape[0]:
             raise ValueError(
                 'This estimator was fitted with {} features but {} got passed '
-                'to transform()'.format(self.actual_n_bins_.shape[0],
+                'to transform()'.format(self.n_bins_non_missing_.shape[0],
                                         X.shape[1])
             )
         binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
-        _map_to_bins(X, self.bin_thresholds_, binned)
+        _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
+                     binned)
         return binned
diff --git a/sklearn/ensemble/_hist_gradient_boosting/types.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
similarity index 96%
rename from sklearn/ensemble/_hist_gradient_boosting/types.pxd
rename to sklearn/ensemble/_hist_gradient_boosting/common.pxd
index f72741006a508..fa78f2024aa5c 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/types.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -23,6 +23,7 @@ cdef packed struct node_struct:
     unsigned int count
     unsigned int feature_idx
     X_DTYPE_C threshold
+    unsigned char missing_go_to_left
     unsigned int left
     unsigned int right
     Y_DTYPE_C gain
diff --git a/sklearn/ensemble/_hist_gradient_boosting/types.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx
similarity index 87%
rename from sklearn/ensemble/_hist_gradient_boosting/types.pyx
rename to sklearn/ensemble/_hist_gradient_boosting/common.pyx
index 67820337e72bc..8604548e44163 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/types.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pyx
@@ -6,7 +6,7 @@ import numpy as np
 Y_DTYPE = np.float64
 X_DTYPE = np.float64
 X_BINNED_DTYPE = np.uint8  # hence max_bins == 256
-# dtypes for gradients and hessians arrays
+# dtype for gradients and hessians arrays
 G_H_DTYPE = np.float32
 
 HISTOGRAM_DTYPE = np.dtype([
@@ -20,6 +20,7 @@ PREDICTOR_RECORD_DTYPE = np.dtype([
     ('count', np.uint32),
     ('feature_idx', np.uint32),
     ('threshold', X_DTYPE),
+    ('missing_go_to_left', np.uint8),
     ('left', np.uint32),
     ('right', np.uint32),
     ('gain', Y_DTYPE),
@@ -27,3 +28,5 @@ PREDICTOR_RECORD_DTYPE = np.dtype([
     ('is_leaf', np.uint8),
     ('bin_threshold', X_BINNED_DTYPE),
 ])
+
+ALMOST_INF = 1e300  # see LightGBM AvoidInf()
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 4c40f662d0656..ad6a5a8ca381b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -2,6 +2,7 @@
 # Author: Nicolas Hug
 
 from abc import ABC, abstractmethod
+from functools import partial
 
 import numpy as np
 from timeit import default_timer as time
@@ -14,7 +15,7 @@
 from ...model_selection import train_test_split
 from ...preprocessing import LabelEncoder
 from ._gradient_boosting import _update_raw_predictions
-from .types import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE
+from .common import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE
 
 from .binning import _BinMapper
 from .grower import TreeGrower
@@ -75,6 +76,10 @@ def _validate_parameters(self):
             raise ValueError('tol={} '
                              'must not be smaller than 0.'.format(self.tol))
 
+        if not (2 <= self.max_bins <= 255):
+            raise ValueError('max_bins={} should be no smaller than 2 '
+                             'and no larger than 255.'.format(self.max_bins))
+
     def fit(self, X, y):
         """Fit the gradient boosting model.
 
@@ -143,8 +148,18 @@ def fit(self, X, y):
             X_train, y_train = X, y
             X_val, y_val = None, None
 
+        has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8)
+
         # Bin the data
-        self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng)
+        # For ease of use of the API, the user-facing GBDT classes accept the
+        # parameter max_bins, which doesn't take into account the bin for
+        # missing values (which is always allocated). However, since max_bins
+        # isn't the true maximal number of bins, all other private classes
+        # (binmapper, histbuilder...) accept n_bins instead, which is the
+        # actual total number of bins. Everywhere in the code, the
+        # convention is that n_bins == max_bins + 1
+        n_bins = self.max_bins + 1  # + 1 for missing values
+        self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=rng)
         X_binned_train = self._bin_data(X_train, rng, is_training_data=True)
         if X_val is not None:
             X_binned_val = self._bin_data(X_val, rng, is_training_data=False)
@@ -293,8 +308,9 @@ def fit(self, X, y):
 
                 grower = TreeGrower(
                     X_binned_train, gradients[k, :], hessians[k, :],
-                    max_bins=self.max_bins,
-                    actual_n_bins=self.bin_mapper_.actual_n_bins_,
+                    n_bins=n_bins,
+                    n_bins_non_missing=self.bin_mapper_.n_bins_non_missing_,
+                    has_missing_values=has_missing_values,
                     max_leaf_nodes=self.max_leaf_nodes,
                     max_depth=self.max_depth,
                     min_samples_leaf=self.min_samples_leaf,
@@ -325,7 +341,11 @@ def fit(self, X, y):
                     if self._use_validation_data:
                         for k, pred in enumerate(self._predictors[-1]):
                             raw_predictions_val[k, :] += (
-                                pred.predict_binned(X_binned_val))
+                                pred.predict_binned(
+                                    X_binned_val,
+                                    self.bin_mapper_.missing_values_bin_idx_
+                                )
+                            )
 
                     should_early_stop = self._check_early_stopping_loss(
                         raw_predictions, y_train,
@@ -556,8 +576,13 @@ def _raw_predict(self, X):
         raw_predictions += self._baseline_prediction
         for predictors_of_ith_iteration in self._predictors:
             for k, predictor in enumerate(predictors_of_ith_iteration):
-                predict = (predictor.predict_binned if is_binned
-                           else predictor.predict)
+                if is_binned:
+                    predict = partial(
+                        predictor.predict_binned,
+                        missing_values_bin_idx=self.bin_mapper_.missing_values_bin_idx_  # noqa
+                    )
+                else:
+                    predict = predictor.predict
                 raw_predictions[k, :] += predict(X)
 
         return raw_predictions
@@ -593,6 +618,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
 
         return averaged_predictions
 
+    def _more_tags(self):
+        return {'allow_nan': True}
+
     @abstractmethod
     def _get_loss(self):
         pass
@@ -606,13 +634,6 @@ def n_iter_(self):
         check_is_fitted(self)
         return len(self._predictors)
 
-    def _more_tags(self):
-        # This is not strictly True, but it's needed since
-        # force_all_finite=False means accept both nans and infinite values.
-        # Without the tag, common checks would fail.
-        # This comment must be removed once we merge PR 13911
-        return {'allow_nan': True}
-
 
 class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     """Histogram-based Gradient Boosting Regression Tree.
@@ -621,6 +642,14 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     :class:`GradientBoostingRegressor<sklearn.ensemble.GradientBoostingRegressor>`
     for big datasets (n_samples >= 10 000).
 
+    This estimator has native support for missing values (NaNs). During
+    training, the tree grower learns at each split point whether samples
+    with missing values should go to the left or right child, based on the
+    potential gain. When predicting, samples with missing values are
+    assigned to the left or right child consequently. If no missing values
+    were encountered for a given feature during training, then samples with
+    missing values are mapped to whichever child has the most samples.
+
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
 
@@ -664,12 +693,13 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
     l2_regularization : float, optional (default=0)
         The L2 regularization parameter. Use ``0`` for no regularization
         (default).
-    max_bins : int, optional (default=256)
-        The maximum number of bins to use. Before training, each feature of
-        the input array ``X`` is binned into at most ``max_bins`` bins, which
-        allows for a much faster training stage. Features with a small
-        number of unique values may use less than ``max_bins`` bins. Must be no
-        larger than 256.
+    max_bins : int, optional (default=255)
+        The maximum number of bins to use for non-missing values. Before
+        training, each feature of the input array `X` is binned into
+        integer-valued bins, which allows for a much faster training stage.
+        Features with a small number of unique values may use less than
+        ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
+        is always reserved for missing values. Must be no larger than 255.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -740,7 +770,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
 
     def __init__(self, loss='least_squares', learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
-                 min_samples_leaf=20, l2_regularization=0., max_bins=256,
+                 min_samples_leaf=20, l2_regularization=0., max_bins=255,
                  warm_start=False, scoring=None, validation_fraction=0.1,
                  n_iter_no_change=None, tol=1e-7, verbose=0,
                  random_state=None):
@@ -789,6 +819,14 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     :class:`GradientBoostingClassifier<sklearn.ensemble.GradientBoostingClassifier>`
     for big datasets (n_samples >= 10 000).
 
+    This estimator has native support for missing values (NaNs). During
+    training, the tree grower learns at each split point whether samples
+    with missing values should go to the left or right child, based on the
+    potential gain. When predicting, samples with missing values are
+    assigned to the left or right child consequently. If no missing values
+    were encountered for a given feature during training, then samples with
+    missing values are mapped to whichever child has the most samples.
+
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
 
@@ -835,12 +873,13 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         since only very shallow trees would be built.
     l2_regularization : float, optional (default=0)
         The L2 regularization parameter. Use 0 for no regularization.
-    max_bins : int, optional (default=256)
-        The maximum number of bins to use. Before training, each feature of
-        the input array ``X`` is binned into at most ``max_bins`` bins, which
-        allows for a much faster training stage. Features with a small
-        number of unique values may use less than ``max_bins`` bins. Must be no
-        larger than 256.
+    max_bins : int, optional (default=255)
+        The maximum number of bins to use for non-missing values. Before
+        training, each feature of the input array `X` is binned into
+        integer-valued bins, which allows for a much faster training stage.
+        Features with a small number of unique values may use less than
+        ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
+        is always reserved for missing values. Must be no larger than 255.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -913,7 +952,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
 
     def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
-                 l2_regularization=0., max_bins=256, warm_start=False,
+                 l2_regularization=0., max_bins=255, warm_start=False,
                  scoring=None, validation_fraction=0.1, n_iter_no_change=None,
                  tol=1e-7, verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index 7eec680082e97..c7d303b8f6201 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -15,8 +15,8 @@
 from .histogram import HistogramBuilder
 from .predictor import TreePredictor
 from .utils import sum_parallel
-from .types import PREDICTOR_RECORD_DTYPE
-from .types import Y_DTYPE
+from .common import PREDICTOR_RECORD_DTYPE
+from .common import Y_DTYPE
 
 
 EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
@@ -141,14 +141,18 @@ class TreeGrower:
     min_gain_to_split : float, optional (default=0.)
         The minimum gain needed to split a node. Splits with lower gain will
         be ignored.
-    max_bins : int, optional (default=256)
-        The maximum number of bins. Used to define the shape of the
-        histograms.
-    actual_n_bins : ndarray of int or int, optional (default=None)
-        The actual number of bins needed for each feature, which is lower or
-        equal to ``max_bins``. If it's an int, all features are considered to
-        have the same number of bins. If None, all features are considered to
-        have ``max_bins`` bins.
+    n_bins : int, optional (default=256)
+        The total number of bins, including the bin for missing values. Used
+        to define the shape of the histograms.
+    n_bins_non_missing_ : array of uint32
+        For each feature, gives the number of bins actually used for
+        non-missing values. For features with a lot of unique values, this
+        is equal to ``n_bins - 1``. If it's an int, all features are
+        considered to have the same number of bins. If None, all features
+        are considered to have ``n_bins - 1`` bins.
+    has_missing_values : ndarray of bool or bool, optional (default=False)
+        Whether each feature contains missing values (in the training data).
+        If it's a bool, the same value is used for all features.
     l2_regularization : float, optional (default=0)
         The L2 regularization parameter.
     min_hessian_to_split : float, optional (default=1e-3)
@@ -161,32 +165,40 @@ class TreeGrower:
     """
     def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
                  max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
-                 max_bins=256, actual_n_bins=None, l2_regularization=0.,
-                 min_hessian_to_split=1e-3, shrinkage=1.):
+                 n_bins=256, n_bins_non_missing=None, has_missing_values=False,
+                 l2_regularization=0., min_hessian_to_split=1e-3,
+                 shrinkage=1.):
 
         self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
                                   min_samples_leaf, min_gain_to_split,
                                   l2_regularization, min_hessian_to_split)
 
-        if actual_n_bins is None:
-            actual_n_bins = max_bins
+        if n_bins_non_missing is None:
+            n_bins_non_missing = n_bins - 1
 
-        if isinstance(actual_n_bins, numbers.Integral):
-            actual_n_bins = np.array(
-                [actual_n_bins] * X_binned.shape[1],
+        if isinstance(n_bins_non_missing, numbers.Integral):
+            n_bins_non_missing = np.array(
+                [n_bins_non_missing] * X_binned.shape[1],
                 dtype=np.uint32)
         else:
-            actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32)
+            n_bins_non_missing = np.asarray(n_bins_non_missing,
+                                            dtype=np.uint32)
+
+        if isinstance(has_missing_values, bool):
+            has_missing_values = [has_missing_values] * X_binned.shape[1]
+        has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
 
         hessians_are_constant = hessians.shape[0] == 1
         self.histogram_builder = HistogramBuilder(
-            X_binned, max_bins, gradients, hessians, hessians_are_constant)
+            X_binned, n_bins, gradients, hessians, hessians_are_constant)
+        missing_values_bin_idx = n_bins - 1
         self.splitter = Splitter(
-            X_binned, actual_n_bins, l2_regularization,
-            min_hessian_to_split, min_samples_leaf, min_gain_to_split,
-            hessians_are_constant)
+            X_binned, n_bins_non_missing, missing_values_bin_idx,
+            has_missing_values, l2_regularization, min_hessian_to_split,
+            min_samples_leaf, min_gain_to_split, hessians_are_constant)
+        self.n_bins_non_missing = n_bins_non_missing
         self.max_leaf_nodes = max_leaf_nodes
-        self.max_bins = max_bins
+        self.has_missing_values = has_missing_values
         self.n_features = X_binned.shape[1]
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
@@ -333,6 +345,13 @@ def split_next(self):
         right_child_node.partition_start = left_child_node.partition_stop
         right_child_node.partition_stop = node.partition_stop
 
+        if not self.has_missing_values[node.split_info.feature_idx]:
+            # If no missing values are encountered at fit time, then samples
+            # with missing values during predict() will go to whichever child
+            # has the most samples.
+            node.split_info.missing_go_to_left = (
+                left_child_node.n_samples > right_child_node.n_samples)
+
         self.n_nodes += 2
 
         if self.max_depth is not None and depth == self.max_depth:
@@ -428,12 +447,13 @@ def make_predictor(self, bin_thresholds=None):
         """
         predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
         _fill_predictor_node_array(predictor_nodes, self.root,
-                                   bin_thresholds=bin_thresholds)
+                                   bin_thresholds, self.n_bins_non_missing)
         return TreePredictor(predictor_nodes)
 
 
 def _fill_predictor_node_array(predictor_nodes, grower_node,
-                               bin_thresholds, next_free_idx=0):
+                               bin_thresholds, n_bins_non_missing,
+                               next_free_idx=0):
     """Helper used in make_predictor to set the TreePredictor fields."""
     node = predictor_nodes[next_free_idx]
     node['count'] = grower_node.n_samples
@@ -454,17 +474,27 @@ def _fill_predictor_node_array(predictor_nodes, grower_node,
         feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
         node['feature_idx'] = feature_idx
         node['bin_threshold'] = bin_idx
-        if bin_thresholds is not None:
-            threshold = bin_thresholds[feature_idx][bin_idx]
-            node['threshold'] = threshold
+        node['missing_go_to_left'] = split_info.missing_go_to_left
+
+        if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1:
+            # Split is on the last non-missing bin: it's a "split on nans". All
+            # nans go to the right, the rest go to the left.
+            node['threshold'] = np.inf
+        elif bin_thresholds is not None:
+            node['threshold'] = bin_thresholds[feature_idx][bin_idx]
+
         next_free_idx += 1
 
         node['left'] = next_free_idx
         next_free_idx = _fill_predictor_node_array(
             predictor_nodes, grower_node.left_child,
-            bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)
+            bin_thresholds=bin_thresholds,
+            n_bins_non_missing=n_bins_non_missing,
+            next_free_idx=next_free_idx)
 
         node['right'] = next_free_idx
         return _fill_predictor_node_array(
             predictor_nodes, grower_node.right_child,
-            bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)
+            bin_thresholds=bin_thresholds,
+            n_bins_non_missing=n_bins_non_missing,
+            next_free_idx=next_free_idx)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
index c83fa0c79db71..740e5e002cf4e 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -12,10 +12,10 @@ from cython.parallel import prange
 import numpy as np
 cimport numpy as np
 
-from .types import HISTOGRAM_DTYPE
-from .types cimport hist_struct
-from .types cimport X_BINNED_DTYPE_C
-from .types cimport G_H_DTYPE_C
+from .common import HISTOGRAM_DTYPE
+from .common cimport hist_struct
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport G_H_DTYPE_C
 
 # Notes:
 # - IN views are read-only, OUT views are write-only
@@ -62,9 +62,9 @@ cdef class HistogramBuilder:
     ----------
     X_binned : ndarray of int, shape (n_samples, n_features)
         The binned input samples. Must be Fortran-aligned.
-    max_bins : int
-        The maximum number of bins. Used to define the shape of the
-        histograms.
+    n_bins : int
+        The total number of bins, including the bin for missing values. Used
+        to define the shape of the histograms.
     gradients : ndarray, shape (n_samples,)
         The gradients of each training sample. Those are the gradients of the
         loss w.r.t the predictions, evaluated at iteration i - 1.
@@ -77,7 +77,7 @@ cdef class HistogramBuilder:
     cdef public:
         const X_BINNED_DTYPE_C [::1, :] X_binned
         unsigned int n_features
-        unsigned int max_bins
+        unsigned int n_bins
         G_H_DTYPE_C [::1] gradients
         G_H_DTYPE_C [::1] hessians
         G_H_DTYPE_C [::1] ordered_gradients
@@ -85,15 +85,15 @@ cdef class HistogramBuilder:
         unsigned char hessians_are_constant
 
     def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned,
-                 unsigned int max_bins, G_H_DTYPE_C [::1] gradients,
+                 unsigned int n_bins, G_H_DTYPE_C [::1] gradients,
                  G_H_DTYPE_C [::1] hessians,
                  unsigned char hessians_are_constant):
 
         self.X_binned = X_binned
         self.n_features = X_binned.shape[1]
-        # Note: all histograms will have <max_bins> bins, but some of the
-        # last bins may be unused if actual_n_bins[f] < max_bins
-        self.max_bins = max_bins
+        # Note: all histograms will have <n_bins> bins, but some of the
+        # bins may be unused if a feature has a small number of unique values.
+        self.n_bins = n_bins
         self.gradients = gradients
         self.hessians = hessians
         # for root node, gradients and hessians are already ordered
@@ -115,7 +115,7 @@ cdef class HistogramBuilder:
 
         Returns
         -------
-        histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, max_bins)
+        histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins)
             The computed histograms of the current node.
         """
         cdef:
@@ -131,7 +131,7 @@ cdef class HistogramBuilder:
             G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians
             G_H_DTYPE_C [::1] hessians = self.hessians
             hist_struct [:, ::1] histograms = np.zeros(
-                shape=(self.n_features, self.max_bins),
+                shape=(self.n_features, self.n_bins),
                 dtype=HISTOGRAM_DTYPE
             )
 
@@ -210,15 +210,15 @@ cdef class HistogramBuilder:
         Parameters
         ----------
         parent_histograms : ndarray of HISTOGRAM_DTYPE, \
-                shape (n_features, max_bins)
+                shape (n_features, n_bins)
             The histograms of the parent.
         sibling_histograms : ndarray of HISTOGRAM_DTYPE, \
-                shape (n_features, max_bins)
+                shape (n_features, n_bins)
             The histograms of the sibling.
 
         Returns
         -------
-        histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, max_bins)
+        histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins)
             The computed histograms of the current node.
         """
 
@@ -226,14 +226,14 @@ cdef class HistogramBuilder:
             int feature_idx
             int n_features = self.n_features
             hist_struct [:, ::1] histograms = np.zeros(
-                shape=(self.n_features, self.max_bins),
+                shape=(self.n_features, self.n_bins),
                 dtype=HISTOGRAM_DTYPE
             )
 
         for feature_idx in prange(n_features, schedule='static', nogil=True):
             # Compute histogram of each feature
             _subtract_histograms(feature_idx,
-                                 self.max_bins,
+                                 self.n_bins,
                                  parent_histograms,
                                  sibling_histograms,
                                  histograms)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index 5d7c68ea0b38f..9e00187d62425 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -15,8 +15,8 @@
 except ImportError:
     from scipy.misc import logsumexp
 
-from .types import Y_DTYPE
-from .types import G_H_DTYPE
+from .common import Y_DTYPE
+from .common import G_H_DTYPE
 from ._loss import _update_gradients_least_squares
 from ._loss import _update_gradients_hessians_binary_crossentropy
 from ._loss import _update_gradients_hessians_categorical_crossentropy
diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py
index c80788d049874..0b359c8f98224 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from .types import Y_DTYPE
+from .common import Y_DTYPE
 from ._predictor import _predict_from_numeric_data
 from ._predictor import _predict_from_binned_data
 from ._predictor import _compute_partial_dependence
@@ -47,13 +47,17 @@ def predict(self, X):
         _predict_from_numeric_data(self.nodes, X, out)
         return out
 
-    def predict_binned(self, X):
+    def predict_binned(self, X, missing_values_bin_idx):
         """Predict raw values for binned data.
 
         Parameters
         ----------
         X : ndarray, shape (n_samples, n_features)
             The input samples.
+        missing_values_bin_idx : uint8
+            Index of the bin that is used for missing values. This is the
+            index of the last bin and is always equal to max_bins (as passed
+            to the GBDT classes), or equivalently to n_bins - 1.
 
         Returns
         -------
@@ -61,7 +65,7 @@ def predict_binned(self, X):
             The raw predicted values.
         """
         out = np.empty(X.shape[0], dtype=Y_DTYPE)
-        _predict_from_binned_data(self.nodes, X, out)
+        _predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out)
         return out
 
     def compute_partial_dependence(self, grid, target_features, out):
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 6dc6e58d9acff..fda060e238514 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -20,10 +20,10 @@ IF SKLEARN_OPENMP_SUPPORTED:
 from libc.stdlib cimport malloc, free
 from libc.string cimport memcpy
 
-from .types cimport X_BINNED_DTYPE_C
-from .types cimport Y_DTYPE_C
-from .types cimport hist_struct
-from .types import HISTOGRAM_DTYPE
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport Y_DTYPE_C
+from .common cimport hist_struct
+from .common import HISTOGRAM_DTYPE
 
 
 cdef struct split_info_struct:
@@ -32,6 +32,7 @@ cdef struct split_info_struct:
     Y_DTYPE_C gain
     int feature_idx
     unsigned int bin_idx
+    unsigned char missing_go_to_left
     Y_DTYPE_C sum_gradient_left
     Y_DTYPE_C sum_gradient_right
     Y_DTYPE_C sum_hessian_left
@@ -51,6 +52,8 @@ class SplitInfo:
         The index of the feature to be split.
     bin_idx : int
         The index of the bin on which the split is made.
+    missing_go_to_left : bool
+        Whether missing values should go to the left child.
     sum_gradient_left : float
         The sum of the gradients of all the samples in the left child.
     sum_hessian_left : float
@@ -64,12 +67,14 @@ class SplitInfo:
     n_samples_right : int
         The number of samples in the right child.
     """
-    def __init__(self, gain, feature_idx, bin_idx, sum_gradient_left,
-                 sum_hessian_left, sum_gradient_right, sum_hessian_right,
-                 n_samples_left, n_samples_right):
+    def __init__(self, gain, feature_idx, bin_idx,
+                 missing_go_to_left, sum_gradient_left, sum_hessian_left,
+                 sum_gradient_right, sum_hessian_right, n_samples_left,
+                 n_samples_right):
         self.gain = gain
         self.feature_idx = feature_idx
         self.bin_idx = bin_idx
+        self.missing_go_to_left = missing_go_to_left
         self.sum_gradient_left = sum_gradient_left
         self.sum_hessian_left = sum_hessian_left
         self.sum_gradient_right = sum_gradient_right
@@ -91,9 +96,16 @@ cdef class Splitter:
     ----------
     X_binned : ndarray of int, shape (n_samples, n_features)
         The binned input samples. Must be Fortran-aligned.
-    actual_n_bins : ndarray, shape (n_features,)
-        The actual number of bins needed for each feature, which is lower or
-        equal to max_bins.
+    n_bins_non_missing : ndarray, shape (n_features,)
+        For each feature, gives the number of bins actually used for
+        non-missing values.
+    missing_values_bin_idx : uint8
+        Index of the bin that is used for missing values. This is the index of
+        the last bin and is always equal to max_bins (as passed to the GBDT
+        classes), or equivalently to n_bins - 1.
+    has_missing_values : ndarray, shape (n_features,)
+        Whether missing values were observed in the training data, for each
+        feature.
     l2_regularization : float
         The L2 regularization parameter.
     min_hessian_to_split : float, default=1e-3
@@ -111,7 +123,9 @@ cdef class Splitter:
     cdef public:
         const X_BINNED_DTYPE_C [::1, :] X_binned
         unsigned int n_features
-        unsigned int [::1] actual_n_bins
+        const unsigned int [::1] n_bins_non_missing
+        unsigned char missing_values_bin_idx
+        const unsigned char [::1] has_missing_values
         unsigned char hessians_are_constant
         Y_DTYPE_C l2_regularization
         Y_DTYPE_C min_hessian_to_split
@@ -122,16 +136,22 @@ cdef class Splitter:
         unsigned int [::1] left_indices_buffer
         unsigned int [::1] right_indices_buffer
 
-    def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned,
-                 np.ndarray[np.uint32_t] actual_n_bins,
-                 Y_DTYPE_C l2_regularization, Y_DTYPE_C
-                 min_hessian_to_split=1e-3, unsigned int
-                 min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0.,
+    def __init__(self,
+                 const X_BINNED_DTYPE_C [::1, :] X_binned,
+                 const unsigned int [::1] n_bins_non_missing,
+                 const unsigned char missing_values_bin_idx,
+                 const unsigned char [::1] has_missing_values,
+                 Y_DTYPE_C l2_regularization,
+                 Y_DTYPE_C min_hessian_to_split=1e-3,
+                 unsigned int min_samples_leaf=20,
+                 Y_DTYPE_C min_gain_to_split=0.,
                  unsigned char hessians_are_constant=False):
 
         self.X_binned = X_binned
         self.n_features = X_binned.shape[1]
-        self.actual_n_bins = actual_n_bins
+        self.n_bins_non_missing = n_bins_non_missing
+        self.missing_values_bin_idx = missing_values_bin_idx
+        self.has_missing_values = has_missing_values
         self.l2_regularization = l2_regularization
         self.min_hessian_to_split = min_hessian_to_split
         self.min_samples_leaf = min_samples_leaf
@@ -228,6 +248,8 @@ cdef class Splitter:
         cdef:
             int n_samples = sample_indices.shape[0]
             X_BINNED_DTYPE_C bin_idx = split_info.bin_idx
+            unsigned char missing_go_to_left = split_info.missing_go_to_left
+            unsigned char missing_values_bin_idx = self.missing_values_bin_idx
             int feature_idx = split_info.feature_idx
             const X_BINNED_DTYPE_C [::1] X_binned = \
                 self.X_binned[:, feature_idx]
@@ -252,6 +274,7 @@ cdef class Splitter:
             int thread_idx
             int sample_idx
             int right_child_position
+            unsigned char turn_left
             int [:] left_offset = np.zeros(n_threads, dtype=np.int32)
             int [:] right_offset = np.zeros(n_threads, dtype=np.int32)
 
@@ -273,7 +296,12 @@ cdef class Splitter:
                 stop = start + sizes[thread_idx]
                 for i in range(start, stop):
                     sample_idx = sample_indices[i]
-                    if X_binned[sample_idx] <= bin_idx:
+                    turn_left = sample_goes_left(
+                        missing_go_to_left,
+                        missing_values_bin_idx, bin_idx,
+                        X_binned[sample_idx])
+
+                    if turn_left:
                         left_indices_buffer[start + left_count] = sample_idx
                         left_count = left_count + 1
                     else:
@@ -350,6 +378,7 @@ cdef class Splitter:
             int n_features = self.n_features
             split_info_struct split_info
             split_info_struct * split_infos
+            const unsigned char [:] has_missing_values = self.has_missing_values
 
         with nogil:
 
@@ -358,10 +387,32 @@ cdef class Splitter:
 
             for feature_idx in prange(n_features, schedule='static'):
                 # For each feature, find best bin to split on
-                split_info = self._find_best_bin_to_split_helper(
-                    feature_idx, histograms, n_samples,
-                    sum_gradients, sum_hessians)
-                split_infos[feature_idx] = split_info
+                # Start with a gain of -1 (if no better split is found, that
+                # means one of the constraints isn't respected
+                # (min_samples_leaf, etc) and the grower will later turn the
+                # node into a leaf.
+                split_infos[feature_idx].gain = -1
+
+                # We will scan bins from left to right (in all cases), and if
+                # there are any missing values, we will also scan bins from
+                # right to left. This way, we can consider whichever case
+                # yields the best gain: either missing values go to the right
+                # (left to right scan) or to the left (right to left case).
+                # See algo 3 from the XGBoost paper
+                # https://arxiv.org/abs/1603.02754
+
+                self._find_best_bin_to_split_left_to_right(
+                    feature_idx, has_missing_values[feature_idx],
+                    histograms, n_samples, sum_gradients, sum_hessians,
+                    &split_infos[feature_idx])
+
+                if has_missing_values[feature_idx]:
+                    # We need to explore both directions to check whether
+                    # sending the nans to the left child would lead to a higher
+                    # gain
+                    self._find_best_bin_to_split_right_to_left(
+                        feature_idx, histograms, n_samples,
+                        sum_gradients, sum_hessians, &split_infos[feature_idx])
 
             # then compute best possible split among all features
             best_feature_idx = self._find_best_feature_to_split_helper(
@@ -372,6 +423,7 @@ cdef class Splitter:
             split_info.gain,
             split_info.feature_idx,
             split_info.bin_idx,
+            split_info.missing_go_to_left,
             split_info.sum_gradient_left,
             split_info.sum_hessian_left,
             split_info.sum_gradient_right,
@@ -382,13 +434,13 @@ cdef class Splitter:
         free(split_infos)
         return out
 
-    cdef int _find_best_feature_to_split_helper(
+    cdef unsigned int _find_best_feature_to_split_helper(
             self,
             split_info_struct * split_infos) nogil:  # IN
         """Returns the best feature among those in splits_infos."""
         cdef:
-            int feature_idx
-            int best_feature_idx = 0
+            unsigned int feature_idx
+            unsigned int best_feature_idx = 0
 
         for feature_idx in range(1, self.n_features):
             if (split_infos[feature_idx].gain >
@@ -396,43 +448,50 @@ cdef class Splitter:
                 best_feature_idx = feature_idx
         return best_feature_idx
 
-    cdef split_info_struct _find_best_bin_to_split_helper(
+    cdef void _find_best_bin_to_split_left_to_right(
             Splitter self,
             unsigned int feature_idx,
+            unsigned char has_missing_values,
             const hist_struct [:, ::1] histograms,  # IN
             unsigned int n_samples,
             Y_DTYPE_C sum_gradients,
-            Y_DTYPE_C sum_hessians) nogil:
+            Y_DTYPE_C sum_hessians,
+            split_info_struct * split_info) nogil:  # OUT
         """Find best bin to split on for a given feature.
 
         Splits that do not satisfy the splitting constraints
-        (min_gain_to_split, etc.) are discarded here. If no split can
-        satisfy the constraints, a SplitInfo with a gain of -1 is returned.
-        If for a given node the best SplitInfo has a gain of -1, it is
-        finalized into a leaf in the grower.
+        (min_gain_to_split, etc.) are discarded here.
+
+        We scan node from left to right. This version is called whether there
+        are missing values or not. If any, missing values are assigned to the
+        right node.
         """
         cdef:
             unsigned int bin_idx
             unsigned int n_samples_left
             unsigned int n_samples_right
             unsigned int n_samples_ = n_samples
+            # We set the 'end' variable such that the last non-missing-values
+            # bin never goes to the left child (which would result in and
+            # empty right child), unless there are missing values, since these
+            # would go to the right child.
+            unsigned int end = \
+                self.n_bins_non_missing[feature_idx] - 1 + has_missing_values
             Y_DTYPE_C sum_hessian_left
             Y_DTYPE_C sum_hessian_right
             Y_DTYPE_C sum_gradient_left
             Y_DTYPE_C sum_gradient_right
             Y_DTYPE_C negative_loss_current_node
             Y_DTYPE_C gain
-            split_info_struct best_split
 
-        best_split.gain = -1.
         sum_gradient_left, sum_hessian_left = 0., 0.
         n_samples_left = 0
         negative_loss_current_node = negative_loss(sum_gradients,
-            sum_hessians, self.l2_regularization)
+                                                   sum_hessians,
+                                                   self.l2_regularization)
+
 
-        for bin_idx in range(self.actual_n_bins[feature_idx] - 1):
-            # Note that considering splitting on the last bin is useless since
-            # it would result in having 0 samples in the right node (forbidden)
+        for bin_idx in range(end):
             n_samples_left += histograms[feature_idx, bin_idx].count
             n_samples_right = n_samples_ - n_samples_left
 
@@ -463,19 +522,103 @@ cdef class Splitter:
                                negative_loss_current_node,
                                self.l2_regularization)
 
-            if gain > best_split.gain and gain > self.min_gain_to_split:
-                best_split.gain = gain
-                best_split.feature_idx = feature_idx
-                best_split.bin_idx = bin_idx
-                best_split.sum_gradient_left = sum_gradient_left
-                best_split.sum_gradient_right = sum_gradient_right
-                best_split.sum_hessian_left = sum_hessian_left
-                best_split.sum_hessian_right = sum_hessian_right
-                best_split.n_samples_left = n_samples_left
-                best_split.n_samples_right = n_samples_right
+            if gain > split_info.gain and gain > self.min_gain_to_split:
+                split_info.gain = gain
+                split_info.feature_idx = feature_idx
+                split_info.bin_idx = bin_idx
+                # we scan from left to right so missing values go to the right
+                split_info.missing_go_to_left = False
+                split_info.sum_gradient_left = sum_gradient_left
+                split_info.sum_gradient_right = sum_gradient_right
+                split_info.sum_hessian_left = sum_hessian_left
+                split_info.sum_hessian_right = sum_hessian_right
+                split_info.n_samples_left = n_samples_left
+                split_info.n_samples_right = n_samples_right
+
+    cdef void _find_best_bin_to_split_right_to_left(
+            self,
+            unsigned int feature_idx,
+            const hist_struct [:, ::1] histograms,  # IN
+            unsigned int n_samples,
+            Y_DTYPE_C sum_gradients,
+            Y_DTYPE_C sum_hessians,
+            split_info_struct * split_info) nogil:  # OUT
+        """Find best bin to split on for a given feature.
+
+        Splits that do not satisfy the splitting constraints
+        (min_gain_to_split, etc.) are discarded here.
+
+        We scan node from right to left. This version is only called when
+        there are missing values. Missing values are assigned to the left
+        child.
+
+        If no missing value are present in the data this method isn't called
+        since only calling _find_best_bin_to_split_left_to_right is enough.
+        """
+
+        cdef:
+            unsigned int bin_idx
+            unsigned int n_samples_left
+            unsigned int n_samples_right
+            unsigned int n_samples_ = n_samples
+            Y_DTYPE_C sum_hessian_left
+            Y_DTYPE_C sum_hessian_right
+            Y_DTYPE_C sum_gradient_left
+            Y_DTYPE_C sum_gradient_right
+            Y_DTYPE_C negative_loss_current_node
+            Y_DTYPE_C gain
+            unsigned int start = self.n_bins_non_missing[feature_idx] - 2
+
+        sum_gradient_right, sum_hessian_right = 0., 0.
+        n_samples_right = 0
+        negative_loss_current_node = negative_loss(sum_gradients,
+                                                   sum_hessians,
+                                                   self.l2_regularization)
+
+        for bin_idx in range(start, -1, -1):
+            n_samples_right += histograms[feature_idx, bin_idx + 1].count
+            n_samples_left = n_samples_ - n_samples_right
+
+            if self.hessians_are_constant:
+                sum_hessian_right += histograms[feature_idx, bin_idx + 1].count
+            else:
+                sum_hessian_right += \
+                    histograms[feature_idx, bin_idx + 1].sum_hessians
+            sum_hessian_left = sum_hessians - sum_hessian_right
+
+            sum_gradient_right += \
+                histograms[feature_idx, bin_idx + 1].sum_gradients
+            sum_gradient_left = sum_gradients - sum_gradient_right
+
+            if n_samples_right < self.min_samples_leaf:
+                continue
+            if n_samples_left < self.min_samples_leaf:
+                # won't get any better
+                break
+
+            if sum_hessian_right < self.min_hessian_to_split:
+                continue
+            if sum_hessian_left < self.min_hessian_to_split:
+                # won't get any better (hessians are > 0 since loss is convex)
+                break
 
-        return best_split
+            gain = _split_gain(sum_gradient_left, sum_hessian_left,
+                               sum_gradient_right, sum_hessian_right,
+                               negative_loss_current_node,
+                               self.l2_regularization)
 
+            if gain > split_info.gain and gain > self.min_gain_to_split:
+                split_info.gain = gain
+                split_info.feature_idx = feature_idx
+                split_info.bin_idx = bin_idx
+                # we scan from right to left so missing values go to the left
+                split_info.missing_go_to_left = True
+                split_info.sum_gradient_left = sum_gradient_left
+                split_info.sum_gradient_right = sum_gradient_right
+                split_info.sum_hessian_left = sum_hessian_left
+                split_info.sum_hessian_right = sum_hessian_right
+                split_info.n_samples_left = n_samples_left
+                split_info.n_samples_right = n_samples_right
 
 cdef inline Y_DTYPE_C _split_gain(
         Y_DTYPE_C sum_gradient_left,
@@ -507,3 +650,19 @@ cdef inline Y_DTYPE_C negative_loss(
         Y_DTYPE_C hessian,
         Y_DTYPE_C l2_regularization) nogil:
     return (gradient * gradient) / (hessian + l2_regularization)
+
+cdef inline unsigned char sample_goes_left(
+        unsigned char missing_go_to_left,
+        unsigned char missing_values_bin_idx,
+        X_BINNED_DTYPE_C split_bin_idx,
+        X_BINNED_DTYPE_C bin_value) nogil:
+    """Helper to decide whether sample should go to left or right child."""
+
+    return (
+        (
+            missing_go_to_left and
+            bin_value == missing_values_bin_idx
+        )
+        or (
+            bin_value <= split_bin_idx
+        ))
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
index 8cbb26fa98178..06e38d62f7638 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@@ -7,8 +7,9 @@
     _find_binning_thresholds as _find_binning_thresholds_orig,
     _map_to_bins
 )
-from sklearn.ensemble._hist_gradient_boosting.types import X_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
 
 
 DATA = np.random.RandomState(42).normal(
@@ -16,7 +17,7 @@
 ).astype(X_DTYPE)
 
 
-def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5),
+def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
                              random_state=None):
     # Just a redef to avoid having to pass arguments all the time (as the
     # function is private we don't use default values for parameters)
@@ -52,10 +53,11 @@ def test_find_binning_thresholds_small_regular_data():
 
 
 def test_find_binning_thresholds_random_data():
-    bin_thresholds = _find_binning_thresholds(DATA, random_state=0)
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=255,
+                                              random_state=0)
     assert len(bin_thresholds) == 2
     for i in range(len(bin_thresholds)):
-        assert bin_thresholds[i].shape == (255,)  # 256 - 1
+        assert bin_thresholds[i].shape == (254,)  # 255 - 1
         assert bin_thresholds[i].dtype == DATA.dtype
 
     assert_allclose(bin_thresholds[0][[64, 128, 192]],
@@ -74,25 +76,29 @@ def test_find_binning_thresholds_low_n_bins():
         assert bin_thresholds[i].dtype == DATA.dtype
 
 
-def test_find_binning_thresholds_invalid_n_bins():
-    err_msg = 'no smaller than 2 and no larger than 256'
+@pytest.mark.parametrize('n_bins', (2, 257))
+def test_invalid_n_bins(n_bins):
+    err_msg = (
+        'n_bins={} should be no smaller than 3 and no larger than 256'
+        .format(n_bins))
     with pytest.raises(ValueError, match=err_msg):
-        _find_binning_thresholds(DATA, max_bins=1024)
+        _BinMapper(n_bins=n_bins).fit(DATA)
 
 
 def test_bin_mapper_n_features_transform():
-    mapper = _BinMapper(max_bins=42, random_state=42).fit(DATA)
+    mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
     err_msg = 'This estimator was fitted with 2 features but 4 got passed'
     with pytest.raises(ValueError, match=err_msg):
         mapper.transform(np.repeat(DATA, 2, axis=1))
 
 
-@pytest.mark.parametrize('n_bins', [16, 128, 256])
-def test_map_to_bins(n_bins):
-    bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins,
+@pytest.mark.parametrize('max_bins', [16, 128, 255])
+def test_map_to_bins(max_bins):
+    bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins,
                                               random_state=0)
     binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F')
-    _map_to_bins(DATA, bin_thresholds, binned)
+    last_bin_idx = max_bins
+    _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
     assert binned.shape == DATA.shape
     assert binned.dtype == np.uint8
     assert binned.flags.f_contiguous
@@ -103,47 +109,52 @@ def test_map_to_bins(n_bins):
     for feature_idx, min_idx in enumerate(min_indices):
         assert binned[min_idx, feature_idx] == 0
     for feature_idx, max_idx in enumerate(max_indices):
-        assert binned[max_idx, feature_idx] == n_bins - 1
+        assert binned[max_idx, feature_idx] == max_bins - 1
 
 
-@pytest.mark.parametrize("n_bins", [5, 10, 42])
-def test_bin_mapper_random_data(n_bins):
+@pytest.mark.parametrize("max_bins", [5, 10, 42])
+def test_bin_mapper_random_data(max_bins):
     n_samples, n_features = DATA.shape
 
-    expected_count_per_bin = n_samples // n_bins
+    expected_count_per_bin = n_samples // max_bins
     tol = int(0.05 * expected_count_per_bin)
 
-    mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA)
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
     binned = mapper.transform(DATA)
 
     assert binned.shape == (n_samples, n_features)
     assert binned.dtype == np.uint8
     assert_array_equal(binned.min(axis=0), np.array([0, 0]))
-    assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1]))
+    assert_array_equal(binned.max(axis=0),
+                       np.array([max_bins - 1, max_bins - 1]))
     assert len(mapper.bin_thresholds_) == n_features
     for bin_thresholds_feature in mapper.bin_thresholds_:
-        assert bin_thresholds_feature.shape == (n_bins - 1,)
+        assert bin_thresholds_feature.shape == (max_bins - 1,)
         assert bin_thresholds_feature.dtype == DATA.dtype
-    assert np.all(mapper.actual_n_bins_ == n_bins)
+    assert np.all(mapper.n_bins_non_missing_ == max_bins)
 
     # Check that the binned data is approximately balanced across bins.
     for feature_idx in range(n_features):
-        for bin_idx in range(n_bins):
+        for bin_idx in range(max_bins):
             count = (binned[:, feature_idx] == bin_idx).sum()
             assert abs(count - expected_count_per_bin) < tol
 
 
-@pytest.mark.parametrize("n_samples, n_bins", [
+@pytest.mark.parametrize("n_samples, max_bins", [
     (5, 5),
     (5, 10),
     (5, 11),
     (42, 255)
 ])
-def test_bin_mapper_small_random_data(n_samples, n_bins):
+def test_bin_mapper_small_random_data(n_samples, max_bins):
     data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
     assert len(np.unique(data)) == n_samples
 
-    mapper = _BinMapper(max_bins=n_bins, random_state=42)
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
     binned = mapper.fit_transform(data)
 
     assert binned.shape == data.shape
@@ -152,14 +163,16 @@ def test_bin_mapper_small_random_data(n_samples, n_bins):
                        np.arange(n_samples))
 
 
-@pytest.mark.parametrize("n_bins, n_distinct, multiplier", [
+@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [
     (5, 5, 1),
     (5, 5, 3),
     (255, 12, 42),
 ])
-def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier):
+def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
     data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
-    binned = _BinMapper(max_bins=n_bins).fit_transform(data)
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
     assert_array_equal(data, binned)
 
 
@@ -176,59 +189,62 @@ def test_bin_mapper_repeated_values_invariance(n_distinct):
 
     data = data.reshape(-1, 1)
 
-    mapper_1 = _BinMapper(max_bins=n_distinct)
+    mapper_1 = _BinMapper(n_bins=n_distinct + 1)
     binned_1 = mapper_1.fit_transform(data)
     assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
 
     # Adding more bins to the mapper yields the same results (same thresholds)
-    mapper_2 = _BinMapper(max_bins=min(256, n_distinct * 3))
+    mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
     binned_2 = mapper_2.fit_transform(data)
 
     assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
     assert_array_equal(binned_1, binned_2)
 
 
-@pytest.mark.parametrize("n_bins, scale, offset", [
+@pytest.mark.parametrize("max_bins, scale, offset", [
     (3, 2, -1),
     (42, 1, 0),
-    (256, 0.3, 42),
+    (255, 0.3, 42),
 ])
-def test_bin_mapper_identity_small(n_bins, scale, offset):
-    data = np.arange(n_bins).reshape(-1, 1) * scale + offset
-    binned = _BinMapper(max_bins=n_bins).fit_transform(data)
-    assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1))
+def test_bin_mapper_identity_small(max_bins, scale, offset):
+    data = np.arange(max_bins).reshape(-1, 1) * scale + offset
+    # max_bins is the number of bins for non-missing values
+    n_bins = max_bins + 1
+    binned = _BinMapper(n_bins=n_bins).fit_transform(data)
+    assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
 
 
-@pytest.mark.parametrize('n_bins_small, n_bins_large', [
+@pytest.mark.parametrize('max_bins_small, max_bins_large', [
     (2, 2),
     (3, 3),
     (4, 4),
     (42, 42),
-    (256, 256),
+    (255, 255),
     (5, 17),
-    (42, 256),
+    (42, 255),
 ])
-def test_bin_mapper_idempotence(n_bins_small, n_bins_large):
-    assert n_bins_large >= n_bins_small
+def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
+    assert max_bins_large >= max_bins_small
     data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
-    mapper_small = _BinMapper(max_bins=n_bins_small)
-    mapper_large = _BinMapper(max_bins=n_bins_large)
+    mapper_small = _BinMapper(n_bins=max_bins_small + 1)
+    mapper_large = _BinMapper(n_bins=max_bins_small + 1)
     binned_small = mapper_small.fit_transform(data)
     binned_large = mapper_large.fit_transform(binned_small)
     assert_array_equal(binned_small, binned_large)
 
 
-@pytest.mark.parametrize('max_bins', [10, 100, 256])
+@pytest.mark.parametrize('n_bins', [10, 100, 256])
 @pytest.mark.parametrize('diff', [-5, 0, 5])
-def test_actual_n_bins(max_bins, diff):
-    # Check that actual_n_bins is n_unique_values when
-    # n_unique_values <= max_bins, else max_bins.
+def test_n_bins_non_missing(n_bins, diff):
+    # Check that n_bins_non_missing is n_unique_values when
+    # there are not a lot of unique values, else n_bins - 1.
 
-    n_unique_values = max_bins + diff
+    n_unique_values = n_bins + diff
     X = list(range(n_unique_values)) * 2
     X = np.array(X).reshape(-1, 1)
-    mapper = _BinMapper(max_bins=max_bins).fit(X)
-    assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values))
+    mapper = _BinMapper(n_bins=n_bins).fit(X)
+    assert np.all(mapper.n_bins_non_missing_ == min(
+        n_bins - 1, n_unique_values))
 
 
 def test_subsample():
@@ -242,6 +258,48 @@ def test_subsample():
                                rtol=1e-4)
 
 
+@pytest.mark.parametrize(
+    'n_bins, n_bins_non_missing, X_trans_expected', [
+        (256, [4, 2, 2], [[0,   0,   0],  # 255 <=> missing value
+                          [255, 255, 0],
+                          [1,   0,   0],
+                          [255, 1,   1],
+                          [2,   1,   1],
+                          [3,   0,   0]]),
+        (3, [2, 2, 2], [[0, 0, 0],  # 2 <=> missing value
+                        [2, 2, 0],
+                        [0, 0, 0],
+                        [2, 1, 1],
+                        [1, 1, 1],
+                        [1, 0, 0]])])
+def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
+    # check for missing values: make sure nans are mapped to the last bin
+    # and that the _BinMapper attributes are correct
+
+    X = [[1,      1,      0],
+         [np.NaN, np.NaN, 0],
+         [2,      1,      0],
+         [np.NaN, 2,      1],
+         [3,      2,      1],
+         [4,      1,      0]]
+
+    X = np.array(X)
+
+    mapper = _BinMapper(n_bins=n_bins)
+    mapper.fit(X)
+
+    assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
+
+    for feature_idx in range(X.shape[1]):
+        assert len(mapper.bin_thresholds_[feature_idx]) == \
+            n_bins_non_missing[feature_idx] - 1
+
+    assert mapper.missing_values_bin_idx_ == n_bins - 1
+
+    X_trans = mapper.transform(X)
+    assert_array_equal(X_trans, X_trans_expected)
+
+
 def test_infinite_values():
     # Make sure infinite values are properly handled.
     bin_mapper = _BinMapper()
@@ -249,8 +307,8 @@ def test_infinite_values():
     X = np.array([-np.inf, 0, 1,  np.inf]).reshape(-1, 1)
 
     bin_mapper.fit(X)
-    assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, np.inf])
-    assert bin_mapper.actual_n_bins_ == [4]
+    assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF])
+    assert bin_mapper.n_bins_non_missing_ == [4]
 
     expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
     assert_array_equal(bin_mapper.transform(X), expected_binned_X)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
index 95672a60e5c40..63d8c8fb1059d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -43,7 +43,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
     max_iter = 1
-    max_bins = 256
+    max_bins = 255
 
     X, y = make_regression(n_samples=n_samples, n_features=5,
                            n_informative=5, random_state=0)
@@ -51,7 +51,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
     if n_samples > 255:
         # bin data and convert it to float32 so that the estimator doesn't
         # treat it as pre-binned
-        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
@@ -95,7 +95,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
     max_iter = 1
-    max_bins = 256
+    max_bins = 255
 
     X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                                n_informative=5, n_redundant=0, random_state=0)
@@ -103,7 +103,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
     if n_samples > 255:
         # bin data and convert it to float32 so that the estimator doesn't
         # treat it as pre-binned
-        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
@@ -155,7 +155,7 @@ def test_same_predictions_multiclass_classification(
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
     max_iter = 1
-    max_bins = 256
+    max_bins = 255
     lr = 1
 
     X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
@@ -165,7 +165,7 @@ def test_same_predictions_multiclass_classification(
     if n_samples > 255:
         # bin data and convert it to float32 so that the estimator doesn't
         # treat it as pre-binned
-        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)
+        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index be7e424a844bc..1eebdefd5288d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1,6 +1,11 @@
 import numpy as np
 import pytest
+from numpy.testing import assert_allclose
 from sklearn.datasets import make_classification, make_regression
+from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.base import clone, BaseEstimator, TransformerMixin
+from sklearn.pipeline import make_pipeline
 
 # To use this experimental feature, we need to explicitly ask for it:
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
@@ -31,7 +36,7 @@
      ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'),
      ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'),
      ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'),
-     ({'max_bins': 257}, 'max_bins=257 should be no smaller than 2 and no'),
+     ({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'),
      ({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'),
      ({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'),
      ({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'),
@@ -170,10 +175,65 @@ def test_binning_train_validation_are_separated():
     mapper_whole_data.fit(X_classification)
 
     n_samples = X_classification.shape[0]
-    assert np.all(mapper_training_data.actual_n_bins_ ==
+    assert np.all(mapper_training_data.n_bins_non_missing_ ==
                   int((1 - validation_fraction) * n_samples))
-    assert np.all(mapper_training_data.actual_n_bins_ !=
-                  mapper_whole_data.actual_n_bins_)
+    assert np.all(mapper_training_data.n_bins_non_missing_ !=
+                  mapper_whole_data.n_bins_non_missing_)
+
+
+def test_missing_values_trivial():
+    # sanity check for missing values support. With only one feature and
+    # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
+    # training set.
+
+    n_samples = 100
+    n_features = 1
+    rng = np.random.RandomState(0)
+
+    X = rng.normal(size=(n_samples, n_features))
+    mask = rng.binomial(1, .5, size=X.shape).astype(np.bool)
+    X[mask] = np.nan
+    y = mask.ravel()
+    gb = HistGradientBoostingClassifier()
+    gb.fit(X, y)
+
+    assert gb.score(X, y) == pytest.approx(1)
+
+
+@pytest.mark.parametrize('problem', ('classification', 'regression'))
+@pytest.mark.parametrize(
+    'missing_proportion, expected_min_score_classification, '
+    'expected_min_score_regression', [
+        (.1, .97, .89),
+        (.2, .93, .81),
+        (.5, .79, .52)])
+def test_missing_values_resilience(problem, missing_proportion,
+                                   expected_min_score_classification,
+                                   expected_min_score_regression):
+    # Make sure the estimators can deal with missing values and still yield
+    # decent predictions
+
+    rng = np.random.RandomState(0)
+    n_samples = 1000
+    n_features = 2
+    if problem == 'regression':
+        X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                               n_informative=n_features, random_state=rng)
+        gb = HistGradientBoostingRegressor()
+        expected_min_score = expected_min_score_regression
+    else:
+        X, y = make_classification(n_samples=n_samples, n_features=n_features,
+                                   n_informative=n_features, n_redundant=0,
+                                   n_repeated=0, random_state=rng)
+        gb = HistGradientBoostingClassifier()
+        expected_min_score = expected_min_score_classification
+
+    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
+    X[mask] = np.nan
+
+    gb.fit(X, y)
+
+    assert gb.score(X, y) > expected_min_score
 
 
 @pytest.mark.parametrize('data', [
@@ -222,7 +282,115 @@ def test_small_trainset():
     assert small_distrib == pytest.approx(original_distrib)
 
 
+def test_missing_values_minmax_imputation():
+    # Compare the buit-in missing value handling of Histogram GBC with an
+    # a-priori missing value imputation strategy that should yield the same
+    # results in terms of decision function.
+    #
+    # Each feature (containing NaNs) is replaced by 2 features:
+    # - one where the nans are replaced by min(feature) - 1
+    # - one where the nans are replaced by max(feature) + 1
+    # A split where nans go to the left has an equivalent split in the
+    # first (min) feature, and a split where nans go to the right has an
+    # equivalent split in the second (max) feature.
+    #
+    # Assuming the data is such that there is never a tie to select the best
+    # feature to split on during training, the learned decision trees should be
+    # strictly equivalent (learn a sequence of splits that encode the same
+    # decision function).
+    #
+    # The MinMaxImputer transformer is meant to be a toy implementation of the
+    # "Missing In Attributes" (MIA) missing value handling for decision trees
+    # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305
+    # The implementation of MIA as an imputation transformer was suggested by
+    # "Remark 3" in https://arxiv.org/abs/1902.06931
+
+    class MinMaxImputer(BaseEstimator, TransformerMixin):
+
+        def fit(self, X, y=None):
+            mm = MinMaxScaler().fit(X)
+            self.data_min_ = mm.data_min_
+            self.data_max_ = mm.data_max_
+            return self
+
+        def transform(self, X):
+            X_min, X_max = X.copy(), X.copy()
+
+            for feature_idx in range(X.shape[1]):
+                nan_mask = np.isnan(X[:, feature_idx])
+                X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1
+                X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1
+
+            return np.concatenate([X_min, X_max], axis=1)
+
+    def make_missing_value_data(n_samples=int(1e4), seed=0):
+        rng = np.random.RandomState(seed)
+        X, y = make_regression(n_samples=n_samples, n_features=4,
+                               random_state=rng)
+
+        # Pre-bin the data to ensure a deterministic handling by the 2
+        # strategies and also make it easier to insert np.nan in a structured
+        # way:
+        X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X)
+
+        # First feature has missing values completely at random:
+        rnd_mask = rng.rand(X.shape[0]) > 0.9
+        X[rnd_mask, 0] = np.nan
+
+        # Second and third features have missing values for extreme values
+        # (censoring missingness):
+        low_mask = X[:, 1] == 0
+        X[low_mask, 1] = np.nan
+
+        high_mask = X[:, 2] == X[:, 2].max()
+        X[high_mask, 2] = np.nan
+
+        # Make the last feature nan pattern very informative:
+        y_max = np.percentile(y, 70)
+        y_max_mask = y >= y_max
+        y[y_max_mask] = y_max
+        X[y_max_mask, 3] = np.nan
+
+        # Check that there is at least one missing value in each feature:
+        for feature_idx in range(X.shape[1]):
+            assert any(np.isnan(X[:, feature_idx]))
+
+        # Let's use a test set to check that the learned decision function is
+        # the same as evaluated on unseen data. Otherwise it could just be the
+        # case that we find two independent ways to overfit the training set.
+        return train_test_split(X, y, random_state=rng)
+
+    # n_samples need to be large enough to minimize the likelihood of having
+    # several candidate splits with the same gain value in a given tree.
+    X_train, X_test, y_train, y_test = make_missing_value_data(
+        n_samples=int(1e4), seed=0)
+
+    # Use a small number of leaf nodes and iterations so as to keep
+    # under-fitting models to minimize the likelihood of ties when training the
+    # model.
+    gbm1 = HistGradientBoostingRegressor(max_iter=100,
+                                         max_leaf_nodes=5,
+                                         random_state=0)
+    gbm1.fit(X_train, y_train)
+
+    gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1))
+    gbm2.fit(X_train, y_train)
+
+    # Check that the model reach the same score:
+    assert gbm1.score(X_train, y_train) == \
+        pytest.approx(gbm2.score(X_train, y_train))
+
+    assert gbm1.score(X_test, y_test) == \
+        pytest.approx(gbm2.score(X_test, y_test))
+
+    # Check the individual prediction match as a finer grained
+    # decision function check.
+    assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train))
+    assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
+
+
 def test_infinite_values():
+    # Basic test for infinite values
 
     X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
     y = np.array([0, 0, 1, 1])
@@ -230,3 +398,20 @@ def test_infinite_values():
     gbdt = HistGradientBoostingRegressor(min_samples_leaf=1)
     gbdt.fit(X, y)
     np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4)
+
+
+def test_infinite_values_missing_values():
+    # High level test making sure that inf and nan values are properly handled
+    # when both are present. This is similar to
+    # test_split_on_nan_with_infinite_values() in test_grower.py, though we
+    # cannot check the predicitons for binned values here.
+
+    X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
+    y_isnan = np.isnan(X.ravel())
+    y_isinf = X.ravel() == np.inf
+
+    stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1,
+                                               learning_rate=1, max_depth=2)
+
+    assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
+    assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
index 49b19ce2778dd..0cc301b7b1b36 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -4,9 +4,9 @@
 
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
-from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 
 
 def _make_training_data(n_bins=256, constant_hessian=True):
@@ -85,7 +85,7 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
         stopping_param = {"min_gain_to_split": 0.01}
 
     grower = TreeGrower(X_binned, all_gradients, all_hessians,
-                        max_bins=n_bins, shrinkage=shrinkage,
+                        n_bins=n_bins, shrinkage=shrinkage,
                         min_samples_leaf=1, **stopping_param)
 
     # The root node is not yet splitted, but the best possible split has
@@ -147,7 +147,7 @@ def test_predictor_from_grower():
     X_binned, all_gradients, all_hessians = _make_training_data(
         n_bins=n_bins)
     grower = TreeGrower(X_binned, all_gradients, all_hessians,
-                        max_bins=n_bins, shrinkage=1.,
+                        n_bins=n_bins, shrinkage=1.,
                         max_leaf_nodes=3, min_samples_leaf=5)
     grower.grow()
     assert grower.n_nodes == 5  # (2 decision nodes + 3 leaves)
@@ -163,22 +163,23 @@ def test_predictor_from_grower():
     input_data = np.array([
         [0, 0],
         [42, 99],
-        [128, 255],
+        [128, 254],
 
         [129, 0],
         [129, 85],
-        [255, 85],
+        [254, 85],
 
         [129, 86],
-        [129, 255],
+        [129, 254],
         [242, 100],
     ], dtype=np.uint8)
-    predictions = predictor.predict_binned(input_data)
+    missing_values_bin_idx = n_bins - 1
+    predictions = predictor.predict_binned(input_data, missing_values_bin_idx)
     expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
     assert np.allclose(predictions, expected_targets)
 
     # Check that training set can be recovered exactly:
-    predictions = predictor.predict_binned(X_binned)
+    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx)
     assert np.allclose(predictions, -all_gradients)
 
 
@@ -203,14 +204,14 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins,
     if noise:
         y_scale = y.std()
         y += rng.normal(scale=noise, size=n_samples) * y_scale
-    mapper = _BinMapper(max_bins=n_bins)
+    mapper = _BinMapper(n_bins=n_bins)
     X = mapper.fit_transform(X)
 
     all_gradients = y.astype(G_H_DTYPE)
     shape_hessian = 1 if constant_hessian else all_gradients.shape
     all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE)
     grower = TreeGrower(X, all_gradients, all_hessians,
-                        max_bins=n_bins, shrinkage=1.,
+                        n_bins=n_bins, shrinkage=1.,
                         min_samples_leaf=min_samples_leaf,
                         max_leaf_nodes=n_samples)
     grower.grow()
@@ -235,18 +236,18 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf):
     # min_samples_leaf
     rng = np.random.RandomState(seed=0)
 
-    max_bins = 255
+    n_bins = 256
 
     # data = linear target, 3 features, 1 irrelevant.
     X = rng.normal(size=(n_samples, 3))
     y = X[:, 0] - X[:, 1]
-    mapper = _BinMapper(max_bins=max_bins)
+    mapper = _BinMapper(n_bins=n_bins)
     X = mapper.fit_transform(X)
 
     all_gradients = y.astype(G_H_DTYPE)
     all_hessians = np.ones(shape=1, dtype=G_H_DTYPE)
     grower = TreeGrower(X, all_gradients, all_hessians,
-                        max_bins=max_bins, shrinkage=1.,
+                        n_bins=n_bins, shrinkage=1.,
                         min_samples_leaf=min_samples_leaf,
                         max_leaf_nodes=n_samples)
     grower.grow()
@@ -261,13 +262,13 @@ def test_max_depth(max_depth):
     # Make sure max_depth parameter works as expected
     rng = np.random.RandomState(seed=0)
 
-    max_bins = 255
+    n_bins = 256
     n_samples = 1000
 
     # data = linear target, 3 features, 1 irrelevant.
     X = rng.normal(size=(n_samples, 3))
     y = X[:, 0] - X[:, 1]
-    mapper = _BinMapper(max_bins=max_bins)
+    mapper = _BinMapper(n_bins=n_bins)
     X = mapper.fit_transform(X)
 
     all_gradients = y.astype(G_H_DTYPE)
@@ -307,3 +308,80 @@ def test_init_parameters_validation():
                        match="min_hessian_to_split=-1 must be positive"):
         TreeGrower(X_binned, all_gradients, all_hessians,
                    min_hessian_to_split=-1)
+
+
+def test_missing_value_predict_only():
+    # Make sure that missing values are supported at predict time even if they
+    # were not encountered in the training data: the missing values are
+    # assigned to whichever child has the most samples.
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5,
+                        has_missing_values=False)
+    grower.grow()
+
+    predictor = grower.make_predictor()
+
+    # go from root to a leaf, always following node with the most samples.
+    # That's the path nans are supposed to take
+    node = predictor.nodes[0]
+    while not node['is_leaf']:
+        left = predictor.nodes[node['left']]
+        right = predictor.nodes[node['right']]
+        node = left if left['count'] > right['count'] else right
+
+    prediction_main_path = node['value']
+
+    # now build X_test with only nans, and make sure all predictions are equal
+    # to prediction_main_path
+    all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan)
+    assert np.all(predictor.predict(all_nans) == prediction_main_path)
+
+
+def test_split_on_nan_with_infinite_values():
+    # Make sure the split on nan situations are respected even when there are
+    # samples with +inf values (we set the threshold to +inf when we have a
+    # split on nan so this test makes sure this does not introduce edge-case
+    # bugs). We need to use the private API so that we can also test
+    # predict_binned().
+
+    X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1)
+    # the gradient values will force a split on nan situation
+    gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    bin_mapper = _BinMapper()
+    X_binned = bin_mapper.fit_transform(X)
+
+    n_bins_non_missing = 3
+    has_missing_values = True
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        n_bins_non_missing=n_bins_non_missing,
+                        has_missing_values=has_missing_values,
+                        min_samples_leaf=1)
+
+    grower.grow()
+
+    predictor = grower.make_predictor(
+        bin_thresholds=bin_mapper.bin_thresholds_
+    )
+
+    # sanity check: this was a split on nan
+    assert predictor.nodes[0]['threshold'] == np.inf
+    assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1
+
+    # Make sure in particular that the +inf sample is mapped to the left child
+    # Note that lightgbm "fails" here and will assign the inf sample to the
+    # right child, even though it's a "split on nan" situation.
+    predictions = predictor.predict(X)
+    predictions_binned = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
+    assert np.all(predictions == -gradients)
+    assert np.all(predictions_binned == -gradients)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
index c425a0389a789..1ffb08353b30a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py
@@ -12,9 +12,9 @@
     _build_histogram_root,
     _subtract_histograms
 )
-from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 29b5b6b47a04a..b49acc52b6e40 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -7,8 +7,8 @@
 import pytest
 
 from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
-from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 
 
 def get_derivatives_helper(loss):
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
index cb7d7a804f29e..7df1e616445fc 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@@ -7,17 +7,17 @@
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor
-from sklearn.ensemble._hist_gradient_boosting.types import (
-    G_H_DTYPE, PREDICTOR_RECORD_DTYPE)
+from sklearn.ensemble._hist_gradient_boosting.common import (
+    G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF)
 
 
-@pytest.mark.parametrize('max_bins', [200, 256])
-def test_boston_dataset(max_bins):
+@pytest.mark.parametrize('n_bins', [200, 256])
+def test_boston_dataset(n_bins):
     X, y = load_boston(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=42)
 
-    mapper = _BinMapper(max_bins=max_bins, random_state=42)
+    mapper = _BinMapper(n_bins=n_bins, random_state=42)
     X_train_binned = mapper.fit_transform(X_train)
 
     # Init gradients and hessians to that of least squares loss
@@ -28,8 +28,8 @@ def test_boston_dataset(max_bins):
     max_leaf_nodes = 31
     grower = TreeGrower(X_train_binned, gradients, hessians,
                         min_samples_leaf=min_samples_leaf,
-                        max_leaf_nodes=max_leaf_nodes, max_bins=max_bins,
-                        actual_n_bins=mapper.actual_n_bins_)
+                        max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
+                        n_bins_non_missing=mapper.n_bins_non_missing_)
     grower.grow()
 
     predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
@@ -42,12 +42,14 @@ def test_boston_dataset(max_bins):
     (-np.inf, [0, 1, 1, 1]),
     (10, [0, 0, 1, 1]),
     (20, [0, 0, 0, 1]),
-    (np.inf, [0, 0, 0, 1]),
+    (ALMOST_INF, [0, 0, 0, 1]),
+    (np.inf, [0, 0, 0, 0]),
 ])
 def test_infinite_values_and_thresholds(threshold, expected_predictions):
     # Make sure infinite values and infinite thresholds are handled properly.
-    # In paticular, if a value is +inf and the threhsold is +inf, the sample
-    # should go to the right child.
+    # In particular, if a value is +inf and the threshold is ALMOST_INF the
+    # sample should go to the right child. If the threshold is inf (split on
+    # nan), the +inf sample will go to the left child.
 
     X = np.array([-np.inf, 10, 20,  np.inf]).reshape(-1, 1)
     nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
index c8afac4fbab2c..a0eb6c6ab61c5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@@ -1,9 +1,9 @@
 import numpy as np
 import pytest
 
-from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter
 from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
 from sklearn.utils.testing import skip_if_32bit
@@ -18,7 +18,7 @@ def test_histogram_split(n_bins):
     min_samples_leaf = 1
     min_gain_to_split = 0.
     X_binned = np.asfortranarray(
-        rng.randint(0, n_bins, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE)
+        rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE)
     binned_feature = X_binned.T[feature_idx]
     sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
     ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
@@ -26,7 +26,7 @@ def test_histogram_split(n_bins):
     sum_hessians = all_hessians.sum()
     hessians_are_constant = False
 
-    for true_bin in range(1, n_bins - 1):
+    for true_bin in range(1, n_bins - 2):
         for sign in [-1, 1]:
             ordered_gradients = np.full_like(binned_feature, sign,
                                              dtype=G_H_DTYPE)
@@ -34,15 +34,20 @@ def test_histogram_split(n_bins):
             all_gradients = ordered_gradients
             sum_gradients = all_gradients.sum()
 
-            actual_n_bins = np.array([n_bins] * X_binned.shape[1],
-                                     dtype=np.uint32)
             builder = HistogramBuilder(X_binned,
                                        n_bins,
                                        all_gradients,
                                        all_hessians,
                                        hessians_are_constant)
+            n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                          dtype=np.uint32)
+            has_missing_values = np.array([False] * X_binned.shape[1],
+                                          dtype=np.uint8)
+            missing_values_bin_idx = n_bins - 1
             splitter = Splitter(X_binned,
-                                actual_n_bins,
+                                n_bins_non_missing,
+                                missing_values_bin_idx,
+                                has_missing_values,
                                 l2_regularization,
                                 min_hessian_to_split,
                                 min_samples_leaf, min_gain_to_split,
@@ -96,13 +101,16 @@ def test_gradient_and_hessian_sanity(constant_hessian):
         all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
         sum_hessians = all_hessians.sum()
 
-    actual_n_bins = np.array([n_bins] * X_binned.shape[1],
-                             dtype=np.uint32)
     builder = HistogramBuilder(X_binned, n_bins, all_gradients,
                                all_hessians, constant_hessian)
-    splitter = Splitter(X_binned, actual_n_bins,
-                        l2_regularization, min_hessian_to_split,
-                        min_samples_leaf, min_gain_to_split, constant_hessian)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, constant_hessian)
 
     hists_parent = builder.compute_histograms_brute(sample_indices)
     si_parent = splitter.find_node_split(n_samples, hists_parent,
@@ -192,15 +200,17 @@ def test_split_indices():
     sum_hessians = 1 * n_samples
     hessians_are_constant = True
 
-    actual_n_bins = np.array([n_bins] * X_binned.shape[1],
-                             dtype=np.uint32)
     builder = HistogramBuilder(X_binned, n_bins,
                                all_gradients, all_hessians,
                                hessians_are_constant)
-    splitter = Splitter(X_binned, actual_n_bins,
-                        l2_regularization, min_hessian_to_split,
-                        min_samples_leaf, min_gain_to_split,
-                        hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
 
     assert np.all(sample_indices == splitter.partition)
 
@@ -248,11 +258,151 @@ def test_min_gain_to_split():
     sum_hessians = all_hessians.sum()
     hessians_are_constant = False
 
-    actual_n_bins = np.array([n_bins] * X_binned.shape[1],
-                             dtype=np.uint32)
     builder = HistogramBuilder(X_binned, n_bins, all_gradients,
                                all_hessians, hessians_are_constant)
-    splitter = Splitter(X_binned, actual_n_bins,
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians)
+    assert split_info.gain == -1
+
+
+@pytest.mark.parametrize(
+    'X_binned, all_gradients, has_missing_values, n_bins_non_missing, '
+    ' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [
+
+        # basic sanity check with no missing values: given the gradient
+        # values, the split must occur on bin_idx=3
+        ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],  # X_binned
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],  # gradients
+         False,  # no missing values
+         10,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # expected_bin_idx
+         'not_applicable'),
+
+        # We replace 2 samples by NaNs (bin_idx=8)
+        # These 2 samples were mapped to the left node before, so they should
+        # be mapped to left node again
+        # Notice how the bin_idx threshold changes from 3 to 1.
+        ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7],  # 8 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         1,  # cut on bin_idx=1
+         True),  # missing values go to left
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([9, 0, 1, 9, 2, 3, 4, 5, 6, 7],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         1,  # cut on bin_idx=1
+         True),  # missing values go to left
+
+        # this time replacing 2 samples that were on the right.
+        ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7],  # 8 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # cut on bin_idx=3 (like in first case)
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([0, 1, 2, 3, 9, 4, 9, 5, 6, 7],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         8,  # n_bins_non_missing
+         False,  # don't split on nans
+         3,  # cut on bin_idx=3 (like in first case)
+         False),  # missing values go to right
+
+        # For the following cases, split_on_nans is True (we replace all of
+        # the samples with nans, instead of just 2).
+        ([0, 1, 2, 3, 4, 4, 4, 4, 4, 4],  # 4 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         4,  # n_bins_non_missing
+         True,  # split on nans
+         3,  # cut on bin_idx=3
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([0, 1, 2, 3, 9, 9, 9, 9, 9, 9],  # 9 <=> missing
+         [1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
+         True,  # missing values
+         4,  # n_bins_non_missing
+         True,  # split on nans
+         3,  # cut on bin_idx=3
+         False),  # missing values go to right
+
+        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 4 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         6,  # n_bins_non_missing
+         True,  # split on nans
+         5,  # cut on bin_idx=5
+         False),  # missing values go to right
+
+        # same as above, but with non-consecutive missing_values_bin
+        ([9, 9, 9, 9, 0, 1, 2, 3, 4, 5],  # 9 <=> missing
+         [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
+         True,  # missing values
+         6,  # n_bins_non_missing
+         True,  # split on nans
+         5,  # cut on bin_idx=5
+         False),  # missing values go to right
+    ]
+)
+def test_splitting_missing_values(X_binned, all_gradients,
+                                  has_missing_values, n_bins_non_missing,
+                                  expected_split_on_nan, expected_bin_idx,
+                                  expected_go_to_left):
+    # Make sure missing values are properly supported.
+    # we build an artificial example with gradients such that the best split
+    # is on bin_idx=3, when there are no missing values.
+    # Then we introduce missing values and:
+    #   - make sure the chosen bin is correct (find_best_bin()): it's
+    #     still the same split, even though the index of the bin may change
+    #   - make sure the missing values are mapped to the correct child
+    #     (split_indices())
+
+    n_bins = max(X_binned) + 1
+    n_samples = len(X_binned)
+    l2_regularization = 0.
+    min_hessian_to_split = 1e-3
+    min_samples_leaf = 1
+    min_gain_to_split = 0.
+
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
+    X_binned = np.asfortranarray(X_binned)
+    all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
+    has_missing_values = np.array([has_missing_values], dtype=np.uint8)
+    all_hessians = np.ones(1, dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = 1 * n_samples
+    hessians_are_constant = True
+
+    builder = HistogramBuilder(X_binned, n_bins,
+                               all_gradients, all_hessians,
+                               hessians_are_constant)
+
+    n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    missing_values_bin_idx = n_bins - 1
+    splitter = Splitter(X_binned, n_bins_non_missing,
+                        missing_values_bin_idx, has_missing_values,
                         l2_regularization, min_hessian_to_split,
                         min_samples_leaf, min_gain_to_split,
                         hessians_are_constant)
@@ -260,4 +410,31 @@ def test_min_gain_to_split():
     histograms = builder.compute_histograms_brute(sample_indices)
     split_info = splitter.find_node_split(n_samples, histograms,
                                           sum_gradients, sum_hessians)
-    assert split_info.gain == -1
+
+    assert split_info.bin_idx == expected_bin_idx
+    if has_missing_values:
+        assert split_info.missing_go_to_left == expected_go_to_left
+
+    split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
+    assert split_on_nan == expected_split_on_nan
+
+    # Make sure the split is properly computed.
+    # This also make sure missing values are properly assigned to the correct
+    # child in split_indices()
+    samples_left, samples_right, _ = splitter.split_indices(
+        split_info, splitter.partition)
+
+    if not expected_split_on_nan:
+        # When we don't split on nans, the split should always be the same.
+        assert set(samples_left) == set([0, 1, 2, 3])
+        assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
+    else:
+        # When we split on nans, samples with missing values are always mapped
+        # to the right child.
+        missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) == missing_values_bin_idx)
+        non_missing_samples_indices = np.flatnonzero(
+            np.array(X_binned) != missing_values_bin_idx)
+
+        assert set(samples_right) == set(missing_samples_indices)
+        assert set(samples_left) == set(non_missing_samples_indices)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
index fa9556ef9efb5..291c015fec5d3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
@@ -9,8 +9,8 @@ from cython.parallel import prange
 
 from ...base import is_classifier
 from .binning import _BinMapper
-from .types cimport G_H_DTYPE_C
-from .types cimport Y_DTYPE_C
+from .common cimport G_H_DTYPE_C
+from .common cimport Y_DTYPE_C
 
 
 def get_equivalent_estimator(estimator, lib='lightgbm'):
diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py
index 88e1b2e32d98d..4430cb129efcf 100644
--- a/sklearn/ensemble/setup.py
+++ b/sklearn/ensemble/setup.py
@@ -37,8 +37,8 @@ def configuration(parent_package="", top_path=None):
                          sources=["_hist_gradient_boosting/_loss.pyx"],
                          include_dirs=[numpy.get_include()])
 
-    config.add_extension("_hist_gradient_boosting.types",
-                         sources=["_hist_gradient_boosting/types.pyx"],
+    config.add_extension("_hist_gradient_boosting.common",
+                         sources=["_hist_gradient_boosting/common.pyx"],
                          include_dirs=[numpy.get_include()])
 
     config.add_extension("_hist_gradient_boosting.utils",