diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 8d055b22c2252..5f070bd45708d 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -2,6 +2,7 @@ import argparse import matplotlib.pyplot as plt +import numpy as np from sklearn.model_selection import train_test_split # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -25,6 +26,7 @@ parser.add_argument('--learning-rate', type=float, default=.1) parser.add_argument('--problem', type=str, default='classification', choices=['classification', 'regression']) +parser.add_argument('--missing-fraction', type=float, default=0) parser.add_argument('--n-classes', type=int, default=2) parser.add_argument('--n-samples-max', type=int, default=int(1e6)) parser.add_argument('--n-features', type=int, default=20) @@ -52,6 +54,11 @@ def get_estimator_and_data(): X, y, Estimator = get_estimator_and_data() +if args.missing_fraction: + mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype( + np.bool) + X[mask] = np.nan + X_train_, X_test_, y_train_, y_test_ = train_test_split( X, y, test_size=0.5, random_state=0) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 084a2aca22597..fde8f40db6c8c 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -864,7 +864,7 @@ Usage Most of the parameters are unchanged from :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`. One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and -controls the number of iterations of the boosting process: +controls the number of iterations of the boosting process:: >>> from sklearn.experimental import enable_hist_gradient_boosting >>> from sklearn.ensemble import HistGradientBoostingClassifier @@ -873,10 +873,10 @@ controls the number of iterations of the boosting process: >>> X, y = make_hastie_10_2(random_state=0) >>> X_train, X_test = X[:2000], X[2000:] >>> y_train, y_test = y[:2000], y[2000:] - >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) + >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) >>> clf.score(X_test, y_test) - 0.8998 + 0.8965 The size of the trees can be controlled through the ``max_leaf_nodes``, ``max_depth``, and ``min_samples_leaf`` parameters. @@ -895,6 +895,45 @@ using an arbitrary :term:`scorer`, or just the training or validation loss. By default, early-stopping is performed using the default :term:`scorer` of the estimator on a validation set. +Missing values support +---------------------- + +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` have built-in support for missing +values (NaNs). + +During training, the tree grower learns at each split point whether samples +with missing values should go to the left or right child, based on the +potential gain. When predicting, samples with missing values are assigned to +the left or right child consequently:: + + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> import numpy as np + + >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) + >>> y = [0, 0, 1, 1] + + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) + >>> gbdt.predict(X) + array([0, 0, 1, 1]) + +When the missingness pattern is predictive, the splits can be done on +whether the feature value is missing or not:: + + >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1) + >>> y = [0, 1, 0, 0, 1] + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1, + ... max_depth=2, + ... learning_rate=1, + ... max_iter=1).fit(X, y) + >>> gbdt.predict(X) + array([0, 1, 0, 0, 1]) + +If no missing values were encountered for a given feature during training, +then samples with missing values are mapped to whichever child has the most +samples. + Low-level parallelism --------------------- diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 4ac7afe644e89..779a94c2dd1b0 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -23,10 +23,11 @@ random sampling procedures. - :class:`decomposition.SparseCoder` with `algorithm='lasso_lars'` |Fix| - :class:`decomposition.SparsePCA` where `normalize_components` has no effect due to deprecation. - - :class:`linear_model.Ridge` when `X` is sparse. |Fix| - - :class:`cluster.KMeans` when `n_jobs=1`. |Fix| +- :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` |Fix|, |Feature|, + |Enhancement|. Details are listed in the changelog below. @@ -112,24 +113,31 @@ Changelog :mod:`sklearn.ensemble` ....................... -- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` have an additional - parameter called `warm_start` that enables warm starting. :pr:`14012` by - :user:`Johann Faouzi `. - -- |Fix| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` now bin the training and - validation data separately to avoid any data leak. :pr:`13933` by - `Nicolas Hug`_. +- Many improvements were made to + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`: + + - |MajorFeature| Estimators now natively support dense data with missing + values both for training and predicting. They also support infinite + values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_ + and `Olivier Grisel`_. + - |Feature| Estimators now have an additional `warm_start` parameter that + enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. + - |Enhancement| for :class:`ensemble.HistGradientBoostingClassifier` the + training loss or score is now monitored on a class-wise stratified + subsample to preserve the class balance of the original training set. + :pr:`14194` by :user:`Johann Faouzi `. + - |Feature| :func:`inspection.partial_dependence` and + :func:`inspection.plot_partial_dependence` now support the fast 'recursion' + method for both estimators. :pr:`13769` by `Nicolas Hug`_. + - |Fix| Estimators now bin the training and validation data separately to + avoid any data leak. :pr:`13933` by `Nicolas Hug`_. + + Note that pickles from 0.21 will not work in 0.22. - |Fix| :func:`ensemble.VotingClassifier.predict_proba` will no longer be present when `voting='hard'`. :pr:`14287` by `Thomas Fan`_. -- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` the training - loss or score is now monitored on a class-wise stratified subsample to - preserve the class balance of the original training set. :pr:`14194` - by :user:`Johann Faouzi `. - - |Fix| Run by default :func:`utils.estimator_checks.check_estimator` on both :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. It @@ -182,6 +190,12 @@ Changelog measure the importance of each feature in an arbitrary trained model with respect to a given scoring function. :issue:`13146` by `Thomas Fan`_. +- |Feature| :func:`inspection.partial_dependence` and + :func:`inspection.plot_partial_dependence` now support the fast 'recursion' + method for :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. :pr:`13769` by + `Nicolas Hug`_. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 80eb625cdb676..1ecee3c9ee27e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -12,11 +12,14 @@ import numpy as np cimport numpy as np from numpy.math cimport INFINITY from cython.parallel import prange +from libc.math cimport isnan -from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C +from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C -cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - X_BINNED_DTYPE_C [::1, :] binned): +def _map_to_bins(const X_DTYPE_C [:, :] data, + list binning_thresholds, + const unsigned char missing_values_bin_idx, + X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. Parameters @@ -35,11 +38,13 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, for feature_idx in range(data.shape[1]): _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], + missing_values_bin_idx, binned[:, feature_idx]) cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, + const unsigned char missing_values_bin_idx, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: @@ -49,11 +54,11 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, int middle for i in prange(data.shape[0], schedule='static', nogil=True): - if data[i] == INFINITY: - # Special case for +inf. - # -inf is handled properly by binary search. - binned[i] = binning_thresholds.shape[0] + + if isnan(data[i]): + binned[i] = missing_values_bin_idx else: + # for known values, use binary search left, right = 0, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index 3603e6b2e2d8e..8d307c3806532 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -10,8 +10,8 @@ from cython.parallel import prange import numpy as np cimport numpy as np -from .types import Y_DTYPE -from .types cimport Y_DTYPE_C +from .common import Y_DTYPE +from .common cimport Y_DTYPE_C def _update_raw_predictions( diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 91c3e53101ed6..ff17654840005 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -12,8 +12,8 @@ cimport numpy as np from libc.math cimport exp -from .types cimport Y_DTYPE_C -from .types cimport G_H_DTYPE_C +from .common cimport Y_DTYPE_C +from .common cimport G_H_DTYPE_C def _update_gradients_least_squares( diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 21f9038210722..b3234cb5ba945 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -7,15 +7,16 @@ cimport cython from cython.parallel import prange +from libc.math cimport isnan import numpy as np cimport numpy as np from numpy.math cimport INFINITY -from .types cimport X_DTYPE_C -from .types cimport Y_DTYPE_C -from .types import Y_DTYPE -from .types cimport X_BINNED_DTYPE_C -from .types cimport node_struct +from .common cimport X_DTYPE_C +from .common cimport Y_DTYPE_C +from .common import Y_DTYPE +from .common cimport X_BINNED_DTYPE_C +from .common cimport node_struct def _predict_from_numeric_data( @@ -43,10 +44,12 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( while True: if node.is_leaf: return node.value - if numeric_data[row, node.feature_idx] == INFINITY: - # if data is +inf we always go to the right child, even when the - # threhsold is +inf - node = nodes[node.right] + + if isnan(numeric_data[row, node.feature_idx]): + if node.missing_go_to_left: + node = nodes[node.left] + else: + node = nodes[node.right] else: if numeric_data[row, node.feature_idx] <= node.threshold: node = nodes[node.left] @@ -57,19 +60,22 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( def _predict_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, + const unsigned char missing_values_bin_idx, Y_DTYPE_C [:] out): cdef: int i for i in prange(binned_data.shape[0], schedule='static', nogil=True): - out[i] = _predict_one_from_binned_data(nodes, binned_data, i) + out[i] = _predict_one_from_binned_data(nodes, binned_data, i, + missing_values_bin_idx) cdef inline Y_DTYPE_C _predict_one_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - const int row) nogil: + const int row, + const unsigned char missing_values_bin_idx) nogil: # Need to pass the whole array and the row index, else prange won't work. # See issue Cython #2798 @@ -79,10 +85,16 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( while True: if node.is_leaf: return node.value - if binned_data[row, node.feature_idx] <= node.bin_threshold: - node = nodes[node.left] + if binned_data[row, node.feature_idx] == missing_values_bin_idx: + if node.missing_go_to_left: + node = nodes[node.left] + else: + node = nodes[node.right] else: - node = nodes[node.right] + if binned_data[row, node.feature_idx] <= node.bin_threshold: + node = nodes[node.left] + else: + node = nodes[node.right] def _compute_partial_dependence( node_struct [:] nodes, diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index b35b2a2083b03..a6c779ca0a97b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -13,20 +13,23 @@ from ...base import BaseEstimator, TransformerMixin from ...utils.validation import check_is_fitted from ._binning import _map_to_bins -from .types import X_DTYPE, X_BINNED_DTYPE +from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. + Missing values are ignored for finding the thresholds. + Parameters ---------- data : array-like, shape (n_samples, n_features) The data to bin. - max_bins : int - The maximum number of bins to use. If for a given feature the number of - unique values is less than ``max_bins``, then those unique values - will be used to compute the bin thresholds, instead of the quantiles. + max_bins: int + The maximum number of bins to use for non-missing values. If for a + given feature the number of unique values is less than ``max_bins``, + then those unique values will be used to compute the bin thresholds, + instead of the quantiles. subsample : int or None If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -42,19 +45,19 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): be used to separate the bins. Thus ``len(binning_thresholds) == n_features``. """ - if not (2 <= max_bins <= 256): - raise ValueError('max_bins={} should be no smaller than 2 ' - 'and no larger than 256.'.format(max_bins)) rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) data = data.take(subset, axis=0) - percentiles = np.linspace(0, 100, num=max_bins + 1) - percentiles = percentiles[1:-1] binning_thresholds = [] for f_idx in range(data.shape[1]): - col_data = np.ascontiguousarray(data[:, f_idx], dtype=X_DTYPE) + col_data = data[:, f_idx] + # ignore missing values when computing bin thresholds + missing_mask = np.isnan(col_data) + if missing_mask.any(): + col_data = col_data[~missing_mask] + col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) distinct_values = np.unique(col_data) if len(distinct_values) <= max_bins: midpoints = distinct_values[:-1] + distinct_values[1:] @@ -65,9 +68,18 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. + percentiles = np.linspace(0, 100, num=max_bins + 1) + percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) + assert midpoints.shape[0] == max_bins - 1 + + # We avoid having +inf thresholds: +inf thresholds are only allowed in + # a "split on nan" situation. + np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints) + binning_thresholds.append(midpoints) + return binning_thresholds @@ -80,16 +92,19 @@ class _BinMapper(BaseEstimator, TransformerMixin): For large datasets, quantiles are computed on a subset of the data to speed-up the binning, but the quantiles should remain stable. - If the number of unique values for a given feature is less than - ``max_bins``, then the unique values of this feature are used instead of - the quantiles. + Features with a small number of values may be binned into less than + ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved + for missing values. Parameters ---------- - max_bins : int, optional (default=256) - The maximum number of bins to use. If for a given feature the number of - unique values is less than ``max_bins``, then those unique values - will be used to compute the bin thresholds, instead of the quantiles. + n_bins : int, optional (default=256) + The maximum number of bins to use (including the bin for missing + values). Non-missing values are binned on ``max_bins = n_bins - 1`` + bins. The last bin is always reserved for missing values. If for a + given feature the number of unique values is less than ``max_bins``, + then those unique values will be used to compute the bin thresholds, + instead of the quantiles. subsample : int or None, optional (default=2e5) If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -98,15 +113,35 @@ class _BinMapper(BaseEstimator, TransformerMixin): optional (default=None) Pseudo-random number generator to control the random sub-sampling. See :term:`random_state`. + + Attributes + ---------- + bin_thresholds_ : list of arrays + For each feature, gives the real-valued bin threhsolds. There are + ``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the + number of bins used for non-missing values. + n_bins_non_missing_ : array of uint32 + For each feature, gives the number of bins actually used for + non-missing values. For features with a lot of unique values, this is + equal to ``n_bins - 1``. + missing_values_bin_idx_ : uint8 + The index of the bin where missing values are mapped. This is a + constant accross all features. This corresponds to the last bin, and + it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_`` + is less than ``n_bins - 1`` for a given feature, then there are + empty (and unused) bins. """ - def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): - self.max_bins = max_bins + def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): + self.n_bins = n_bins self.subsample = subsample self.random_state = random_state def fit(self, X, y=None): """Fit data X by computing the binning thresholds. + The last bin is reserved for missing values, whether missing values + are present in the data or not. + Parameters ---------- X : array-like, shape (n_samples, n_features) @@ -118,20 +153,30 @@ def fit(self, X, y=None): ------- self : object """ + if not (3 <= self.n_bins <= 256): + # min is 3: at least 2 distinct bins and a missing values bin + raise ValueError('n_bins={} should be no smaller than 3 ' + 'and no larger than 256.'.format(self.n_bins)) + X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) + max_bins = self.n_bins - 1 self.bin_thresholds_ = _find_binning_thresholds( - X, self.max_bins, subsample=self.subsample, + X, max_bins, subsample=self.subsample, random_state=self.random_state) - self.actual_n_bins_ = np.array( + self.n_bins_non_missing_ = np.array( [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], dtype=np.uint32) + self.missing_values_bin_idx_ = self.n_bins - 1 + return self def transform(self, X): """Bin data X. + Missing values will be mapped to the last bin. + Parameters ---------- X : array-like, shape (n_samples, n_features) @@ -144,12 +189,13 @@ def transform(self, X): """ X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) check_is_fitted(self) - if X.shape[1] != self.actual_n_bins_.shape[0]: + if X.shape[1] != self.n_bins_non_missing_.shape[0]: raise ValueError( 'This estimator was fitted with {} features but {} got passed ' - 'to transform()'.format(self.actual_n_bins_.shape[0], + 'to transform()'.format(self.n_bins_non_missing_.shape[0], X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, binned) + _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_, + binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/types.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd similarity index 96% rename from sklearn/ensemble/_hist_gradient_boosting/types.pxd rename to sklearn/ensemble/_hist_gradient_boosting/common.pxd index f72741006a508..fa78f2024aa5c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/types.pxd +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd @@ -23,6 +23,7 @@ cdef packed struct node_struct: unsigned int count unsigned int feature_idx X_DTYPE_C threshold + unsigned char missing_go_to_left unsigned int left unsigned int right Y_DTYPE_C gain diff --git a/sklearn/ensemble/_hist_gradient_boosting/types.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx similarity index 87% rename from sklearn/ensemble/_hist_gradient_boosting/types.pyx rename to sklearn/ensemble/_hist_gradient_boosting/common.pyx index 67820337e72bc..8604548e44163 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/types.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pyx @@ -6,7 +6,7 @@ import numpy as np Y_DTYPE = np.float64 X_DTYPE = np.float64 X_BINNED_DTYPE = np.uint8 # hence max_bins == 256 -# dtypes for gradients and hessians arrays +# dtype for gradients and hessians arrays G_H_DTYPE = np.float32 HISTOGRAM_DTYPE = np.dtype([ @@ -20,6 +20,7 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ ('count', np.uint32), ('feature_idx', np.uint32), ('threshold', X_DTYPE), + ('missing_go_to_left', np.uint8), ('left', np.uint32), ('right', np.uint32), ('gain', Y_DTYPE), @@ -27,3 +28,5 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ ('is_leaf', np.uint8), ('bin_threshold', X_BINNED_DTYPE), ]) + +ALMOST_INF = 1e300 # see LightGBM AvoidInf() diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 4c40f662d0656..ad6a5a8ca381b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2,6 +2,7 @@ # Author: Nicolas Hug from abc import ABC, abstractmethod +from functools import partial import numpy as np from timeit import default_timer as time @@ -14,7 +15,7 @@ from ...model_selection import train_test_split from ...preprocessing import LabelEncoder from ._gradient_boosting import _update_raw_predictions -from .types import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE +from .common import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE from .binning import _BinMapper from .grower import TreeGrower @@ -75,6 +76,10 @@ def _validate_parameters(self): raise ValueError('tol={} ' 'must not be smaller than 0.'.format(self.tol)) + if not (2 <= self.max_bins <= 255): + raise ValueError('max_bins={} should be no smaller than 2 ' + 'and no larger than 255.'.format(self.max_bins)) + def fit(self, X, y): """Fit the gradient boosting model. @@ -143,8 +148,18 @@ def fit(self, X, y): X_train, y_train = X, y X_val, y_val = None, None + has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) + # Bin the data - self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=rng) X_binned_train = self._bin_data(X_train, rng, is_training_data=True) if X_val is not None: X_binned_val = self._bin_data(X_val, rng, is_training_data=False) @@ -293,8 +308,9 @@ def fit(self, X, y): grower = TreeGrower( X_binned_train, gradients[k, :], hessians[k, :], - max_bins=self.max_bins, - actual_n_bins=self.bin_mapper_.actual_n_bins_, + n_bins=n_bins, + n_bins_non_missing=self.bin_mapper_.n_bins_non_missing_, + has_missing_values=has_missing_values, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, @@ -325,7 +341,11 @@ def fit(self, X, y): if self._use_validation_data: for k, pred in enumerate(self._predictors[-1]): raw_predictions_val[k, :] += ( - pred.predict_binned(X_binned_val)) + pred.predict_binned( + X_binned_val, + self.bin_mapper_.missing_values_bin_idx_ + ) + ) should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, @@ -556,8 +576,13 @@ def _raw_predict(self, X): raw_predictions += self._baseline_prediction for predictors_of_ith_iteration in self._predictors: for k, predictor in enumerate(predictors_of_ith_iteration): - predict = (predictor.predict_binned if is_binned - else predictor.predict) + if is_binned: + predict = partial( + predictor.predict_binned, + missing_values_bin_idx=self.bin_mapper_.missing_values_bin_idx_ # noqa + ) + else: + predict = predictor.predict raw_predictions[k, :] += predict(X) return raw_predictions @@ -593,6 +618,9 @@ def _compute_partial_dependence_recursion(self, grid, target_features): return averaged_predictions + def _more_tags(self): + return {'allow_nan': True} + @abstractmethod def _get_loss(self): pass @@ -606,13 +634,6 @@ def n_iter_(self): check_is_fitted(self) return len(self._predictors) - def _more_tags(self): - # This is not strictly True, but it's needed since - # force_all_finite=False means accept both nans and infinite values. - # Without the tag, common checks would fail. - # This comment must be removed once we merge PR 13911 - return {'allow_nan': True} - class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): """Histogram-based Gradient Boosting Regression Tree. @@ -621,6 +642,14 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): :class:`GradientBoostingRegressor` for big datasets (n_samples >= 10 000). + This estimator has native support for missing values (NaNs). During + training, the tree grower learns at each split point whether samples + with missing values should go to the left or right child, based on the + potential gain. When predicting, samples with missing values are + assigned to the left or right child consequently. If no missing values + were encountered for a given feature during training, then samples with + missing values are mapped to whichever child has the most samples. + This implementation is inspired by `LightGBM `_. @@ -664,12 +693,13 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): l2_regularization : float, optional (default=0) The L2 regularization parameter. Use ``0`` for no regularization (default). - max_bins : int, optional (default=256) - The maximum number of bins to use. Before training, each feature of - the input array ``X`` is binned into at most ``max_bins`` bins, which - allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. Must be no - larger than 256. + max_bins : int, optional (default=255) + The maximum number of bins to use for non-missing values. Before + training, each feature of the input array `X` is binned into + integer-valued bins, which allows for a much faster training stage. + Features with a small number of unique values may use less than + ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin + is always reserved for missing values. Must be no larger than 255. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -740,7 +770,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): def __init__(self, loss='least_squares', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, - min_samples_leaf=20, l2_regularization=0., max_bins=256, + min_samples_leaf=20, l2_regularization=0., max_bins=255, warm_start=False, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): @@ -789,6 +819,14 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, :class:`GradientBoostingClassifier` for big datasets (n_samples >= 10 000). + This estimator has native support for missing values (NaNs). During + training, the tree grower learns at each split point whether samples + with missing values should go to the left or right child, based on the + potential gain. When predicting, samples with missing values are + assigned to the left or right child consequently. If no missing values + were encountered for a given feature during training, then samples with + missing values are mapped to whichever child has the most samples. + This implementation is inspired by `LightGBM `_. @@ -835,12 +873,13 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, since only very shallow trees would be built. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. - max_bins : int, optional (default=256) - The maximum number of bins to use. Before training, each feature of - the input array ``X`` is binned into at most ``max_bins`` bins, which - allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. Must be no - larger than 256. + max_bins : int, optional (default=255) + The maximum number of bins to use for non-missing values. Before + training, each feature of the input array `X` is binned into + integer-valued bins, which allows for a much faster training stage. + Features with a small number of unique values may use less than + ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin + is always reserved for missing values. Must be no larger than 255. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -913,7 +952,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, - l2_regularization=0., max_bins=256, warm_start=False, + l2_regularization=0., max_bins=255, warm_start=False, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(HistGradientBoostingClassifier, self).__init__( diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 7eec680082e97..c7d303b8f6201 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -15,8 +15,8 @@ from .histogram import HistogramBuilder from .predictor import TreePredictor from .utils import sum_parallel -from .types import PREDICTOR_RECORD_DTYPE -from .types import Y_DTYPE +from .common import PREDICTOR_RECORD_DTYPE +from .common import Y_DTYPE EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors @@ -141,14 +141,18 @@ class TreeGrower: min_gain_to_split : float, optional (default=0.) The minimum gain needed to split a node. Splits with lower gain will be ignored. - max_bins : int, optional (default=256) - The maximum number of bins. Used to define the shape of the - histograms. - actual_n_bins : ndarray of int or int, optional (default=None) - The actual number of bins needed for each feature, which is lower or - equal to ``max_bins``. If it's an int, all features are considered to - have the same number of bins. If None, all features are considered to - have ``max_bins`` bins. + n_bins : int, optional (default=256) + The total number of bins, including the bin for missing values. Used + to define the shape of the histograms. + n_bins_non_missing_ : array of uint32 + For each feature, gives the number of bins actually used for + non-missing values. For features with a lot of unique values, this + is equal to ``n_bins - 1``. If it's an int, all features are + considered to have the same number of bins. If None, all features + are considered to have ``n_bins - 1`` bins. + has_missing_values : ndarray of bool or bool, optional (default=False) + Whether each feature contains missing values (in the training data). + If it's a bool, the same value is used for all features. l2_regularization : float, optional (default=0) The L2 regularization parameter. min_hessian_to_split : float, optional (default=1e-3) @@ -161,32 +165,40 @@ class TreeGrower: """ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., - max_bins=256, actual_n_bins=None, l2_regularization=0., - min_hessian_to_split=1e-3, shrinkage=1.): + n_bins=256, n_bins_non_missing=None, has_missing_values=False, + l2_regularization=0., min_hessian_to_split=1e-3, + shrinkage=1.): self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split) - if actual_n_bins is None: - actual_n_bins = max_bins + if n_bins_non_missing is None: + n_bins_non_missing = n_bins - 1 - if isinstance(actual_n_bins, numbers.Integral): - actual_n_bins = np.array( - [actual_n_bins] * X_binned.shape[1], + if isinstance(n_bins_non_missing, numbers.Integral): + n_bins_non_missing = np.array( + [n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32) else: - actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32) + n_bins_non_missing = np.asarray(n_bins_non_missing, + dtype=np.uint32) + + if isinstance(has_missing_values, bool): + has_missing_values = [has_missing_values] * X_binned.shape[1] + has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( - X_binned, max_bins, gradients, hessians, hessians_are_constant) + X_binned, n_bins, gradients, hessians, hessians_are_constant) + missing_values_bin_idx = n_bins - 1 self.splitter = Splitter( - X_binned, actual_n_bins, l2_regularization, - min_hessian_to_split, min_samples_leaf, min_gain_to_split, - hessians_are_constant) + X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, hessians_are_constant) + self.n_bins_non_missing = n_bins_non_missing self.max_leaf_nodes = max_leaf_nodes - self.max_bins = max_bins + self.has_missing_values = has_missing_values self.n_features = X_binned.shape[1] self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf @@ -333,6 +345,13 @@ def split_next(self): right_child_node.partition_start = left_child_node.partition_stop right_child_node.partition_stop = node.partition_stop + if not self.has_missing_values[node.split_info.feature_idx]: + # If no missing values are encountered at fit time, then samples + # with missing values during predict() will go to whichever child + # has the most samples. + node.split_info.missing_go_to_left = ( + left_child_node.n_samples > right_child_node.n_samples) + self.n_nodes += 2 if self.max_depth is not None and depth == self.max_depth: @@ -428,12 +447,13 @@ def make_predictor(self, bin_thresholds=None): """ predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) _fill_predictor_node_array(predictor_nodes, self.root, - bin_thresholds=bin_thresholds) + bin_thresholds, self.n_bins_non_missing) return TreePredictor(predictor_nodes) def _fill_predictor_node_array(predictor_nodes, grower_node, - bin_thresholds, next_free_idx=0): + bin_thresholds, n_bins_non_missing, + next_free_idx=0): """Helper used in make_predictor to set the TreePredictor fields.""" node = predictor_nodes[next_free_idx] node['count'] = grower_node.n_samples @@ -454,17 +474,27 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx - if bin_thresholds is not None: - threshold = bin_thresholds[feature_idx][bin_idx] - node['threshold'] = threshold + node['missing_go_to_left'] = split_info.missing_go_to_left + + if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1: + # Split is on the last non-missing bin: it's a "split on nans". All + # nans go to the right, the rest go to the left. + node['threshold'] = np.inf + elif bin_thresholds is not None: + node['threshold'] = bin_thresholds[feature_idx][bin_idx] + next_free_idx += 1 node['left'] = next_free_idx next_free_idx = _fill_predictor_node_array( predictor_nodes, grower_node.left_child, - bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) + bin_thresholds=bin_thresholds, + n_bins_non_missing=n_bins_non_missing, + next_free_idx=next_free_idx) node['right'] = next_free_idx return _fill_predictor_node_array( predictor_nodes, grower_node.right_child, - bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) + bin_thresholds=bin_thresholds, + n_bins_non_missing=n_bins_non_missing, + next_free_idx=next_free_idx) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index c83fa0c79db71..740e5e002cf4e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -12,10 +12,10 @@ from cython.parallel import prange import numpy as np cimport numpy as np -from .types import HISTOGRAM_DTYPE -from .types cimport hist_struct -from .types cimport X_BINNED_DTYPE_C -from .types cimport G_H_DTYPE_C +from .common import HISTOGRAM_DTYPE +from .common cimport hist_struct +from .common cimport X_BINNED_DTYPE_C +from .common cimport G_H_DTYPE_C # Notes: # - IN views are read-only, OUT views are write-only @@ -62,9 +62,9 @@ cdef class HistogramBuilder: ---------- X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - max_bins : int - The maximum number of bins. Used to define the shape of the - histograms. + n_bins : int + The total number of bins, including the bin for missing values. Used + to define the shape of the histograms. gradients : ndarray, shape (n_samples,) The gradients of each training sample. Those are the gradients of the loss w.r.t the predictions, evaluated at iteration i - 1. @@ -77,7 +77,7 @@ cdef class HistogramBuilder: cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features - unsigned int max_bins + unsigned int n_bins G_H_DTYPE_C [::1] gradients G_H_DTYPE_C [::1] hessians G_H_DTYPE_C [::1] ordered_gradients @@ -85,15 +85,15 @@ cdef class HistogramBuilder: unsigned char hessians_are_constant def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, - unsigned int max_bins, G_H_DTYPE_C [::1] gradients, + unsigned int n_bins, G_H_DTYPE_C [::1] gradients, G_H_DTYPE_C [::1] hessians, unsigned char hessians_are_constant): self.X_binned = X_binned self.n_features = X_binned.shape[1] - # Note: all histograms will have bins, but some of the - # last bins may be unused if actual_n_bins[f] < max_bins - self.max_bins = max_bins + # Note: all histograms will have bins, but some of the + # bins may be unused if a feature has a small number of unique values. + self.n_bins = n_bins self.gradients = gradients self.hessians = hessians # for root node, gradients and hessians are already ordered @@ -115,7 +115,7 @@ cdef class HistogramBuilder: Returns ------- - histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, max_bins) + histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) The computed histograms of the current node. """ cdef: @@ -131,7 +131,7 @@ cdef class HistogramBuilder: G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians G_H_DTYPE_C [::1] hessians = self.hessians hist_struct [:, ::1] histograms = np.zeros( - shape=(self.n_features, self.max_bins), + shape=(self.n_features, self.n_bins), dtype=HISTOGRAM_DTYPE ) @@ -210,15 +210,15 @@ cdef class HistogramBuilder: Parameters ---------- parent_histograms : ndarray of HISTOGRAM_DTYPE, \ - shape (n_features, max_bins) + shape (n_features, n_bins) The histograms of the parent. sibling_histograms : ndarray of HISTOGRAM_DTYPE, \ - shape (n_features, max_bins) + shape (n_features, n_bins) The histograms of the sibling. Returns ------- - histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, max_bins) + histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins) The computed histograms of the current node. """ @@ -226,14 +226,14 @@ cdef class HistogramBuilder: int feature_idx int n_features = self.n_features hist_struct [:, ::1] histograms = np.zeros( - shape=(self.n_features, self.max_bins), + shape=(self.n_features, self.n_bins), dtype=HISTOGRAM_DTYPE ) for feature_idx in prange(n_features, schedule='static', nogil=True): # Compute histogram of each feature _subtract_histograms(feature_idx, - self.max_bins, + self.n_bins, parent_histograms, sibling_histograms, histograms) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 5d7c68ea0b38f..9e00187d62425 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -15,8 +15,8 @@ except ImportError: from scipy.misc import logsumexp -from .types import Y_DTYPE -from .types import G_H_DTYPE +from .common import Y_DTYPE +from .common import G_H_DTYPE from ._loss import _update_gradients_least_squares from ._loss import _update_gradients_hessians_binary_crossentropy from ._loss import _update_gradients_hessians_categorical_crossentropy diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index c80788d049874..0b359c8f98224 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -5,7 +5,7 @@ import numpy as np -from .types import Y_DTYPE +from .common import Y_DTYPE from ._predictor import _predict_from_numeric_data from ._predictor import _predict_from_binned_data from ._predictor import _compute_partial_dependence @@ -47,13 +47,17 @@ def predict(self, X): _predict_from_numeric_data(self.nodes, X, out) return out - def predict_binned(self, X): + def predict_binned(self, X, missing_values_bin_idx): """Predict raw values for binned data. Parameters ---------- X : ndarray, shape (n_samples, n_features) The input samples. + missing_values_bin_idx : uint8 + Index of the bin that is used for missing values. This is the + index of the last bin and is always equal to max_bins (as passed + to the GBDT classes), or equivalently to n_bins - 1. Returns ------- @@ -61,7 +65,7 @@ def predict_binned(self, X): The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_binned_data(self.nodes, X, out) + _predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out) return out def compute_partial_dependence(self, grid, target_features, out): diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 6dc6e58d9acff..fda060e238514 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -20,10 +20,10 @@ IF SKLEARN_OPENMP_SUPPORTED: from libc.stdlib cimport malloc, free from libc.string cimport memcpy -from .types cimport X_BINNED_DTYPE_C -from .types cimport Y_DTYPE_C -from .types cimport hist_struct -from .types import HISTOGRAM_DTYPE +from .common cimport X_BINNED_DTYPE_C +from .common cimport Y_DTYPE_C +from .common cimport hist_struct +from .common import HISTOGRAM_DTYPE cdef struct split_info_struct: @@ -32,6 +32,7 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx + unsigned char missing_go_to_left Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right Y_DTYPE_C sum_hessian_left @@ -51,6 +52,8 @@ class SplitInfo: The index of the feature to be split. bin_idx : int The index of the bin on which the split is made. + missing_go_to_left : bool + Whether missing values should go to the left child. sum_gradient_left : float The sum of the gradients of all the samples in the left child. sum_hessian_left : float @@ -64,12 +67,14 @@ class SplitInfo: n_samples_right : int The number of samples in the right child. """ - def __init__(self, gain, feature_idx, bin_idx, sum_gradient_left, - sum_hessian_left, sum_gradient_right, sum_hessian_right, - n_samples_left, n_samples_right): + def __init__(self, gain, feature_idx, bin_idx, + missing_go_to_left, sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, n_samples_left, + n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx + self.missing_go_to_left = missing_go_to_left self.sum_gradient_left = sum_gradient_left self.sum_hessian_left = sum_hessian_left self.sum_gradient_right = sum_gradient_right @@ -91,9 +96,16 @@ cdef class Splitter: ---------- X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - actual_n_bins : ndarray, shape (n_features,) - The actual number of bins needed for each feature, which is lower or - equal to max_bins. + n_bins_non_missing : ndarray, shape (n_features,) + For each feature, gives the number of bins actually used for + non-missing values. + missing_values_bin_idx : uint8 + Index of the bin that is used for missing values. This is the index of + the last bin and is always equal to max_bins (as passed to the GBDT + classes), or equivalently to n_bins - 1. + has_missing_values : ndarray, shape (n_features,) + Whether missing values were observed in the training data, for each + feature. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -111,7 +123,9 @@ cdef class Splitter: cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features - unsigned int [::1] actual_n_bins + const unsigned int [::1] n_bins_non_missing + unsigned char missing_values_bin_idx + const unsigned char [::1] has_missing_values unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -122,16 +136,22 @@ cdef class Splitter: unsigned int [::1] left_indices_buffer unsigned int [::1] right_indices_buffer - def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, - np.ndarray[np.uint32_t] actual_n_bins, - Y_DTYPE_C l2_regularization, Y_DTYPE_C - min_hessian_to_split=1e-3, unsigned int - min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0., + def __init__(self, + const X_BINNED_DTYPE_C [::1, :] X_binned, + const unsigned int [::1] n_bins_non_missing, + const unsigned char missing_values_bin_idx, + const unsigned char [::1] has_missing_values, + Y_DTYPE_C l2_regularization, + Y_DTYPE_C min_hessian_to_split=1e-3, + unsigned int min_samples_leaf=20, + Y_DTYPE_C min_gain_to_split=0., unsigned char hessians_are_constant=False): self.X_binned = X_binned self.n_features = X_binned.shape[1] - self.actual_n_bins = actual_n_bins + self.n_bins_non_missing = n_bins_non_missing + self.missing_values_bin_idx = missing_values_bin_idx + self.has_missing_values = has_missing_values self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf @@ -228,6 +248,8 @@ cdef class Splitter: cdef: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx + unsigned char missing_go_to_left = split_info.missing_go_to_left + unsigned char missing_values_bin_idx = self.missing_values_bin_idx int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] @@ -252,6 +274,7 @@ cdef class Splitter: int thread_idx int sample_idx int right_child_position + unsigned char turn_left int [:] left_offset = np.zeros(n_threads, dtype=np.int32) int [:] right_offset = np.zeros(n_threads, dtype=np.int32) @@ -273,7 +296,12 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if X_binned[sample_idx] <= bin_idx: + turn_left = sample_goes_left( + missing_go_to_left, + missing_values_bin_idx, bin_idx, + X_binned[sample_idx]) + + if turn_left: left_indices_buffer[start + left_count] = sample_idx left_count = left_count + 1 else: @@ -350,6 +378,7 @@ cdef class Splitter: int n_features = self.n_features split_info_struct split_info split_info_struct * split_infos + const unsigned char [:] has_missing_values = self.has_missing_values with nogil: @@ -358,10 +387,32 @@ cdef class Splitter: for feature_idx in prange(n_features, schedule='static'): # For each feature, find best bin to split on - split_info = self._find_best_bin_to_split_helper( - feature_idx, histograms, n_samples, - sum_gradients, sum_hessians) - split_infos[feature_idx] = split_info + # Start with a gain of -1 (if no better split is found, that + # means one of the constraints isn't respected + # (min_samples_leaf, etc) and the grower will later turn the + # node into a leaf. + split_infos[feature_idx].gain = -1 + + # We will scan bins from left to right (in all cases), and if + # there are any missing values, we will also scan bins from + # right to left. This way, we can consider whichever case + # yields the best gain: either missing values go to the right + # (left to right scan) or to the left (right to left case). + # See algo 3 from the XGBoost paper + # https://arxiv.org/abs/1603.02754 + + self._find_best_bin_to_split_left_to_right( + feature_idx, has_missing_values[feature_idx], + histograms, n_samples, sum_gradients, sum_hessians, + &split_infos[feature_idx]) + + if has_missing_values[feature_idx]: + # We need to explore both directions to check whether + # sending the nans to the left child would lead to a higher + # gain + self._find_best_bin_to_split_right_to_left( + feature_idx, histograms, n_samples, + sum_gradients, sum_hessians, &split_infos[feature_idx]) # then compute best possible split among all features best_feature_idx = self._find_best_feature_to_split_helper( @@ -372,6 +423,7 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, + split_info.missing_go_to_left, split_info.sum_gradient_left, split_info.sum_hessian_left, split_info.sum_gradient_right, @@ -382,13 +434,13 @@ cdef class Splitter: free(split_infos) return out - cdef int _find_best_feature_to_split_helper( + cdef unsigned int _find_best_feature_to_split_helper( self, split_info_struct * split_infos) nogil: # IN """Returns the best feature among those in splits_infos.""" cdef: - int feature_idx - int best_feature_idx = 0 + unsigned int feature_idx + unsigned int best_feature_idx = 0 for feature_idx in range(1, self.n_features): if (split_infos[feature_idx].gain > @@ -396,43 +448,50 @@ cdef class Splitter: best_feature_idx = feature_idx return best_feature_idx - cdef split_info_struct _find_best_bin_to_split_helper( + cdef void _find_best_bin_to_split_left_to_right( Splitter self, unsigned int feature_idx, + unsigned char has_missing_values, const hist_struct [:, ::1] histograms, # IN unsigned int n_samples, Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians) nogil: + Y_DTYPE_C sum_hessians, + split_info_struct * split_info) nogil: # OUT """Find best bin to split on for a given feature. Splits that do not satisfy the splitting constraints - (min_gain_to_split, etc.) are discarded here. If no split can - satisfy the constraints, a SplitInfo with a gain of -1 is returned. - If for a given node the best SplitInfo has a gain of -1, it is - finalized into a leaf in the grower. + (min_gain_to_split, etc.) are discarded here. + + We scan node from left to right. This version is called whether there + are missing values or not. If any, missing values are assigned to the + right node. """ cdef: unsigned int bin_idx unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples + # We set the 'end' variable such that the last non-missing-values + # bin never goes to the left child (which would result in and + # empty right child), unless there are missing values, since these + # would go to the right child. + unsigned int end = \ + self.n_bins_non_missing[feature_idx] - 1 + has_missing_values Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right Y_DTYPE_C negative_loss_current_node Y_DTYPE_C gain - split_info_struct best_split - best_split.gain = -1. sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 negative_loss_current_node = negative_loss(sum_gradients, - sum_hessians, self.l2_regularization) + sum_hessians, + self.l2_regularization) + - for bin_idx in range(self.actual_n_bins[feature_idx] - 1): - # Note that considering splitting on the last bin is useless since - # it would result in having 0 samples in the right node (forbidden) + for bin_idx in range(end): n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left @@ -463,19 +522,103 @@ cdef class Splitter: negative_loss_current_node, self.l2_regularization) - if gain > best_split.gain and gain > self.min_gain_to_split: - best_split.gain = gain - best_split.feature_idx = feature_idx - best_split.bin_idx = bin_idx - best_split.sum_gradient_left = sum_gradient_left - best_split.sum_gradient_right = sum_gradient_right - best_split.sum_hessian_left = sum_hessian_left - best_split.sum_hessian_right = sum_hessian_right - best_split.n_samples_left = n_samples_left - best_split.n_samples_right = n_samples_right + if gain > split_info.gain and gain > self.min_gain_to_split: + split_info.gain = gain + split_info.feature_idx = feature_idx + split_info.bin_idx = bin_idx + # we scan from left to right so missing values go to the right + split_info.missing_go_to_left = False + split_info.sum_gradient_left = sum_gradient_left + split_info.sum_gradient_right = sum_gradient_right + split_info.sum_hessian_left = sum_hessian_left + split_info.sum_hessian_right = sum_hessian_right + split_info.n_samples_left = n_samples_left + split_info.n_samples_right = n_samples_right + + cdef void _find_best_bin_to_split_right_to_left( + self, + unsigned int feature_idx, + const hist_struct [:, ::1] histograms, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + split_info_struct * split_info) nogil: # OUT + """Find best bin to split on for a given feature. + + Splits that do not satisfy the splitting constraints + (min_gain_to_split, etc.) are discarded here. + + We scan node from right to left. This version is only called when + there are missing values. Missing values are assigned to the left + child. + + If no missing value are present in the data this method isn't called + since only calling _find_best_bin_to_split_left_to_right is enough. + """ + + cdef: + unsigned int bin_idx + unsigned int n_samples_left + unsigned int n_samples_right + unsigned int n_samples_ = n_samples + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C negative_loss_current_node + Y_DTYPE_C gain + unsigned int start = self.n_bins_non_missing[feature_idx] - 2 + + sum_gradient_right, sum_hessian_right = 0., 0. + n_samples_right = 0 + negative_loss_current_node = negative_loss(sum_gradients, + sum_hessians, + self.l2_regularization) + + for bin_idx in range(start, -1, -1): + n_samples_right += histograms[feature_idx, bin_idx + 1].count + n_samples_left = n_samples_ - n_samples_right + + if self.hessians_are_constant: + sum_hessian_right += histograms[feature_idx, bin_idx + 1].count + else: + sum_hessian_right += \ + histograms[feature_idx, bin_idx + 1].sum_hessians + sum_hessian_left = sum_hessians - sum_hessian_right + + sum_gradient_right += \ + histograms[feature_idx, bin_idx + 1].sum_gradients + sum_gradient_left = sum_gradients - sum_gradient_right + + if n_samples_right < self.min_samples_leaf: + continue + if n_samples_left < self.min_samples_leaf: + # won't get any better + break + + if sum_hessian_right < self.min_hessian_to_split: + continue + if sum_hessian_left < self.min_hessian_to_split: + # won't get any better (hessians are > 0 since loss is convex) + break - return best_split + gain = _split_gain(sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, + negative_loss_current_node, + self.l2_regularization) + if gain > split_info.gain and gain > self.min_gain_to_split: + split_info.gain = gain + split_info.feature_idx = feature_idx + split_info.bin_idx = bin_idx + # we scan from right to left so missing values go to the left + split_info.missing_go_to_left = True + split_info.sum_gradient_left = sum_gradient_left + split_info.sum_gradient_right = sum_gradient_right + split_info.sum_hessian_left = sum_hessian_left + split_info.sum_hessian_right = sum_hessian_right + split_info.n_samples_left = n_samples_left + split_info.n_samples_right = n_samples_right cdef inline Y_DTYPE_C _split_gain( Y_DTYPE_C sum_gradient_left, @@ -507,3 +650,19 @@ cdef inline Y_DTYPE_C negative_loss( Y_DTYPE_C hessian, Y_DTYPE_C l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) + +cdef inline unsigned char sample_goes_left( + unsigned char missing_go_to_left, + unsigned char missing_values_bin_idx, + X_BINNED_DTYPE_C split_bin_idx, + X_BINNED_DTYPE_C bin_value) nogil: + """Helper to decide whether sample should go to left or right child.""" + + return ( + ( + missing_go_to_left and + bin_value == missing_values_bin_idx + ) + or ( + bin_value <= split_bin_idx + )) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 8cbb26fa98178..06e38d62f7638 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -7,8 +7,9 @@ _find_binning_thresholds as _find_binning_thresholds_orig, _map_to_bins ) -from sklearn.ensemble._hist_gradient_boosting.types import X_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF DATA = np.random.RandomState(42).normal( @@ -16,7 +17,7 @@ ).astype(X_DTYPE) -def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), +def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) @@ -52,10 +53,11 @@ def test_find_binning_thresholds_small_regular_data(): def test_find_binning_thresholds_random_data(): - bin_thresholds = _find_binning_thresholds(DATA, random_state=0) + bin_thresholds = _find_binning_thresholds(DATA, max_bins=255, + random_state=0) assert len(bin_thresholds) == 2 for i in range(len(bin_thresholds)): - assert bin_thresholds[i].shape == (255,) # 256 - 1 + assert bin_thresholds[i].shape == (254,) # 255 - 1 assert bin_thresholds[i].dtype == DATA.dtype assert_allclose(bin_thresholds[0][[64, 128, 192]], @@ -74,25 +76,29 @@ def test_find_binning_thresholds_low_n_bins(): assert bin_thresholds[i].dtype == DATA.dtype -def test_find_binning_thresholds_invalid_n_bins(): - err_msg = 'no smaller than 2 and no larger than 256' +@pytest.mark.parametrize('n_bins', (2, 257)) +def test_invalid_n_bins(n_bins): + err_msg = ( + 'n_bins={} should be no smaller than 3 and no larger than 256' + .format(n_bins)) with pytest.raises(ValueError, match=err_msg): - _find_binning_thresholds(DATA, max_bins=1024) + _BinMapper(n_bins=n_bins).fit(DATA) def test_bin_mapper_n_features_transform(): - mapper = _BinMapper(max_bins=42, random_state=42).fit(DATA) + mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA) err_msg = 'This estimator was fitted with 2 features but 4 got passed' with pytest.raises(ValueError, match=err_msg): mapper.transform(np.repeat(DATA, 2, axis=1)) -@pytest.mark.parametrize('n_bins', [16, 128, 256]) -def test_map_to_bins(n_bins): - bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, +@pytest.mark.parametrize('max_bins', [16, 128, 255]) +def test_map_to_bins(max_bins): + bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins, random_state=0) binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(DATA, bin_thresholds, binned) + last_bin_idx = max_bins + _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous @@ -103,47 +109,52 @@ def test_map_to_bins(n_bins): for feature_idx, min_idx in enumerate(min_indices): assert binned[min_idx, feature_idx] == 0 for feature_idx, max_idx in enumerate(max_indices): - assert binned[max_idx, feature_idx] == n_bins - 1 + assert binned[max_idx, feature_idx] == max_bins - 1 -@pytest.mark.parametrize("n_bins", [5, 10, 42]) -def test_bin_mapper_random_data(n_bins): +@pytest.mark.parametrize("max_bins", [5, 10, 42]) +def test_bin_mapper_random_data(max_bins): n_samples, n_features = DATA.shape - expected_count_per_bin = n_samples // n_bins + expected_count_per_bin = n_samples // max_bins tol = int(0.05 * expected_count_per_bin) - mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA) + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 assert_array_equal(binned.min(axis=0), np.array([0, 0])) - assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) + assert_array_equal(binned.max(axis=0), + np.array([max_bins - 1, max_bins - 1])) assert len(mapper.bin_thresholds_) == n_features for bin_thresholds_feature in mapper.bin_thresholds_: - assert bin_thresholds_feature.shape == (n_bins - 1,) + assert bin_thresholds_feature.shape == (max_bins - 1,) assert bin_thresholds_feature.dtype == DATA.dtype - assert np.all(mapper.actual_n_bins_ == n_bins) + assert np.all(mapper.n_bins_non_missing_ == max_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): - for bin_idx in range(n_bins): + for bin_idx in range(max_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol -@pytest.mark.parametrize("n_samples, n_bins", [ +@pytest.mark.parametrize("n_samples, max_bins", [ (5, 5), (5, 10), (5, 11), (42, 255) ]) -def test_bin_mapper_small_random_data(n_samples, n_bins): +def test_bin_mapper_small_random_data(n_samples, max_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples - mapper = _BinMapper(max_bins=n_bins, random_state=42) + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + mapper = _BinMapper(n_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape @@ -152,14 +163,16 @@ def test_bin_mapper_small_random_data(n_samples, n_bins): np.arange(n_samples)) -@pytest.mark.parametrize("n_bins, n_distinct, multiplier", [ +@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [ (5, 5, 1), (5, 5, 3), (255, 12, 42), ]) -def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): +def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) - binned = _BinMapper(max_bins=n_bins).fit_transform(data) + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + binned = _BinMapper(n_bins=n_bins).fit_transform(data) assert_array_equal(data, binned) @@ -176,59 +189,62 @@ def test_bin_mapper_repeated_values_invariance(n_distinct): data = data.reshape(-1, 1) - mapper_1 = _BinMapper(max_bins=n_distinct) + mapper_1 = _BinMapper(n_bins=n_distinct + 1) binned_1 = mapper_1.fit_transform(data) assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) - mapper_2 = _BinMapper(max_bins=min(256, n_distinct * 3)) + mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) assert_array_equal(binned_1, binned_2) -@pytest.mark.parametrize("n_bins, scale, offset", [ +@pytest.mark.parametrize("max_bins, scale, offset", [ (3, 2, -1), (42, 1, 0), - (256, 0.3, 42), + (255, 0.3, 42), ]) -def test_bin_mapper_identity_small(n_bins, scale, offset): - data = np.arange(n_bins).reshape(-1, 1) * scale + offset - binned = _BinMapper(max_bins=n_bins).fit_transform(data) - assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1)) +def test_bin_mapper_identity_small(max_bins, scale, offset): + data = np.arange(max_bins).reshape(-1, 1) * scale + offset + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + binned = _BinMapper(n_bins=n_bins).fit_transform(data) + assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1)) -@pytest.mark.parametrize('n_bins_small, n_bins_large', [ +@pytest.mark.parametrize('max_bins_small, max_bins_large', [ (2, 2), (3, 3), (4, 4), (42, 42), - (256, 256), + (255, 255), (5, 17), - (42, 256), + (42, 255), ]) -def test_bin_mapper_idempotence(n_bins_small, n_bins_large): - assert n_bins_large >= n_bins_small +def test_bin_mapper_idempotence(max_bins_small, max_bins_large): + assert max_bins_large >= max_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) - mapper_small = _BinMapper(max_bins=n_bins_small) - mapper_large = _BinMapper(max_bins=n_bins_large) + mapper_small = _BinMapper(n_bins=max_bins_small + 1) + mapper_large = _BinMapper(n_bins=max_bins_small + 1) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large) -@pytest.mark.parametrize('max_bins', [10, 100, 256]) +@pytest.mark.parametrize('n_bins', [10, 100, 256]) @pytest.mark.parametrize('diff', [-5, 0, 5]) -def test_actual_n_bins(max_bins, diff): - # Check that actual_n_bins is n_unique_values when - # n_unique_values <= max_bins, else max_bins. +def test_n_bins_non_missing(n_bins, diff): + # Check that n_bins_non_missing is n_unique_values when + # there are not a lot of unique values, else n_bins - 1. - n_unique_values = max_bins + diff + n_unique_values = n_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) - mapper = _BinMapper(max_bins=max_bins).fit(X) - assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values)) + mapper = _BinMapper(n_bins=n_bins).fit(X) + assert np.all(mapper.n_bins_non_missing_ == min( + n_bins - 1, n_unique_values)) def test_subsample(): @@ -242,6 +258,48 @@ def test_subsample(): rtol=1e-4) +@pytest.mark.parametrize( + 'n_bins, n_bins_non_missing, X_trans_expected', [ + (256, [4, 2, 2], [[0, 0, 0], # 255 <=> missing value + [255, 255, 0], + [1, 0, 0], + [255, 1, 1], + [2, 1, 1], + [3, 0, 0]]), + (3, [2, 2, 2], [[0, 0, 0], # 2 <=> missing value + [2, 2, 0], + [0, 0, 0], + [2, 1, 1], + [1, 1, 1], + [1, 0, 0]])]) +def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected): + # check for missing values: make sure nans are mapped to the last bin + # and that the _BinMapper attributes are correct + + X = [[1, 1, 0], + [np.NaN, np.NaN, 0], + [2, 1, 0], + [np.NaN, 2, 1], + [3, 2, 1], + [4, 1, 0]] + + X = np.array(X) + + mapper = _BinMapper(n_bins=n_bins) + mapper.fit(X) + + assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing) + + for feature_idx in range(X.shape[1]): + assert len(mapper.bin_thresholds_[feature_idx]) == \ + n_bins_non_missing[feature_idx] - 1 + + assert mapper.missing_values_bin_idx_ == n_bins - 1 + + X_trans = mapper.transform(X) + assert_array_equal(X_trans, X_trans_expected) + + def test_infinite_values(): # Make sure infinite values are properly handled. bin_mapper = _BinMapper() @@ -249,8 +307,8 @@ def test_infinite_values(): X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) bin_mapper.fit(X) - assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, np.inf]) - assert bin_mapper.actual_n_bins_ == [4] + assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF]) + assert bin_mapper.n_bins_non_missing_ == [4] expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) assert_array_equal(bin_mapper.transform(X), expected_binned_X) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 95672a60e5c40..63d8c8fb1059d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -43,7 +43,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 - max_bins = 256 + max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) @@ -51,7 +51,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) @@ -95,7 +95,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 - max_bins = 256 + max_bins = 255 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) @@ -103,7 +103,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) @@ -155,7 +155,7 @@ def test_same_predictions_multiclass_classification( rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 - max_bins = 256 + max_bins = 255 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, @@ -165,7 +165,7 @@ def test_same_predictions_multiclass_classification( if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index be7e424a844bc..1eebdefd5288d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,6 +1,11 @@ import numpy as np import pytest +from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression +from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler +from sklearn.model_selection import train_test_split +from sklearn.base import clone, BaseEstimator, TransformerMixin +from sklearn.pipeline import make_pipeline # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -31,7 +36,7 @@ ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'), ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'), ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'), - ({'max_bins': 257}, 'max_bins=257 should be no smaller than 2 and no'), + ({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'), ({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'), ({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'), ({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'), @@ -170,10 +175,65 @@ def test_binning_train_validation_are_separated(): mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] - assert np.all(mapper_training_data.actual_n_bins_ == + assert np.all(mapper_training_data.n_bins_non_missing_ == int((1 - validation_fraction) * n_samples)) - assert np.all(mapper_training_data.actual_n_bins_ != - mapper_whole_data.actual_n_bins_) + assert np.all(mapper_training_data.n_bins_non_missing_ != + mapper_whole_data.n_bins_non_missing_) + + +def test_missing_values_trivial(): + # sanity check for missing values support. With only one feature and + # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the + # training set. + + n_samples = 100 + n_features = 1 + rng = np.random.RandomState(0) + + X = rng.normal(size=(n_samples, n_features)) + mask = rng.binomial(1, .5, size=X.shape).astype(np.bool) + X[mask] = np.nan + y = mask.ravel() + gb = HistGradientBoostingClassifier() + gb.fit(X, y) + + assert gb.score(X, y) == pytest.approx(1) + + +@pytest.mark.parametrize('problem', ('classification', 'regression')) +@pytest.mark.parametrize( + 'missing_proportion, expected_min_score_classification, ' + 'expected_min_score_regression', [ + (.1, .97, .89), + (.2, .93, .81), + (.5, .79, .52)]) +def test_missing_values_resilience(problem, missing_proportion, + expected_min_score_classification, + expected_min_score_regression): + # Make sure the estimators can deal with missing values and still yield + # decent predictions + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + if problem == 'regression': + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features, random_state=rng) + gb = HistGradientBoostingRegressor() + expected_min_score = expected_min_score_regression + else: + X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_informative=n_features, n_redundant=0, + n_repeated=0, random_state=rng) + gb = HistGradientBoostingClassifier() + expected_min_score = expected_min_score_classification + + mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) + X[mask] = np.nan + + gb.fit(X, y) + + assert gb.score(X, y) > expected_min_score @pytest.mark.parametrize('data', [ @@ -222,7 +282,115 @@ def test_small_trainset(): assert small_distrib == pytest.approx(original_distrib) +def test_missing_values_minmax_imputation(): + # Compare the buit-in missing value handling of Histogram GBC with an + # a-priori missing value imputation strategy that should yield the same + # results in terms of decision function. + # + # Each feature (containing NaNs) is replaced by 2 features: + # - one where the nans are replaced by min(feature) - 1 + # - one where the nans are replaced by max(feature) + 1 + # A split where nans go to the left has an equivalent split in the + # first (min) feature, and a split where nans go to the right has an + # equivalent split in the second (max) feature. + # + # Assuming the data is such that there is never a tie to select the best + # feature to split on during training, the learned decision trees should be + # strictly equivalent (learn a sequence of splits that encode the same + # decision function). + # + # The MinMaxImputer transformer is meant to be a toy implementation of the + # "Missing In Attributes" (MIA) missing value handling for decision trees + # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 + # The implementation of MIA as an imputation transformer was suggested by + # "Remark 3" in https://arxiv.org/abs/1902.06931 + + class MinMaxImputer(BaseEstimator, TransformerMixin): + + def fit(self, X, y=None): + mm = MinMaxScaler().fit(X) + self.data_min_ = mm.data_min_ + self.data_max_ = mm.data_max_ + return self + + def transform(self, X): + X_min, X_max = X.copy(), X.copy() + + for feature_idx in range(X.shape[1]): + nan_mask = np.isnan(X[:, feature_idx]) + X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 + X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 + + return np.concatenate([X_min, X_max], axis=1) + + def make_missing_value_data(n_samples=int(1e4), seed=0): + rng = np.random.RandomState(seed) + X, y = make_regression(n_samples=n_samples, n_features=4, + random_state=rng) + + # Pre-bin the data to ensure a deterministic handling by the 2 + # strategies and also make it easier to insert np.nan in a structured + # way: + X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) + + # First feature has missing values completely at random: + rnd_mask = rng.rand(X.shape[0]) > 0.9 + X[rnd_mask, 0] = np.nan + + # Second and third features have missing values for extreme values + # (censoring missingness): + low_mask = X[:, 1] == 0 + X[low_mask, 1] = np.nan + + high_mask = X[:, 2] == X[:, 2].max() + X[high_mask, 2] = np.nan + + # Make the last feature nan pattern very informative: + y_max = np.percentile(y, 70) + y_max_mask = y >= y_max + y[y_max_mask] = y_max + X[y_max_mask, 3] = np.nan + + # Check that there is at least one missing value in each feature: + for feature_idx in range(X.shape[1]): + assert any(np.isnan(X[:, feature_idx])) + + # Let's use a test set to check that the learned decision function is + # the same as evaluated on unseen data. Otherwise it could just be the + # case that we find two independent ways to overfit the training set. + return train_test_split(X, y, random_state=rng) + + # n_samples need to be large enough to minimize the likelihood of having + # several candidate splits with the same gain value in a given tree. + X_train, X_test, y_train, y_test = make_missing_value_data( + n_samples=int(1e4), seed=0) + + # Use a small number of leaf nodes and iterations so as to keep + # under-fitting models to minimize the likelihood of ties when training the + # model. + gbm1 = HistGradientBoostingRegressor(max_iter=100, + max_leaf_nodes=5, + random_state=0) + gbm1.fit(X_train, y_train) + + gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) + gbm2.fit(X_train, y_train) + + # Check that the model reach the same score: + assert gbm1.score(X_train, y_train) == \ + pytest.approx(gbm2.score(X_train, y_train)) + + assert gbm1.score(X_test, y_test) == \ + pytest.approx(gbm2.score(X_test, y_test)) + + # Check the individual prediction match as a finer grained + # decision function check. + assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) + assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test)) + + def test_infinite_values(): + # Basic test for infinite values X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) y = np.array([0, 0, 1, 1]) @@ -230,3 +398,20 @@ def test_infinite_values(): gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) gbdt.fit(X, y) np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4) + + +def test_infinite_values_missing_values(): + # High level test making sure that inf and nan values are properly handled + # when both are present. This is similar to + # test_split_on_nan_with_infinite_values() in test_grower.py, though we + # cannot check the predicitons for binned values here. + + X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1) + y_isnan = np.isnan(X.ravel()) + y_isinf = X.ravel() == np.inf + + stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1, + learning_rate=1, max_depth=2) + + assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1 + assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 49b19ce2778dd..0cc301b7b1b36 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -4,9 +4,9 @@ from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE def _make_training_data(n_bins=256, constant_hessian=True): @@ -85,7 +85,7 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): stopping_param = {"min_gain_to_split": 0.01} grower = TreeGrower(X_binned, all_gradients, all_hessians, - max_bins=n_bins, shrinkage=shrinkage, + n_bins=n_bins, shrinkage=shrinkage, min_samples_leaf=1, **stopping_param) # The root node is not yet splitted, but the best possible split has @@ -147,7 +147,7 @@ def test_predictor_from_grower(): X_binned, all_gradients, all_hessians = _make_training_data( n_bins=n_bins) grower = TreeGrower(X_binned, all_gradients, all_hessians, - max_bins=n_bins, shrinkage=1., + n_bins=n_bins, shrinkage=1., max_leaf_nodes=3, min_samples_leaf=5) grower.grow() assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) @@ -163,22 +163,23 @@ def test_predictor_from_grower(): input_data = np.array([ [0, 0], [42, 99], - [128, 255], + [128, 254], [129, 0], [129, 85], - [255, 85], + [254, 85], [129, 86], - [129, 255], + [129, 254], [242, 100], ], dtype=np.uint8) - predictions = predictor.predict_binned(input_data) + missing_values_bin_idx = n_bins - 1 + predictions = predictor.predict_binned(input_data, missing_values_bin_idx) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: - predictions = predictor.predict_binned(X_binned) + predictions = predictor.predict_binned(X_binned, missing_values_bin_idx) assert np.allclose(predictions, -all_gradients) @@ -203,14 +204,14 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale - mapper = _BinMapper(max_bins=n_bins) + mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) shape_hessian = 1 if constant_hessian else all_gradients.shape all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, - max_bins=n_bins, shrinkage=1., + n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() @@ -235,18 +236,18 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): # min_samples_leaf rng = np.random.RandomState(seed=0) - max_bins = 255 + n_bins = 256 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] - mapper = _BinMapper(max_bins=max_bins) + mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, - max_bins=max_bins, shrinkage=1., + n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() @@ -261,13 +262,13 @@ def test_max_depth(max_depth): # Make sure max_depth parameter works as expected rng = np.random.RandomState(seed=0) - max_bins = 255 + n_bins = 256 n_samples = 1000 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] - mapper = _BinMapper(max_bins=max_bins) + mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) @@ -307,3 +308,80 @@ def test_init_parameters_validation(): match="min_hessian_to_split=-1 must be positive"): TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1) + + +def test_missing_value_predict_only(): + # Make sure that missing values are supported at predict time even if they + # were not encountered in the training data: the missing values are + # assigned to whichever child has the most samples. + + rng = np.random.RandomState(0) + n_samples = 100 + X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) + + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5, + has_missing_values=False) + grower.grow() + + predictor = grower.make_predictor() + + # go from root to a leaf, always following node with the most samples. + # That's the path nans are supposed to take + node = predictor.nodes[0] + while not node['is_leaf']: + left = predictor.nodes[node['left']] + right = predictor.nodes[node['right']] + node = left if left['count'] > right['count'] else right + + prediction_main_path = node['value'] + + # now build X_test with only nans, and make sure all predictions are equal + # to prediction_main_path + all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) + assert np.all(predictor.predict(all_nans) == prediction_main_path) + + +def test_split_on_nan_with_infinite_values(): + # Make sure the split on nan situations are respected even when there are + # samples with +inf values (we set the threshold to +inf when we have a + # split on nan so this test makes sure this does not introduce edge-case + # bugs). We need to use the private API so that we can also test + # predict_binned(). + + X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) + # the gradient values will force a split on nan situation + gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + n_bins_non_missing = 3 + has_missing_values = True + grower = TreeGrower(X_binned, gradients, hessians, + n_bins_non_missing=n_bins_non_missing, + has_missing_values=has_missing_values, + min_samples_leaf=1) + + grower.grow() + + predictor = grower.make_predictor( + bin_thresholds=bin_mapper.bin_thresholds_ + ) + + # sanity check: this was a split on nan + assert predictor.nodes[0]['threshold'] == np.inf + assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1 + + # Make sure in particular that the +inf sample is mapped to the left child + # Note that lightgbm "fails" here and will assign the inf sample to the + # right child, even though it's a "split on nan" situation. + predictions = predictor.predict(X) + predictions_binned = predictor.predict_binned( + X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_) + assert np.all(predictions == -gradients) + assert np.all(predictions_binned == -gradients) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index c425a0389a789..1ffb08353b30a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -12,9 +12,9 @@ _build_histogram_root, _subtract_histograms ) -from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE @pytest.mark.parametrize( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 29b5b6b47a04a..b49acc52b6e40 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -7,8 +7,8 @@ import pytest from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES -from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE def get_derivatives_helper(loss): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index cb7d7a804f29e..7df1e616445fc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -7,17 +7,17 @@ from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor -from sklearn.ensemble._hist_gradient_boosting.types import ( - G_H_DTYPE, PREDICTOR_RECORD_DTYPE) +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF) -@pytest.mark.parametrize('max_bins', [200, 256]) -def test_boston_dataset(max_bins): +@pytest.mark.parametrize('n_bins', [200, 256]) +def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) - mapper = _BinMapper(max_bins=max_bins, random_state=42) + mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss @@ -28,8 +28,8 @@ def test_boston_dataset(max_bins): max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, - actual_n_bins=mapper.actual_n_bins_) + max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, + n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) @@ -42,12 +42,14 @@ def test_boston_dataset(max_bins): (-np.inf, [0, 1, 1, 1]), (10, [0, 0, 1, 1]), (20, [0, 0, 0, 1]), - (np.inf, [0, 0, 0, 1]), + (ALMOST_INF, [0, 0, 0, 1]), + (np.inf, [0, 0, 0, 0]), ]) def test_infinite_values_and_thresholds(threshold, expected_predictions): # Make sure infinite values and infinite thresholds are handled properly. - # In paticular, if a value is +inf and the threhsold is +inf, the sample - # should go to the right child. + # In particular, if a value is +inf and the threshold is ALMOST_INF the + # sample should go to the right child. If the threshold is inf (split on + # nan), the +inf sample will go to the left child. X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1) nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index c8afac4fbab2c..a0eb6c6ab61c5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder from sklearn.utils.testing import skip_if_32bit @@ -18,7 +18,7 @@ def test_histogram_split(n_bins): min_samples_leaf = 1 min_gain_to_split = 0. X_binned = np.asfortranarray( - rng.randint(0, n_bins, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE) + rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE) binned_feature = X_binned.T[feature_idx] sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) @@ -26,7 +26,7 @@ def test_histogram_split(n_bins): sum_hessians = all_hessians.sum() hessians_are_constant = False - for true_bin in range(1, n_bins - 1): + for true_bin in range(1, n_bins - 2): for sign in [-1, 1]: ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE) @@ -34,15 +34,20 @@ def test_histogram_split(n_bins): all_gradients = ordered_gradients sum_gradients = all_gradients.sum() - actual_n_bins = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], + dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], + dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, - actual_n_bins, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, @@ -96,13 +101,16 @@ def test_gradient_and_hessian_sanity(constant_hessian): all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) sum_hessians = all_hessians.sum() - actual_n_bins = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) - splitter = Splitter(X_binned, actual_n_bins, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, constant_hessian) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], + dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, constant_hessian) hists_parent = builder.compute_histograms_brute(sample_indices) si_parent = splitter.find_node_split(n_samples, hists_parent, @@ -192,15 +200,17 @@ def test_split_indices(): sum_hessians = 1 * n_samples hessians_are_constant = True - actual_n_bins = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, actual_n_bins, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) assert np.all(sample_indices == splitter.partition) @@ -248,11 +258,151 @@ def test_min_gain_to_split(): sum_hessians = all_hessians.sum() hessians_are_constant = False - actual_n_bins = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, actual_n_bins, + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], + dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) + + histograms = builder.compute_histograms_brute(sample_indices) + split_info = splitter.find_node_split(n_samples, histograms, + sum_gradients, sum_hessians) + assert split_info.gain == -1 + + +@pytest.mark.parametrize( + 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' + ' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [ + + # basic sanity check with no missing values: given the gradient + # values, the split must occur on bin_idx=3 + ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients + False, # no missing values + 10, # n_bins_non_missing + False, # don't split on nans + 3, # expected_bin_idx + 'not_applicable'), + + # We replace 2 samples by NaNs (bin_idx=8) + # These 2 samples were mapped to the left node before, so they should + # be mapped to left node again + # Notice how the bin_idx threshold changes from 3 to 1. + ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 1, # cut on bin_idx=1 + True), # missing values go to left + + # same as above, but with non-consecutive missing_values_bin + ([9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 1, # cut on bin_idx=1 + True), # missing values go to left + + # this time replacing 2 samples that were on the right. + ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 3, # cut on bin_idx=3 (like in first case) + False), # missing values go to right + + # same as above, but with non-consecutive missing_values_bin + ([0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 3, # cut on bin_idx=3 (like in first case) + False), # missing values go to right + + # For the following cases, split_on_nans is True (we replace all of + # the samples with nans, instead of just 2). + ([0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 4, # n_bins_non_missing + True, # split on nans + 3, # cut on bin_idx=3 + False), # missing values go to right + + # same as above, but with non-consecutive missing_values_bin + ([0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing + [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + True, # missing values + 4, # n_bins_non_missing + True, # split on nans + 3, # cut on bin_idx=3 + False), # missing values go to right + + ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 4 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 6, # n_bins_non_missing + True, # split on nans + 5, # cut on bin_idx=5 + False), # missing values go to right + + # same as above, but with non-consecutive missing_values_bin + ([9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 6, # n_bins_non_missing + True, # split on nans + 5, # cut on bin_idx=5 + False), # missing values go to right + ] +) +def test_splitting_missing_values(X_binned, all_gradients, + has_missing_values, n_bins_non_missing, + expected_split_on_nan, expected_bin_idx, + expected_go_to_left): + # Make sure missing values are properly supported. + # we build an artificial example with gradients such that the best split + # is on bin_idx=3, when there are no missing values. + # Then we introduce missing values and: + # - make sure the chosen bin is correct (find_best_bin()): it's + # still the same split, even though the index of the bin may change + # - make sure the missing values are mapped to the correct child + # (split_indices()) + + n_bins = max(X_binned) + 1 + n_samples = len(X_binned) + l2_regularization = 0. + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + + sample_indices = np.arange(n_samples, dtype=np.uint32) + X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + X_binned = np.asfortranarray(X_binned) + all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) + has_missing_values = np.array([has_missing_values], dtype=np.uint8) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples + hessians_are_constant = True + + builder = HistogramBuilder(X_binned, n_bins, + all_gradients, all_hessians, + hessians_are_constant) + + n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, + missing_values_bin_idx, has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) @@ -260,4 +410,31 @@ def test_min_gain_to_split(): histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(n_samples, histograms, sum_gradients, sum_hessians) - assert split_info.gain == -1 + + assert split_info.bin_idx == expected_bin_idx + if has_missing_values: + assert split_info.missing_go_to_left == expected_go_to_left + + split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1 + assert split_on_nan == expected_split_on_nan + + # Make sure the split is properly computed. + # This also make sure missing values are properly assigned to the correct + # child in split_indices() + samples_left, samples_right, _ = splitter.split_indices( + split_info, splitter.partition) + + if not expected_split_on_nan: + # When we don't split on nans, the split should always be the same. + assert set(samples_left) == set([0, 1, 2, 3]) + assert set(samples_right) == set([4, 5, 6, 7, 8, 9]) + else: + # When we split on nans, samples with missing values are always mapped + # to the right child. + missing_samples_indices = np.flatnonzero( + np.array(X_binned) == missing_values_bin_idx) + non_missing_samples_indices = np.flatnonzero( + np.array(X_binned) != missing_values_bin_idx) + + assert set(samples_right) == set(missing_samples_indices) + assert set(samples_left) == set(non_missing_samples_indices) diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index fa9556ef9efb5..291c015fec5d3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -9,8 +9,8 @@ from cython.parallel import prange from ...base import is_classifier from .binning import _BinMapper -from .types cimport G_H_DTYPE_C -from .types cimport Y_DTYPE_C +from .common cimport G_H_DTYPE_C +from .common cimport Y_DTYPE_C def get_equivalent_estimator(estimator, lib='lightgbm'): diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index 88e1b2e32d98d..4430cb129efcf 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -37,8 +37,8 @@ def configuration(parent_package="", top_path=None): sources=["_hist_gradient_boosting/_loss.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("_hist_gradient_boosting.types", - sources=["_hist_gradient_boosting/types.pyx"], + config.add_extension("_hist_gradient_boosting.common", + sources=["_hist_gradient_boosting/common.pyx"], include_dirs=[numpy.get_include()]) config.add_extension("_hist_gradient_boosting.utils",