From e2791618112b0faf06bdd7a994e63235db35e080 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 May 2019 16:23:04 -0400 Subject: [PATCH 01/76] Added NaN support in mapper --- .../_hist_gradient_boosting/_binning.pyx | 35 +++++-- .../_hist_gradient_boosting/binning.py | 50 +++++++--- .../tests/test_binning.py | 98 ++++++++++++++++++- 3 files changed, 157 insertions(+), 26 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index be958948bec6a..2bbe1f71a3546 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -11,10 +11,13 @@ cimport cython import numpy as np cimport numpy as np from cython.parallel import prange +from libc.math cimport isnan from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C -cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, +cpdef _map_to_bins(const X_DTYPE_C [:, :] data, + list binning_thresholds, + const unsigned int [:] actual_n_bins, X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -25,6 +28,9 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. + actual_n_bins : ndarray, shape (n_features,) + For each feature, indicate the actual number of bins, including the bin + for missing values, if any. binned : ndarray, shape (n_samples, n_features) Output array, must be fortran aligned. """ @@ -32,13 +38,16 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, int feature_idx for feature_idx in range(data.shape[1]): + _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], + actual_n_bins[feature_idx], binned[:, feature_idx]) cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, + const unsigned int actual_n_bins, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: @@ -48,11 +57,19 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, int middle for i in prange(data.shape[0], schedule='static', nogil=True): - left, right = 0, binning_thresholds.shape[0] - while left < right: - middle = (right + left - 1) // 2 - if data[i] <= binning_thresholds[middle]: - right = middle - else: - left = middle + 1 - binned[i] = left + + if isnan(data[i]): + # unkown values are mapped to last bin + # Note that this is only correct if missing values were + # encountered at fit time (else actual_n_bins is incorrect). + binned[i] = actual_n_bins - 1 + else: + # for known values, use binary search + left, right = 0, binning_thresholds.shape[0] + while left < right: + middle = (right + left - 1) // 2 + if data[i] <= binning_thresholds[middle]: + right = middle + else: + left = middle + 1 + binned[i] = left diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 075ed4f175ac3..83e149acd5949 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -19,6 +19,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. + Missing values are ignored for finding the thresholds. + Parameters ---------- data : array-like, shape (n_samples, n_features) @@ -50,13 +52,20 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) data = data.take(subset, axis=0) - percentiles = np.linspace(0, 100, num=max_bins + 1) - percentiles = percentiles[1:-1] binning_thresholds = [] + has_missing_values = [] for f_idx in range(data.shape[1]): - col_data = np.ascontiguousarray(data[:, f_idx], dtype=X_DTYPE) + col_data = data[:, f_idx] + # ignore missing values when computing bin thresholds + missing_mask = np.isnan(col_data) + if missing_mask.any(): + col_data = col_data[~missing_mask] + has_missing_values.append(True) + else: + has_missing_values.append(False) + col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) distinct_values = np.unique(col_data) - if len(distinct_values) <= max_bins: + if len(distinct_values) + has_missing_values[-1] <= max_bins: midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 else: @@ -65,10 +74,13 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. + n_percentiles = max_bins if has_missing_values[-1] else max_bins + 1 + percentiles = np.linspace(0, 100, num=n_percentiles) + percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) binning_thresholds.append(midpoints) - return binning_thresholds + return binning_thresholds, has_missing_values class _BinMapper(BaseEstimator, TransformerMixin): @@ -87,9 +99,10 @@ class _BinMapper(BaseEstimator, TransformerMixin): Parameters ---------- max_bins : int, optional (default=256) - The maximum number of bins to use. If for a given feature the number of - unique values is less than ``max_bins``, then those unique values - will be used to compute the bin thresholds, instead of the quantiles. + The maximum number of bins to use (including the bin for missing + values, if any). If for a given feature the number of unique values + is less than ``max_bins``, then those unique values will be used to + compute the bin thresholds, instead of the quantiles. subsample : int or None, optional (default=2e5) If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -107,6 +120,8 @@ def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): def fit(self, X, y=None): """Fit data X by computing the binning thresholds. + The last bin is reserved for missing values, if any. + Parameters ---------- X : array-like, shape (n_samples, n_features) @@ -118,13 +133,15 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X, dtype=[X_DTYPE]) - self.bin_thresholds_ = _find_binning_thresholds( + X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') + self.bin_thresholds_, self.has_missing_values_ = _find_binning_thresholds( X, self.max_bins, subsample=self.subsample, random_state=self.random_state) self.actual_n_bins_ = np.array( - [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], + [thresholds.shape[0] + 1 + has_missing_values + for (thresholds, has_missing_values) + in zip(self.bin_thresholds_, self.has_missing_values_)], dtype=np.uint32) return self @@ -132,17 +149,22 @@ def fit(self, X, y=None): def transform(self, X): """Bin data X. + Missing values will be mapped to the last bin (whether or not missing + values were encountered at fit time). For this reason, `X` should be + the fitting data, though we do not enforce this. Note that the GBDT + code only ever uses mapper.fit_transform(), so this assumption is OK. + Parameters ---------- X : array-like, shape (n_samples, n_features) - The data to bin. + The data to bin. Must be the fitting data. Returns ------- X_binned : array-like, shape (n_samples, n_features) The binned data. """ - X = check_array(X, dtype=[X_DTYPE]) + X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_']) if X.shape[1] != self.actual_n_bins_.shape[0]: raise ValueError( @@ -151,5 +173,5 @@ def transform(self, X): X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, binned) + _map_to_bins(X, self.bin_thresholds_, self.actual_n_bins_, binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 4f4def6199411..f8bf7adf51bb8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -20,8 +20,9 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, max_bins, subsample, - random_state) + binning_thresholds, _ = _find_binning_thresholds_orig( + data, max_bins, subsample, random_state) + return binning_thresholds def test_find_binning_thresholds_regular_data(): @@ -92,7 +93,11 @@ def test_map_to_bins(n_bins): bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, random_state=0) binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(DATA, bin_thresholds, binned) + actual_n_bins = np.array( + [len(thresholds) + 1 for thresholds in bin_thresholds], + dtype=np.uint32 + ) + _map_to_bins(DATA, bin_thresholds, actual_n_bins, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous @@ -240,3 +245,90 @@ def test_subsample(): assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature], mapper_subsample.bin_thresholds_[feature], rtol=1e-4) + + +@pytest.mark.parametrize( + 'max_bins, actual_n_bins, X_trans_expected', [ + (256, [5, 3, 2], [[0, 0, 0], + [4, 2, 0], + [1, 0, 0], + [4, 1, 1], + [2, 1, 1], + [3, 0, 0]]), + # With max_bins=2, we expect all non-nan values to be mapped to bin 0 + # and all nans to be mapped to bin 1 + (2, [2, 2, 2], [[0, 0, 0], + [1, 1, 0], + [0, 0, 0], + [1, 0, 1], + [0, 0, 1], + [0, 0, 0]]), + + (3, [3, 3, 2], [[0, 0, 0], + [2, 2, 0], + [0, 0, 0], + [2, 1, 1], + [1, 1, 1], + [1, 0, 0]]) +]) +def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): + # basic check for missing values: make sure nans are mapped to last bins + # and that attributes are correct + + # Note that the extra bin for missing values is only allocated if needed + # (no need to allocate extra bin for third column here.) + + X = [[1, 1, 1], + [np.NaN, np.NaN, 1], + [2, 1, 1], + [np.NaN, 2, 2], + [3, 2, 2], + [4, 1, 1]] + + X = np.array(X) + + mapper = _BinMapper(max_bins=max_bins) + mapper.fit(X) + assert_array_equal(mapper.actual_n_bins_, actual_n_bins) + assert_array_equal(mapper.has_missing_values_, [True, True, False]) + X_trans = mapper.transform(X) + assert_array_equal(X_trans, X_trans_expected) + + +def test_missing_values_different_X_fit_transform(): + # Test to illustrate the fact that missing values are always mapped to the + # last bin. + # If there are no missing values at fit time (second column), then during + # transform(), missing values are treated as the biggest values, which is + # not a desired behaviour in general. + + # Note that in practice this case never happens, since the GBDT code only + # ever uses mapper.fit_transform(). + + X = [[1, 1], + [np.NaN, 1], + [1, 1], + [2, 2], + [2, 2], + [1, 1]] + + X = np.array(X) + + mapper = _BinMapper() + mapper.fit(X) + + X2 = [[1, 1], + [3, 1], + [1, np.NaN], # Nan mapped in same bin as the biggest value + [2, 2], + [np.NaN, 2], # Nan mapped in a special bin, as expected + [1, 1]] + + X2_trans = mapper.transform(X2) + X2_trans_expected = [[0, 0], + [1, 0], + [0, 1], + [1, 1], + [2, 1], + [0, 0]] + assert_array_equal(X2_trans, X2_trans_expected) From 91105a660ae2ebe99a9178d8fcbe35268e6ffaf2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 May 2019 16:30:26 -0400 Subject: [PATCH 02/76] pep --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 3 ++- sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 83e149acd5949..c4ebb6d90bb9f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -74,7 +74,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - n_percentiles = max_bins if has_missing_values[-1] else max_bins + 1 + n_percentiles = (max_bins if has_missing_values[-1] + else max_bins + 1) percentiles = np.linspace(0, 100, num=n_percentiles) percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index f8bf7adf51bb8..4c640464814d4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -269,8 +269,7 @@ def test_subsample(): [0, 0, 0], [2, 1, 1], [1, 1, 1], - [1, 0, 0]]) -]) + [1, 0, 0]])]) def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): # basic check for missing values: make sure nans are mapped to last bins # and that attributes are correct From 000ab9a9d8dac66455ed0120796d4fdeea8913a2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 May 2019 18:07:26 -0400 Subject: [PATCH 03/76] WIP --- .../gradient_boosting.py | 1 + .../_hist_gradient_boosting/grower.py | 15 +- .../_hist_gradient_boosting/splitting.pyx | 151 ++++++++++++++---- .../tests/test_splitting.py | 15 +- 4 files changed, 143 insertions(+), 39 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 719756061f896..d261702a788bd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -250,6 +250,7 @@ def fit(self, X, y): X_binned_train, gradients[k, :], hessians[k, :], max_bins=self.max_bins, actual_n_bins=self.bin_mapper_.actual_n_bins_, + has_missing_values=self.bin_mapper_.has_missing_values_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 064c7ce8b6411..ce72513a07f4c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -156,8 +156,9 @@ class TreeGrower: """ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., - max_bins=256, actual_n_bins=None, l2_regularization=0., - min_hessian_to_split=1e-3, shrinkage=1.): + max_bins=256, actual_n_bins=None, has_missing_values=False, + l2_regularization=0., min_hessian_to_split=1e-3, + shrinkage=1.): self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, @@ -173,13 +174,17 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, else: actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32) + if isinstance(has_missing_values, bool): + has_missing_values = [has_missing_values] * actual_n_bins.shape[0] + has_missing_values = np.array(has_missing_values, dtype=np.uint8) + hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( X_binned, max_bins, gradients, hessians, hessians_are_constant) self.splitter = Splitter( - X_binned, max_bins, actual_n_bins, l2_regularization, - min_hessian_to_split, min_samples_leaf, min_gain_to_split, - hessians_are_constant) + X_binned, max_bins, actual_n_bins, has_missing_values, + l2_regularization, min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes self.max_bins = max_bins self.n_features = X_binned.shape[1] diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 2f7c7d3453326..d3da6a908fd8e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -32,6 +32,7 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx + unsigned char missing_go_to_left Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right Y_DTYPE_C sum_hessian_left @@ -64,12 +65,13 @@ class SplitInfo: n_samples_right : int The number of samples in the right child. """ - def __init__(self, gain, feature_idx, bin_idx, sum_gradient_left, - sum_hessian_left, sum_gradient_right, sum_hessian_right, - n_samples_left, n_samples_right): + def __init__(self, gain, feature_idx, bin_idx, missing_go_to_left, + sum_gradient_left, sum_hessian_left, sum_gradient_right, + sum_hessian_right, n_samples_left, n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx + self.missing_go_to_left = missing_go_to_left self.sum_gradient_left = sum_gradient_left self.sum_hessian_left = sum_hessian_left self.sum_gradient_right = sum_gradient_right @@ -116,6 +118,7 @@ cdef class Splitter: unsigned int n_features unsigned int max_bins unsigned int [::1] actual_n_bins + unsigned char [::1] has_missing_values, unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -126,11 +129,15 @@ cdef class Splitter: unsigned int [::1] left_indices_buffer unsigned int [::1] right_indices_buffer - def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int - max_bins, np.ndarray[np.uint32_t] actual_n_bins, - Y_DTYPE_C l2_regularization, Y_DTYPE_C - min_hessian_to_split=1e-3, unsigned int - min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0., + def __init__(self, + const X_BINNED_DTYPE_C [::1, :] X_binned, + unsigned int max_bins, + np.ndarray[np.uint32_t] actual_n_bins, + np.ndarray[np.uint8_t] has_missing_values, + Y_DTYPE_C l2_regularization, + Y_DTYPE_C min_hessian_to_split=1e-3, + unsigned int min_samples_leaf=20, + Y_DTYPE_C min_gain_to_split=0., unsigned char hessians_are_constant=False): self.X_binned = X_binned @@ -139,6 +146,7 @@ cdef class Splitter: # last bins may be unused if actual_n_bins[f] < max_bins self.max_bins = max_bins self.actual_n_bins = actual_n_bins + self.has_missing_values = has_missing_values self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf @@ -367,10 +375,16 @@ cdef class Splitter: for feature_idx in prange(n_features, schedule='static'): # For each feature, find best bin to split on - split_info = self._find_best_bin_to_split_helper( + split_infos[feature_idx].gain = -1 + + self._find_best_bin_to_split_left_to_right( feature_idx, histograms, n_samples, - sum_gradients, sum_hessians) - split_infos[feature_idx] = split_info + sum_gradients, sum_hessians, &split_infos[feature_idx]) + + if self.has_missing_values[feature_idx]: + self._find_best_bin_to_split_right_to_left( + feature_idx, histograms, n_samples, + sum_gradients, sum_hessians, &split_infos[feature_idx]) # then compute best possible split among all features best_feature_idx = self._find_best_feature_to_split_helper( @@ -381,6 +395,7 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, + split_info.missing_go_to_left, split_info.sum_gradient_left, split_info.sum_hessian_left, split_info.sum_gradient_right, @@ -405,38 +420,40 @@ cdef class Splitter: best_feature_idx = feature_idx return best_feature_idx - cdef split_info_struct _find_best_bin_to_split_helper( + cdef void _find_best_bin_to_split_left_to_right( self, unsigned int feature_idx, const hist_struct [:, ::1] histograms, # IN unsigned int n_samples, Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians) nogil: + Y_DTYPE_C sum_hessians, + split_info_struct * split_info) nogil: # OUT """Find best bin to split on for a given feature. Splits that do not satisfy the splitting constraints - (min_gain_to_split, etc.) are discarded here. If no split can - satisfy the constraints, a SplitInfo with a gain of -1 is returned. - If for a given node the best SplitInfo has a gain of -1, it is - finalized into a leaf in the grower. + (min_gain_to_split, etc.) are discarded here. """ cdef: unsigned int bin_idx unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples + unsigned int n_bins Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right Y_DTYPE_C gain - split_info_struct best_split - best_split.gain = -1. sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - for bin_idx in range(self.actual_n_bins[feature_idx]): + n_bins = self.actual_n_bins[feature_idx] + if self.has_missing_values[feature_idx]: + # if there are missing values, ignore the last bin + n_bins -= 1 + + for bin_idx in range(n_bins): n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left @@ -467,19 +484,91 @@ cdef class Splitter: sum_gradients, sum_hessians, self.l2_regularization) - if gain > best_split.gain and gain > self.min_gain_to_split: - best_split.gain = gain - best_split.feature_idx = feature_idx - best_split.bin_idx = bin_idx - best_split.sum_gradient_left = sum_gradient_left - best_split.sum_gradient_right = sum_gradient_right - best_split.sum_hessian_left = sum_hessian_left - best_split.sum_hessian_right = sum_hessian_right - best_split.n_samples_left = n_samples_left - best_split.n_samples_right = n_samples_right + if gain > split_info.gain and gain > self.min_gain_to_split: + split_info.gain = gain + split_info.feature_idx = feature_idx + split_info.bin_idx = bin_idx + # since we scan from left to right, missing values go to the + # right. + split_info.missing_go_to_left = False + split_info.sum_gradient_left = sum_gradient_left + split_info.sum_gradient_right = sum_gradient_right + split_info.sum_hessian_left = sum_hessian_left + split_info.sum_hessian_right = sum_hessian_right + split_info.n_samples_left = n_samples_left + split_info.n_samples_right = n_samples_right + + cdef void _find_best_bin_to_split_right_to_left( + self, + unsigned int feature_idx, + const hist_struct [:, ::1] histograms, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + split_info_struct * split_info) nogil: # OUT + # Only called if there are missing values. we ignore the last bin + + cdef: + unsigned int bin_idx + unsigned int n_samples_left + unsigned int n_samples_right + unsigned int n_samples_ = n_samples + unsigned int second_to_last_bin + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C gain + + sum_gradient_left, sum_hessian_left = 0., 0. + n_samples_left = 0 + + second_to_last_bin = self.actual_n_bins[feature_idx] - 1 - return best_split + for bin_idx in range(second_to_last_bin, -1, -1): + n_samples_right += histograms[feature_idx, bin_idx].count + n_samples_left = n_samples_ - n_samples_right + + if self.hessians_are_constant: + sum_hessian_right += histograms[feature_idx, bin_idx].count + else: + sum_hessian_right += \ + histograms[feature_idx, bin_idx].sum_hessians + sum_hessian_left = sum_hessians - sum_hessian_right + + sum_gradient_right += histograms[feature_idx, bin_idx].sum_gradients + sum_gradient_left = sum_gradients - sum_gradient_right + + if n_samples_right < self.min_samples_leaf: + continue + if n_samples_left < self.min_samples_leaf: + # won't get any better + break + + if sum_hessian_right < self.min_hessian_to_split: + continue + if sum_hessian_left < self.min_hessian_to_split: + # won't get any better (hessians are > 0 since loss is convex) + break + + gain = _split_gain(sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, + sum_gradients, sum_hessians, + self.l2_regularization) + if gain > split_info.gain and gain > self.min_gain_to_split: + split_info.gain = gain + split_info.feature_idx = feature_idx + split_info.bin_idx = bin_idx + # since we scan from right to left, missing values go to the + # left. + split_info.missing_go_to_left = True + split_info.sum_gradient_left = sum_gradient_left + split_info.sum_gradient_right = sum_gradient_right + split_info.sum_hessian_left = sum_hessian_left + split_info.sum_hessian_right = sum_hessian_right + split_info.n_samples_left = n_samples_left + split_info.n_samples_right = n_samples_right cdef inline Y_DTYPE_C _split_gain( Y_DTYPE_C sum_gradient_left, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index d34f5ef064137..c2da7ffdd03a2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -35,6 +35,8 @@ def test_histogram_split(n_bins): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], + dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, @@ -43,6 +45,7 @@ def test_histogram_split(n_bins): splitter = Splitter(X_binned, n_bins, actual_n_bins, + has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, @@ -97,9 +100,11 @@ def test_gradient_and_hessian_sanity(constant_hessian): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], + dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) - splitter = Splitter(X_binned, n_bins, actual_n_bins, + splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, constant_hessian) @@ -193,10 +198,12 @@ def test_split_indices(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], + dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, n_bins, actual_n_bins, + splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) @@ -249,9 +256,11 @@ def test_min_gain_to_split(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], + dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, n_bins, actual_n_bins, + splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) From 66c25020ee8433fef0e42ad45f1204bd10256bae Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 May 2019 19:29:29 -0400 Subject: [PATCH 04/76] some more --- .../_hist_gradient_boosting/_predictor.pyx | 14 ++++++++--- .../gradient_boosting.py | 5 ++-- .../_hist_gradient_boosting/grower.py | 2 ++ .../_hist_gradient_boosting/predictor.py | 1 + .../_hist_gradient_boosting/splitting.pyx | 24 ++++++++++++++----- 5 files changed, 35 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 45ba70095c3c7..fb20f57206439 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -7,6 +7,7 @@ cimport cython from cython.parallel import prange +from libc.math cimport isnan import numpy as np cimport numpy as np @@ -22,6 +23,7 @@ cdef packed struct node_struct: unsigned int count unsigned int feature_idx X_DTYPE_C threshold + unsigned char missing_go_to_left unsigned int left unsigned int right Y_DTYPE_C gain @@ -63,10 +65,16 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( while True: if node.is_leaf: return node.value - if numeric_data[row, node.feature_idx] <= node.threshold: - node = nodes[node.left] + if isnan(numeric_data[row, node.feature_idx]): + if node.missing_go_to_left: + node = nodes[node.left] + else: + node = nodes[node.right] else: - node = nodes[node.right] + if numeric_data[row, node.feature_idx] <= node.threshold: + node = nodes[node.left] + else: + node = nodes[node.right] cdef void _predict_from_binned_data_parallel( diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d261702a788bd..7e5ca50fcebe3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -95,7 +95,7 @@ def fit(self, X, y): acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. - X, y = check_X_y(X, y, dtype=[X_DTYPE]) + X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite='allow-nan') y = self._encode_y(y) rng = check_random_state(self.random_state) @@ -441,7 +441,8 @@ def _raw_predict(self, X): raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ - X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE]) + X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE], + force_all_finite='allow-nan') check_is_fitted(self, '_predictors') if X.shape[1] != self.n_features_: raise ValueError( diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index ce72513a07f4c..aa278266ac38e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -452,8 +452,10 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, # Decision node split_info = grower_node.split_info feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx + print(bin_idx) node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx + node['missing_go_to_left'] = split_info.missing_go_to_left if bin_thresholds is not None: threshold = bin_thresholds[feature_idx][bin_idx] node['threshold'] = threshold diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index 5b18048cc24e2..7151c8d1017e6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -17,6 +17,7 @@ ('count', np.uint32), ('feature_idx', np.uint32), ('threshold', X_DTYPE), + ('missing_go_to_left', np.uint8), ('left', np.uint32), ('right', np.uint32), ('gain', Y_DTYPE), diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index d3da6a908fd8e..7b6a404115185 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -243,6 +243,7 @@ cdef class Splitter: cdef: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx + unsigned char missing_go_to_left = split_info.missing_go_to_left int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] @@ -288,12 +289,23 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if X_binned[sample_idx] <= bin_idx: - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 + if (self.has_missing_values[feature_idx] and + X_binned[sample_idx] == self.actual_n_bins[feature_idx] - 1): + # missing value + if missing_go_to_left: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 + # non-missing value + if X_binned[sample_idx] <= bin_idx: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 left_counts[thread_idx] = left_count right_counts[thread_idx] = right_count @@ -513,7 +525,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int second_to_last_bin + unsigned int second_to_last_bin Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left From 810b7b005384433eed3c00357c039a89b8d4e96c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 07:58:32 -0400 Subject: [PATCH 05/76] WIP --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 1 + sklearn/ensemble/_hist_gradient_boosting/grower.py | 3 +-- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 7e5ca50fcebe3..cfa9d6ca78dd4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -123,6 +123,7 @@ def fit(self, X, y): if self.verbose: duration = toc - tic print("{:.3f} s".format(duration)) + print(self.bin_mapper_.actual_n_bins_) self.loss_ = self._get_loss() diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index aa278266ac38e..db5a9af3c8227 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -452,12 +452,11 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, # Decision node split_info = grower_node.split_info feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx - print(bin_idx) node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left if bin_thresholds is not None: - threshold = bin_thresholds[feature_idx][bin_idx] + threshold = bin_thresholds[feature_idx][min(bin_idx, 253)] node['threshold'] = threshold next_free_idx += 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 7b6a404115185..25981d5cd685f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -535,7 +535,7 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - second_to_last_bin = self.actual_n_bins[feature_idx] - 1 + second_to_last_bin = self.actual_n_bins[feature_idx] - 2 for bin_idx in range(second_to_last_bin, -1, -1): n_samples_right += histograms[feature_idx, bin_idx].count From 5fd59cb5dc9d71f03f5d41469d93ff48d6a2264b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 09:22:50 -0400 Subject: [PATCH 06/76] WIP --- .../gradient_boosting.py | 1 - .../_hist_gradient_boosting/grower.py | 2 +- .../_hist_gradient_boosting/splitting.pyx | 21 ++++++++++++------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index cfa9d6ca78dd4..7e5ca50fcebe3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -123,7 +123,6 @@ def fit(self, X, y): if self.verbose: duration = toc - tic print("{:.3f} s".format(duration)) - print(self.bin_mapper_.actual_n_bins_) self.loss_ = self._get_loss() diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index db5a9af3c8227..45b688e822062 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -456,7 +456,7 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left if bin_thresholds is not None: - threshold = bin_thresholds[feature_idx][min(bin_idx, 253)] + threshold = bin_thresholds[feature_idx][bin_idx] node['threshold'] = threshold next_free_idx += 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 25981d5cd685f..1a9ccca837e82 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -450,7 +450,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int n_bins + unsigned int end Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -460,12 +460,14 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - n_bins = self.actual_n_bins[feature_idx] + # We don't need to consider splitting on the last bin since this would + # result in having 0 samples in the right child + end = self.actual_n_bins[feature_idx] - 1 if self.has_missing_values[feature_idx]: - # if there are missing values, ignore the last bin - n_bins -= 1 + # if there are missing values, skip one more bin + end -= 1 - for bin_idx in range(n_bins): + for bin_idx in range(end): n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left @@ -525,7 +527,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int second_to_last_bin + unsigned int start Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -535,9 +537,12 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - second_to_last_bin = self.actual_n_bins[feature_idx] - 2 + # - Skip last bin (where the missing values are) + # - Skip second to last bin (considering this split would result in 0 + # samples on the right node) + start = self.actual_n_bins[feature_idx] - 3 - for bin_idx in range(second_to_last_bin, -1, -1): + for bin_idx in range(start, -1, -1): n_samples_right += histograms[feature_idx, bin_idx].count n_samples_left = n_samples_ - n_samples_right From 670566b71f50720bd3b2dc10e37031f5af72746a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 16:22:49 -0400 Subject: [PATCH 07/76] bug fix --- .../_hist_gradient_boosting/grower.py | 8 +++++++ .../_hist_gradient_boosting/splitting.pyx | 23 ++++++++++--------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 45b688e822062..42a6cf6aa7de4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -187,6 +187,7 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, min_gain_to_split, hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes self.max_bins = max_bins + self.has_missing_values = has_missing_values self.n_features = X_binned.shape[1] self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf @@ -333,6 +334,13 @@ def split_next(self): right_child_node.partition_start = left_child_node.partition_stop right_child_node.partition_stop = node.partition_stop + if not self.has_missing_values[node.split_info.feature_idx]: + # If no missing values are encountered at fit time, then samples + # with missing values during predict() will go to whichever child + # has the most samples. + node.split_info.missing_go_to_left = ( + left_child_node.n_samples > right_child_node.n_samples) + self.n_nodes += 2 if self.max_depth is not None and depth == self.max_depth: diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 1a9ccca837e82..1d906b0b76977 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -461,7 +461,7 @@ cdef class Splitter: n_samples_left = 0 # We don't need to consider splitting on the last bin since this would - # result in having 0 samples in the right child + # result in having 0 samples in the right node end = self.actual_n_bins[feature_idx] - 1 if self.has_missing_values[feature_idx]: # if there are missing values, skip one more bin @@ -502,8 +502,7 @@ cdef class Splitter: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx - # since we scan from left to right, missing values go to the - # right. + # we scan from left to right so missing values go to the right split_info.missing_go_to_left = False split_info.sum_gradient_left = sum_gradient_left split_info.sum_gradient_right = sum_gradient_right @@ -520,27 +519,30 @@ cdef class Splitter: Y_DTYPE_C sum_gradients, Y_DTYPE_C sum_hessians, split_info_struct * split_info) nogil: # OUT - # Only called if there are missing values. we ignore the last bin cdef: unsigned int bin_idx unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int start Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right Y_DTYPE_C gain - - sum_gradient_left, sum_hessian_left = 0., 0. - n_samples_left = 0 + unsigned int n_bins = self.actual_n_bins[feature_idx] + unsigned int start # - Skip last bin (where the missing values are) # - Skip second to last bin (considering this split would result in 0 # samples on the right node) - start = self.actual_n_bins[feature_idx] - 3 + start = n_bins - 3 + + # n_bins - 2 is the index of the second to last bin, which we consider + # being on the right child. + sum_gradient_right = histograms[feature_idx, n_bins - 2].sum_gradients + sum_hessian_right = histograms[feature_idx, n_bins - 2].sum_hessians + n_samples_right = histograms[feature_idx, n_bins - 2].count for bin_idx in range(start, -1, -1): n_samples_right += histograms[feature_idx, bin_idx].count @@ -577,8 +579,7 @@ cdef class Splitter: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx - # since we scan from right to left, missing values go to the - # left. + # we scan from right to left so missing values go to the left split_info.missing_go_to_left = True split_info.sum_gradient_left = sum_gradient_left split_info.sum_gradient_right = sum_gradient_right From e338e0adbb8ae635e3d034a390ffd05bd95facca Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 16:34:23 -0400 Subject: [PATCH 08/76] basic tests --- .../tests/test_gradient_boosting.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 790597b07fa15..74a725a9b7425 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -145,3 +145,39 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping): n_iter_no_change=n_iter_no_change, tol=tol ) assert gbdt._should_stop(scores) == stopping + + +def test_missing_values(): + # sanity check for missing value support. With only one feature and + # y == isnan(X), the gbdt is supposed to reach perfect accuracy. + + n_samples = 100 + n_features = 1 + rng = np.random.RandomState(0) + + X = rng.normal(size=(n_samples, n_features)) + mask = rng.binomial(1, .5, size=X.shape).astype(np.bool) + X[mask] = np.nan + y = mask.ravel() + gb = HistGradientBoostingClassifier() + gb.fit(X, y) + + assert gb.score(X, y) == 1 + + +def test_missing_value_predict_only(): + # Make sure that misisng values are supported at predict time even if they + # were not encountered during fit time + + rng = np.random.RandomState(0) + X, y = make_classification(random_state=rng) + + gb = HistGradientBoostingClassifier() + gb.fit(X, y) + assert gb.score(X, y) == 1 + + # Half the values are now missing + missing_mask = rng.binomial(1, .5, size=X.shape).astype(np.bool) + X[missing_mask] = np.nan + + assert gb.score(X, y) >= .8 From d288518b66980608edbc02697f9db6fc504af198 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 17:24:04 -0400 Subject: [PATCH 09/76] some doc --- .../_hist_gradient_boosting/_predictor.pyx | 14 ++-- .../_hist_gradient_boosting/binning.py | 9 ++- .../_hist_gradient_boosting/grower.py | 3 + .../_hist_gradient_boosting/splitting.pyx | 64 +++++++++++++------ .../tests/test_binning.py | 2 +- .../tests/test_gradient_boosting.py | 3 +- 6 files changed, 61 insertions(+), 34 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index fb20f57206439..d4c3227212528 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -65,16 +65,12 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( while True: if node.is_leaf: return node.value - if isnan(numeric_data[row, node.feature_idx]): - if node.missing_go_to_left: - node = nodes[node.left] - else: - node = nodes[node.right] + if ((isnan(numeric_data[row, node.feature_idx]) and + node.missing_go_to_left) or + numeric_data[row, node.feature_idx] <= node.threshold): + node = nodes[node.left] else: - if numeric_data[row, node.feature_idx] <= node.threshold: - node = nodes[node.left] - else: - node = nodes[node.right] + node = nodes[node.right] cdef void _predict_from_binned_data_parallel( diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index c4ebb6d90bb9f..891d131c41774 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -43,6 +43,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): For each feature, stores the increasing numeric values that can be used to separate the bins. Thus ``len(binning_thresholds) == n_features``. + has_missing_values: list of bool + For each feature, indicates whether missing values were encountered. """ if not (2 <= max_bins <= 256): raise ValueError('max_bins={} should be no smaller than 2 ' @@ -135,9 +137,10 @@ def fit(self, X, y=None): self : object """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - self.bin_thresholds_, self.has_missing_values_ = _find_binning_thresholds( - X, self.max_bins, subsample=self.subsample, - random_state=self.random_state) + self.bin_thresholds_, self.has_missing_values_ = \ + _find_binning_thresholds( + X, self.max_bins, subsample=self.subsample, + random_state=self.random_state) self.actual_n_bins_ = np.array( [thresholds.shape[0] + 1 + has_missing_values diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 42a6cf6aa7de4..277e6e150ba7e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -144,6 +144,9 @@ class TreeGrower: equal to ``max_bins``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to have ``max_bins`` bins. + has_missing_values : ndarray of bool or bool, optional (default=False) + Whether each feature contains missing values. If it's a bool, the same + values is used for all features. l2_regularization : float, optional (default=0) The L2 regularization parameter. min_hessian_to_split : float, optional (default=1e-3) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 1d906b0b76977..abde4227ac546 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -52,6 +52,8 @@ class SplitInfo: The index of the feature to be split. bin_idx : int The index of the bin on which the split is made. + missing_go_to_left : bool + Whether missing values should go to the left child sum_gradient_left : float The sum of the gradients of all the samples in the left child. sum_hessian_left : float @@ -99,6 +101,8 @@ cdef class Splitter: actual_n_bins : ndarray, shape (n_features,) The actual number of bins needed for each feature, which is lower or equal to max_bins. + has_missing_values : ndarray, shape (n_features,) + Whether each feature contains missing values. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -289,23 +293,15 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if (self.has_missing_values[feature_idx] and - X_binned[sample_idx] == self.actual_n_bins[feature_idx] - 1): - # missing value - if missing_go_to_left: - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 - else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 + if ((self.has_missing_values[feature_idx] and + X_binned[sample_idx] == self.actual_n_bins[feature_idx] - 1 and + missing_go_to_left) or + X_binned[sample_idx] <= bin_idx): + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 else: - # non-missing value - if X_binned[sample_idx] <= bin_idx: - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 - else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 left_counts[thread_idx] = left_count right_counts[thread_idx] = right_count @@ -387,8 +383,20 @@ cdef class Splitter: for feature_idx in prange(n_features, schedule='static'): # For each feature, find best bin to split on + # Start with a gain of -1 (if no better split is found, that + # means one of the constraints isn't respected + # (min_samples_leaf, etc) and the grower will later turn the + # node into a leaf. split_infos[feature_idx].gain = -1 + # We will scan bins from left to right (in all cases), and if + # there are any missing values, we will also scan bins from + # right to left. This way, we can consider whichever case + # yields the best gain: either missing values go to the right + # (left to right scan) or to the left (left to right case). + # See algo 3 from the XGBoost paper + # https://arxiv.org/abs/1603.02754 + self._find_best_bin_to_split_left_to_right( feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) @@ -444,6 +452,10 @@ cdef class Splitter: Splits that do not satisfy the splitting constraints (min_gain_to_split, etc.) are discarded here. + + We scan node from left to right. This version is called whether there + are missing values or not. If any, missing values are assigned to the + right node. """ cdef: unsigned int bin_idx @@ -460,11 +472,12 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - # We don't need to consider splitting on the last bin since this would - # result in having 0 samples in the right node + # We don't need to consider splitting on the last bin (or second to + # last bin if there are missing values) since this would result in + # having 0 samples in the right node end = self.actual_n_bins[feature_idx] - 1 if self.has_missing_values[feature_idx]: - # if there are missing values, skip one more bin + # if there are missing values (in the last bin), skip one more bin end -= 1 for bin_idx in range(end): @@ -519,6 +532,16 @@ cdef class Splitter: Y_DTYPE_C sum_gradients, Y_DTYPE_C sum_hessians, split_info_struct * split_info) nogil: # OUT + """Find best bin to split on for a given feature. + + Splits that do not satisfy the splitting constraints + (min_gain_to_split, etc.) are discarded here. + + We scan node from right to left. This version is only called when + there are missing values. If there's no missing value, calling + _find_best_bin_to_split_left_to_right is enough. If any, missing + values are assigned to the left node. + """ cdef: unsigned int bin_idx @@ -555,7 +578,8 @@ cdef class Splitter: histograms[feature_idx, bin_idx].sum_hessians sum_hessian_left = sum_hessians - sum_hessian_right - sum_gradient_right += histograms[feature_idx, bin_idx].sum_gradients + sum_gradient_right += \ + histograms[feature_idx, bin_idx].sum_gradients sum_gradient_left = sum_gradients - sum_gradient_right if n_samples_right < self.min_samples_leaf: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 4c640464814d4..99f664660c0e2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -271,7 +271,7 @@ def test_subsample(): [1, 1, 1], [1, 0, 0]])]) def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): - # basic check for missing values: make sure nans are mapped to last bins + # check for missing values: make sure nans are mapped to last bins # and that attributes are correct # Note that the extra bin for missing values is only allocated if needed diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 74a725a9b7425..6f410baa49a3f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -166,8 +166,9 @@ def test_missing_values(): def test_missing_value_predict_only(): - # Make sure that misisng values are supported at predict time even if they + # Make sure that missing values are supported at predict time even if they # were not encountered during fit time + # The missing values are assigned to whichever child has the most samples rng = np.random.RandomState(0) X, y = make_classification(random_state=rng) From 2d1659b3c4b2b35c52d206b1d076b954607b13b7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 17:40:08 -0400 Subject: [PATCH 10/76] avoid some interactions --- .../_hist_gradient_boosting/splitting.pyx | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index abde4227ac546..b7c6b778d3dbb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -249,6 +249,10 @@ cdef class Splitter: X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left int feature_idx = split_info.feature_idx + unsigned char has_missing_values = \ + self.has_missing_values[feature_idx] + X_BINNED_DTYPE_C missing_values_bin = \ + self.actual_n_bins[feature_idx] - 1 const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer @@ -293,8 +297,8 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if ((self.has_missing_values[feature_idx] and - X_binned[sample_idx] == self.actual_n_bins[feature_idx] - 1 and + if ((has_missing_values and + X_binned[sample_idx] == missing_values_bin and missing_go_to_left) or X_binned[sample_idx] <= bin_idx): left_indices_buffer[start + left_count] = sample_idx @@ -374,6 +378,7 @@ cdef class Splitter: int n_features = self.n_features split_info_struct split_info split_info_struct * split_infos + unsigned char [:] has_missing_values = self.has_missing_values with nogil: n_samples = sample_indices.shape[0] @@ -401,7 +406,7 @@ cdef class Splitter: feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) - if self.has_missing_values[feature_idx]: + if has_missing_values[feature_idx]: self._find_best_bin_to_split_right_to_left( feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) @@ -462,7 +467,10 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int end + # We don't need to consider splitting on the last bin (or second to + # last bin if there are missing values) since this would result in + # having 0 samples in the right node + unsigned int end = self.actual_n_bins[feature_idx] - 1 Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -472,10 +480,6 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - # We don't need to consider splitting on the last bin (or second to - # last bin if there are missing values) since this would result in - # having 0 samples in the right node - end = self.actual_n_bins[feature_idx] - 1 if self.has_missing_values[feature_idx]: # if there are missing values (in the last bin), skip one more bin end -= 1 From f2a83a0fc41e150bfd7a6599d27df1cdfded7a06 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 May 2019 17:46:29 -0400 Subject: [PATCH 11/76] Added tag --- sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 7e5ca50fcebe3..242182b7be63e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -464,6 +464,9 @@ def _raw_predict(self, X): return raw_predictions + def _more_tags(self): + return {'allow_nan': True} + @abstractmethod def _get_loss(self): pass From cd1de3c1c0e6a30494c8c80f8650fc2f3313186b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 22 May 2019 13:17:54 -0400 Subject: [PATCH 12/76] better test --- .../tests/test_gradient_boosting.py | 19 ----------- .../tests/test_grower.py | 33 +++++++++++++++++++ 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 6f410baa49a3f..be43f8dbf5c99 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -163,22 +163,3 @@ def test_missing_values(): gb.fit(X, y) assert gb.score(X, y) == 1 - - -def test_missing_value_predict_only(): - # Make sure that missing values are supported at predict time even if they - # were not encountered during fit time - # The missing values are assigned to whichever child has the most samples - - rng = np.random.RandomState(0) - X, y = make_classification(random_state=rng) - - gb = HistGradientBoostingClassifier() - gb.fit(X, y) - assert gb.score(X, y) == 1 - - # Half the values are now missing - missing_mask = rng.binomial(1, .5, size=X.shape).astype(np.bool) - X[missing_mask] = np.nan - - assert gb.score(X, y) >= .8 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 49b19ce2778dd..df2a8d9f80169 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -307,3 +307,36 @@ def test_init_parameters_validation(): match="min_hessian_to_split=-1 must be positive"): TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1) + + +def test_missing_value_predict_only(): + # Make sure that missing values are supported at predict time even if they + # were not encountered during fit time: the missing values are assigned to + # whichever child has the most samples + + rng = np.random.RandomState(0) + n_samples = 100 + X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) + + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5) + grower.grow() + + predictor = grower.make_predictor() + + # go from root to a leaf, always following node with the most samples. + # That's the path nans are supposed to take + node = predictor.nodes[0] + while not node['is_leaf']: + left = predictor.nodes[node['left']] + right = predictor.nodes[node['right']] + node = left if left['count'] > right['count'] else right + + prediction_main_path = node['value'] + + # now build X_test with only nans, and make sure all predictions are equal + # to prediction_main_path + all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) + assert np.all(predictor.predict(all_nans) == prediction_main_path) From 5cd8e5972f5fdfeb377291f89b3f651e22108091 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 22 May 2019 18:40:59 -0400 Subject: [PATCH 13/76] decent test + fix bug --- .../_hist_gradient_boosting/splitting.pyx | 13 ++- .../tests/test_splitting.py | 83 +++++++++++++++++++ 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index b7c6b778d3dbb..c48a900d24387 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -567,23 +567,22 @@ cdef class Splitter: # n_bins - 2 is the index of the second to last bin, which we consider # being on the right child. - sum_gradient_right = histograms[feature_idx, n_bins - 2].sum_gradients - sum_hessian_right = histograms[feature_idx, n_bins - 2].sum_hessians - n_samples_right = histograms[feature_idx, n_bins - 2].count + sum_gradient_right, sum_hessian_right = 0., 0. + n_samples_right = 0 for bin_idx in range(start, -1, -1): - n_samples_right += histograms[feature_idx, bin_idx].count + n_samples_right += histograms[feature_idx, bin_idx + 1].count n_samples_left = n_samples_ - n_samples_right if self.hessians_are_constant: - sum_hessian_right += histograms[feature_idx, bin_idx].count + sum_hessian_right += histograms[feature_idx, bin_idx + 1].count else: sum_hessian_right += \ - histograms[feature_idx, bin_idx].sum_hessians + histograms[feature_idx, bin_idx + 1].sum_hessians sum_hessian_left = sum_hessians - sum_hessian_right sum_gradient_right += \ - histograms[feature_idx, bin_idx].sum_gradients + histograms[feature_idx, bin_idx + 1].sum_gradients sum_gradient_left = sum_gradients - sum_gradient_right if n_samples_right < self.min_samples_leaf: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index c2da7ffdd03a2..0fb3c188a175a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -269,3 +269,86 @@ def test_min_gain_to_split(): split_info = splitter.find_node_split(sample_indices, histograms, sum_gradients, sum_hessians) assert split_info.gain == -1 + + +@pytest.mark.parametrize( + 'X_binned, all_gradients, has_missing_values, expected_bin_idx, ' + 'expected_go_to_left', [ + + # basic sanity check: given the gradient values, the split must occur + # on bin_idx=3 + ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients + False, # no missing values + 3, # expected_bin_idx + 'not_applicable'), + + # We replace 2 samples by NaNs (bin_idx=8) + # These 2 samples were mapped to the left node before, so they should + # be mapped to left node again + # Notice how the bin_idx threshold changes from 3 to 1. + ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7], + [1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + True, # missing values (bin_idx=8) + 1, # cut on bin_idx=1 + True), # missing values go to left + + # Same, this time replacing 2 samples that were on the right. + ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7], + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values (bin_idx=8) + 3, # cut on bin_idx=3 (like in first case) + False), # missing values go to right + ] +) +def test_splitting_missing_values(X_binned, all_gradients, + has_missing_values, expected_bin_idx, + expected_go_to_left): + # Make sure missing values are properly supported. + # we build an artificial example with gradients such that the best split + # is on bin_idx=3. + # Then we introduce missing values and: + # - make sure the chosen bin is still correct (find_best_bin()) + # - make sure the missing values are mapped to the correct child + # (split_indices()) + + max_bins = 10 # TO REMOVE + n_samples = len(X_binned) + l2_regularization = 0. + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + + sample_indices = np.arange(n_samples, dtype=np.uint32) + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) + has_missing_values = np.array([has_missing_values], dtype=np.uint8) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples + hessians_are_constant = True + + actual_n_bins = np.array([X_binned.max() + 1], dtype=np.uint32) + builder = HistogramBuilder(X_binned, max_bins, + all_gradients, all_hessians, + hessians_are_constant) + splitter = Splitter(X_binned, max_bins, actual_n_bins, has_missing_values, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + hessians_are_constant) + + histograms = builder.compute_histograms_brute(sample_indices) + split_info = splitter.find_node_split(sample_indices, histograms, + sum_gradients, sum_hessians) + + assert split_info.bin_idx == expected_bin_idx + if has_missing_values: + assert split_info.missing_go_to_left == expected_go_to_left + + # Whatever the missing values, the split shouhld always be the same. This + # also make sure missing values are properly assigned to the correct child + # in split_indices() + samples_left, samples_right, _ = splitter.split_indices( + split_info, splitter.partition) + assert set(samples_left) == set([0, 1, 2, 3]) + assert set(samples_right) == set([4, 5, 6, 7, 8, 9]) From d6b73ed74ac8290eec1145c8b5faa5674844eb8e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 09:38:07 -0400 Subject: [PATCH 14/76] add missing_fraction param to benchmark --- benchmarks/bench_hist_gradient_boosting.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 8d055b22c2252..5f070bd45708d 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -2,6 +2,7 @@ import argparse import matplotlib.pyplot as plt +import numpy as np from sklearn.model_selection import train_test_split # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -25,6 +26,7 @@ parser.add_argument('--learning-rate', type=float, default=.1) parser.add_argument('--problem', type=str, default='classification', choices=['classification', 'regression']) +parser.add_argument('--missing-fraction', type=float, default=0) parser.add_argument('--n-classes', type=int, default=2) parser.add_argument('--n-samples-max', type=int, default=int(1e6)) parser.add_argument('--n-features', type=int, default=20) @@ -52,6 +54,11 @@ def get_estimator_and_data(): X, y, Estimator = get_estimator_and_data() +if args.missing_fraction: + mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype( + np.bool) + X[mask] = np.nan + X_train_, X_test_, y_train_, y_test_ = train_test_split( X, y, test_size=0.5, random_state=0) From 5e06fa7a2c7fa4341a1cef36b2081cadde98ce6f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 10:36:58 -0400 Subject: [PATCH 15/76] bin training and validation data separately --- .../_hist_gradient_boosting/binning.py | 2 +- .../gradient_boosting.py | 59 ++++++++++++------- .../tests/test_gradient_boosting.py | 22 +++++++ 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 075ed4f175ac3..34bd43cde4061 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -140,7 +140,7 @@ def transform(self, X): Returns ------- X_binned : array-like, shape (n_samples, n_features) - The binned data. + The binned data (fortran-aligned). """ X = check_array(X, dtype=[X_DTYPE]) check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_']) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 719756061f896..7a1926d0258ca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -112,17 +112,6 @@ def fit(self, X, y): # data. self._in_fit = True - # bin the data - if self.verbose: - print("Binning {:.3f} GB of data: ".format(X.nbytes / 1e9), end="", - flush=True) - tic = time() - self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) - X_binned = self.bin_mapper_.fit_transform(X) - toc = time() - if self.verbose: - duration = toc - tic - print("{:.3f} s".format(duration)) self.loss_ = self._get_loss() @@ -135,17 +124,19 @@ def fit(self, X, y): # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None - X_binned_train, X_binned_val, y_train, y_val = train_test_split( - X_binned, y, test_size=self.validation_fraction, - stratify=stratify, random_state=rng) + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=self.validation_fraction, stratify=stratify, + random_state=rng) + else: + X_train, y_train = X, y + X_val, y_val = None, None - # Predicting is faster of C-contiguous arrays, training is faster - # on Fortran arrays. - X_binned_val = np.ascontiguousarray(X_binned_val) - X_binned_train = np.asfortranarray(X_binned_train) + # Bin the data + X_binned_train = self._bin_data(X_train, rng, is_training_data=True) + if X_val is not None: + X_binned_val = self._bin_data(X_val, rng, is_training_data=False) else: - X_binned_train, y_train = X_binned, y - X_binned_val, y_val = None, None + X_binned_val = None if self.verbose: print("Fitting gradient boosted rounds:") @@ -387,6 +378,34 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) + def _bin_data(self, X, rng, is_training_data): + """Bin data X. + + If is_training_data, then set the bin_mapper_ attribute. + Else, the binned data is converted to a C-contiguous array. + """ + + description = 'training' if is_training_data else 'validation' + if self.verbose: + print("Binning {:.3f} GB of {} data: ".format( + X.nbytes / 1e9, description), end="", flush=True) + tic = time() + bin_mapper = _BinMapper(max_bins=self.max_bins, random_state=rng) + X_binned = bin_mapper.fit_transform(X) # F-aligned array + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + if is_training_data: + self.bin_mapper_ = bin_mapper + else: + # Validation data. We convert the array to C-contiguous since + # predicting is faster with this layout (training is faster on + # F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + return X_binned + def _print_iteration_stats(self, iteration_start_time): """Print info about the current fitting iteration.""" log_msg = '' diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 790597b07fa15..f80ec4aa253c4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -6,6 +6,7 @@ from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper X_classification, y_classification = make_classification(random_state=0) @@ -145,3 +146,24 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping): n_iter_no_change=n_iter_no_change, tol=tol ) assert gbdt._should_stop(scores) == stopping + + +def test_binning_train_validation_are_separated(): + # Make sure training and validation data are binned separately. + + rng = np.random.RandomState(0) + gb = HistGradientBoostingClassifier(n_iter_no_change=5, + validation_fraction=.2, + random_state=rng) + gb.fit(X_classification, y_classification) + mapper_training_data = gb.bin_mapper_ + + # Note that since the data is small there is no subsampling and the + # random_state doesn't matter + mapper_whole_data = _BinMapper(random_state=0) + mapper_whole_data.fit(X_classification) + + for feature_idx in range(X_classification.shape[1]): + n_bins_training = mapper_training_data.actual_n_bins_[feature_idx] + n_bins_whole = mapper_whole_data.actual_n_bins_[feature_idx] + assert n_bins_training != n_bins_whole From 1a3485666fb6375e35d883823b1873a537a91b6a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 10:42:02 -0400 Subject: [PATCH 16/76] shorter test --- .../tests/test_gradient_boosting.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index f80ec4aa253c4..089d7e76c7d82 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -150,6 +150,7 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping): def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. + # See issue 13926 rng = np.random.RandomState(0) gb = HistGradientBoostingClassifier(n_iter_no_change=5, @@ -163,7 +164,5 @@ def test_binning_train_validation_are_separated(): mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) - for feature_idx in range(X_classification.shape[1]): - n_bins_training = mapper_training_data.actual_n_bins_[feature_idx] - n_bins_whole = mapper_whole_data.actual_n_bins_[feature_idx] - assert n_bins_training != n_bins_whole + assert np.all(mapper_training_data.actual_n_bins_ != + mapper_whole_data.actual_n_bins_) From aae10a2afbc98d9b4da94024b3c8b37084324750 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 12:00:23 -0400 Subject: [PATCH 17/76] Map missing values to first bin instead of last --- .../_hist_gradient_boosting/_binning.pyx | 18 +++--- .../_hist_gradient_boosting/_predictor.pyx | 15 +++-- .../_hist_gradient_boosting/binning.py | 6 +- .../_hist_gradient_boosting/splitting.pyx | 47 ++++++++------- .../tests/test_binning.py | 60 ++++++++++--------- .../tests/test_splitting.py | 18 +++--- 6 files changed, 87 insertions(+), 77 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 2bbe1f71a3546..194b859a485f7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -17,7 +17,7 @@ from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - const unsigned int [:] actual_n_bins, + list has_missing_values, X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -28,9 +28,6 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. - actual_n_bins : ndarray, shape (n_features,) - For each feature, indicate the actual number of bins, including the bin - for missing values, if any. binned : ndarray, shape (n_samples, n_features) Output array, must be fortran aligned. """ @@ -41,13 +38,13 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], - actual_n_bins[feature_idx], + has_missing_values[feature_idx], binned[:, feature_idx]) cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, - const unsigned int actual_n_bins, + const int has_missing_values, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: @@ -59,10 +56,11 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True): if isnan(data[i]): - # unkown values are mapped to last bin + # unkown values are mapped to first bin # Note that this is only correct if missing values were - # encountered at fit time (else actual_n_bins is incorrect). - binned[i] = actual_n_bins - 1 + # encountered at fit time (else non-missing values will also be + # mapped to this bin) + binned[i] = 0 else: # for known values, use binary search left, right = 0, binning_thresholds.shape[0] @@ -72,4 +70,4 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, right = middle else: left = middle + 1 - binned[i] = left + binned[i] = left + has_missing_values diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index d4c3227212528..33bf993dab7d8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -65,12 +65,17 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( while True: if node.is_leaf: return node.value - if ((isnan(numeric_data[row, node.feature_idx]) and - node.missing_go_to_left) or - numeric_data[row, node.feature_idx] <= node.threshold): - node = nodes[node.left] + + if isnan(numeric_data[row, node.feature_idx]): + if node.missing_go_to_left: + node = nodes[node.left] + else: + node = nodes[node.right] else: - node = nodes[node.right] + if numeric_data[row, node.feature_idx] <= node.threshold: + node = nodes[node.left] + else: + node = nodes[node.right] cdef void _predict_from_binned_data_parallel( diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 891d131c41774..65ffd625af0cc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -123,7 +123,7 @@ def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): def fit(self, X, y=None): """Fit data X by computing the binning thresholds. - The last bin is reserved for missing values, if any. + The first bin is reserved for missing values, if any. Parameters ---------- @@ -153,7 +153,7 @@ def fit(self, X, y=None): def transform(self, X): """Bin data X. - Missing values will be mapped to the last bin (whether or not missing + Missing values will be mapped to the first bin (whether or not missing values were encountered at fit time). For this reason, `X` should be the fitting data, though we do not enforce this. Note that the GBDT code only ever uses mapper.fit_transform(), so this assumption is OK. @@ -177,5 +177,5 @@ def transform(self, X): X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, self.actual_n_bins_, binned) + _map_to_bins(X, self.bin_thresholds_, self.has_missing_values_, binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index c48a900d24387..67e88d29642aa 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -251,8 +251,6 @@ cdef class Splitter: int feature_idx = split_info.feature_idx unsigned char has_missing_values = \ self.has_missing_values[feature_idx] - X_BINNED_DTYPE_C missing_values_bin = \ - self.actual_n_bins[feature_idx] - 1 const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer @@ -297,15 +295,20 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if ((has_missing_values and - X_binned[sample_idx] == missing_values_bin and - missing_go_to_left) or - X_binned[sample_idx] <= bin_idx): - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 + if (has_missing_values and X_binned[sample_idx] == 0): + if missing_go_to_left: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 + if X_binned[sample_idx] <= bin_idx: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 left_counts[thread_idx] = left_count right_counts[thread_idx] = right_count @@ -467,10 +470,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - # We don't need to consider splitting on the last bin (or second to - # last bin if there are missing values) since this would result in - # having 0 samples in the right node - unsigned int end = self.actual_n_bins[feature_idx] - 1 + unsigned int start = 0 Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -481,10 +481,12 @@ cdef class Splitter: n_samples_left = 0 if self.has_missing_values[feature_idx]: - # if there are missing values (in the last bin), skip one more bin - end -= 1 + # if there are missing values (in the first bin), skip it + start = 1 - for bin_idx in range(end): + for bin_idx in range(start, self.actual_n_bins[feature_idx] - 1): + # Note that considering splitting on the last bin is useless since + # it would result in having 0 samples in the right node (forbidden) n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left @@ -560,17 +562,16 @@ cdef class Splitter: unsigned int n_bins = self.actual_n_bins[feature_idx] unsigned int start - # - Skip last bin (where the missing values are) - # - Skip second to last bin (considering this split would result in 0 - # samples on the right node) - start = n_bins - 3 - # n_bins - 2 is the index of the second to last bin, which we consider # being on the right child. sum_gradient_right, sum_hessian_right = 0., 0. n_samples_right = 0 - for bin_idx in range(start, -1, -1): + for bin_idx in range(self.actual_n_bins[feature_idx] - 2, 0, -1): + # We start at the second to last bin (we don't need to consider + # splitting on the last bin since it would result in having zero + # samples on the right node). + # We also skip the first bin (where the missing values are) n_samples_right += histograms[feature_idx, bin_idx + 1].count n_samples_left = n_samples_ - n_samples_right diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 99f664660c0e2..f8a9144fc3877 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -249,33 +249,35 @@ def test_subsample(): @pytest.mark.parametrize( 'max_bins, actual_n_bins, X_trans_expected', [ - (256, [5, 3, 2], [[0, 0, 0], - [4, 2, 0], - [1, 0, 0], - [4, 1, 1], - [2, 1, 1], - [3, 0, 0]]), - # With max_bins=2, we expect all non-nan values to be mapped to bin 0 - # and all nans to be mapped to bin 1 - (2, [2, 2, 2], [[0, 0, 0], - [1, 1, 0], + (256, [5, 3, 2], [[1, 1, 0], + [0, 0, 0], + [2, 1, 0], + [0, 2, 1], + [3, 2, 1], + [4, 1, 0]]), + # With max_bins=2, we expect all nan values to be mapped to bin 0 + # and all non-nans to be mapped to bin 1 + (2, [2, 2, 2], [[1, 1, 0], [0, 0, 0], - [1, 0, 1], - [0, 0, 1], - [0, 0, 0]]), + [1, 1, 0], + [0, 1, 1], + [1, 1, 1], + [1, 1, 0]]), - (3, [3, 3, 2], [[0, 0, 0], - [2, 2, 0], + (3, [3, 3, 2], [[1, 1, 0], [0, 0, 0], - [2, 1, 1], - [1, 1, 1], - [1, 0, 0]])]) + [1, 1, 0], + [0, 2, 1], + [2, 2, 1], + [2, 1, 0]])]) def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): - # check for missing values: make sure nans are mapped to last bins + # check for missing values: make sure nans are mapped to the first bin # and that attributes are correct - # Note that the extra bin for missing values is only allocated if needed - # (no need to allocate extra bin for third column here.) + # Note that the extra bin for missing values is only allocated if needed: + # - no need to allocate extra bin for third column here + # - due to the extra bin, the features with missing values are "shifted" + # with an offset of 1 X = [[1, 1, 1], [np.NaN, np.NaN, 1], @@ -291,14 +293,16 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): assert_array_equal(mapper.actual_n_bins_, actual_n_bins) assert_array_equal(mapper.has_missing_values_, [True, True, False]) X_trans = mapper.transform(X) + print() + print(X_trans) assert_array_equal(X_trans, X_trans_expected) def test_missing_values_different_X_fit_transform(): # Test to illustrate the fact that missing values are always mapped to the - # last bin. + # first bin. # If there are no missing values at fit time (second column), then during - # transform(), missing values are treated as the biggest values, which is + # transform(), missing values are treated as the smallest values, which is # not a desired behaviour in general. # Note that in practice this case never happens, since the GBDT code only @@ -318,16 +322,16 @@ def test_missing_values_different_X_fit_transform(): X2 = [[1, 1], [3, 1], - [1, np.NaN], # Nan mapped in same bin as the biggest value + [1, np.NaN], # Nan mapped in same bin as the smallest value [2, 2], [np.NaN, 2], # Nan mapped in a special bin, as expected [1, 1]] X2_trans = mapper.transform(X2) - X2_trans_expected = [[0, 0], + X2_trans_expected = [[1, 0], + [2, 0], [1, 0], - [0, 1], - [1, 1], [2, 1], - [0, 0]] + [0, 1], + [1, 0]] assert_array_equal(X2_trans, X2_trans_expected) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 0fb3c188a175a..866fee5fa382e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -283,21 +283,23 @@ def test_min_gain_to_split(): 3, # expected_bin_idx 'not_applicable'), - # We replace 2 samples by NaNs (bin_idx=8) + # We replace 2 samples by NaNs (bin_idx=0) # These 2 samples were mapped to the left node before, so they should # be mapped to left node again - # Notice how the bin_idx threshold changes from 3 to 1. - ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7], + # Notice how the bin_idx threshold changes from 3 to 2. + # Also, the bins of the previous non-nan samples have bin shiffted by + # one + ([0, 1, 2, 0, 3, 4, 5, 6, 7, 8], [1, 1, 1, 1, 2, 2, 2, 2, 2, 2], - True, # missing values (bin_idx=8) - 1, # cut on bin_idx=1 + True, # missing values (bin_idx=0) + 2, # cut on bin_idx=2 True), # missing values go to left # Same, this time replacing 2 samples that were on the right. - ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7], + ([1, 2, 3, 4, 0, 5, 0, 6, 7, 8], [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values (bin_idx=8) - 3, # cut on bin_idx=3 (like in first case) + 4, # cut on bin_idx=4 (like in first case, with +1 because of offset) False), # missing values go to right ] ) @@ -345,7 +347,7 @@ def test_splitting_missing_values(X_binned, all_gradients, if has_missing_values: assert split_info.missing_go_to_left == expected_go_to_left - # Whatever the missing values, the split shouhld always be the same. This + # Whatever the missing values, the split should always be the same. This # also make sure missing values are properly assigned to the correct child # in split_indices() samples_left, samples_right, _ = splitter.split_indices( From 35eda6e69c45e5fb829e2615b9a5720b002bdd95 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 12:12:45 -0400 Subject: [PATCH 18/76] pep8 --- .../tests/test_splitting.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 866fee5fa382e..05464b0c40807 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -100,8 +100,7 @@ def test_gradient_and_hessian_sanity(constant_hessian): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - has_missing_values = np.array([False] * X_binned.shape[1], - dtype=np.uint8) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, @@ -198,8 +197,7 @@ def test_split_indices(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - has_missing_values = np.array([False] * X_binned.shape[1], - dtype=np.uint8) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) @@ -256,8 +254,7 @@ def test_min_gain_to_split(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - has_missing_values = np.array([False] * X_binned.shape[1], - dtype=np.uint8) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, @@ -298,7 +295,7 @@ def test_min_gain_to_split(): # Same, this time replacing 2 samples that were on the right. ([1, 2, 3, 4, 0, 5, 0, 6, 7, 8], [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], - True, # missing values (bin_idx=8) + True, # missing values (bin_idx=0) 4, # cut on bin_idx=4 (like in first case, with +1 because of offset) False), # missing values go to right ] @@ -308,9 +305,10 @@ def test_splitting_missing_values(X_binned, all_gradients, expected_go_to_left): # Make sure missing values are properly supported. # we build an artificial example with gradients such that the best split - # is on bin_idx=3. + # is on bin_idx=3, when there are no missing values. # Then we introduce missing values and: - # - make sure the chosen bin is still correct (find_best_bin()) + # - make sure the chosen bin is still correct (find_best_bin()): it's + # still the same bin, even though its index changes # - make sure the missing values are mapped to the correct child # (split_indices()) From 1f63282e5462b1dc2300a092d543ab395675a061 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 14:13:06 -0400 Subject: [PATCH 19/76] Added whats new entry --- doc/whats_new/v0.22.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 0518d6c9e0de4..b0e56518da55b 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -39,6 +39,14 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.ensemble` +....................... + +- |Fix| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` now bin the training and + validation data separately to avoid any data leak. :pr:`13933` by + :user:`NicolasHug`_. + :mod:`sklearn.linear_model` .................. From e3d34a9046aca8282977af38f4c03803995b6d95 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 16:45:03 -0400 Subject: [PATCH 20/76] avoid some python interactions --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 67e88d29642aa..62bff071fb043 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -471,6 +471,7 @@ cdef class Splitter: unsigned int n_samples_right unsigned int n_samples_ = n_samples unsigned int start = 0 + unsigned int end = self.actual_n_bins[feature_idx] - 1 Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -484,7 +485,7 @@ cdef class Splitter: # if there are missing values (in the first bin), skip it start = 1 - for bin_idx in range(start, self.actual_n_bins[feature_idx] - 1): + for bin_idx in range(start, end): # Note that considering splitting on the last bin is useless since # it would result in having 0 samples in the right node (forbidden) n_samples_left += histograms[feature_idx, bin_idx].count @@ -559,15 +560,14 @@ cdef class Splitter: Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right Y_DTYPE_C gain - unsigned int n_bins = self.actual_n_bins[feature_idx] - unsigned int start + unsigned int start = self.actual_n_bins[feature_idx] - 2 # n_bins - 2 is the index of the second to last bin, which we consider # being on the right child. sum_gradient_right, sum_hessian_right = 0., 0. n_samples_right = 0 - for bin_idx in range(self.actual_n_bins[feature_idx] - 2, 0, -1): + for bin_idx in range(start, 0, -1): # We start at the second to last bin (we don't need to consider # splitting on the last bin since it would result in having zero # samples on the right node). From 542cb25d5d1520135406a093ae552c3f108f47ca Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 17:22:01 -0400 Subject: [PATCH 21/76] make predict_binned work --- .../_hist_gradient_boosting/_binning.pyx | 6 +++-- .../_hist_gradient_boosting/_predictor.pyx | 23 ++++++++++++++----- .../_hist_gradient_boosting/binning.py | 5 ++-- .../gradient_boosting.py | 15 +++++++++--- .../_hist_gradient_boosting/predictor.py | 9 ++++++-- .../tests/test_binning.py | 7 ++---- 6 files changed, 45 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 194b859a485f7..18f0e7edce95e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -17,7 +17,7 @@ from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - list has_missing_values, + const unsigned char [:] has_missing_values, X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -28,6 +28,8 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. + has_missing_values : ndarray, shape (n_features,) + Whether each feature has missing values. binned : ndarray, shape (n_samples, n_features) Output array, must be fortran aligned. """ @@ -44,7 +46,7 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, - const int has_missing_values, + const unsigned char has_missing_values, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 33bf993dab7d8..f0bd29d429b5a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -36,8 +36,9 @@ def _predict_from_numeric_data(nodes, numeric_data, out): _predict_from_numeric_data_parallel(nodes, numeric_data, out) -def _predict_from_binned_data(nodes, binned_data, out): - _predict_from_binned_data_parallel(nodes, binned_data, out) +def _predict_from_binned_data(nodes, binned_data, has_missing_values, out): + _predict_from_binned_data_parallel(nodes, binned_data, + has_missing_values, out) cdef void _predict_from_numeric_data_parallel( @@ -81,18 +82,21 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( cdef void _predict_from_binned_data_parallel( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, + const unsigned char [:] has_missing_features, Y_DTYPE_C [:] out): cdef: int i for i in prange(binned_data.shape[0], schedule='static', nogil=True): - out[i] = _predict_one_from_binned_data(nodes, binned_data, i) + out[i] = _predict_one_from_binned_data(nodes, binned_data, + has_missing_features, i) cdef inline Y_DTYPE_C _predict_one_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, + const unsigned char [:] has_missing_features, const int row) nogil: # Need to pass the whole array and the row index, else prange won't work. # See issue Cython #2798 @@ -103,7 +107,14 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( while True: if node.is_leaf: return node.value - if binned_data[row, node.feature_idx] <= node.bin_threshold: - node = nodes[node.left] + if (has_missing_features[node.feature_idx] and + binned_data[row, node.feature_idx] == 0): + if node.missing_go_to_left: + node = nodes[node.left] + else: + node = nodes[node.right] else: - node = nodes[node.right] + if binned_data[row, node.feature_idx] <= node.bin_threshold: + node = nodes[node.left] + else: + node = nodes[node.right] diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 65ffd625af0cc..276cf37dfdf4d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -137,11 +137,12 @@ def fit(self, X, y=None): self : object """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - self.bin_thresholds_, self.has_missing_values_ = \ - _find_binning_thresholds( + self.bin_thresholds_, has_missing_values = _find_binning_thresholds( X, self.max_bins, subsample=self.subsample, random_state=self.random_state) + self.has_missing_values_ = np.array(has_missing_values, dtype=np.uint8) + self.actual_n_bins_ = np.array( [thresholds.shape[0] + 1 + has_missing_values for (thresholds, has_missing_values) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 242182b7be63e..d26ad80f55f80 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2,6 +2,7 @@ # Author: Nicolas Hug from abc import ABC, abstractmethod +from functools import partial import numpy as np from timeit import default_timer as time @@ -281,7 +282,10 @@ def fit(self, X, y): if self._use_validation_data: for k, pred in enumerate(self._predictors[-1]): raw_predictions_val[k, :] += ( - pred.predict_binned(X_binned_val)) + pred.predict_binned( + X_binned_val, + self.bin_mapper_.has_missing_values_) + ) should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, @@ -458,8 +462,13 @@ def _raw_predict(self, X): raw_predictions += self._baseline_prediction for predictors_of_ith_iteration in self._predictors: for k, predictor in enumerate(predictors_of_ith_iteration): - predict = (predictor.predict_binned if is_binned - else predictor.predict) + if is_binned: + predict = partial( + predictor.predict_binned, + has_missing_values=self.bin_mapper_.has_missing_values_ + ) + else: + predict = predictor.predict raw_predictions[k, :] += predict(X) return raw_predictions diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index 7151c8d1017e6..f03692a1af118 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -63,13 +63,18 @@ def predict(self, X): _predict_from_numeric_data(self.nodes, X, out) return out - def predict_binned(self, X): + def predict_binned(self, X, has_missing_values): """Predict raw values for binned data. Parameters ---------- X : ndarray, shape (n_samples, n_features) The input samples. + has_missing_values : ndarray, shape (n_features,) + Whether each feature has missing values (at fit time). + This parameter is only needed for predict_binned(): we need to know + whether the first bin should be treated as the bin for missing + data. Returns ------- @@ -77,5 +82,5 @@ def predict_binned(self, X): The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_binned_data(self.nodes, X, out) + _predict_from_binned_data(self.nodes, X, has_missing_values, out) return out diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index f8a9144fc3877..6a02b9993ac83 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -93,11 +93,8 @@ def test_map_to_bins(n_bins): bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, random_state=0) binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') - actual_n_bins = np.array( - [len(thresholds) + 1 for thresholds in bin_thresholds], - dtype=np.uint32 - ) - _map_to_bins(DATA, bin_thresholds, actual_n_bins, binned) + has_missing_values = np.array([False] * DATA.shape[1], dtype=np.uint8) + _map_to_bins(DATA, bin_thresholds, has_missing_values, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous From bf822b494be6471cf5e93fbee29cd6ba35cdef71 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 17:58:46 -0400 Subject: [PATCH 22/76] fixed bug due to offset in bin_thresholds_ attribute --- .../_hist_gradient_boosting/_binning.pyx | 4 ++-- .../_hist_gradient_boosting/binning.py | 23 +++++++++++++------ .../tests/test_grower.py | 5 ++-- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 18f0e7edce95e..51ed4901fa4f1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -65,11 +65,11 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, binned[i] = 0 else: # for known values, use binary search - left, right = 0, binning_thresholds.shape[0] + left, right = has_missing_values, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 if data[i] <= binning_thresholds[middle]: right = middle else: left = middle + 1 - binned[i] = left + has_missing_values + binned[i] = left diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 276cf37dfdf4d..0a52bb50abb86 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -137,16 +137,25 @@ def fit(self, X, y=None): self : object """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - self.bin_thresholds_, has_missing_values = _find_binning_thresholds( - X, self.max_bins, subsample=self.subsample, - random_state=self.random_state) - + all_bin_thresholds, has_missing_values = _find_binning_thresholds( + X, self.max_bins, subsample=self.subsample, + random_state=self.random_state) + + # If there are missing value in a given feature, we prepend a fake + # threshold (nan) corresponding to the first bin were missing values + # are mapped. This threshold is never used in practice, but we use it + # to keep the indexes of the bins synchronized with the + # bin_thresholds_ attribute. + for feature_idx, bin_thresholds in enumerate(all_bin_thresholds): + if has_missing_values[feature_idx]: + all_bin_thresholds[feature_idx] = \ + np.insert(bin_thresholds, 0, np.nan) + + self.bin_thresholds_ = all_bin_thresholds self.has_missing_values_ = np.array(has_missing_values, dtype=np.uint8) self.actual_n_bins_ = np.array( - [thresholds.shape[0] + 1 + has_missing_values - for (thresholds, has_missing_values) - in zip(self.bin_thresholds_, self.has_missing_values_)], + [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], dtype=np.uint32) return self diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index df2a8d9f80169..d5272e24302b9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -173,12 +173,13 @@ def test_predictor_from_grower(): [129, 255], [242, 100], ], dtype=np.uint8) - predictions = predictor.predict_binned(input_data) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + predictions = predictor.predict_binned(input_data, has_missing_values) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: - predictions = predictor.predict_binned(X_binned) + predictions = predictor.predict_binned(X_binned, has_missing_values) assert np.allclose(predictions, -all_gradients) From 112b4008c8bf2123ee0ebd4f7c1bdebab08a8b0a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 18:12:37 -0400 Subject: [PATCH 23/76] more sensible binning strat --- .../_hist_gradient_boosting/_binning.pyx | 5 +---- .../ensemble/_hist_gradient_boosting/binning.py | 11 +++++++---- .../tests/test_binning.py | 17 ++++++++--------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 51ed4901fa4f1..99637af296624 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -57,11 +57,8 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True): - if isnan(data[i]): + if has_missing_values and isnan(data[i]): # unkown values are mapped to first bin - # Note that this is only correct if missing values were - # encountered at fit time (else non-missing values will also be - # mapped to this bin) binned[i] = 0 else: # for known values, use binary search diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 0a52bb50abb86..46d29f5b9e44d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -163,10 +163,13 @@ def fit(self, X, y=None): def transform(self, X): """Bin data X. - Missing values will be mapped to the first bin (whether or not missing - values were encountered at fit time). For this reason, `X` should be - the fitting data, though we do not enforce this. Note that the GBDT - code only ever uses mapper.fit_transform(), so this assumption is OK. + Missing values will be mapped to the first bin, but only if missing + values were encountered at fit time. Else, due to side effect of + comparing with NaNs (always results in False), missing values are + mapped to the last bin. + For this reason, `X` should be the fitting data, though we do not + enforce this. Note that the GBDT code only ever uses + mapper.fit_transform(), so this assumption is OK. Parameters ---------- diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 6a02b9993ac83..9d0ace8592ca2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -290,17 +290,15 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): assert_array_equal(mapper.actual_n_bins_, actual_n_bins) assert_array_equal(mapper.has_missing_values_, [True, True, False]) X_trans = mapper.transform(X) - print() - print(X_trans) assert_array_equal(X_trans, X_trans_expected) def test_missing_values_different_X_fit_transform(): - # Test to illustrate the fact that missing values are always mapped to the - # first bin. + # Test to illustrate the fact that missing values are mapped to the + # first bin only if missing values were encountered at fit time. # If there are no missing values at fit time (second column), then during - # transform(), missing values are treated as the smallest values, which is - # not a desired behaviour in general. + # transform(), missing values will be mapped to the last bin, not a + # desired behaviour in general. # Note that in practice this case never happens, since the GBDT code only # ever uses mapper.fit_transform(). @@ -319,16 +317,17 @@ def test_missing_values_different_X_fit_transform(): X2 = [[1, 1], [3, 1], - [1, np.NaN], # Nan mapped in same bin as the smallest value + [1, np.NaN], # Nan mapped in biggest bin [2, 2], - [np.NaN, 2], # Nan mapped in a special bin, as expected + [np.NaN, 2], # Nan mapped in a first bin, as expected [1, 1]] X2_trans = mapper.transform(X2) X2_trans_expected = [[1, 0], [2, 0], - [1, 0], + [1, 1], [2, 1], [0, 1], [1, 0]] + print(X2_trans) assert_array_equal(X2_trans, X2_trans_expected) From 21a3ee37c2f06dcaeb0857af6555831c79ccdde6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 18:21:12 -0400 Subject: [PATCH 24/76] typo --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 62bff071fb043..cdd9b0999daf3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -401,7 +401,7 @@ cdef class Splitter: # there are any missing values, we will also scan bins from # right to left. This way, we can consider whichever case # yields the best gain: either missing values go to the right - # (left to right scan) or to the left (left to right case). + # (left to right scan) or to the left (right to left case). # See algo 3 from the XGBoost paper # https://arxiv.org/abs/1603.02754 From 28c15b2257cd6d77e8815b7b6dfdf7efbfe70421 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 May 2019 19:36:27 -0400 Subject: [PATCH 25/76] user name --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index b0e56518da55b..118d5a38b9ba3 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -45,7 +45,7 @@ Changelog - |Fix| :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor` now bin the training and validation data separately to avoid any data leak. :pr:`13933` by - :user:`NicolasHug`_. + `NicolasHug`_. :mod:`sklearn.linear_model` .................. From 5a5f39da4284896a16b348ac9b6370d5cd36c6a4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 May 2019 14:55:53 -0400 Subject: [PATCH 26/76] Add small test --- .../tests/test_binning.py | 17 ++++++++++++----- .../tests/test_gradient_boosting.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 9d0ace8592ca2..35de3f11309a5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -289,6 +289,14 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): mapper.fit(X) assert_array_equal(mapper.actual_n_bins_, actual_n_bins) assert_array_equal(mapper.has_missing_values_, [True, True, False]) + for feature_idx in range(X.shape[1]): + assert len(mapper.bin_thresholds_[feature_idx]) == \ + actual_n_bins[feature_idx] - 1 + for feature_idx in (0, 1): + # For features with missing values, we add a fake threshold (nan) to + # keep the bin_thresholds_ array synchronized with the bin values, i.e. + # bin k has threhold at index k. + assert np.isnan(mapper.bin_thresholds_[feature_idx][0]) X_trans = mapper.transform(X) assert_array_equal(X_trans, X_trans_expected) @@ -317,17 +325,16 @@ def test_missing_values_different_X_fit_transform(): X2 = [[1, 1], [3, 1], - [1, np.NaN], # Nan mapped in biggest bin + [1, np.NaN], [2, 2], - [np.NaN, 2], # Nan mapped in a first bin, as expected + [np.NaN, 2], [1, 1]] X2_trans = mapper.transform(X2) X2_trans_expected = [[1, 0], [2, 0], - [1, 1], + [1, 1], # Nan mapped in biggest bin (treated as 2) [2, 1], - [0, 1], + [0, 1], # Nan mapped in the first bin, as expected [1, 0]] - print(X2_trans) assert_array_equal(X2_trans, X2_trans_expected) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index be43f8dbf5c99..0b2657770255f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -148,7 +148,7 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping): def test_missing_values(): - # sanity check for missing value support. With only one feature and + # sanity check for missing values support. With only one feature and # y == isnan(X), the gbdt is supposed to reach perfect accuracy. n_samples = 100 From b07fed94e21095ac9a2f2c68b768e1854bd17099 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 May 2019 15:20:41 -0400 Subject: [PATCH 27/76] convert to fortran array in tests --- sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py | 1 + .../ensemble/_hist_gradient_boosting/tests/test_splitting.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index d5272e24302b9..0fec7bb0cd4a2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -318,6 +318,7 @@ def test_missing_value_predict_only(): rng = np.random.RandomState(0) n_samples = 100 X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 05464b0c40807..b50c89ec5f387 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -320,7 +320,8 @@ def test_splitting_missing_values(X_binned, all_gradients, min_gain_to_split = 0. sample_indices = np.arange(n_samples, dtype=np.uint32) - X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + X_binned = np.asfortranarray(X_binned) all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) has_missing_values = np.array([has_missing_values], dtype=np.uint8) all_hessians = np.ones(1, dtype=G_H_DTYPE) From b78e96b0d0bf377b007476bb135e45fb8921d5e4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 May 2019 15:40:29 -0400 Subject: [PATCH 28/76] some doc --- doc/whats_new/v0.22.rst | 8 ++++++ .../_hist_gradient_boosting/binning.py | 2 +- .../gradient_boosting.py | 26 ++++++++++++++++--- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 0518d6c9e0de4..a83da6c0189c5 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -39,6 +39,14 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.ensemble` +.................. + +- |Feature| :class:`ensemble.HistGradientBoostingClassifier` + and :class:`ensemble.HistGradientBoostingRegressor` now natively supports + data with missing values both for training and predicting. :pr:`13911` by + `NicolasHug`_. + :mod:`sklearn.linear_model` .................. diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 46d29f5b9e44d..2ac89c6162f9c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -145,7 +145,7 @@ def fit(self, X, y=None): # threshold (nan) corresponding to the first bin were missing values # are mapped. This threshold is never used in practice, but we use it # to keep the indexes of the bins synchronized with the - # bin_thresholds_ attribute. + # bin_thresholds_ attribute: bin k is at index k. for feature_idx, bin_thresholds in enumerate(all_bin_thresholds): if has_missing_values[feature_idx]: all_bin_thresholds[feature_idx] = \ diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d26ad80f55f80..83af91f1429f1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -503,6 +503,14 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): might be preferred since binning may lead to split points that are too approximate in this setting. + This estimator has native support for missing values (Nans). During + training, the tree grower learns at each split point whether nodes with + missing values should go to the left or right child, based on the + potential gain. When predicting, nodes with missing values are assigned to + the left or right child consequently. If no missing values were encountered + for a given feature during training, then nodes with missing values are + mapped to whichever child has the most samples. + This implementation is inspired by `LightGBM `_. @@ -549,8 +557,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. Must be no - larger than 256. + number of unique values may use less than ``max_bins`` bins. One bin is + specifically allocated for missing values, if any. Must be no larger + than 256. scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see @@ -669,6 +678,14 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, might be preferred since binning may lead to split points that are too approximate in this setting. + This estimator has native support for missing values (Nans). During + training, the tree grower learns at each split point whether nodes with + missing values should go to the left or right child, based on the + potential gain. When predicting, nodes with missing values are assigned to + the left or right child consequently. If no missing values were encountered + for a given feature during training, then nodes with missing values are + mapped to whichever child has the most samples. + This implementation is inspired by `LightGBM `_. @@ -717,8 +734,9 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. Must be no - larger than 256. + number of unique values may use less than ``max_bins`` bins. One bin is + specifically allocated for missing values, if any. Must be no larger + than 256. scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see From a9f878c453db3e26e64d430c67ce90b6b8a5d4a0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 May 2019 16:35:25 -0400 Subject: [PATCH 29/76] Added function test --- .../_hist_gradient_boosting/splitting.pyx | 2 +- .../tests/test_binning.py | 4 +- .../tests/test_gradient_boosting.py | 39 ++++++++++++++++++- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index cdd9b0999daf3..942ae155c7f5e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -546,7 +546,7 @@ cdef class Splitter: We scan node from right to left. This version is only called when there are missing values. If there's no missing value, calling - _find_best_bin_to_split_left_to_right is enough. If any, missing + _find_best_bin_to_split_left_to_right is enough. Missing values are assigned to the left node. """ diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 35de3f11309a5..10ac2ebb7868e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -273,8 +273,8 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): # Note that the extra bin for missing values is only allocated if needed: # - no need to allocate extra bin for third column here - # - due to the extra bin, the features with missing values are "shifted" - # with an offset of 1 + # - due to the extra bin allocated for missing values, the bin values of + # feature 0 and 1 are "shifted" with an offset of 1 X = [[1, 1, 1], [np.NaN, np.NaN, 1], diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0b2657770255f..acdac73bb3d01 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -147,7 +147,7 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping): assert gbdt._should_stop(scores) == stopping -def test_missing_values(): +def test_missing_values_trivial(): # sanity check for missing values support. With only one feature and # y == isnan(X), the gbdt is supposed to reach perfect accuracy. @@ -163,3 +163,40 @@ def test_missing_values(): gb.fit(X, y) assert gb.score(X, y) == 1 + + +@pytest.mark.parametrize('problem', ('classification', 'regression')) +@pytest.mark.parametrize('missing_proportion, ' + 'expected_min_score_classification, ' + 'expected_min_score_regression', [ + (.1, .97, .9), + (.2, .94, .82), + (.5, .79, .52), +]) +def test_missing_values_resilience(problem, missing_proportion, + expected_min_score_classification, + expected_min_score_regression): + # Make sure the estimators can deal with missing values and still yield + # decent predictions + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + if problem == 'regression': + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features, random_state=rng) + gb = HistGradientBoostingRegressor() + expected_min_score = expected_min_score_regression + else: + X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_informative=n_features, n_redundant=0, + n_repeated=0, random_state=rng) + gb = HistGradientBoostingClassifier() + expected_min_score = expected_min_score_classification + + mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) + X[mask] = np.nan + + gb.fit(X, y) + + assert gb.score(X, y) > expected_min_score From 71b64e821f619c4c646fc1ec41ce2f41f380551f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 May 2019 16:36:53 -0400 Subject: [PATCH 30/76] pep8 --- .../tests/test_gradient_boosting.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index acdac73bb3d01..dfa11af7b60ec 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -166,13 +166,12 @@ def test_missing_values_trivial(): @pytest.mark.parametrize('problem', ('classification', 'regression')) -@pytest.mark.parametrize('missing_proportion, ' - 'expected_min_score_classification, ' - 'expected_min_score_regression', [ - (.1, .97, .9), - (.2, .94, .82), - (.5, .79, .52), -]) +@pytest.mark.parametrize( + 'missing_proportion, expected_min_score_classification, ' + 'expected_min_score_regression', [ + (.1, .97, .9), + (.2, .94, .82), + (.5, .79, .52)]) def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, expected_min_score_regression): From 2c2373e23bd9fae953c72099e43b9e67468f3ca2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 25 May 2019 21:45:53 -0400 Subject: [PATCH 31/76] Bin validation data using binmaper of training data --- .../gradient_boosting.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 7a1926d0258ca..466181f445ee8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -132,6 +132,7 @@ def fit(self, X, y): X_val, y_val = None, None # Bin the data + self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) X_binned_train = self._bin_data(X_train, rng, is_training_data=True) if X_val is not None: X_binned_val = self._bin_data(X_val, rng, is_training_data=False) @@ -390,20 +391,18 @@ def _bin_data(self, X, rng, is_training_data): print("Binning {:.3f} GB of {} data: ".format( X.nbytes / 1e9, description), end="", flush=True) tic = time() - bin_mapper = _BinMapper(max_bins=self.max_bins, random_state=rng) - X_binned = bin_mapper.fit_transform(X) # F-aligned array + if is_training_data: + X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array + else: + X_binned = self.bin_mapper_.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) toc = time() if self.verbose: duration = toc - tic print("{:.3f} s".format(duration)) - if is_training_data: - self.bin_mapper_ = bin_mapper - else: - # Validation data. We convert the array to C-contiguous since - # predicting is faster with this layout (training is faster on - # F-arrays though) - X_binned = np.ascontiguousarray(X_binned) return X_binned def _print_iteration_stats(self, iteration_start_time): From deda3487ccd99b935099be7682d4abfed79efd99 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 27 May 2019 09:23:41 -0400 Subject: [PATCH 32/76] Allocate first bin for missing entries based on the whole data, not just training data. --- .../_hist_gradient_boosting/_binning.pyx | 18 +++--- .../_hist_gradient_boosting/_predictor.pyx | 12 ++-- .../_hist_gradient_boosting/binning.py | 61 +++++++++--------- .../gradient_boosting.py | 29 +++++++-- .../_hist_gradient_boosting/grower.py | 25 ++++++-- .../_hist_gradient_boosting/predictor.py | 12 ++-- .../_hist_gradient_boosting/splitting.pyx | 24 ++++--- .../tests/test_binning.py | 62 +++++++++++-------- .../tests/test_grower.py | 15 +++-- .../tests/test_splitting.py | 29 +++++---- 10 files changed, 177 insertions(+), 110 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 99637af296624..435a3eded2aa0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -17,7 +17,7 @@ from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - const unsigned char [:] has_missing_values, + const unsigned char [:] support_missing_values, X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -28,8 +28,9 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. - has_missing_values : ndarray, shape (n_features,) - Whether each feature has missing values. + support_missing_values: ndarray, shape (n_features,) + For each feature, indicate whether the first bin is reserved for + missing values. binned : ndarray, shape (n_samples, n_features) Output array, must be fortran aligned. """ @@ -40,13 +41,13 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], - has_missing_values[feature_idx], + support_missing_values[feature_idx], binned[:, feature_idx]) cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, - const unsigned char has_missing_values, + const unsigned char support_missing_values, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: @@ -57,12 +58,13 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True): - if has_missing_values and isnan(data[i]): - # unkown values are mapped to first bin + if isnan(data[i]): + # unkown values are mapped to first bin. For this to be correct, + # support_missing_values must have been correctly set at fit time. binned[i] = 0 else: # for known values, use binary search - left, right = has_missing_values, binning_thresholds.shape[0] + left, right = support_missing_values, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 if data[i] <= binning_thresholds[middle]: diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index f0bd29d429b5a..a851cba441c2e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -36,9 +36,9 @@ def _predict_from_numeric_data(nodes, numeric_data, out): _predict_from_numeric_data_parallel(nodes, numeric_data, out) -def _predict_from_binned_data(nodes, binned_data, has_missing_values, out): +def _predict_from_binned_data(nodes, binned_data, support_missing_values, out): _predict_from_binned_data_parallel(nodes, binned_data, - has_missing_values, out) + support_missing_values, out) cdef void _predict_from_numeric_data_parallel( @@ -82,7 +82,7 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( cdef void _predict_from_binned_data_parallel( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - const unsigned char [:] has_missing_features, + const unsigned char [:] support_missing_values, Y_DTYPE_C [:] out): cdef: @@ -90,13 +90,13 @@ cdef void _predict_from_binned_data_parallel( for i in prange(binned_data.shape[0], schedule='static', nogil=True): out[i] = _predict_one_from_binned_data(nodes, binned_data, - has_missing_features, i) + support_missing_values, i) cdef inline Y_DTYPE_C _predict_one_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - const unsigned char [:] has_missing_features, + const unsigned char [:] support_missing_values, const int row) nogil: # Need to pass the whole array and the row index, else prange won't work. # See issue Cython #2798 @@ -107,7 +107,7 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( while True: if node.is_leaf: return node.value - if (has_missing_features[node.feature_idx] and + if (support_missing_values[node.feature_idx] and binned_data[row, node.feature_idx] == 0): if node.missing_go_to_left: node = nodes[node.left] diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 8ecfd27ca9221..18aae5200e621 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,7 +16,8 @@ from .types import X_DTYPE, X_BINNED_DTYPE -def _find_binning_thresholds(data, max_bins, subsample, random_state): +def _find_binning_thresholds(data, max_bins, support_missing_values, + subsample, random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -29,6 +30,9 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): The maximum number of bins to use. If for a given feature the number of unique values is less than ``max_bins``, then those unique values will be used to compute the bin thresholds, instead of the quantiles. + support_missing_values : ndarray, shape (n_features,) + For each feature, indicates whether the first bin should be reserved + for missing values. subsample : int or None If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -55,19 +59,15 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): data = data.take(subset, axis=0) binning_thresholds = [] - has_missing_values = [] for f_idx in range(data.shape[1]): col_data = data[:, f_idx] # ignore missing values when computing bin thresholds missing_mask = np.isnan(col_data) if missing_mask.any(): col_data = col_data[~missing_mask] - has_missing_values.append(True) - else: - has_missing_values.append(False) col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) distinct_values = np.unique(col_data) - if len(distinct_values) + has_missing_values[-1] <= max_bins: + if len(distinct_values) + support_missing_values[f_idx] <= max_bins: midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 else: @@ -76,14 +76,14 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - n_percentiles = (max_bins if has_missing_values[-1] + n_percentiles = (max_bins if support_missing_values[f_idx] else max_bins + 1) percentiles = np.linspace(0, 100, num=n_percentiles) percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) binning_thresholds.append(midpoints) - return binning_thresholds, has_missing_values + return binning_thresholds class _BinMapper(BaseEstimator, TransformerMixin): @@ -120,7 +120,7 @@ def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X, support_missing_values=False): """Fit data X by computing the binning thresholds. The first bin is reserved for missing values, if any. @@ -129,30 +129,35 @@ def fit(self, X, y=None): ---------- X : array-like, shape (n_samples, n_features) The data to bin. - y: None - Ignored. + support_missing_values : ndarray of bool or bool, shape (n_features,) + For each feature, indicates whether the first bin should be + reserved for missing values. Returns ------- self : object """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - all_bin_thresholds, has_missing_values = _find_binning_thresholds( - X, self.max_bins, subsample=self.subsample, - random_state=self.random_state) - - # If there are missing value in a given feature, we prepend a fake - # threshold (nan) corresponding to the first bin were missing values - # are mapped. This threshold is never used in practice, but we use it - # to keep the indexes of the bins synchronized with the - # bin_thresholds_ attribute: bin k is at index k. + + if isinstance(support_missing_values, bool): + support_missing_values = \ + np.array([support_missing_values] * X.shape[1], dtype=np.uint8) + self.support_missing_values = support_missing_values + + all_bin_thresholds = _find_binning_thresholds( + X, self.max_bins, self.support_missing_values, + subsample=self.subsample, random_state=self.random_state) + + # If the first bin is reserved for missing vaules, we prepend a fake + # threshold (nan) for the first bin. This threshold is never used in + # practice, but we use it to keep the indexes of the bins synchronized + # with the bin_thresholds_ attribute: bin k is at index k. for feature_idx, bin_thresholds in enumerate(all_bin_thresholds): - if has_missing_values[feature_idx]: + if support_missing_values[feature_idx]: all_bin_thresholds[feature_idx] = \ np.insert(bin_thresholds, 0, np.nan) self.bin_thresholds_ = all_bin_thresholds - self.has_missing_values_ = np.array(has_missing_values, dtype=np.uint8) self.actual_n_bins_ = np.array( [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], @@ -163,13 +168,8 @@ def fit(self, X, y=None): def transform(self, X): """Bin data X. - Missing values will be mapped to the first bin, but only if missing - values were encountered at fit time. Else, due to side effect of - comparing with NaNs (always results in False), missing values are - mapped to the last bin. - For this reason, `X` should be the fitting data, though we do not - enforce this. Note that the GBDT code only ever uses - mapper.fit_transform(), so this assumption is OK. + Missing values will be mapped to the first bin, provided that the + support_missing_values parameter was correctly set when calling fit(). Parameters ---------- @@ -190,5 +190,6 @@ def transform(self, X): X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, self.has_missing_values_, binned) + _map_to_bins(X, self.bin_thresholds_, self.support_missing_values, + binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f3dcf1311d12a..585e2925e9c64 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -103,6 +103,12 @@ def fit(self, X, y): self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() + # support_missing_values_ indicates whether the first bin should be + # reserved for missing values. In order for the training and + # validation data to be treated equally, we need to determine this + # before the train/val split. + self.support_missing_values_ = np.isnan(X).any(axis=0).astype(np.uint8) + # we need this stateful variable to tell raw_predict() that it was # called from fit() (this current method), and that the data it has # received is pre-binned. @@ -128,9 +134,11 @@ def fit(self, X, y): X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=self.validation_fraction, stratify=stratify, random_state=rng) + has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) else: X_train, y_train = X, y X_val, y_val = None, None + has_missing_values = self.support_missing_values_ # Bin the data self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) @@ -243,7 +251,8 @@ def fit(self, X, y): X_binned_train, gradients[k, :], hessians[k, :], max_bins=self.max_bins, actual_n_bins=self.bin_mapper_.actual_n_bins_, - has_missing_values=self.bin_mapper_.has_missing_values_, + has_missing_values=has_missing_values, + support_missing_values=self.support_missing_values_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, @@ -276,7 +285,7 @@ def fit(self, X, y): raw_predictions_val[k, :] += ( pred.predict_binned( X_binned_val, - self.bin_mapper_.has_missing_values_) + self.support_missing_values_) ) should_early_stop = self._check_early_stopping_loss( @@ -396,13 +405,21 @@ def _bin_data(self, X, rng, is_training_data): print("Binning {:.3f} GB of {} data: ".format( X.nbytes / 1e9, description), end="", flush=True) tic = time() + if is_training_data: - X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array - else: - X_binned = self.bin_mapper_.transform(X) # F-aligned array + # Fit X. If missing values were found in the original data (before + # any train/val split), the first bin is reserved for missing + # values, even if there aren't missing value in the training data. + self.bin_mapper_.fit( + X, support_missing_values=self.support_missing_values_) + + X_binned = self.bin_mapper_.transform(X) # F-aligned array + + if not is_training_data: # We convert the array to C-contiguous since predicting is faster # with this layout (training is faster on F-arrays though) X_binned = np.ascontiguousarray(X_binned) + toc = time() if self.verbose: duration = toc - tic @@ -483,7 +500,7 @@ def _raw_predict(self, X): if is_binned: predict = partial( predictor.predict_binned, - has_missing_values=self.bin_mapper_.has_missing_values_ + support_missing_values=self.support_missing_values_ ) else: predict = predictor.predict diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 277e6e150ba7e..0b186612533ce 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -145,8 +145,15 @@ class TreeGrower: have the same number of bins. If None, all features are considered to have ``max_bins`` bins. has_missing_values : ndarray of bool or bool, optional (default=False) - Whether each feature contains missing values. If it's a bool, the same - values is used for all features. + Whether each feature contains missing values (in the training data). + If it's a bool, the same values is used for all features. + support_missing_values : ndarray of bool or bool, optional (default=False) + Whether the first bin is reserved for missing values, for each + feature. Naturally, has_missing_values implies + support_missing_values. However, support_missing_values might True + while has_missing_values is False, in the case where there are + missing values in the training data before the train/val split, but + not after. If it's a bool, the same values is used for all features. l2_regularization : float, optional (default=0) The L2 regularization parameter. min_hessian_to_split : float, optional (default=1e-3) @@ -160,8 +167,8 @@ class TreeGrower: def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., max_bins=256, actual_n_bins=None, has_missing_values=False, - l2_regularization=0., min_hessian_to_split=1e-3, - shrinkage=1.): + support_missing_values=False, l2_regularization=0., + min_hessian_to_split=1e-3, shrinkage=1.): self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, @@ -181,13 +188,19 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, has_missing_values = [has_missing_values] * actual_n_bins.shape[0] has_missing_values = np.array(has_missing_values, dtype=np.uint8) + if isinstance(support_missing_values, bool): + support_missing_values = \ + [support_missing_values] * actual_n_bins.shape[0] + support_missing_values = np.array(support_missing_values, + dtype=np.uint8) + hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( X_binned, max_bins, gradients, hessians, hessians_are_constant) self.splitter = Splitter( X_binned, max_bins, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, min_samples_leaf, - min_gain_to_split, hessians_are_constant) + support_missing_values, l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes self.max_bins = max_bins self.has_missing_values = has_missing_values diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index f03692a1af118..b1ac900f59674 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -63,18 +63,16 @@ def predict(self, X): _predict_from_numeric_data(self.nodes, X, out) return out - def predict_binned(self, X, has_missing_values): + def predict_binned(self, X, support_missing_values): """Predict raw values for binned data. Parameters ---------- X : ndarray, shape (n_samples, n_features) The input samples. - has_missing_values : ndarray, shape (n_features,) - Whether each feature has missing values (at fit time). - This parameter is only needed for predict_binned(): we need to know - whether the first bin should be treated as the bin for missing - data. + support_missing_values : ndarray, shape (n_features,) + For each feature, indicates whether the first bin is reserved + for missing values. Returns ------- @@ -82,5 +80,5 @@ def predict_binned(self, X, has_missing_values): The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_binned_data(self.nodes, X, has_missing_values, out) + _predict_from_binned_data(self.nodes, X, support_missing_values, out) return out diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 942ae155c7f5e..5ee492b73dfde 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -102,7 +102,14 @@ cdef class Splitter: The actual number of bins needed for each feature, which is lower or equal to max_bins. has_missing_values : ndarray, shape (n_features,) - Whether each feature contains missing values. + Whether each feature contains missing values (in the training data). + support_missing_values : ndarray, shape (n_features,) + Whether the first bin is reserved for missing values, for each + feature. Naturally, has_missing_values implies + support_missing_values. However, support_missing_values might True + while has_missing_values is False, in the case where there are + missing values in the training data before the train/val split, but + not after. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -122,7 +129,8 @@ cdef class Splitter: unsigned int n_features unsigned int max_bins unsigned int [::1] actual_n_bins - unsigned char [::1] has_missing_values, + unsigned char [::1] has_missing_values + unsigned char [::1] support_missing_values unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -138,6 +146,7 @@ cdef class Splitter: unsigned int max_bins, np.ndarray[np.uint32_t] actual_n_bins, np.ndarray[np.uint8_t] has_missing_values, + np.ndarray[np.uint8_t] support_missing_values, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, @@ -151,6 +160,7 @@ cdef class Splitter: self.max_bins = max_bins self.actual_n_bins = actual_n_bins self.has_missing_values = has_missing_values + self.support_missing_values = support_missing_values self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf @@ -249,8 +259,8 @@ cdef class Splitter: X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left int feature_idx = split_info.feature_idx - unsigned char has_missing_values = \ - self.has_missing_values[feature_idx] + unsigned char support_missing_values = \ + self.support_missing_values[feature_idx] const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer @@ -295,7 +305,7 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if (has_missing_values and X_binned[sample_idx] == 0): + if (support_missing_values and X_binned[sample_idx] == 0): if missing_go_to_left: left_indices_buffer[start + left_count] = sample_idx left_count = left_count + 1 @@ -481,8 +491,8 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - if self.has_missing_values[feature_idx]: - # if there are missing values (in the first bin), skip it + if self.support_missing_values[feature_idx]: + # if first bin is reserved for missing values, skip it start = 1 for bin_idx in range(start, end): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 10ac2ebb7868e..463774ef0cfc7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -20,8 +20,9 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - binning_thresholds, _ = _find_binning_thresholds_orig( - data, max_bins, subsample, random_state) + support_missing_values = np.array([False] * data.shape[1], dtype=np.uint8) + binning_thresholds = _find_binning_thresholds_orig( + data, max_bins, support_missing_values, subsample, random_state) return binning_thresholds @@ -271,7 +272,8 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): # check for missing values: make sure nans are mapped to the first bin # and that attributes are correct - # Note that the extra bin for missing values is only allocated if needed: + # The extra bin for missing values is only allocated if + # support_missing_values is True: # - no need to allocate extra bin for third column here # - due to the extra bin allocated for missing values, the bin values of # feature 0 and 1 are "shifted" with an offset of 1 @@ -286,16 +288,16 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): X = np.array(X) mapper = _BinMapper(max_bins=max_bins) - mapper.fit(X) + support_missing_values = np.array([True, True, False], dtype=np.uint8) + mapper.fit(X, support_missing_values) assert_array_equal(mapper.actual_n_bins_, actual_n_bins) - assert_array_equal(mapper.has_missing_values_, [True, True, False]) for feature_idx in range(X.shape[1]): assert len(mapper.bin_thresholds_[feature_idx]) == \ actual_n_bins[feature_idx] - 1 for feature_idx in (0, 1): - # For features with missing values, we add a fake threshold (nan) to - # keep the bin_thresholds_ array synchronized with the bin values, i.e. - # bin k has threhold at index k. + # If the first bin is reserved, we add a fake threshold (nan) to keep + # the bin_thresholds_ array synchronized with the bin values, i.e. bin + # k has threhold at index k. assert np.isnan(mapper.bin_thresholds_[feature_idx][0]) X_trans = mapper.transform(X) assert_array_equal(X_trans, X_trans_expected) @@ -303,25 +305,33 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): def test_missing_values_different_X_fit_transform(): # Test to illustrate the fact that missing values are mapped to the - # first bin only if missing values were encountered at fit time. - # If there are no missing values at fit time (second column), then during - # transform(), missing values will be mapped to the last bin, not a - # desired behaviour in general. - - # Note that in practice this case never happens, since the GBDT code only - # ever uses mapper.fit_transform(). - - X = [[1, 1], - [np.NaN, 1], - [1, 1], - [2, 2], - [2, 2], - [1, 1]] + # first bin, even if the first bin isn't allocated for missing values. + # The first bin is only reserved for missing values if + # support_missing_values is passed as True for a given feature. + # This means that if it is set to False and missing values are encountered + # during transform(), the missing values are incorrectly treated as the + # smallest values (which are also mapped to the first bin). + + # In practice, this does not happen since: + # - We only call transform() on the training and validation data + # - We set the support_missing_values parameter according to the *whole* + # data, i.e. union(training, validation). + # So if we ever call transform() and there is a missing value, the first + # bin would have been correctly reserved. + + X = [[1, 1], + [1, 1], + [1, 1], + [2, 2], + [2, 2], + [1, 1]] X = np.array(X) mapper = _BinMapper() - mapper.fit(X) + + support_missing_values = np.array([True, False], dtype=np.uint8) + mapper.fit(X, support_missing_values) X2 = [[1, 1], [3, 1], @@ -333,8 +343,10 @@ def test_missing_values_different_X_fit_transform(): X2_trans = mapper.transform(X2) X2_trans_expected = [[1, 0], [2, 0], - [1, 1], # Nan mapped in biggest bin (treated as 2) + [1, 0], # Nan mapped in first bin (treated as 1) [2, 1], - [0, 1], # Nan mapped in the first bin, as expected + [0, 1], # Nan mapped in the first bin, alone [1, 0]] + # Note also how the bins of the first feature have bin shifted because the + # first bin is reserved, while they aren't shifted for the second feature assert_array_equal(X2_trans, X2_trans_expected) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 0fec7bb0cd4a2..08c4597e80427 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -310,10 +310,15 @@ def test_init_parameters_validation(): min_hessian_to_split=-1) -def test_missing_value_predict_only(): +@pytest.mark.parametrize('support_missing_values', [True, False]) +def test_missing_value_predict_only(support_missing_values): # Make sure that missing values are supported at predict time even if they - # were not encountered during fit time: the missing values are assigned to - # whichever child has the most samples + # were not encountered in the training data: the missing values are + # assigned to whichever child has the most samples. + + # Passing support_missing_values=True tests the case where missing values + # were in the original data (train + val), but not present anymore in the + # tradata after the train/val split. rng = np.random.RandomState(0) n_samples = 100 @@ -323,7 +328,9 @@ def test_missing_value_predict_only(): gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) hessians = np.ones(shape=1, dtype=G_H_DTYPE) - grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5) + grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5, + has_missing_values=False, + support_missing_values=support_missing_values) grower.grow() predictor = grower.make_predictor() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index b50c89ec5f387..2df7ea2dff57e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -37,6 +37,7 @@ def test_histogram_split(n_bins): dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, @@ -46,6 +47,7 @@ def test_histogram_split(n_bins): n_bins, actual_n_bins, has_missing_values, + support_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, @@ -101,11 +103,13 @@ def test_gradient_and_hessian_sanity(constant_hessian): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, constant_hessian) + support_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, constant_hessian) hists_parent = builder.compute_histograms_brute(sample_indices) si_parent = splitter.find_node_split(sample_indices, hists_parent, @@ -198,13 +202,14 @@ def test_split_indices(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) + support_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) assert np.all(sample_indices == splitter.partition) @@ -255,12 +260,13 @@ def test_min_gain_to_split(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, n_bins, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) + support_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(sample_indices, histograms, @@ -324,6 +330,7 @@ def test_splitting_missing_values(X_binned, all_gradients, X_binned = np.asfortranarray(X_binned) all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) has_missing_values = np.array([has_missing_values], dtype=np.uint8) + support_missing_values = has_missing_values all_hessians = np.ones(1, dtype=G_H_DTYPE) sum_gradients = all_gradients.sum() sum_hessians = 1 * n_samples @@ -334,9 +341,9 @@ def test_splitting_missing_values(X_binned, all_gradients, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, max_bins, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) + support_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(sample_indices, histograms, From 3fed0ab39a786fc8ea24f9e1544dc91a105425ba Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 30 May 2019 10:43:28 -0400 Subject: [PATCH 33/76] Addressed Thomas' comments --- .../_hist_gradient_boosting/binning.py | 38 ++++++++++--------- .../_hist_gradient_boosting/splitting.pyx | 23 +++++------ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 18aae5200e621..cdf253cf61def 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -46,9 +46,8 @@ def _find_binning_thresholds(data, max_bins, support_missing_values, binning_thresholds: list of arrays For each feature, stores the increasing numeric values that can be used to separate the bins. Thus ``len(binning_thresholds) == - n_features``. - has_missing_values: list of bool - For each feature, indicates whether missing values were encountered. + n_features``. If support_missing_values is True for a given feature, + the first threshold is set to NaN. """ if not (2 <= max_bins <= 256): raise ValueError('max_bins={} should be no smaller than 2 ' @@ -76,12 +75,19 @@ def _find_binning_thresholds(data, max_bins, support_missing_values, # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - n_percentiles = (max_bins if support_missing_values[f_idx] - else max_bins + 1) + n_percentiles = max_bins + 1 - support_missing_values[f_idx] percentiles = np.linspace(0, 100, num=n_percentiles) percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) + + # If the first bin is reserved for missing vaules, we prepend a fake + # threshold (nan) for the first bin. This threshold is never used in + # practice, but we use it to keep the indexes of the bins synchronized + # with the bin_thresholds_ attribute: bin k must be at index k. + if support_missing_values[f_idx]: + midpoints = np.insert(midpoints, 0, np.nan) + binning_thresholds.append(midpoints) return binning_thresholds @@ -131,7 +137,12 @@ def fit(self, X, support_missing_values=False): The data to bin. support_missing_values : ndarray of bool or bool, shape (n_features,) For each feature, indicates whether the first bin should be - reserved for missing values. + reserved for missing values. Note that inferring this from X would + be incorrect here, in general. The X that is passed here is the + training data (after the train/val split). + support_missing_values must be computed on the whole data + (before the split) so that the first bin is allocated if there are + missing values in the training data OR in the validation data. Returns ------- @@ -142,21 +153,12 @@ def fit(self, X, support_missing_values=False): if isinstance(support_missing_values, bool): support_missing_values = \ np.array([support_missing_values] * X.shape[1], dtype=np.uint8) - self.support_missing_values = support_missing_values + self.support_missing_values_ = support_missing_values all_bin_thresholds = _find_binning_thresholds( - X, self.max_bins, self.support_missing_values, + X, self.max_bins, self.support_missing_values_, subsample=self.subsample, random_state=self.random_state) - # If the first bin is reserved for missing vaules, we prepend a fake - # threshold (nan) for the first bin. This threshold is never used in - # practice, but we use it to keep the indexes of the bins synchronized - # with the bin_thresholds_ attribute: bin k is at index k. - for feature_idx, bin_thresholds in enumerate(all_bin_thresholds): - if support_missing_values[feature_idx]: - all_bin_thresholds[feature_idx] = \ - np.insert(bin_thresholds, 0, np.nan) - self.bin_thresholds_ = all_bin_thresholds self.actual_n_bins_ = np.array( @@ -190,6 +192,6 @@ def transform(self, X): X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, self.support_missing_values, + _map_to_bins(X, self.bin_thresholds_, self.support_missing_values_, binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 65d367c649380..c4d7e079c2d81 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -124,9 +124,9 @@ cdef class Splitter: cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features - unsigned int [::1] actual_n_bins - unsigned char [::1] has_missing_values - unsigned char [::1] support_missing_values + const unsigned int [::1] actual_n_bins + const unsigned char [::1] has_missing_values + const unsigned char [::1] support_missing_values unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -139,9 +139,9 @@ cdef class Splitter: def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, - np.ndarray[np.uint32_t] actual_n_bins, - np.ndarray[np.uint8_t] has_missing_values, - np.ndarray[np.uint8_t] support_missing_values, + const unsigned int [::1] actual_n_bins, + const unsigned char [::1] has_missing_values, + const unsigned char [::1] support_missing_values, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, @@ -383,7 +383,7 @@ cdef class Splitter: int n_features = self.n_features split_info_struct split_info split_info_struct * split_infos - unsigned char [:] has_missing_values = self.has_missing_values + const unsigned char [:] has_missing_values = self.has_missing_values with nogil: n_samples = sample_indices.shape[0] @@ -472,7 +472,8 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int start = 0 + # if first bin is reserved for missing values, skip it + unsigned int start = self.support_missing_values[feature_idx] unsigned int end = self.actual_n_bins[feature_idx] - 1 Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right @@ -487,9 +488,6 @@ cdef class Splitter: sum_hessians, self.l2_regularization) - if self.support_missing_values[feature_idx]: - # if first bin is reserved for missing values, skip it - start = 1 for bin_idx in range(start, end): # Note that considering splitting on the last bin is useless since @@ -569,8 +567,7 @@ cdef class Splitter: Y_DTYPE_C gain unsigned int start = self.actual_n_bins[feature_idx] - 2 - # n_bins - 2 is the index of the second to last bin, which we consider - # being on the right child. + # n_bins - 2 is the index of the second to last bin sum_gradient_right, sum_hessian_right = 0., 0. n_samples_right = 0 negative_loss_current_node = negative_loss(sum_gradients, From 7ad5bce2ad5d88398db39619dfdad2ec819b2ee4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 30 May 2019 11:35:01 -0400 Subject: [PATCH 34/76] Update sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py --- sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 08c4597e80427..133558d2bc2f3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -318,7 +318,7 @@ def test_missing_value_predict_only(support_missing_values): # Passing support_missing_values=True tests the case where missing values # were in the original data (train + val), but not present anymore in the - # tradata after the train/val split. + # training data after the train/val split. rng = np.random.RandomState(0) n_samples = 100 From e83b39e0311929621fb3bd482bdbefd35ec7b84d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 3 Jun 2019 12:00:54 -0400 Subject: [PATCH 35/76] Addressed Guillaume's comments --- doc/whats_new/v0.22.rst | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/binning.py | 3 ++- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/grower.py | 6 +++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e27461ae3ac91..2671b612a7d6f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -44,8 +44,8 @@ Changelog - |Feature| :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor` now natively supports - data with missing values both for training and predicting. :pr:`13911` by - `NicolasHug`_. + dense data with missing values both for training and predicting. + :pr:`13911` by `NicolasHug`_. - |Fix| :class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor` now bin the training and diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index cdf253cf61def..d32a8aab0c1b7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -135,7 +135,7 @@ def fit(self, X, support_missing_values=False): ---------- X : array-like, shape (n_samples, n_features) The data to bin. - support_missing_values : ndarray of bool or bool, shape (n_features,) + support_missing_values : bool or ndarray of bool, shape (n_features,) For each feature, indicates whether the first bin should be reserved for missing values. Note that inferring this from X would be incorrect here, in general. The X that is passed here is the @@ -143,6 +143,7 @@ def fit(self, X, support_missing_values=False): support_missing_values must be computed on the whole data (before the split) so that the first bin is allocated if there are missing values in the training data OR in the validation data. + If it's a bool, the same values is used for all features. Returns ------- diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 585e2925e9c64..daeb6ae1e4e0f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -538,7 +538,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): might be preferred since binning may lead to split points that are too approximate in this setting. - This estimator has native support for missing values (Nans). During + This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether nodes with missing values should go to the left or right child, based on the potential gain. When predicting, nodes with missing values are assigned to @@ -713,7 +713,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, might be preferred since binning may lead to split points that are too approximate in this setting. - This estimator has native support for missing values (Nans). During + This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether nodes with missing values should go to the left or right child, based on the potential gain. When predicting, nodes with missing values are assigned to diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 48090654b4a19..9f64c4a3a0f5b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -187,13 +187,13 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, if isinstance(has_missing_values, bool): has_missing_values = [has_missing_values] * actual_n_bins.shape[0] - has_missing_values = np.array(has_missing_values, dtype=np.uint8) + has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) if isinstance(support_missing_values, bool): support_missing_values = \ [support_missing_values] * actual_n_bins.shape[0] - support_missing_values = np.array(support_missing_values, - dtype=np.uint8) + support_missing_values = np.asarray(support_missing_values, + dtype=np.uint8) hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( From d2de00b86d6abefc8194c569934f8e3abd0bc524 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 15 Jun 2019 17:57:28 -0400 Subject: [PATCH 36/76] always allocate first bin for missing values --- .../_hist_gradient_boosting/_binning.pyx | 11 +- .../_hist_gradient_boosting/_predictor.pyx | 8 +- .../_hist_gradient_boosting/binning.py | 61 ++---- .../gradient_boosting.py | 45 ++-- .../_hist_gradient_boosting/grower.py | 23 +- .../_hist_gradient_boosting/predictor.py | 7 +- .../_hist_gradient_boosting/splitting.pyx | 19 +- .../tests/test_binning.py | 198 +++++++----------- .../tests/test_gradient_boosting.py | 2 +- .../tests/test_grower.py | 21 +- .../tests/test_splitting.py | 66 +++--- 11 files changed, 161 insertions(+), 300 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 2ada9b9fd352a..273d257beed97 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -17,7 +17,6 @@ from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - const unsigned char [:] support_missing_values, X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -28,9 +27,6 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. - support_missing_values: ndarray, shape (n_features,) - For each feature, indicate whether the first bin is reserved for - missing values. binned : ndarray, shape (n_samples, n_features) Output array, must be fortran aligned. """ @@ -41,13 +37,11 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], - support_missing_values[feature_idx], binned[:, feature_idx]) cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, - const unsigned char support_missing_values, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: @@ -59,12 +53,11 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True): if isnan(data[i]): - # unkown values are mapped to first bin. For this to be correct, - # support_missing_values must have been correctly set at fit time. + # unkown values are mapped to first bin. binned[i] = 0 else: # for known values, use binary search - left, right = support_missing_values, binning_thresholds.shape[0] + left, right = 1, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 if data[i] <= binning_thresholds[middle]: diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index c4de0043cbbf9..78db4af23b072 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -58,21 +58,18 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( def _predict_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - const unsigned char [:] support_missing_values, Y_DTYPE_C [:] out): cdef: int i for i in prange(binned_data.shape[0], schedule='static', nogil=True): - out[i] = _predict_one_from_binned_data(nodes, binned_data, - support_missing_values, i) + out[i] = _predict_one_from_binned_data(nodes, binned_data, i) cdef inline Y_DTYPE_C _predict_one_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - const unsigned char [:] support_missing_values, const int row) nogil: # Need to pass the whole array and the row index, else prange won't work. # See issue Cython #2798 @@ -83,8 +80,7 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( while True: if node.is_leaf: return node.value - if (support_missing_values[node.feature_idx] and - binned_data[row, node.feature_idx] == 0): + if binned_data[row, node.feature_idx] == 0: # missing value if node.missing_go_to_left: node = nodes[node.left] else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index d32a8aab0c1b7..8125cb9a9db25 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,8 +16,7 @@ from .types import X_DTYPE, X_BINNED_DTYPE -def _find_binning_thresholds(data, max_bins, support_missing_values, - subsample, random_state): +def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -30,9 +29,6 @@ def _find_binning_thresholds(data, max_bins, support_missing_values, The maximum number of bins to use. If for a given feature the number of unique values is less than ``max_bins``, then those unique values will be used to compute the bin thresholds, instead of the quantiles. - support_missing_values : ndarray, shape (n_features,) - For each feature, indicates whether the first bin should be reserved - for missing values. subsample : int or None If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -46,8 +42,7 @@ def _find_binning_thresholds(data, max_bins, support_missing_values, binning_thresholds: list of arrays For each feature, stores the increasing numeric values that can be used to separate the bins. Thus ``len(binning_thresholds) == - n_features``. If support_missing_values is True for a given feature, - the first threshold is set to NaN. + n_features``. The first threshold (for missing values) is always NaN. """ if not (2 <= max_bins <= 256): raise ValueError('max_bins={} should be no smaller than 2 ' @@ -66,7 +61,7 @@ def _find_binning_thresholds(data, max_bins, support_missing_values, col_data = col_data[~missing_mask] col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) distinct_values = np.unique(col_data) - if len(distinct_values) + support_missing_values[f_idx] <= max_bins: + if len(distinct_values) <= max_bins - 1: # - 1 for missing values bin midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 else: @@ -75,20 +70,19 @@ def _find_binning_thresholds(data, max_bins, support_missing_values, # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - n_percentiles = max_bins + 1 - support_missing_values[f_idx] - percentiles = np.linspace(0, 100, num=n_percentiles) + percentiles = np.linspace(0, 100, num=max_bins) percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) - # If the first bin is reserved for missing vaules, we prepend a fake - # threshold (nan) for the first bin. This threshold is never used in - # practice, but we use it to keep the indexes of the bins synchronized - # with the bin_thresholds_ attribute: bin k must be at index k. - if support_missing_values[f_idx]: - midpoints = np.insert(midpoints, 0, np.nan) + # We prepend a fake threshold (nan) for the first bin (reserved for + # missing values). This threshold is never used in practice, but we + # use it to keep the indexes of the bins synchronized with the + # bin_thresholds_ attribute: bin k must be at index k. + midpoints = np.insert(midpoints, 0, np.nan) binning_thresholds.append(midpoints) + return binning_thresholds @@ -110,8 +104,10 @@ class _BinMapper(BaseEstimator, TransformerMixin): max_bins : int, optional (default=256) The maximum number of bins to use (including the bin for missing values, if any). If for a given feature the number of unique values - is less than ``max_bins``, then those unique values will be used to - compute the bin thresholds, instead of the quantiles. + is less than ``max_bins - 1``, then those unique values will be used + to compute the bin thresholds, instead of the quantiles. The first bin + is always reserved for missing values, so the number of bins used + for non-missing values is actually ``max_bins - 1``. subsample : int or None, optional (default=2e5) If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -126,24 +122,16 @@ def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X, support_missing_values=False): + def fit(self, X): """Fit data X by computing the binning thresholds. - The first bin is reserved for missing values, if any. + The first bin is reserved for missing values, whether there are + missing values or not. Parameters ---------- X : array-like, shape (n_samples, n_features) The data to bin. - support_missing_values : bool or ndarray of bool, shape (n_features,) - For each feature, indicates whether the first bin should be - reserved for missing values. Note that inferring this from X would - be incorrect here, in general. The X that is passed here is the - training data (after the train/val split). - support_missing_values must be computed on the whole data - (before the split) so that the first bin is allocated if there are - missing values in the training data OR in the validation data. - If it's a bool, the same values is used for all features. Returns ------- @@ -151,14 +139,9 @@ def fit(self, X, support_missing_values=False): """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - if isinstance(support_missing_values, bool): - support_missing_values = \ - np.array([support_missing_values] * X.shape[1], dtype=np.uint8) - self.support_missing_values_ = support_missing_values - all_bin_thresholds = _find_binning_thresholds( - X, self.max_bins, self.support_missing_values_, - subsample=self.subsample, random_state=self.random_state) + X, self.max_bins, subsample=self.subsample, + random_state=self.random_state) self.bin_thresholds_ = all_bin_thresholds @@ -171,8 +154,7 @@ def fit(self, X, support_missing_values=False): def transform(self, X): """Bin data X. - Missing values will be mapped to the first bin, provided that the - support_missing_values parameter was correctly set when calling fit(). + Missing values will be mapped to the first bin. Parameters ---------- @@ -193,6 +175,5 @@ def transform(self, X): X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, self.support_missing_values_, - binned) + _map_to_bins(X, self.bin_thresholds_, binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 309f26fa8a4ba..91b89a4123f40 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2,7 +2,6 @@ # Author: Nicolas Hug from abc import ABC, abstractmethod -from functools import partial import numpy as np from timeit import default_timer as time @@ -103,12 +102,6 @@ def fit(self, X, y): self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() - # support_missing_values_ indicates whether the first bin should be - # reserved for missing values. In order for the training and - # validation data to be treated equally, we need to determine this - # before the train/val split. - self.support_missing_values_ = np.isnan(X).any(axis=0).astype(np.uint8) - # we need this stateful variable to tell raw_predict() that it was # called from fit() (this current method), and that the data it has # received is pre-binned. @@ -134,11 +127,11 @@ def fit(self, X, y): X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=self.validation_fraction, stratify=stratify, random_state=rng) - has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) else: X_train, y_train = X, y X_val, y_val = None, None - has_missing_values = self.support_missing_values_ + + has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) # Bin the data self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) @@ -252,7 +245,6 @@ def fit(self, X, y): max_bins=self.max_bins, actual_n_bins=self.bin_mapper_.actual_n_bins_, has_missing_values=has_missing_values, - support_missing_values=self.support_missing_values_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, @@ -282,11 +274,8 @@ def fit(self, X, y): # Update raw_predictions_val with the newest tree(s) if self._use_validation_data: for k, pred in enumerate(self._predictors[-1]): - raw_predictions_val[k, :] += ( - pred.predict_binned( - X_binned_val, - self.support_missing_values_) - ) + raw_predictions_val[k, :] += \ + pred.predict_binned(X_binned_val) should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, @@ -410,8 +399,7 @@ def _bin_data(self, X, rng, is_training_data): # Fit X. If missing values were found in the original data (before # any train/val split), the first bin is reserved for missing # values, even if there aren't missing value in the training data. - self.bin_mapper_.fit( - X, support_missing_values=self.support_missing_values_) + self.bin_mapper_.fit(X) X_binned = self.bin_mapper_.transform(X) # F-aligned array @@ -497,13 +485,8 @@ def _raw_predict(self, X): raw_predictions += self._baseline_prediction for predictors_of_ith_iteration in self._predictors: for k, predictor in enumerate(predictors_of_ith_iteration): - if is_binned: - predict = partial( - predictor.predict_binned, - support_missing_values=self.support_missing_values_ - ) - else: - predict = predictor.predict + predict = (predictor.predict_binned if is_binned + else predictor.predict) raw_predictions[k, :] += predict(X) return raw_predictions @@ -592,9 +575,10 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. One bin is - specifically allocated for missing values, if any. Must be no larger - than 256. + number of unique values may use less than ``max_bins`` bins. The + first bin is specifically allocated for missing values, whether they + exist or not. As a result, the number of bins used for non-missing + values is at most ``max_bins - 1``. Must be no larger than 256. scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see @@ -769,9 +753,10 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. One bin is - specifically allocated for missing values, if any. Must be no larger - than 256. + number of unique values may use less than ``max_bins`` bins. The + first bin is specifically allocated for missing values, whether they + exist or not. As a result, the number of bins used for non-missing + values is at most ``max_bins - 1``. Must be no larger than 256. scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 8a98b68310b69..e03c72d54e163 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -152,13 +152,6 @@ class TreeGrower: has_missing_values : ndarray of bool or bool, optional (default=False) Whether each feature contains missing values (in the training data). If it's a bool, the same values is used for all features. - support_missing_values : ndarray of bool or bool, optional (default=False) - Whether the first bin is reserved for missing values, for each - feature. Naturally, has_missing_values implies - support_missing_values. However, support_missing_values might True - while has_missing_values is False, in the case where there are - missing values in the training data before the train/val split, but - not after. If it's a bool, the same values is used for all features. l2_regularization : float, optional (default=0) The L2 regularization parameter. min_hessian_to_split : float, optional (default=1e-3) @@ -172,8 +165,8 @@ class TreeGrower: def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., max_bins=256, actual_n_bins=None, has_missing_values=False, - support_missing_values=False, l2_regularization=0., - min_hessian_to_split=1e-3, shrinkage=1.): + l2_regularization=0., min_hessian_to_split=1e-3, + shrinkage=1.): self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, @@ -193,19 +186,13 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, has_missing_values = [has_missing_values] * actual_n_bins.shape[0] has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) - if isinstance(support_missing_values, bool): - support_missing_values = \ - [support_missing_values] * actual_n_bins.shape[0] - support_missing_values = np.asarray(support_missing_values, - dtype=np.uint8) - hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( X_binned, max_bins, gradients, hessians, hessians_are_constant) self.splitter = Splitter( - X_binned, actual_n_bins, has_missing_values, - support_missing_values, l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, hessians_are_constant) + X_binned, actual_n_bins, has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, min_gain_to_split, + hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes self.max_bins = max_bins self.has_missing_values = has_missing_values diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index 6ffe0bc933ae1..d5586986d8a5c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -46,16 +46,13 @@ def predict(self, X): _predict_from_numeric_data(self.nodes, X, out) return out - def predict_binned(self, X, support_missing_values): + def predict_binned(self, X): """Predict raw values for binned data. Parameters ---------- X : ndarray, shape (n_samples, n_features) The input samples. - support_missing_values : ndarray, shape (n_features,) - For each feature, indicates whether the first bin is reserved - for missing values. Returns ------- @@ -63,5 +60,5 @@ def predict_binned(self, X, support_missing_values): The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_binned_data(self.nodes, X, support_missing_values, out) + _predict_from_binned_data(self.nodes, X, out) return out diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index c64bb0477c7e3..08de0e7c74413 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -100,13 +100,6 @@ cdef class Splitter: equal to max_bins. has_missing_values : ndarray, shape (n_features,) Whether each feature contains missing values (in the training data). - support_missing_values : ndarray, shape (n_features,) - Whether the first bin is reserved for missing values, for each - feature. Naturally, has_missing_values implies - support_missing_values. However, support_missing_values might True - while has_missing_values is False, in the case where there are - missing values in the training data before the train/val split, but - not after. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 @@ -126,7 +119,6 @@ cdef class Splitter: unsigned int n_features const unsigned int [::1] actual_n_bins const unsigned char [::1] has_missing_values - const unsigned char [::1] support_missing_values unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -141,7 +133,6 @@ cdef class Splitter: const X_BINNED_DTYPE_C [::1, :] X_binned, const unsigned int [::1] actual_n_bins, const unsigned char [::1] has_missing_values, - const unsigned char [::1] support_missing_values, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, @@ -152,7 +143,6 @@ cdef class Splitter: self.n_features = X_binned.shape[1] self.actual_n_bins = actual_n_bins self.has_missing_values = has_missing_values - self.support_missing_values = support_missing_values self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf @@ -251,8 +241,6 @@ cdef class Splitter: X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left int feature_idx = split_info.feature_idx - unsigned char support_missing_values = \ - self.support_missing_values[feature_idx] const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer @@ -297,7 +285,7 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if (support_missing_values and X_binned[sample_idx] == 0): + if X_binned[sample_idx] == 0: # missing value if missing_go_to_left: left_indices_buffer[start + left_count] = sample_idx left_count = left_count + 1 @@ -470,8 +458,6 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - # if first bin is reserved for missing values, skip it - unsigned int start = self.support_missing_values[feature_idx] # Note that considering splitting on the last bin is useless since # it would result in having 0 samples in the right node (forbidden) unsigned int end = self.actual_n_bins[feature_idx] - 1 @@ -489,7 +475,8 @@ cdef class Splitter: self.l2_regularization) - for bin_idx in range(start, end): + for bin_idx in range(1, end): + # we skip the first bin which is reserved for missing values n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 463774ef0cfc7..c5a6c9cc83fef 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -20,37 +20,38 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - support_missing_values = np.array([False] * data.shape[1], dtype=np.uint8) binning_thresholds = _find_binning_thresholds_orig( - data, max_bins, support_missing_values, subsample, random_state) + data, max_bins, subsample, random_state) return binning_thresholds def test_find_binning_thresholds_regular_data(): data = np.linspace(0, 10, 1001).reshape(-1, 1) - bin_thresholds = _find_binning_thresholds(data, max_bins=10) - assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) + bin_thresholds = _find_binning_thresholds(data, max_bins=11) + assert_allclose(bin_thresholds[0], [np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9]) assert len(bin_thresholds) == 1 - bin_thresholds = _find_binning_thresholds(data, max_bins=5) - assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) + bin_thresholds = _find_binning_thresholds(data, max_bins=6) + assert_allclose(bin_thresholds[0], [np.nan, 2, 4, 6, 8]) assert len(bin_thresholds) == 1 def test_find_binning_thresholds_small_regular_data(): data = np.linspace(0, 10, 11).reshape(-1, 1) - bin_thresholds = _find_binning_thresholds(data, max_bins=5) - assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) - - bin_thresholds = _find_binning_thresholds(data, max_bins=10) - assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) + bin_thresholds = _find_binning_thresholds(data, max_bins=6) + assert_allclose(bin_thresholds[0], [np.nan, 2, 4, 6, 8]) bin_thresholds = _find_binning_thresholds(data, max_bins=11) - assert_allclose(bin_thresholds[0], np.arange(10) + .5) + assert_allclose(bin_thresholds[0], [np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=12) + expected = np.arange(10) + .5 + expected = np.insert(expected, 0, np.nan) + assert_allclose(bin_thresholds[0], expected) bin_thresholds = _find_binning_thresholds(data, max_bins=255) - assert_allclose(bin_thresholds[0], np.arange(10) + .5) + assert_allclose(bin_thresholds[0], expected) def test_find_binning_thresholds_random_data(): @@ -94,8 +95,7 @@ def test_map_to_bins(n_bins): bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, random_state=0) binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') - has_missing_values = np.array([False] * DATA.shape[1], dtype=np.uint8) - _map_to_bins(DATA, bin_thresholds, has_missing_values, binned) + _map_to_bins(DATA, bin_thresholds, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous @@ -104,7 +104,7 @@ def test_map_to_bins(n_bins): max_indices = DATA.argmax(axis=0) for feature_idx, min_idx in enumerate(min_indices): - assert binned[min_idx, feature_idx] == 0 + assert binned[min_idx, feature_idx] == 1 for feature_idx, max_idx in enumerate(max_indices): assert binned[max_idx, feature_idx] == n_bins - 1 @@ -113,15 +113,12 @@ def test_map_to_bins(n_bins): def test_bin_mapper_random_data(n_bins): n_samples, n_features = DATA.shape - expected_count_per_bin = n_samples // n_bins - tol = int(0.05 * expected_count_per_bin) - mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 - assert_array_equal(binned.min(axis=0), np.array([0, 0])) + assert_array_equal(binned.min(axis=0), np.array([1, 1])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.bin_thresholds_) == n_features for bin_thresholds_feature in mapper.bin_thresholds_: @@ -129,41 +126,47 @@ def test_bin_mapper_random_data(n_bins): assert bin_thresholds_feature.dtype == DATA.dtype assert np.all(mapper.actual_n_bins_ == n_bins) - # Check that the binned data is approximately balanced across bins. + # Check that the binned data is approximately balanced across bins + # (ignoring first bin since there are no missing values) + expected_count_per_bin = n_samples // (n_bins - 1) + tol = int(0.05 * expected_count_per_bin) for feature_idx in range(n_features): - for bin_idx in range(n_bins): + for bin_idx in range(1, n_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol -@pytest.mark.parametrize("n_samples, n_bins", [ +@pytest.mark.parametrize("n_samples, n_bins_for_non_missing", [ (5, 5), (5, 10), (5, 11), (42, 255) ]) -def test_bin_mapper_small_random_data(n_samples, n_bins): +def test_bin_mapper_small_random_data(n_samples, n_bins_for_non_missing): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples - mapper = _BinMapper(max_bins=n_bins, random_state=42) + max_bins = n_bins_for_non_missing + 1 # first bin reserved + mapper = _BinMapper(max_bins=max_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape assert binned.dtype == np.uint8 assert_array_equal(binned.ravel()[np.argsort(data.ravel())], - np.arange(n_samples)) + np.arange(n_samples) + 1) -@pytest.mark.parametrize("n_bins, n_distinct, multiplier", [ +@pytest.mark.parametrize("n_bins_for_non_missing, n_distinct, multiplier", [ (5, 5, 1), (5, 5, 3), (255, 12, 42), ]) -def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): +def test_bin_mapper_identity_repeated_values(n_bins_for_non_missing, + n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) - binned = _BinMapper(max_bins=n_bins).fit_transform(data) - assert_array_equal(data, binned) + max_bins = n_bins_for_non_missing + 1 # first bin reserved + binned = _BinMapper(max_bins=max_bins).fit_transform(data) + assert_array_equal(data, binned - 1) @pytest.mark.parametrize('n_distinct', [2, 7, 42]) @@ -179,9 +182,9 @@ def test_bin_mapper_repeated_values_invariance(n_distinct): data = data.reshape(-1, 1) - mapper_1 = _BinMapper(max_bins=n_distinct) + mapper_1 = _BinMapper(max_bins=n_distinct + 1) binned_1 = mapper_1.fit_transform(data) - assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) + assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct) + 1) # Adding more bins to the mapper yields the same results (same thresholds) mapper_2 = _BinMapper(max_bins=min(256, n_distinct * 3)) @@ -191,15 +194,17 @@ def test_bin_mapper_repeated_values_invariance(n_distinct): assert_array_equal(binned_1, binned_2) -@pytest.mark.parametrize("n_bins, scale, offset", [ +@pytest.mark.parametrize("n_bins_for_non_missing, scale, offset", [ (3, 2, -1), (42, 1, 0), - (256, 0.3, 42), + (255, 0.3, 42), ]) -def test_bin_mapper_identity_small(n_bins, scale, offset): - data = np.arange(n_bins).reshape(-1, 1) * scale + offset - binned = _BinMapper(max_bins=n_bins).fit_transform(data) - assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1)) +def test_bin_mapper_identity_small(n_bins_for_non_missing, scale, offset): + data = np.arange(n_bins_for_non_missing).reshape(-1, 1) * scale + offset + max_bins = n_bins_for_non_missing + 1 # first bin reserved + binned = _BinMapper(max_bins=max_bins).fit_transform(data) + assert_array_equal(binned, + np.arange(n_bins_for_non_missing).reshape(-1, 1) + 1) @pytest.mark.parametrize('n_bins_small, n_bins_large', [ @@ -224,14 +229,14 @@ def test_bin_mapper_idempotence(n_bins_small, n_bins_large): @pytest.mark.parametrize('max_bins', [10, 100, 256]) @pytest.mark.parametrize('diff', [-5, 0, 5]) def test_actual_n_bins(max_bins, diff): - # Check that actual_n_bins is n_unique_values when - # n_unique_values <= max_bins, else max_bins. + # Check that actual_n_bins is n_unique_values + 1 when + # n_unique_values <= max_bins - 1, else max_bins. n_unique_values = max_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) mapper = _BinMapper(max_bins=max_bins).fit(X) - assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values)) + assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values + 1)) def test_subsample(): @@ -247,106 +252,49 @@ def test_subsample(): @pytest.mark.parametrize( 'max_bins, actual_n_bins, X_trans_expected', [ - (256, [5, 3, 2], [[1, 1, 0], - [0, 0, 0], - [2, 1, 0], - [0, 2, 1], - [3, 2, 1], - [4, 1, 0]]), + (256, [5, 3, 3], [[1, 1, 1], + [0, 0, 1], + [2, 1, 1], + [0, 2, 2], + [3, 2, 2], + [4, 1, 1]]), # With max_bins=2, we expect all nan values to be mapped to bin 0 # and all non-nans to be mapped to bin 1 - (2, [2, 2, 2], [[1, 1, 0], - [0, 0, 0], - [1, 1, 0], + (2, [2, 2, 2], [[1, 1, 1], + [0, 0, 1], + [1, 1, 1], [0, 1, 1], [1, 1, 1], - [1, 1, 0]]), - - (3, [3, 3, 2], [[1, 1, 0], - [0, 0, 0], - [1, 1, 0], - [0, 2, 1], - [2, 2, 1], - [2, 1, 0]])]) + [1, 1, 1]]), + + (3, [3, 3, 3], [[1, 1, 1], + [0, 0, 1], + [1, 1, 1], + [0, 2, 2], + [2, 2, 2], + [2, 1, 1]])]) def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): # check for missing values: make sure nans are mapped to the first bin # and that attributes are correct - # The extra bin for missing values is only allocated if - # support_missing_values is True: - # - no need to allocate extra bin for third column here - # - due to the extra bin allocated for missing values, the bin values of - # feature 0 and 1 are "shifted" with an offset of 1 - - X = [[1, 1, 1], - [np.NaN, np.NaN, 1], - [2, 1, 1], - [np.NaN, 2, 2], - [3, 2, 2], - [4, 1, 1]] + X = [[1, 1, 0], + [np.NaN, np.NaN, 0], + [2, 1, 0], + [np.NaN, 2, 1], + [3, 2, 1], + [4, 1, 0]] X = np.array(X) mapper = _BinMapper(max_bins=max_bins) - support_missing_values = np.array([True, True, False], dtype=np.uint8) - mapper.fit(X, support_missing_values) + mapper.fit(X) + assert_array_equal(mapper.actual_n_bins_, actual_n_bins) + for feature_idx in range(X.shape[1]): assert len(mapper.bin_thresholds_[feature_idx]) == \ actual_n_bins[feature_idx] - 1 - for feature_idx in (0, 1): - # If the first bin is reserved, we add a fake threshold (nan) to keep - # the bin_thresholds_ array synchronized with the bin values, i.e. bin - # k has threhold at index k. assert np.isnan(mapper.bin_thresholds_[feature_idx][0]) + X_trans = mapper.transform(X) assert_array_equal(X_trans, X_trans_expected) - - -def test_missing_values_different_X_fit_transform(): - # Test to illustrate the fact that missing values are mapped to the - # first bin, even if the first bin isn't allocated for missing values. - # The first bin is only reserved for missing values if - # support_missing_values is passed as True for a given feature. - # This means that if it is set to False and missing values are encountered - # during transform(), the missing values are incorrectly treated as the - # smallest values (which are also mapped to the first bin). - - # In practice, this does not happen since: - # - We only call transform() on the training and validation data - # - We set the support_missing_values parameter according to the *whole* - # data, i.e. union(training, validation). - # So if we ever call transform() and there is a missing value, the first - # bin would have been correctly reserved. - - X = [[1, 1], - [1, 1], - [1, 1], - [2, 2], - [2, 2], - [1, 1]] - - X = np.array(X) - - mapper = _BinMapper() - - support_missing_values = np.array([True, False], dtype=np.uint8) - mapper.fit(X, support_missing_values) - - X2 = [[1, 1], - [3, 1], - [1, np.NaN], - [2, 2], - [np.NaN, 2], - [1, 1]] - - X2_trans = mapper.transform(X2) - X2_trans_expected = [[1, 0], - [2, 0], - [1, 0], # Nan mapped in first bin (treated as 1) - [2, 1], - [0, 1], # Nan mapped in the first bin, alone - [1, 0]] - # Note also how the bins of the first feature have bin shifted because the - # first bin is reserved, while they aren't shifted for the second feature - assert_array_equal(X2_trans, X2_trans_expected) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 32d00249a6d9e..806d583dd6bd9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -169,7 +169,7 @@ def test_binning_train_validation_are_separated(): n_samples = X_classification.shape[0] assert np.all(mapper_training_data.actual_n_bins_ == - int((1 - validation_fraction) * n_samples)) + int((1 - validation_fraction) * n_samples) + 1) assert np.all(mapper_training_data.actual_n_bins_ != mapper_whole_data.actual_n_bins_) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 133558d2bc2f3..3763cc10e06a2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -15,7 +15,7 @@ def _make_training_data(n_bins=256, constant_hessian=True): # Generate some test data directly binned so as to test the grower code # independently of the binning logic. - X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), + X_binned = rng.randint(1, n_bins - 1, size=(n_samples, 2), dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) @@ -161,11 +161,11 @@ def test_predictor_from_grower(): # Probe some predictions for each leaf of the tree # each group of 3 samples corresponds to a condition in _make_training_data input_data = np.array([ - [0, 0], + [1, 1], [42, 99], [128, 255], - [129, 0], + [129, 1], [129, 85], [255, 85], @@ -173,13 +173,12 @@ def test_predictor_from_grower(): [129, 255], [242, 100], ], dtype=np.uint8) - has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - predictions = predictor.predict_binned(input_data, has_missing_values) + predictions = predictor.predict_binned(input_data) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: - predictions = predictor.predict_binned(X_binned, has_missing_values) + predictions = predictor.predict_binned(X_binned) assert np.allclose(predictions, -all_gradients) @@ -310,16 +309,11 @@ def test_init_parameters_validation(): min_hessian_to_split=-1) -@pytest.mark.parametrize('support_missing_values', [True, False]) -def test_missing_value_predict_only(support_missing_values): +def test_missing_value_predict_only(): # Make sure that missing values are supported at predict time even if they # were not encountered in the training data: the missing values are # assigned to whichever child has the most samples. - # Passing support_missing_values=True tests the case where missing values - # were in the original data (train + val), but not present anymore in the - # training data after the train/val split. - rng = np.random.RandomState(0) n_samples = 100 X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) @@ -329,8 +323,7 @@ def test_missing_value_predict_only(support_missing_values): hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X_binned, gradients, hessians, min_samples_leaf=5, - has_missing_values=False, - support_missing_values=support_missing_values) + has_missing_values=False) grower.grow() predictor = grower.make_predictor() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 8dd6fd6062961..e618ca83c5168 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -37,7 +37,6 @@ def test_histogram_split(n_bins): dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, @@ -46,7 +45,6 @@ def test_histogram_split(n_bins): splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - support_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, @@ -102,13 +100,12 @@ def test_gradient_and_hessian_sanity(constant_hessian): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - support_missing_values, l2_regularization, - min_hessian_to_split, min_samples_leaf, - min_gain_to_split, constant_hessian) + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + constant_hessian) hists_parent = builder.compute_histograms_brute(sample_indices) si_parent = splitter.find_node_split(n_samples, hists_parent, @@ -180,16 +177,16 @@ def test_split_indices(): min_gain_to_split = 0. # split will happen on feature 1 and on bin 3 - X_binned = [[0, 0], - [0, 3], - [0, 4], - [0, 0], - [0, 0], - [0, 0], - [0, 0], - [0, 4], - [0, 0], - [0, 4]] + X_binned = [[1, 1], + [1, 3], + [1, 4], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 4], + [1, 1], + [1, 4]] X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) sample_indices = np.arange(n_samples, dtype=np.uint32) all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) @@ -201,14 +198,13 @@ def test_split_indices(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - support_missing_values, l2_regularization, - min_hessian_to_split, min_samples_leaf, - min_gain_to_split, hessians_are_constant) + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + hessians_are_constant) assert np.all(sample_indices == splitter.partition) @@ -259,13 +255,12 @@ def test_min_gain_to_split(): actual_n_bins = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - support_missing_values = has_missing_values builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - support_missing_values, l2_regularization, - min_hessian_to_split, min_samples_leaf, - min_gain_to_split, hessians_are_constant) + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + hessians_are_constant) histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(n_samples, histograms, @@ -277,12 +272,12 @@ def test_min_gain_to_split(): 'X_binned, all_gradients, has_missing_values, expected_bin_idx, ' 'expected_go_to_left', [ - # basic sanity check: given the gradient values, the split must occur - # on bin_idx=3 - ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned + # basic sanity check with no missing values: given the gradient + # values, the split must occur on bin_idx=4 + ([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # X_binned [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients False, # no missing values - 3, # expected_bin_idx + 4, # expected_bin_idx 'not_applicable'), # We replace 2 samples by NaNs (bin_idx=0) @@ -291,7 +286,7 @@ def test_min_gain_to_split(): # Notice how the bin_idx threshold changes from 3 to 2. # Also, the bins of the previous non-nan samples have bin shiffted by # one - ([0, 1, 2, 0, 3, 4, 5, 6, 7, 8], + ([0, 1, 2, 0, 3, 4, 5, 6, 7, 8], # missing values are the zeros [1, 1, 1, 1, 2, 2, 2, 2, 2, 2], True, # missing values (bin_idx=0) 2, # cut on bin_idx=2 @@ -312,12 +307,12 @@ def test_splitting_missing_values(X_binned, all_gradients, # we build an artificial example with gradients such that the best split # is on bin_idx=3, when there are no missing values. # Then we introduce missing values and: - # - make sure the chosen bin is still correct (find_best_bin()): it's - # still the same bin, even though its index changes + # - make sure the chosen bin is correct (find_best_bin()): it's + # still the same split, even though the index of the bin changes # - make sure the missing values are mapped to the correct child # (split_indices()) - max_bins = 10 # TO REMOVE + max_bins = max(X_binned) + 1 n_samples = len(X_binned) l2_regularization = 0. min_hessian_to_split = 1e-3 @@ -329,7 +324,6 @@ def test_splitting_missing_values(X_binned, all_gradients, X_binned = np.asfortranarray(X_binned) all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) has_missing_values = np.array([has_missing_values], dtype=np.uint8) - support_missing_values = has_missing_values all_hessians = np.ones(1, dtype=G_H_DTYPE) sum_gradients = all_gradients.sum() sum_hessians = 1 * n_samples @@ -340,9 +334,9 @@ def test_splitting_missing_values(X_binned, all_gradients, all_gradients, all_hessians, hessians_are_constant) splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - support_missing_values, l2_regularization, - min_hessian_to_split, min_samples_leaf, - min_gain_to_split, hessians_are_constant) + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + hessians_are_constant) histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(n_samples, histograms, From 26b66ab1c97b7ab427f7f6f13bec5b4a0392b1e5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 15 Jun 2019 18:17:08 -0400 Subject: [PATCH 37/76] reduce diff --- .../_hist_gradient_boosting/_binning.pyx | 8 +++----- .../ensemble/_hist_gradient_boosting/binning.py | 17 ++++++++--------- .../gradient_boosting.py | 17 +++++------------ .../tests/test_binning.py | 5 ++--- 4 files changed, 18 insertions(+), 29 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 273d257beed97..37a9390eb8b7b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -15,8 +15,7 @@ from libc.math cimport isnan from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C -cpdef _map_to_bins(const X_DTYPE_C [:, :] data, - list binning_thresholds, +cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -34,15 +33,14 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, int feature_idx for feature_idx in range(data.shape[1]): - _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], binned[:, feature_idx]) cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, - const X_DTYPE_C [:] binning_thresholds, - X_BINNED_DTYPE_C [:] binned): + const X_DTYPE_C [:] binning_thresholds, + X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: int i diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 8125cb9a9db25..335d405188040 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -103,9 +103,9 @@ class _BinMapper(BaseEstimator, TransformerMixin): ---------- max_bins : int, optional (default=256) The maximum number of bins to use (including the bin for missing - values, if any). If for a given feature the number of unique values - is less than ``max_bins - 1``, then those unique values will be used - to compute the bin thresholds, instead of the quantiles. The first bin + values). If for a given feature the number of unique values is less + than ``max_bins - 1``, then those unique values will be used to + compute the bin thresholds, instead of the quantiles. The first bin is always reserved for missing values, so the number of bins used for non-missing values is actually ``max_bins - 1``. subsample : int or None, optional (default=2e5) @@ -122,7 +122,7 @@ def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): self.subsample = subsample self.random_state = random_state - def fit(self, X): + def fit(self, X, y=None): """Fit data X by computing the binning thresholds. The first bin is reserved for missing values, whether there are @@ -132,19 +132,18 @@ def fit(self, X): ---------- X : array-like, shape (n_samples, n_features) The data to bin. + y: None + Ignored. Returns ------- self : object """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - - all_bin_thresholds = _find_binning_thresholds( + self.bin_thresholds_ = _find_binning_thresholds( X, self.max_bins, subsample=self.subsample, random_state=self.random_state) - self.bin_thresholds_ = all_bin_thresholds - self.actual_n_bins_ = np.array( [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], dtype=np.uint32) @@ -159,7 +158,7 @@ def transform(self, X): Parameters ---------- X : array-like, shape (n_samples, n_features) - The data to bin. Must be the fitting data. + The data to bin. Returns ------- diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 91b89a4123f40..ee04f6b780017 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -274,8 +274,8 @@ def fit(self, X, y): # Update raw_predictions_val with the newest tree(s) if self._use_validation_data: for k, pred in enumerate(self._predictors[-1]): - raw_predictions_val[k, :] += \ - pred.predict_binned(X_binned_val) + raw_predictions_val[k, :] += ( + pred.predict_binned(X_binned_val)) should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, @@ -394,20 +394,13 @@ def _bin_data(self, X, rng, is_training_data): print("Binning {:.3f} GB of {} data: ".format( X.nbytes / 1e9, description), end="", flush=True) tic = time() - if is_training_data: - # Fit X. If missing values were found in the original data (before - # any train/val split), the first bin is reserved for missing - # values, even if there aren't missing value in the training data. - self.bin_mapper_.fit(X) - - X_binned = self.bin_mapper_.transform(X) # F-aligned array - - if not is_training_data: + X_binned = self.bin_mapper_.fit_transform(X) # F-aligned array + else: + X_binned = self.bin_mapper_.transform(X) # F-aligned array # We convert the array to C-contiguous since predicting is faster # with this layout (training is faster on F-arrays though) X_binned = np.ascontiguousarray(X_binned) - toc = time() if self.verbose: duration = toc - tic diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index c5a6c9cc83fef..f28df67f1e0c3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -20,9 +20,8 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), random_state=None): # Just a redef to avoid having to pass arguments all the time (as the # function is private we don't use default values for parameters) - binning_thresholds = _find_binning_thresholds_orig( - data, max_bins, subsample, random_state) - return binning_thresholds + return _find_binning_thresholds_orig(data, max_bins, subsample, + random_state) def test_find_binning_thresholds_regular_data(): From f370a713e0160b6ab6fce713fe1197f81254f80a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 18 Jun 2019 11:39:33 -0400 Subject: [PATCH 38/76] minor more consistent test --- .../ensemble/_hist_gradient_boosting/tests/test_splitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index e618ca83c5168..7b925ba989dc2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -287,7 +287,7 @@ def test_min_gain_to_split(): # Also, the bins of the previous non-nan samples have bin shiffted by # one ([0, 1, 2, 0, 3, 4, 5, 6, 7, 8], # missing values are the zeros - [1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values (bin_idx=0) 2, # cut on bin_idx=2 True), # missing values go to left From ec571710809905c5c01b461c355e53c54944460f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 18 Jun 2019 11:40:00 -0400 Subject: [PATCH 39/76] typo --- .../ensemble/_hist_gradient_boosting/tests/test_splitting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 7b925ba989dc2..b23111fe0142b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -305,7 +305,7 @@ def test_splitting_missing_values(X_binned, all_gradients, expected_go_to_left): # Make sure missing values are properly supported. # we build an artificial example with gradients such that the best split - # is on bin_idx=3, when there are no missing values. + # is on bin_idx=4, when there are no missing values. # Then we introduce missing values and: # - make sure the chosen bin is correct (find_best_bin()): it's # still the same split, even though the index of the bin changes From 2dfaad8de18bdce5e380fe518393c3170ab8d8a6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 27 Jun 2019 18:13:18 -0400 Subject: [PATCH 40/76] WIP --- .../_hist_gradient_boosting/_binning.pyx | 13 +- .../_hist_gradient_boosting/_predictor.pyx | 9 +- .../_hist_gradient_boosting/binning.py | 66 +++--- .../gradient_boosting.py | 61 +++--- .../_hist_gradient_boosting/grower.py | 27 +-- .../_hist_gradient_boosting/predictor.py | 4 +- .../_hist_gradient_boosting/splitting.pyx | 30 ++- .../tests/test_binning.py | 201 +++++++++--------- .../tests/test_compare_lightgbm.py | 12 +- .../tests/test_gradient_boosting.py | 10 +- .../tests/test_grower.py | 37 ++-- .../tests/test_predictor.py | 10 +- .../tests/test_splitting.py | 113 +++++----- 13 files changed, 309 insertions(+), 284 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 37a9390eb8b7b..1c5550d6ddd19 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -15,8 +15,10 @@ from libc.math cimport isnan from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C -cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - X_BINNED_DTYPE_C [::1, :] binned): +def _map_to_bins(const X_DTYPE_C [:, :] data, + list binning_thresholds, + const unsigned char missing_values_bin_idx, + X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. Parameters @@ -35,11 +37,13 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, for feature_idx in range(data.shape[1]): _map_num_col_to_bins(data[:, feature_idx], binning_thresholds[feature_idx], + missing_values_bin_idx, binned[:, feature_idx]) cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, + const unsigned char missing_values_bin_idx, X_BINNED_DTYPE_C [:] binned): """Binary search to find the bin index for each value in the data.""" cdef: @@ -51,11 +55,10 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, for i in prange(data.shape[0], schedule='static', nogil=True): if isnan(data[i]): - # unkown values are mapped to first bin. - binned[i] = 0 + binned[i] = missing_values_bin_idx else: # for known values, use binary search - left, right = 1, binning_thresholds.shape[0] + left, right = 0, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 if data[i] <= binning_thresholds[middle]: diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 78db4af23b072..6f37e7efb83f0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -58,19 +58,22 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( def _predict_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, + const unsigned char missing_values_bin_idx, Y_DTYPE_C [:] out): cdef: int i for i in prange(binned_data.shape[0], schedule='static', nogil=True): - out[i] = _predict_one_from_binned_data(nodes, binned_data, i) + out[i] = _predict_one_from_binned_data(nodes, binned_data, i, + missing_values_bin_idx) cdef inline Y_DTYPE_C _predict_one_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - const int row) nogil: + const int row, + const unsigned char missing_values_bin_idx) nogil: # Need to pass the whole array and the row index, else prange won't work. # See issue Cython #2798 @@ -80,7 +83,7 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( while True: if node.is_leaf: return node.value - if binned_data[row, node.feature_idx] == 0: # missing value + if binned_data[row, node.feature_idx] == missing_values_bin_idx: if node.missing_go_to_left: node = nodes[node.left] else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 335d405188040..8a0bbbdd99985 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,7 +16,7 @@ from .types import X_DTYPE, X_BINNED_DTYPE -def _find_binning_thresholds(data, max_bins, subsample, random_state): +def _find_binning_thresholds(data, n_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -25,10 +25,13 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): ---------- data : array-like, shape (n_samples, n_features) The data to bin. - max_bins : int - The maximum number of bins to use. If for a given feature the number of - unique values is less than ``max_bins``, then those unique values - will be used to compute the bin thresholds, instead of the quantiles. + n_bins : int, optional (default=256) + The maximum number of bins to use (including the bin for missing + values). Non-missing values are binned on ``max_bins = n_bins - 1`` + bins. The last bin is always reserved for missing values. If for a + given feature the number of unique values is less than ``max_bins``, + then those unique values will be used to compute the bin thresholds, + instead of the quantiles. subsample : int or None If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -44,9 +47,9 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): be used to separate the bins. Thus ``len(binning_thresholds) == n_features``. The first threshold (for missing values) is always NaN. """ - if not (2 <= max_bins <= 256): - raise ValueError('max_bins={} should be no smaller than 2 ' - 'and no larger than 256.'.format(max_bins)) + if not (3 <= n_bins <= 256): + raise ValueError('n_bins={} should be no smaller than 3 ' + 'and no larger than 256.'.format(n_bins)) rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) @@ -61,7 +64,10 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): col_data = col_data[~missing_mask] col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) distinct_values = np.unique(col_data) - if len(distinct_values) <= max_bins - 1: # - 1 for missing values bin + + # maximum number of bins used for non missing values + max_n_bins_non_missing = n_bins - 1 + if len(distinct_values) <= max_n_bins_non_missing: midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 else: @@ -70,16 +76,11 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - percentiles = np.linspace(0, 100, num=max_bins) + percentiles = np.linspace(0, 100, num=n_bins) percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) - - # We prepend a fake threshold (nan) for the first bin (reserved for - # missing values). This threshold is never used in practice, but we - # use it to keep the indexes of the bins synchronized with the - # bin_thresholds_ attribute: bin k must be at index k. - midpoints = np.insert(midpoints, 0, np.nan) + assert midpoints.shape[0] == max_n_bins_non_missing - 1 binning_thresholds.append(midpoints) @@ -96,18 +97,18 @@ class _BinMapper(BaseEstimator, TransformerMixin): speed-up the binning, but the quantiles should remain stable. If the number of unique values for a given feature is less than - ``max_bins``, then the unique values of this feature are used instead of + ``n_bins``, then the unique values of this feature are used instead of the quantiles. Parameters ---------- - max_bins : int, optional (default=256) + n_bins : int, optional (default=256) The maximum number of bins to use (including the bin for missing - values). If for a given feature the number of unique values is less - than ``max_bins - 1``, then those unique values will be used to - compute the bin thresholds, instead of the quantiles. The first bin - is always reserved for missing values, so the number of bins used - for non-missing values is actually ``max_bins - 1``. + values). Non-missing values are binned on ``max_bins = n_bins - 1`` + bins. The last bin is always reserved for missing values. If for a + given feature the number of unique values is less than ``max_bins``, + then those unique values will be used to compute the bin thresholds, + instead of the quantiles. subsample : int or None, optional (default=2e5) If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data @@ -117,8 +118,8 @@ class _BinMapper(BaseEstimator, TransformerMixin): Pseudo-random number generator to control the random sub-sampling. See :term:`random_state`. """ - def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): - self.max_bins = max_bins + def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): + self.n_bins = n_bins self.subsample = subsample self.random_state = random_state @@ -141,13 +142,15 @@ def fit(self, X, y=None): """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') self.bin_thresholds_ = _find_binning_thresholds( - X, self.max_bins, subsample=self.subsample, + X, self.n_bins, subsample=self.subsample, random_state=self.random_state) - self.actual_n_bins_ = np.array( + self.n_bins_non_missing_ = np.array( [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], dtype=np.uint32) + self.missing_values_bin_idx_ = self.n_bins - 1 + return self def transform(self, X): @@ -166,13 +169,14 @@ def transform(self, X): The binned data (fortran-aligned). """ X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') - check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_']) - if X.shape[1] != self.actual_n_bins_.shape[0]: + check_is_fitted(self, ['bin_thresholds_', 'n_bins_non_missing_']) + if X.shape[1] != self.n_bins_non_missing_.shape[0]: raise ValueError( 'This estimator was fitted with {} features but {} got passed ' - 'to transform()'.format(self.actual_n_bins_.shape[0], + 'to transform()'.format(self.n_bins_non_missing_.shape[0], X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(X, self.bin_thresholds_, binned) + _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_, + binned) return binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 05c41a38f88b2..bbab7cd154052 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -2,6 +2,7 @@ # Author: Nicolas Hug from abc import ABC, abstractmethod +from functools import partial import numpy as np from timeit import default_timer as time @@ -74,6 +75,10 @@ def _validate_parameters(self): raise ValueError('tol={} ' 'must not be smaller than 0.'.format(self.tol)) + if not (2 <= self.max_bins <= 255): + raise ValueError('max_bins={} should be no smaller than 2 ' + 'and no larger than 255.'.format(self.max_bins)) + def fit(self, X, y): """Fit the gradient boosting model. @@ -145,7 +150,8 @@ def fit(self, X, y): has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) # Bin the data - self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) + n_bins = self.max_bins + 1 # + 1 for missing values + self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=rng) X_binned_train = self._bin_data(X_train, rng, is_training_data=True) if X_val is not None: X_binned_val = self._bin_data(X_val, rng, is_training_data=False) @@ -294,8 +300,8 @@ def fit(self, X, y): grower = TreeGrower( X_binned_train, gradients[k, :], hessians[k, :], - max_bins=self.max_bins, - actual_n_bins=self.bin_mapper_.actual_n_bins_, + n_bins=n_bins, + n_bins_non_missing=self.bin_mapper_.n_bins_non_missing_, has_missing_values=has_missing_values, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, @@ -327,7 +333,11 @@ def fit(self, X, y): if self._use_validation_data: for k, pred in enumerate(self._predictors[-1]): raw_predictions_val[k, :] += ( - pred.predict_binned(X_binned_val)) + pred.predict_binned( + X_binned_val, + self.bin_mapper_.missing_values_bin_idx_ + ) + ) should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, @@ -555,8 +565,13 @@ def _raw_predict(self, X): raw_predictions += self._baseline_prediction for predictors_of_ith_iteration in self._predictors: for k, predictor in enumerate(predictors_of_ith_iteration): - predict = (predictor.predict_binned if is_binned - else predictor.predict) + if is_binned: + predict = partial( + predictor.predict_binned, + missing_values_bin_idx=self.bin_mapper_.missing_values_bin_idx_ # noqa + ) + else: + predict = predictor.predict raw_predictions[k, :] += predict(X) return raw_predictions @@ -641,14 +656,13 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): l2_regularization : float, optional (default=0) The L2 regularization parameter. Use ``0`` for no regularization (default). - max_bins : int, optional (default=256) - The maximum number of bins to use. Before training, each feature of - the input array ``X`` is binned into at most ``max_bins`` bins, which - allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. The - first bin is specifically allocated for missing values, whether they - exist or not. As a result, the number of bins used for non-missing - values is at most ``max_bins - 1``. Must be no larger than 256. + max_bins : int, optional (default=255) + The maximum number of bins to use for non-missing values. Before + training, each feature of the input array ``X`` is binned into + integer-valued bins, which allows for a much faster training stage. + Features with a small number of unique values may use less than + ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin + is reserved for missing values. Must be no larger than 255. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -719,7 +733,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): def __init__(self, loss='least_squares', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, - min_samples_leaf=20, l2_regularization=0., max_bins=256, + min_samples_leaf=20, l2_regularization=0., max_bins=255, warm_start=False, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): @@ -826,14 +840,13 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, since only very shallow trees would be built. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. - max_bins : int, optional (default=256) - The maximum number of bins to use. Before training, each feature of - the input array ``X`` is binned into at most ``max_bins`` bins, which - allows for a much faster training stage. Features with a small - number of unique values may use less than ``max_bins`` bins. The - first bin is specifically allocated for missing values, whether they - exist or not. As a result, the number of bins used for non-missing - values is at most ``max_bins - 1``. Must be no larger than 256. + max_bins : int, optional (default=255) + The maximum number of bins to use for non-missing values. Before + training, each feature of the input array ``X`` is binned into + integer-valued bins, which allows for a much faster training stage. + Features with a small number of unique values may use less than + ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin + is reserved for missing values. Must be no larger than 255. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -906,7 +919,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, - l2_regularization=0., max_bins=256, warm_start=False, + l2_regularization=0., max_bins=255, warm_start=False, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(HistGradientBoostingClassifier, self).__init__( diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e03c72d54e163..869806b4bf2d4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -164,7 +164,7 @@ class TreeGrower: """ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., - max_bins=256, actual_n_bins=None, has_missing_values=False, + n_bins=256, n_bins_non_missing=None, has_missing_values=False, l2_regularization=0., min_hessian_to_split=1e-3, shrinkage=1.): @@ -172,29 +172,30 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split) - if actual_n_bins is None: - actual_n_bins = max_bins + if n_bins_non_missing is None: + n_bins_non_missing = n_bins - 1 - if isinstance(actual_n_bins, numbers.Integral): - actual_n_bins = np.array( - [actual_n_bins] * X_binned.shape[1], + if isinstance(n_bins_non_missing, numbers.Integral): + n_bins_non_missing = np.array( + [n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32) else: - actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32) + n_bins_non_missing = np.asarray(n_bins_non_missing, + dtype=np.uint32) if isinstance(has_missing_values, bool): - has_missing_values = [has_missing_values] * actual_n_bins.shape[0] + has_missing_values = [has_missing_values] * X_binned.shape[1] has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( - X_binned, max_bins, gradients, hessians, hessians_are_constant) + X_binned, n_bins, gradients, hessians, hessians_are_constant) + missing_values_bin_idx = n_bins - 1 self.splitter = Splitter( - X_binned, actual_n_bins, has_missing_values, l2_regularization, - min_hessian_to_split, min_samples_leaf, min_gain_to_split, - hessians_are_constant) + X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes - self.max_bins = max_bins self.has_missing_values = has_missing_values self.n_features = X_binned.shape[1] self.max_depth = max_depth diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index d5586986d8a5c..5323e0002db6b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -46,7 +46,7 @@ def predict(self, X): _predict_from_numeric_data(self.nodes, X, out) return out - def predict_binned(self, X): + def predict_binned(self, X, missing_values_bin_idx): """Predict raw values for binned data. Parameters @@ -60,5 +60,5 @@ def predict_binned(self, X): The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_binned_data(self.nodes, X, out) + _predict_from_binned_data(self.nodes, X, missing_values_bin_idx, out) return out diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 08de0e7c74413..e51b5e7cb4307 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -95,7 +95,7 @@ cdef class Splitter: ---------- X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - actual_n_bins : ndarray, shape (n_features,) + n_bins_non_missing : ndarray, shape (n_features,) The actual number of bins needed for each feature, which is lower or equal to max_bins. has_missing_values : ndarray, shape (n_features,) @@ -117,7 +117,8 @@ cdef class Splitter: cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features - const unsigned int [::1] actual_n_bins + const unsigned int [::1] n_bins_non_missing + unsigned char missing_values_bin_idx const unsigned char [::1] has_missing_values unsigned char hessians_are_constant Y_DTYPE_C l2_regularization @@ -131,7 +132,8 @@ cdef class Splitter: def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, - const unsigned int [::1] actual_n_bins, + const unsigned int [::1] n_bins_non_missing, + const unsigned char missing_values_bin_idx, const unsigned char [::1] has_missing_values, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, @@ -141,7 +143,8 @@ cdef class Splitter: self.X_binned = X_binned self.n_features = X_binned.shape[1] - self.actual_n_bins = actual_n_bins + self.n_bins_non_missing = n_bins_non_missing + self.missing_values_bin_idx = missing_values_bin_idx self.has_missing_values = has_missing_values self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split @@ -240,6 +243,7 @@ cdef class Splitter: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left + unsigned char missing_values_bin_idx = self.missing_values_bin_idx int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] @@ -285,7 +289,7 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if X_binned[sample_idx] == 0: # missing value + if X_binned[sample_idx] == missing_values_bin_idx: if missing_go_to_left: left_indices_buffer[start + left_count] = sample_idx left_count = left_count + 1 @@ -458,9 +462,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - # Note that considering splitting on the last bin is useless since - # it would result in having 0 samples in the right node (forbidden) - unsigned int end = self.actual_n_bins[feature_idx] - 1 + unsigned int end = self.n_bins_non_missing[feature_idx] - 1 Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -475,8 +477,7 @@ cdef class Splitter: self.l2_regularization) - for bin_idx in range(1, end): - # we skip the first bin which is reserved for missing values + for bin_idx in range(end): n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left @@ -550,20 +551,15 @@ cdef class Splitter: Y_DTYPE_C sum_gradient_right Y_DTYPE_C negative_loss_current_node Y_DTYPE_C gain - unsigned int start = self.actual_n_bins[feature_idx] - 2 + unsigned int start = self.n_bins_non_missing[feature_idx] - 2 - # n_bins - 2 is the index of the second to last bin sum_gradient_right, sum_hessian_right = 0., 0. n_samples_right = 0 negative_loss_current_node = negative_loss(sum_gradients, sum_hessians, self.l2_regularization) - for bin_idx in range(start, 0, -1): - # We start at the second to last bin (we don't need to consider - # splitting on the last bin since it would result in having zero - # samples on the right node). - # We also skip the first bin (where the missing values are) + for bin_idx in range(start, -1, -1): n_samples_right += histograms[feature_idx, bin_idx + 1].count n_samples_left = n_samples_ - n_samples_right diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index f28df67f1e0c3..86623f2c6ec3c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -16,48 +16,45 @@ ).astype(X_DTYPE) -def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), +def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): - # Just a redef to avoid having to pass arguments all the time (as the - # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, max_bins, subsample, - random_state) + n_bins = max_bins + 1 + return _find_binning_thresholds_orig(data, n_bins, subsample, random_state) def test_find_binning_thresholds_regular_data(): data = np.linspace(0, 10, 1001).reshape(-1, 1) - bin_thresholds = _find_binning_thresholds(data, max_bins=11) - assert_allclose(bin_thresholds[0], [np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + bin_thresholds = _find_binning_thresholds(data, max_bins=10) + assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) assert len(bin_thresholds) == 1 - bin_thresholds = _find_binning_thresholds(data, max_bins=6) - assert_allclose(bin_thresholds[0], [np.nan, 2, 4, 6, 8]) + bin_thresholds = _find_binning_thresholds(data, max_bins=5) + assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) assert len(bin_thresholds) == 1 def test_find_binning_thresholds_small_regular_data(): data = np.linspace(0, 10, 11).reshape(-1, 1) - bin_thresholds = _find_binning_thresholds(data, max_bins=6) - assert_allclose(bin_thresholds[0], [np.nan, 2, 4, 6, 8]) + bin_thresholds = _find_binning_thresholds(data, max_bins=5) + assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) - bin_thresholds = _find_binning_thresholds(data, max_bins=11) - assert_allclose(bin_thresholds[0], [np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + bin_thresholds = _find_binning_thresholds(data, max_bins=10) + assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) - bin_thresholds = _find_binning_thresholds(data, max_bins=12) - expected = np.arange(10) + .5 - expected = np.insert(expected, 0, np.nan) - assert_allclose(bin_thresholds[0], expected) + bin_thresholds = _find_binning_thresholds(data, max_bins=11) + assert_allclose(bin_thresholds[0], np.arange(10) + .5) bin_thresholds = _find_binning_thresholds(data, max_bins=255) - assert_allclose(bin_thresholds[0], expected) + assert_allclose(bin_thresholds[0], np.arange(10) + .5) def test_find_binning_thresholds_random_data(): - bin_thresholds = _find_binning_thresholds(DATA, random_state=0) + bin_thresholds = _find_binning_thresholds(DATA, max_bins=255, + random_state=0) assert len(bin_thresholds) == 2 for i in range(len(bin_thresholds)): - assert bin_thresholds[i].shape == (255,) # 256 - 1 + assert bin_thresholds[i].shape == (254,) # 255 - 1 assert bin_thresholds[i].dtype == DATA.dtype assert_allclose(bin_thresholds[0][[64, 128, 192]], @@ -77,24 +74,26 @@ def test_find_binning_thresholds_low_n_bins(): def test_find_binning_thresholds_invalid_n_bins(): - err_msg = 'no smaller than 2 and no larger than 256' + err_msg = 'n_bins=1024 should be no smaller than 3 and no larger than 256' with pytest.raises(ValueError, match=err_msg): - _find_binning_thresholds(DATA, max_bins=1024) + _find_binning_thresholds_orig(DATA, n_bins=1024, subsample=10, + random_state=None) def test_bin_mapper_n_features_transform(): - mapper = _BinMapper(max_bins=42, random_state=42).fit(DATA) + mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA) err_msg = 'This estimator was fitted with 2 features but 4 got passed' with pytest.raises(ValueError, match=err_msg): mapper.transform(np.repeat(DATA, 2, axis=1)) -@pytest.mark.parametrize('n_bins', [16, 128, 256]) -def test_map_to_bins(n_bins): - bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, +@pytest.mark.parametrize('max_bins', [16, 128, 255]) +def test_map_to_bins(max_bins): + bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins, random_state=0) binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') - _map_to_bins(DATA, bin_thresholds, binned) + last_bin_idx = max_bins + _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned) assert binned.shape == DATA.shape assert binned.dtype == np.uint8 assert binned.flags.f_contiguous @@ -103,69 +102,73 @@ def test_map_to_bins(n_bins): max_indices = DATA.argmax(axis=0) for feature_idx, min_idx in enumerate(min_indices): - assert binned[min_idx, feature_idx] == 1 + assert binned[min_idx, feature_idx] == 0 for feature_idx, max_idx in enumerate(max_indices): - assert binned[max_idx, feature_idx] == n_bins - 1 + assert binned[max_idx, feature_idx] == max_bins - 1 -@pytest.mark.parametrize("n_bins", [5, 10, 42]) -def test_bin_mapper_random_data(n_bins): +@pytest.mark.parametrize("max_bins", [5, 10, 42]) +def test_bin_mapper_random_data(max_bins): n_samples, n_features = DATA.shape - mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA) + expected_count_per_bin = n_samples // max_bins + tol = int(0.05 * expected_count_per_bin) + + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) assert binned.dtype == np.uint8 - assert_array_equal(binned.min(axis=0), np.array([1, 1])) - assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) + assert_array_equal(binned.min(axis=0), np.array([0, 0])) + assert_array_equal(binned.max(axis=0), + np.array([max_bins - 1, max_bins - 1])) assert len(mapper.bin_thresholds_) == n_features for bin_thresholds_feature in mapper.bin_thresholds_: - assert bin_thresholds_feature.shape == (n_bins - 1,) + assert bin_thresholds_feature.shape == (max_bins - 1,) assert bin_thresholds_feature.dtype == DATA.dtype - assert np.all(mapper.actual_n_bins_ == n_bins) + assert np.all(mapper.n_bins_non_missing_ == max_bins) - # Check that the binned data is approximately balanced across bins - # (ignoring first bin since there are no missing values) - expected_count_per_bin = n_samples // (n_bins - 1) - tol = int(0.05 * expected_count_per_bin) + # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): - for bin_idx in range(1, n_bins): + for bin_idx in range(max_bins): count = (binned[:, feature_idx] == bin_idx).sum() assert abs(count - expected_count_per_bin) < tol -@pytest.mark.parametrize("n_samples, n_bins_for_non_missing", [ +@pytest.mark.parametrize("n_samples, max_bins", [ (5, 5), (5, 10), (5, 11), (42, 255) ]) -def test_bin_mapper_small_random_data(n_samples, n_bins_for_non_missing): +def test_bin_mapper_small_random_data(n_samples, max_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples - max_bins = n_bins_for_non_missing + 1 # first bin reserved - mapper = _BinMapper(max_bins=max_bins, random_state=42) + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + mapper = _BinMapper(n_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape assert binned.dtype == np.uint8 assert_array_equal(binned.ravel()[np.argsort(data.ravel())], - np.arange(n_samples) + 1) + np.arange(n_samples)) -@pytest.mark.parametrize("n_bins_for_non_missing, n_distinct, multiplier", [ +@pytest.mark.parametrize("max_bins, n_distinct, multiplier", [ (5, 5, 1), (5, 5, 3), (255, 12, 42), ]) -def test_bin_mapper_identity_repeated_values(n_bins_for_non_missing, - n_distinct, multiplier): +def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) - max_bins = n_bins_for_non_missing + 1 # first bin reserved - binned = _BinMapper(max_bins=max_bins).fit_transform(data) - assert_array_equal(data, binned - 1) + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + binned = _BinMapper(n_bins=n_bins).fit_transform(data) + assert_array_equal(data, binned) @pytest.mark.parametrize('n_distinct', [2, 7, 42]) @@ -181,61 +184,62 @@ def test_bin_mapper_repeated_values_invariance(n_distinct): data = data.reshape(-1, 1) - mapper_1 = _BinMapper(max_bins=n_distinct + 1) + mapper_1 = _BinMapper(n_bins=n_distinct + 1) binned_1 = mapper_1.fit_transform(data) - assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct) + 1) + assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) - mapper_2 = _BinMapper(max_bins=min(256, n_distinct * 3)) + mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) assert_array_equal(binned_1, binned_2) -@pytest.mark.parametrize("n_bins_for_non_missing, scale, offset", [ +@pytest.mark.parametrize("max_bins, scale, offset", [ (3, 2, -1), (42, 1, 0), (255, 0.3, 42), ]) -def test_bin_mapper_identity_small(n_bins_for_non_missing, scale, offset): - data = np.arange(n_bins_for_non_missing).reshape(-1, 1) * scale + offset - max_bins = n_bins_for_non_missing + 1 # first bin reserved - binned = _BinMapper(max_bins=max_bins).fit_transform(data) - assert_array_equal(binned, - np.arange(n_bins_for_non_missing).reshape(-1, 1) + 1) +def test_bin_mapper_identity_small(max_bins, scale, offset): + data = np.arange(max_bins).reshape(-1, 1) * scale + offset + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + binned = _BinMapper(n_bins=n_bins).fit_transform(data) + assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1)) -@pytest.mark.parametrize('n_bins_small, n_bins_large', [ +@pytest.mark.parametrize('max_bins_small, max_bins_large', [ (2, 2), (3, 3), (4, 4), (42, 42), - (256, 256), + (255, 255), (5, 17), - (42, 256), + (42, 255), ]) -def test_bin_mapper_idempotence(n_bins_small, n_bins_large): - assert n_bins_large >= n_bins_small +def test_bin_mapper_idempotence(max_bins_small, max_bins_large): + assert max_bins_large >= max_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) - mapper_small = _BinMapper(max_bins=n_bins_small) - mapper_large = _BinMapper(max_bins=n_bins_large) + mapper_small = _BinMapper(n_bins=max_bins_small + 1) + mapper_large = _BinMapper(n_bins=max_bins_small + 1) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large) -@pytest.mark.parametrize('max_bins', [10, 100, 256]) +@pytest.mark.parametrize('n_bins', [10, 100, 256]) @pytest.mark.parametrize('diff', [-5, 0, 5]) -def test_actual_n_bins(max_bins, diff): - # Check that actual_n_bins is n_unique_values + 1 when - # n_unique_values <= max_bins - 1, else max_bins. +def test_n_bins_non_missing(n_bins, diff): + # Check that n_bins_non_missing is n_unique_values when + # there are not a lot of unique values, else n_bins - 1. - n_unique_values = max_bins + diff + n_unique_values = n_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) - mapper = _BinMapper(max_bins=max_bins).fit(X) - assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values + 1)) + mapper = _BinMapper(n_bins=n_bins).fit(X) + assert np.all(mapper.n_bins_non_missing_ == min( + n_bins - 1, n_unique_values)) def test_subsample(): @@ -248,32 +252,22 @@ def test_subsample(): mapper_subsample.bin_thresholds_[feature], rtol=1e-4) - @pytest.mark.parametrize( - 'max_bins, actual_n_bins, X_trans_expected', [ - (256, [5, 3, 3], [[1, 1, 1], - [0, 0, 1], - [2, 1, 1], - [0, 2, 2], - [3, 2, 2], - [4, 1, 1]]), - # With max_bins=2, we expect all nan values to be mapped to bin 0 - # and all non-nans to be mapped to bin 1 - (2, [2, 2, 2], [[1, 1, 1], - [0, 0, 1], - [1, 1, 1], - [0, 1, 1], - [1, 1, 1], - [1, 1, 1]]), - - (3, [3, 3, 3], [[1, 1, 1], - [0, 0, 1], + 'n_bins, n_bins_non_missing, X_trans_expected', [ + (256, [4, 2, 2], [[0, 0, 0], # 255 <=> missing value + [255, 255, 0], + [1, 0, 0], + [255, 1, 1], + [2, 1, 1], + [3, 0, 0]]), + (3, [2, 2, 2], [[0, 0, 0], # 2 <=> missing value + [2, 2, 0], + [0, 0, 0], + [2, 1, 1], [1, 1, 1], - [0, 2, 2], - [2, 2, 2], - [2, 1, 1]])]) -def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): - # check for missing values: make sure nans are mapped to the first bin + [1, 0, 0]])]) +def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected): + # check for missing values: make sure nans are mapped to the last bin # and that attributes are correct X = [[1, 1, 0], @@ -285,15 +279,14 @@ def test_missing_values_support(max_bins, actual_n_bins, X_trans_expected): X = np.array(X) - mapper = _BinMapper(max_bins=max_bins) + mapper = _BinMapper(n_bins=n_bins) mapper.fit(X) - assert_array_equal(mapper.actual_n_bins_, actual_n_bins) + assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing) for feature_idx in range(X.shape[1]): assert len(mapper.bin_thresholds_[feature_idx]) == \ - actual_n_bins[feature_idx] - 1 - assert np.isnan(mapper.bin_thresholds_[feature_idx][0]) + n_bins_non_missing[feature_idx] - 1 X_trans = mapper.transform(X) assert_array_equal(X_trans, X_trans_expected) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 95672a60e5c40..63d8c8fb1059d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -43,7 +43,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 - max_bins = 256 + max_bins = 255 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) @@ -51,7 +51,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) @@ -95,7 +95,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 - max_bins = 256 + max_bins = 255 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) @@ -103,7 +103,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) @@ -155,7 +155,7 @@ def test_same_predictions_multiclass_classification( rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 - max_bins = 256 + max_bins = 255 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, @@ -165,7 +165,7 @@ def test_same_predictions_multiclass_classification( if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 007ade8163c2d..9e5f1d2088428 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -30,7 +30,7 @@ ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'), ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'), ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'), - ({'max_bins': 257}, 'max_bins=257 should be no smaller than 2 and no'), + ({'max_bins': 256}, 'max_bins=256 should be no smaller than 2 and no'), ({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'), ({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'), ({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'), @@ -169,10 +169,10 @@ def test_binning_train_validation_are_separated(): mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] - assert np.all(mapper_training_data.actual_n_bins_ == - int((1 - validation_fraction) * n_samples) + 1) - assert np.all(mapper_training_data.actual_n_bins_ != - mapper_whole_data.actual_n_bins_) + assert np.all(mapper_training_data.n_bins_non_missing_ == + int((1 - validation_fraction) * n_samples)) + assert np.all(mapper_training_data.n_bins_non_missing_ != + mapper_whole_data.n_bins_non_missing_) def test_missing_values_trivial(): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 3763cc10e06a2..49b9a9dc031ba 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -15,7 +15,7 @@ def _make_training_data(n_bins=256, constant_hessian=True): # Generate some test data directly binned so as to test the grower code # independently of the binning logic. - X_binned = rng.randint(1, n_bins - 1, size=(n_samples, 2), + X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) @@ -85,7 +85,7 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): stopping_param = {"min_gain_to_split": 0.01} grower = TreeGrower(X_binned, all_gradients, all_hessians, - max_bins=n_bins, shrinkage=shrinkage, + n_bins=n_bins, shrinkage=shrinkage, min_samples_leaf=1, **stopping_param) # The root node is not yet splitted, but the best possible split has @@ -147,7 +147,7 @@ def test_predictor_from_grower(): X_binned, all_gradients, all_hessians = _make_training_data( n_bins=n_bins) grower = TreeGrower(X_binned, all_gradients, all_hessians, - max_bins=n_bins, shrinkage=1., + n_bins=n_bins, shrinkage=1., max_leaf_nodes=3, min_samples_leaf=5) grower.grow() assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) @@ -161,24 +161,27 @@ def test_predictor_from_grower(): # Probe some predictions for each leaf of the tree # each group of 3 samples corresponds to a condition in _make_training_data input_data = np.array([ - [1, 1], + [0, 0], [42, 99], - [128, 255], + [128, 254], - [129, 1], + [129, 0], [129, 85], - [255, 85], + [254, 85], [129, 86], - [129, 255], + [129, 254], [242, 100], ], dtype=np.uint8) - predictions = predictor.predict_binned(input_data) + missing_values_bin_idx = n_bins - 1 + predictions = predictor.predict_binned(input_data, missing_values_bin_idx) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: - predictions = predictor.predict_binned(X_binned) + predictions = predictor.predict_binned(X_binned, missing_values_bin_idx) + print() + print(np.sum(predictions != -all_gradients)) assert np.allclose(predictions, -all_gradients) @@ -203,14 +206,14 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale - mapper = _BinMapper(max_bins=n_bins) + mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) shape_hessian = 1 if constant_hessian else all_gradients.shape all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, - max_bins=n_bins, shrinkage=1., + n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() @@ -235,18 +238,18 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): # min_samples_leaf rng = np.random.RandomState(seed=0) - max_bins = 255 + n_bins = 256 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] - mapper = _BinMapper(max_bins=max_bins) + mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, - max_bins=max_bins, shrinkage=1., + n_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, max_leaf_nodes=n_samples) grower.grow() @@ -261,13 +264,13 @@ def test_max_depth(max_depth): # Make sure max_depth parameter works as expected rng = np.random.RandomState(seed=0) - max_bins = 255 + n_bins = 256 n_samples = 1000 # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] - mapper = _BinMapper(max_bins=max_bins) + mapper = _BinMapper(n_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 80a56bfe78ded..c7bd1abc0b347 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -9,13 +9,13 @@ from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE -@pytest.mark.parametrize('max_bins', [200, 256]) -def test_boston_dataset(max_bins): +@pytest.mark.parametrize('n_bins', [200, 256]) +def test_boston_dataset(n_bins): boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=42) - mapper = _BinMapper(max_bins=max_bins, random_state=42) + mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss @@ -26,8 +26,8 @@ def test_boston_dataset(max_bins): max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, - actual_n_bins=mapper.actual_n_bins_) + max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, + n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index b23111fe0142b..f7fe8c6d20d6a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -17,7 +17,7 @@ def test_histogram_split(n_bins): min_samples_leaf = 1 min_gain_to_split = 0. X_binned = np.asfortranarray( - rng.randint(0, n_bins, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE) + rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE) binned_feature = X_binned.T[feature_idx] sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) @@ -25,7 +25,7 @@ def test_histogram_split(n_bins): sum_hessians = all_hessians.sum() hessians_are_constant = False - for true_bin in range(1, n_bins - 1): + for true_bin in range(1, n_bins - 2): for sign in [-1, 1]: ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE) @@ -33,8 +33,8 @@ def test_histogram_split(n_bins): all_gradients = ordered_gradients sum_gradients = all_gradients.sum() - actual_n_bins = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], + dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, @@ -42,8 +42,10 @@ def test_histogram_split(n_bins): all_gradients, all_hessians, hessians_are_constant) + missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, - actual_n_bins, + n_bins_non_missing, + missing_values_bin_idx, has_missing_values, l2_regularization, min_hessian_to_split, @@ -97,15 +99,16 @@ def test_gradient_and_hessian_sanity(constant_hessian): all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) sum_hessians = all_hessians.sum() - actual_n_bins = np.array([n_bins] * X_binned.shape[1], + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) - splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - constant_hessian) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, constant_hessian) hists_parent = builder.compute_histograms_brute(sample_indices) si_parent = splitter.find_node_split(n_samples, hists_parent, @@ -177,16 +180,16 @@ def test_split_indices(): min_gain_to_split = 0. # split will happen on feature 1 and on bin 3 - X_binned = [[1, 1], - [1, 3], - [1, 4], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 4], - [1, 1], - [1, 4]] + X_binned = [[0, 0], + [0, 3], + [0, 4], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 4], + [0, 0], + [0, 4]] X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) sample_indices = np.arange(n_samples, dtype=np.uint32) all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) @@ -195,16 +198,17 @@ def test_split_indices(): sum_hessians = 1 * n_samples hessians_are_constant = True - actual_n_bins = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) assert np.all(sample_indices == splitter.partition) @@ -252,15 +256,16 @@ def test_min_gain_to_split(): sum_hessians = all_hessians.sum() hessians_are_constant = False - actual_n_bins = np.array([n_bins] * X_binned.shape[1], + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, actual_n_bins, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, + has_missing_values, l2_regularization, + min_hessian_to_split, min_samples_leaf, + min_gain_to_split, hessians_are_constant) histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(n_samples, histograms, @@ -269,40 +274,41 @@ def test_min_gain_to_split(): @pytest.mark.parametrize( - 'X_binned, all_gradients, has_missing_values, expected_bin_idx, ' - 'expected_go_to_left', [ + 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' + ' expected_bin_idx, expected_go_to_left', [ # basic sanity check with no missing values: given the gradient - # values, the split must occur on bin_idx=4 - ([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # X_binned + # values, the split must occur on bin_idx=3 + ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients False, # no missing values - 4, # expected_bin_idx + 10, # n_bins_non_missing + 3, # expected_bin_idx 'not_applicable'), - # We replace 2 samples by NaNs (bin_idx=0) + # We replace 2 samples by NaNs (bin_idx=9) # These 2 samples were mapped to the left node before, so they should # be mapped to left node again - # Notice how the bin_idx threshold changes from 3 to 2. - # Also, the bins of the previous non-nan samples have bin shiffted by - # one - ([0, 1, 2, 0, 3, 4, 5, 6, 7, 8], # missing values are the zeros + # Notice how the bin_idx threshold changes from 3 to 1. + ([9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], - True, # missing values (bin_idx=0) - 2, # cut on bin_idx=2 + True, # missing values + 8, # n_bins_non_missing + 1, # cut on bin_idx=1 True), # missing values go to left # Same, this time replacing 2 samples that were on the right. - ([1, 2, 3, 4, 0, 5, 0, 6, 7, 8], + ([0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], - True, # missing values (bin_idx=0) - 4, # cut on bin_idx=4 (like in first case, with +1 because of offset) + True, # missing values + 8, # n_bins_non_missing + 3, # cut on bin_idx=3 (like in first case) False), # missing values go to right ] ) def test_splitting_missing_values(X_binned, all_gradients, - has_missing_values, expected_bin_idx, - expected_go_to_left): + has_missing_values, n_bins_non_missing, + expected_bin_idx, expected_go_to_left): # Make sure missing values are properly supported. # we build an artificial example with gradients such that the best split # is on bin_idx=4, when there are no missing values. @@ -312,7 +318,7 @@ def test_splitting_missing_values(X_binned, all_gradients, # - make sure the missing values are mapped to the correct child # (split_indices()) - max_bins = max(X_binned) + 1 + n_bins = max(X_binned) + 1 n_samples = len(X_binned) l2_regularization = 0. min_hessian_to_split = 1e-3 @@ -329,11 +335,14 @@ def test_splitting_missing_values(X_binned, all_gradients, sum_hessians = 1 * n_samples hessians_are_constant = True - actual_n_bins = np.array([X_binned.max() + 1], dtype=np.uint32) - builder = HistogramBuilder(X_binned, max_bins, + builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, actual_n_bins, has_missing_values, + + n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, + missing_values_bin_idx, has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) From 457e720f78fa173bbcad0a039a2a0e52219462c1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Jun 2019 10:25:06 -0400 Subject: [PATCH 41/76] some doc --- .../_hist_gradient_boosting/binning.py | 31 +++++++++++++++---- .../gradient_boosting.py | 11 +++++-- .../_hist_gradient_boosting/grower.py | 19 ++++++------ .../_hist_gradient_boosting/histogram.pyx | 30 +++++++++--------- .../_hist_gradient_boosting/splitting.pyx | 11 +++++-- .../tests/test_binning.py | 2 ++ .../tests/test_gradient_boosting.py | 5 +-- .../tests/test_splitting.py | 24 +++++++------- 8 files changed, 84 insertions(+), 49 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 8a0bbbdd99985..176fd73a44e95 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -47,6 +47,8 @@ def _find_binning_thresholds(data, n_bins, subsample, random_state): be used to separate the bins. Thus ``len(binning_thresholds) == n_features``. The first threshold (for missing values) is always NaN. """ + # n_bins must be >= 3: 1 bin is for missing values, and it woudn't make + # sense to bin non-missing values into only 1 bin. if not (3 <= n_bins <= 256): raise ValueError('n_bins={} should be no smaller than 3 ' 'and no larger than 256.'.format(n_bins)) @@ -96,9 +98,9 @@ class _BinMapper(BaseEstimator, TransformerMixin): For large datasets, quantiles are computed on a subset of the data to speed-up the binning, but the quantiles should remain stable. - If the number of unique values for a given feature is less than - ``n_bins``, then the unique values of this feature are used instead of - the quantiles. + Features with a small number of values may be binned into less than + ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved + for missing values. Parameters ---------- @@ -117,6 +119,23 @@ class _BinMapper(BaseEstimator, TransformerMixin): optional (default=None) Pseudo-random number generator to control the random sub-sampling. See :term:`random_state`. + + Attributes + ---------- + bin_thresholds_ : list of arrays + For each feature, gives the real-valued bin threhsolds. There are + ``max_bins - 1`` thresholds, where ``max_bins = n_bins - 1`` is the + number of bins used for non-missing values. + n_bins_non_missing_ : array of uint32 + For each feature, gives the number of bins actually used for + non-missing values. For features with a lot of unique values, this is + equal to ``n_bins - 1``. + missing_values_bin_idx_ : uint8 + The index of the bin where missing values are mapped. This is a + constant accross all features. This corresponds to the last bin, and + it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_`` + is less than ``n_bins - 1`` for a given feature, then there are + empty (an unused) bins. """ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): self.n_bins = n_bins @@ -126,8 +145,8 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): def fit(self, X, y=None): """Fit data X by computing the binning thresholds. - The first bin is reserved for missing values, whether there are - missing values or not. + The last bin is reserved for missing values, whether there are + missing values present in the data or not. Parameters ---------- @@ -156,7 +175,7 @@ def fit(self, X, y=None): def transform(self, X): """Bin data X. - Missing values will be mapped to the first bin. + Missing values will be mapped to the last bin. Parameters ---------- diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index bbab7cd154052..bf9be94fde872 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -150,6 +150,13 @@ def fit(self, X, y): has_missing_values = np.isnan(X_train).any(axis=0).astype(np.uint8) # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 n_bins = self.max_bins + 1 # + 1 for missing values self.bin_mapper_ = _BinMapper(n_bins=n_bins, random_state=rng) X_binned_train = self._bin_data(X_train, rng, is_training_data=True) @@ -662,7 +669,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin - is reserved for missing values. Must be no larger than 255. + is always reserved for missing values. Must be no larger than 255. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the @@ -846,7 +853,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin - is reserved for missing values. Must be no larger than 255. + is always reserved for missing values. Must be no larger than 255. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble. For results to be valid, the diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 869806b4bf2d4..62635ff086ded 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -141,17 +141,18 @@ class TreeGrower: min_gain_to_split : float, optional (default=0.) The minimum gain needed to split a node. Splits with lower gain will be ignored. - max_bins : int, optional (default=256) - The maximum number of bins. Used to define the shape of the - histograms. - actual_n_bins : ndarray of int or int, optional (default=None) - The actual number of bins needed for each feature, which is lower or - equal to ``max_bins``. If it's an int, all features are considered to - have the same number of bins. If None, all features are considered to - have ``max_bins`` bins. + n_bins : int, optional (default=256) + The total number of bins, including the bin for missing values. Used + to define the shape of the histograms. + n_bins_non_missing_ : array of uint32 + For each feature, gives the number of bins actually used for + non-missing values. For features with a lot of unique values, this + is equal to ``n_bins - 1``. If it's an int, all features are + considered to have the same number of bins. If None, all features + are considered to have ``n_bins - 1`` bins. has_missing_values : ndarray of bool or bool, optional (default=False) Whether each feature contains missing values (in the training data). - If it's a bool, the same values is used for all features. + If it's a bool, the same value is used for all features. l2_regularization : float, optional (default=0) The L2 regularization parameter. min_hessian_to_split : float, optional (default=1e-3) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index c83fa0c79db71..b6031bc86846f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -62,9 +62,9 @@ cdef class HistogramBuilder: ---------- X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - max_bins : int - The maximum number of bins. Used to define the shape of the - histograms. + n_bins : int + The total number of bins, including the bin for missing values. Used + to define the shape of the histograms. gradients : ndarray, shape (n_samples,) The gradients of each training sample. Those are the gradients of the loss w.r.t the predictions, evaluated at iteration i - 1. @@ -77,7 +77,7 @@ cdef class HistogramBuilder: cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features - unsigned int max_bins + unsigned int n_bins G_H_DTYPE_C [::1] gradients G_H_DTYPE_C [::1] hessians G_H_DTYPE_C [::1] ordered_gradients @@ -85,15 +85,15 @@ cdef class HistogramBuilder: unsigned char hessians_are_constant def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, - unsigned int max_bins, G_H_DTYPE_C [::1] gradients, + unsigned int n_bins, G_H_DTYPE_C [::1] gradients, G_H_DTYPE_C [::1] hessians, unsigned char hessians_are_constant): self.X_binned = X_binned self.n_features = X_binned.shape[1] - # Note: all histograms will have bins, but some of the - # last bins may be unused if actual_n_bins[f] < max_bins - self.max_bins = max_bins + # Note: all histograms will have bins, but some of the + # bins may be unused if a feature has a small number of unique values. + self.n_bins = n_bins self.gradients = gradients self.hessians = hessians # for root node, gradients and hessians are already ordered @@ -115,7 +115,7 @@ cdef class HistogramBuilder: Returns ------- - histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, max_bins) + histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) The computed histograms of the current node. """ cdef: @@ -131,7 +131,7 @@ cdef class HistogramBuilder: G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians G_H_DTYPE_C [::1] hessians = self.hessians hist_struct [:, ::1] histograms = np.zeros( - shape=(self.n_features, self.max_bins), + shape=(self.n_features, self.n_bins), dtype=HISTOGRAM_DTYPE ) @@ -210,15 +210,15 @@ cdef class HistogramBuilder: Parameters ---------- parent_histograms : ndarray of HISTOGRAM_DTYPE, \ - shape (n_features, max_bins) + shape (n_features, n_bins) The histograms of the parent. sibling_histograms : ndarray of HISTOGRAM_DTYPE, \ - shape (n_features, max_bins) + shape (n_features, n_bins) The histograms of the sibling. Returns ------- - histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, max_bins) + histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins) The computed histograms of the current node. """ @@ -226,14 +226,14 @@ cdef class HistogramBuilder: int feature_idx int n_features = self.n_features hist_struct [:, ::1] histograms = np.zeros( - shape=(self.n_features, self.max_bins), + shape=(self.n_features, self.n_bins), dtype=HISTOGRAM_DTYPE ) for feature_idx in prange(n_features, schedule='static', nogil=True): # Compute histogram of each feature _subtract_histograms(feature_idx, - self.max_bins, + self.n_bins, parent_histograms, sibling_histograms, histograms) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index e51b5e7cb4307..923669c6049b6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -96,10 +96,15 @@ cdef class Splitter: X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. n_bins_non_missing : ndarray, shape (n_features,) - The actual number of bins needed for each feature, which is lower or - equal to max_bins. + For each feature, gives the number of bins actually used for + non-missing values. + missing_values_bin_idx : uint8 + Index of the bin that is used for missing values. This is the index of + the last bin and is always equal to max_bins (as passed to the GBDT + classes), or equivalently to n_bins - 1. has_missing_values : ndarray, shape (n_features,) - Whether each feature contains missing values (in the training data). + Whether missing values were observed in the training data, for each + feature. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float, default=1e-3 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 86623f2c6ec3c..9aba284c1d9de 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -288,5 +288,7 @@ def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected): assert len(mapper.bin_thresholds_[feature_idx]) == \ n_bins_non_missing[feature_idx] - 1 + assert mapper.missing_values_bin_idx_ == n_bins - 1 + X_trans = mapper.transform(X) assert_array_equal(X_trans, X_trans_expected) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 9e5f1d2088428..4121151db025b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -177,7 +177,8 @@ def test_binning_train_validation_are_separated(): def test_missing_values_trivial(): # sanity check for missing values support. With only one feature and - # y == isnan(X), the gbdt is supposed to reach perfect accuracy. + # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the + # training set. n_samples = 100 n_features = 1 @@ -190,7 +191,7 @@ def test_missing_values_trivial(): gb = HistGradientBoostingClassifier() gb.fit(X, y) - assert gb.score(X, y) == 1 + assert gb.score(X, y) == pytest.approx(1) @pytest.mark.parametrize('problem', ('classification', 'regression')) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index f7fe8c6d20d6a..746dc6bb0dd2b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -33,15 +33,15 @@ def test_histogram_split(n_bins): all_gradients = ordered_gradients sum_gradients = all_gradients.sum() - n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], - dtype=np.uint32) - has_missing_values = np.array([False] * X_binned.shape[1], - dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], + dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], + dtype=np.uint8) missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, n_bins_non_missing, @@ -99,11 +99,11 @@ def test_gradient_and_hessian_sanity(constant_hessian): all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) sum_hessians = all_hessians.sum() + builder = HistogramBuilder(X_binned, n_bins, all_gradients, + all_hessians, constant_hessian) n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - builder = HistogramBuilder(X_binned, n_bins, all_gradients, - all_hessians, constant_hessian) missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, l2_regularization, @@ -198,12 +198,12 @@ def test_split_indices(): sum_hessians = 1 * n_samples hessians_are_constant = True - n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) - has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, l2_regularization, @@ -256,11 +256,11 @@ def test_min_gain_to_split(): sum_hessians = all_hessians.sum() hessians_are_constant = False + builder = HistogramBuilder(X_binned, n_bins, all_gradients, + all_hessians, hessians_are_constant) n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) - builder = HistogramBuilder(X_binned, n_bins, all_gradients, - all_hessians, hessians_are_constant) missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, l2_regularization, @@ -311,7 +311,7 @@ def test_splitting_missing_values(X_binned, all_gradients, expected_bin_idx, expected_go_to_left): # Make sure missing values are properly supported. # we build an artificial example with gradients such that the best split - # is on bin_idx=4, when there are no missing values. + # is on bin_idx=3, when there are no missing values. # Then we introduce missing values and: # - make sure the chosen bin is correct (find_best_bin()): it's # still the same split, even though the index of the bin changes From 45c5068ae1571acb09931cd7f31666a8f26e0a45 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Jun 2019 10:57:51 -0400 Subject: [PATCH 42/76] reduce diff --- .../_hist_gradient_boosting/binning.py | 27 +++++++++---------- .../_hist_gradient_boosting/predictor.py | 4 +++ .../tests/test_binning.py | 16 ++++++----- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 176fd73a44e95..6ec263aa822b0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,7 +16,7 @@ from .types import X_DTYPE, X_BINNED_DTYPE -def _find_binning_thresholds(data, n_bins, subsample, random_state): +def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. @@ -25,10 +25,8 @@ def _find_binning_thresholds(data, n_bins, subsample, random_state): ---------- data : array-like, shape (n_samples, n_features) The data to bin. - n_bins : int, optional (default=256) - The maximum number of bins to use (including the bin for missing - values). Non-missing values are binned on ``max_bins = n_bins - 1`` - bins. The last bin is always reserved for missing values. If for a + max_bins: int + The maximum number of bins to use for non-missing values. If for a given feature the number of unique values is less than ``max_bins``, then those unique values will be used to compute the bin thresholds, instead of the quantiles. @@ -47,11 +45,6 @@ def _find_binning_thresholds(data, n_bins, subsample, random_state): be used to separate the bins. Thus ``len(binning_thresholds) == n_features``. The first threshold (for missing values) is always NaN. """ - # n_bins must be >= 3: 1 bin is for missing values, and it woudn't make - # sense to bin non-missing values into only 1 bin. - if not (3 <= n_bins <= 256): - raise ValueError('n_bins={} should be no smaller than 3 ' - 'and no larger than 256.'.format(n_bins)) rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) @@ -68,8 +61,7 @@ def _find_binning_thresholds(data, n_bins, subsample, random_state): distinct_values = np.unique(col_data) # maximum number of bins used for non missing values - max_n_bins_non_missing = n_bins - 1 - if len(distinct_values) <= max_n_bins_non_missing: + if len(distinct_values) <= max_bins: midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 else: @@ -78,11 +70,11 @@ def _find_binning_thresholds(data, n_bins, subsample, random_state): # np.unique(col_data, return_counts) instead but this is more # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. - percentiles = np.linspace(0, 100, num=n_bins) + percentiles = np.linspace(0, 100, num=max_bins + 1) percentiles = percentiles[1:-1] midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE) - assert midpoints.shape[0] == max_n_bins_non_missing - 1 + assert midpoints.shape[0] == max_bins - 1 binning_thresholds.append(midpoints) @@ -159,9 +151,14 @@ def fit(self, X, y=None): ------- self : object """ + if not (3 <= self.n_bins <= 256): + raise ValueError('n_bins={} should be no smaller than 3 ' + 'and no larger than 256.'.format(self.n_bins)) + X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') + max_bins = self.n_bins - 1 self.bin_thresholds_ = _find_binning_thresholds( - X, self.n_bins, subsample=self.subsample, + X, max_bins, subsample=self.subsample, random_state=self.random_state) self.n_bins_non_missing_ = np.array( diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index 5323e0002db6b..d82082e0e8a97 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -53,6 +53,10 @@ def predict_binned(self, X, missing_values_bin_idx): ---------- X : ndarray, shape (n_samples, n_features) The input samples. + missing_values_bin_idx : uint8 + Index of the bin that is used for missing values. This is the + index of the last bin and is always equal to max_bins (as passed + to the GBDT classes), or equivalently to n_bins - 1. Returns ------- diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 9aba284c1d9de..e3664c87a1cec 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -18,8 +18,10 @@ def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), random_state=None): - n_bins = max_bins + 1 - return _find_binning_thresholds_orig(data, n_bins, subsample, random_state) + # Just a redef to avoid having to pass arguments all the time (as the + # function is private we don't use default values for parameters) + return _find_binning_thresholds_orig(data, max_bins, subsample, + random_state) def test_find_binning_thresholds_regular_data(): @@ -73,11 +75,13 @@ def test_find_binning_thresholds_low_n_bins(): assert bin_thresholds[i].dtype == DATA.dtype -def test_find_binning_thresholds_invalid_n_bins(): - err_msg = 'n_bins=1024 should be no smaller than 3 and no larger than 256' +@pytest.mark.parametrize('n_bins', (2, 257)) +def test_invalid_n_bins(n_bins): + err_msg = ( + 'n_bins={} should be no smaller than 3 and no larger than 256' + .format(n_bins)) with pytest.raises(ValueError, match=err_msg): - _find_binning_thresholds_orig(DATA, n_bins=1024, subsample=10, - random_state=None) + _BinMapper(n_bins=n_bins).fit(DATA) def test_bin_mapper_n_features_transform(): From 5a8fbe5478569a4a9070298167ce836152790070 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Jun 2019 10:58:51 -0400 Subject: [PATCH 43/76] pep8 --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 2 +- .../ensemble/_hist_gradient_boosting/tests/test_binning.py | 1 + .../ensemble/_hist_gradient_boosting/tests/test_splitting.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 6ec263aa822b0..1cb5ab494a79b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -153,7 +153,7 @@ def fit(self, X, y=None): """ if not (3 <= self.n_bins <= 256): raise ValueError('n_bins={} should be no smaller than 3 ' - 'and no larger than 256.'.format(self.n_bins)) + 'and no larger than 256.'.format(self.n_bins)) X = check_array(X, dtype=[X_DTYPE], force_all_finite='allow-nan') max_bins = self.n_bins - 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index e3664c87a1cec..199a9081f5c93 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -256,6 +256,7 @@ def test_subsample(): mapper_subsample.bin_thresholds_[feature], rtol=1e-4) + @pytest.mark.parametrize( 'n_bins, n_bins_non_missing, X_trans_expected', [ (256, [4, 2, 2], [[0, 0, 0], # 255 <=> missing value diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 746dc6bb0dd2b..d2e70e045a04a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -102,7 +102,7 @@ def test_gradient_and_hessian_sanity(constant_hessian): builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], - dtype=np.uint32) + dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, @@ -259,7 +259,7 @@ def test_min_gain_to_split(): builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], - dtype=np.uint32) + dtype=np.uint32) has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) missing_values_bin_idx = n_bins - 1 splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, From 889835a619697834c375d7ee11b7e85c578b9503 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Jun 2019 15:23:57 -0400 Subject: [PATCH 44/76] minor --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 1cb5ab494a79b..c090449e067b6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -59,8 +59,6 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): col_data = col_data[~missing_mask] col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) distinct_values = np.unique(col_data) - - # maximum number of bins used for non missing values if len(distinct_values) <= max_bins: midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 From 8d5e36edf5aca1edde8cf0cbde31ce88ec9e6c93 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 1 Jul 2019 10:22:00 -0400 Subject: [PATCH 45/76] remove prints --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 2 +- sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index c090449e067b6..5c8c10e412bd1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -43,7 +43,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): binning_thresholds: list of arrays For each feature, stores the increasing numeric values that can be used to separate the bins. Thus ``len(binning_thresholds) == - n_features``. The first threshold (for missing values) is always NaN. + n_features``. """ rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 49b9a9dc031ba..14affe89166e6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -180,8 +180,6 @@ def test_predictor_from_grower(): # Check that training set can be recovered exactly: predictions = predictor.predict_binned(X_binned, missing_values_bin_idx) - print() - print(np.sum(predictions != -all_gradients)) assert np.allclose(predictions, -all_gradients) From d28ab14ace0c7170820317ab4b38228ceef6712f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 5 Jul 2019 11:06:05 +0200 Subject: [PATCH 46/76] towards nan only splits --- .../_hist_gradient_boosting/_predictor.pyx | 7 +- .../_hist_gradient_boosting/grower.py | 4 +- .../_hist_gradient_boosting/splitting.pyx | 76 +++++++++++++------ .../tests/test_gradient_boosting.py | 4 +- .../tests/test_splitting.py | 73 ++++++++++++++++++ 5 files changed, 137 insertions(+), 27 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 6f37e7efb83f0..e0eb9c5d448e7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -43,7 +43,12 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( if node.is_leaf: return node.value - if isnan(numeric_data[row, node.feature_idx]): + if isnan(node.threshold): + if isnan(numeric_data[row, node.feature_idx]): + node = nodes[node.right] + else: + node = nodes[node.left] + elif isnan(numeric_data[row, node.feature_idx]): if node.missing_go_to_left: node = nodes[node.left] else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 62635ff086ded..4f4bd6aba4014 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -473,7 +473,9 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left - if bin_thresholds is not None: + if split_info.bin_is_nan: + node['threshold'] = np.nan + elif bin_thresholds is not None: threshold = bin_thresholds[feature_idx][bin_idx] node['threshold'] = threshold next_free_idx += 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 923669c6049b6..05413fa050a00 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -32,6 +32,7 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx + unsigned char bin_is_nan unsigned char missing_go_to_left Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right @@ -52,8 +53,10 @@ class SplitInfo: The index of the feature to be split. bin_idx : int The index of the bin on which the split is made. + bin_is_nan : bool + Whether the split has only NaN on one side. missing_go_to_left : bool - Whether missing values should go to the left child + Whether missing values should go to the left child. sum_gradient_left : float The sum of the gradients of all the samples in the left child. sum_hessian_left : float @@ -67,12 +70,14 @@ class SplitInfo: n_samples_right : int The number of samples in the right child. """ - def __init__(self, gain, feature_idx, bin_idx, missing_go_to_left, - sum_gradient_left, sum_hessian_left, sum_gradient_right, - sum_hessian_right, n_samples_left, n_samples_right): + def __init__(self, gain, feature_idx, bin_idx, bin_is_nan, + missing_go_to_left, sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, n_samples_left, + n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx + self.bin_is_nan = bin_is_nan self.missing_go_to_left = missing_go_to_left self.sum_gradient_left = sum_gradient_left self.sum_hessian_left = sum_hessian_left @@ -248,6 +253,7 @@ cdef class Splitter: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left + unsigned char split_on_nan = split_info.bin_is_nan unsigned char missing_values_bin_idx = self.missing_values_bin_idx int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ @@ -273,6 +279,7 @@ cdef class Splitter: int thread_idx int sample_idx int right_child_position + unsigned char turn_left int [:] left_offset = np.zeros(n_threads, dtype=np.int32) int [:] right_offset = np.zeros(n_threads, dtype=np.int32) @@ -294,20 +301,17 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if X_binned[sample_idx] == missing_values_bin_idx: - if missing_go_to_left: - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 - else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 + turn_left = sample_goes_left( + split_on_nan, missing_go_to_left, + missing_values_bin_idx, bin_idx, + X_binned[sample_idx]) + + if turn_left: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 else: - if X_binned[sample_idx] <= bin_idx: - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 - else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 left_counts[thread_idx] = left_count right_counts[thread_idx] = right_count @@ -420,6 +424,7 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, + split_info.bin_is_nan, split_info.missing_go_to_left, split_info.sum_gradient_left, split_info.sum_hessian_left, @@ -431,13 +436,13 @@ cdef class Splitter: free(split_infos) return out - cdef int _find_best_feature_to_split_helper( + cdef unsigned int _find_best_feature_to_split_helper( self, split_info_struct * split_infos) nogil: # IN """Returns the best feature among those in splits_infos.""" cdef: - int feature_idx - int best_feature_idx = 0 + unsigned int feature_idx + unsigned int best_feature_idx = 0 for feature_idx in range(1, self.n_features): if (split_infos[feature_idx].gain > @@ -467,7 +472,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int end = self.n_bins_non_missing[feature_idx] - 1 + unsigned int end = self.n_bins_non_missing[feature_idx] Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -512,11 +517,14 @@ cdef class Splitter: sum_gradient_right, sum_hessian_right, negative_loss_current_node, self.l2_regularization) - + with gil: + print(gain) if gain > split_info.gain and gain > self.min_gain_to_split: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx + # the split is on NaN if bin_idx happens at the end + split_info.bin_is_nan = bin_idx == end - 1 # we scan from left to right so missing values go to the right split_info.missing_go_to_left = False split_info.sum_gradient_left = sum_gradient_left @@ -556,7 +564,7 @@ cdef class Splitter: Y_DTYPE_C sum_gradient_right Y_DTYPE_C negative_loss_current_node Y_DTYPE_C gain - unsigned int start = self.n_bins_non_missing[feature_idx] - 2 + unsigned int start = self.n_bins_non_missing[feature_idx] - 1 sum_gradient_right, sum_hessian_right = 0., 0. n_samples_right = 0 @@ -596,10 +604,14 @@ cdef class Splitter: negative_loss_current_node, self.l2_regularization) + with gil: + print(gain) if gain > split_info.gain and gain > self.min_gain_to_split: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx + # the split is on NaN if bin_idx happens at the end + split_info.bin_is_nan = bin_idx == start # we scan from right to left so missing values go to the left split_info.missing_go_to_left = True split_info.sum_gradient_left = sum_gradient_left @@ -639,3 +651,21 @@ cdef inline Y_DTYPE_C negative_loss( Y_DTYPE_C hessian, Y_DTYPE_C l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) + +cdef inline unsigned char sample_goes_left( + unsigned char split_on_nan, + unsigned char missing_go_to_left, + unsigned char missing_values_bin_idx, + X_BINNED_DTYPE_C split_bin_idx, + X_BINNED_DTYPE_C bin_value) nogil: + return ( + ( + split_on_nan and + bin_value != missing_values_bin_idx) + or ( + missing_go_to_left and + bin_value == missing_values_bin_idx + ) + or ( + bin_value <= split_bin_idx + )) \ No newline at end of file diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 4121151db025b..2df5bf6409309 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -198,8 +198,8 @@ def test_missing_values_trivial(): @pytest.mark.parametrize( 'missing_proportion, expected_min_score_classification, ' 'expected_min_score_regression', [ - (.1, .97, .9), - (.2, .94, .82), + (.1, .97, .89), + (.2, .93, .81), (.5, .79, .52)]) def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index d2e70e045a04a..01c8a33088849 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -362,3 +362,76 @@ def test_splitting_missing_values(X_binned, all_gradients, split_info, splitter.partition) assert set(samples_left) == set([0, 1, 2, 3]) assert set(samples_right) == set([4, 5, 6, 7, 8, 9]) + + +@pytest.mark.parametrize( + 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' + ' expected_bin_idx, bin_is_nan, expected_go_to_left', [ + + ([0, 1, 2, 3, 7, 8, 9, 9, 9, 9], # 9 <=> missing + [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + 7, # cut on bin_idx=max_bins - 1 + True, # bin_is_nan + False), # missing values go to right + + ([9, 9, 9, 9, 9, 9, 1, 3, 8, 6], # 9 <=> missing + [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + 7, # cut on bin_idx=max_bins - 1 + True, # bin_is_nan + False), # missing values go to right + ] +) +def test_splitting_missing_values_edge_case(X_binned, all_gradients, + has_missing_values, n_bins_non_missing, + expected_bin_idx, bin_is_nan, + expected_go_to_left): + print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") + n_bins = max(X_binned) + 1 + n_samples = len(X_binned) + l2_regularization = 0. + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + + sample_indices = np.arange(n_samples, dtype=np.uint32) + X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + X_binned = np.asfortranarray(X_binned) + all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) + has_missing_values = np.array([has_missing_values], dtype=np.uint8) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples + hessians_are_constant = True + + builder = HistogramBuilder(X_binned, n_bins, + all_gradients, all_hessians, + hessians_are_constant) + + n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter(X_binned, n_bins_non_missing, + missing_values_bin_idx, has_missing_values, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + hessians_are_constant) + + histograms = builder.compute_histograms_brute(sample_indices) + split_info = splitter.find_node_split(n_samples, histograms, + sum_gradients, sum_hessians) + + #assert split_info.bin_idx == expected_bin_idx + if has_missing_values: + assert split_info.missing_go_to_left == expected_go_to_left + + # Whatever the missing values, the split should always be the same. This + # also make sure missing values are properly assigned to the correct child + # in split_indices() + samples_left, samples_right, _ = splitter.split_indices( + split_info, splitter.partition) + + assert set(samples_left) == set([0, 1, 2, 3, 4, 5]) + assert set(samples_right) == set([6, 7, 8, 9]) From 48fa14965c85444c928f8cb915f080c42a68e442 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 11 Jul 2019 12:47:58 +0200 Subject: [PATCH 47/76] don't check right to left on split_on_nan --- .../_hist_gradient_boosting/splitting.pyx | 27 +++++++++---------- .../tests/test_splitting.py | 27 ++++++++++--------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 05413fa050a00..94be93e07df0b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -32,7 +32,7 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx - unsigned char bin_is_nan + unsigned char split_is_nan unsigned char missing_go_to_left Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right @@ -53,7 +53,7 @@ class SplitInfo: The index of the feature to be split. bin_idx : int The index of the bin on which the split is made. - bin_is_nan : bool + split_is_nan : bool Whether the split has only NaN on one side. missing_go_to_left : bool Whether missing values should go to the left child. @@ -70,14 +70,14 @@ class SplitInfo: n_samples_right : int The number of samples in the right child. """ - def __init__(self, gain, feature_idx, bin_idx, bin_is_nan, + def __init__(self, gain, feature_idx, bin_idx, split_is_nan, missing_go_to_left, sum_gradient_left, sum_hessian_left, sum_gradient_right, sum_hessian_right, n_samples_left, n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx - self.bin_is_nan = bin_is_nan + self.split_is_nan = split_is_nan self.missing_go_to_left = missing_go_to_left self.sum_gradient_left = sum_gradient_left self.sum_hessian_left = sum_hessian_left @@ -253,7 +253,7 @@ cdef class Splitter: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left - unsigned char split_on_nan = split_info.bin_is_nan + unsigned char split_on_nan = split_info.split_is_nan unsigned char missing_values_bin_idx = self.missing_values_bin_idx int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ @@ -262,7 +262,8 @@ cdef class Splitter: unsigned int [::1] right_indices_buffer = self.right_indices_buffer IF SKLEARN_OPENMP_SUPPORTED: - int n_threads = omp_get_max_threads() + #int n_threads = omp_get_max_threads() + int n_threads = 1 ELSE: int n_threads = 1 @@ -410,7 +411,8 @@ cdef class Splitter: feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) - if has_missing_values[feature_idx]: + if (has_missing_values[feature_idx] + and not split_infos[feature_idx].split_is_nan): self._find_best_bin_to_split_right_to_left( feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) @@ -424,7 +426,7 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, - split_info.bin_is_nan, + split_info.split_is_nan, split_info.missing_go_to_left, split_info.sum_gradient_left, split_info.sum_hessian_left, @@ -517,14 +519,13 @@ cdef class Splitter: sum_gradient_right, sum_hessian_right, negative_loss_current_node, self.l2_regularization) - with gil: - print(gain) + if gain > split_info.gain and gain > self.min_gain_to_split: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx # the split is on NaN if bin_idx happens at the end - split_info.bin_is_nan = bin_idx == end - 1 + split_info.split_is_nan = bin_idx == end - 1 # we scan from left to right so missing values go to the right split_info.missing_go_to_left = False split_info.sum_gradient_left = sum_gradient_left @@ -604,14 +605,12 @@ cdef class Splitter: negative_loss_current_node, self.l2_regularization) - with gil: - print(gain) if gain > split_info.gain and gain > self.min_gain_to_split: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx # the split is on NaN if bin_idx happens at the end - split_info.bin_is_nan = bin_idx == start + split_info.split_is_nan = bin_idx == start # we scan from right to left so missing values go to the left split_info.missing_go_to_left = True split_info.sum_gradient_left = sum_gradient_left diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 01c8a33088849..6dc2401a70bcf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -371,24 +371,25 @@ def test_splitting_missing_values(X_binned, all_gradients, ([0, 1, 2, 3, 7, 8, 9, 9, 9, 9], # 9 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values - 8, # n_bins_non_missing - 7, # cut on bin_idx=max_bins - 1 - True, # bin_is_nan + 9, # n_bins_non_missing + 8, # cut on bin_idx=max_bins - 1 + True, # bin_is_nan False), # missing values go to right ([9, 9, 9, 9, 9, 9, 1, 3, 8, 6], # 9 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values - 8, # n_bins_non_missing - 7, # cut on bin_idx=max_bins - 1 - True, # bin_is_nan + 9, # n_bins_non_missing + 8, # cut on bin_idx=max_bins - 1 + True, # bin_is_nan False), # missing values go to right ] ) -def test_splitting_missing_values_edge_case(X_binned, all_gradients, - has_missing_values, n_bins_non_missing, - expected_bin_idx, bin_is_nan, - expected_go_to_left): +def test_splitting_missing_values_edge_case( + X_binned, all_gradients, + has_missing_values, n_bins_non_missing, + expected_bin_idx, bin_is_nan, + expected_go_to_left): print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") n_bins = max(X_binned) + 1 n_samples = len(X_binned) @@ -423,7 +424,7 @@ def test_splitting_missing_values_edge_case(X_binned, all_gradients, split_info = splitter.find_node_split(n_samples, histograms, sum_gradients, sum_hessians) - #assert split_info.bin_idx == expected_bin_idx + assert split_info.bin_idx == expected_bin_idx if has_missing_values: assert split_info.missing_go_to_left == expected_go_to_left @@ -433,5 +434,5 @@ def test_splitting_missing_values_edge_case(X_binned, all_gradients, samples_left, samples_right, _ = splitter.split_indices( split_info, splitter.partition) - assert set(samples_left) == set([0, 1, 2, 3, 4, 5]) - assert set(samples_right) == set([6, 7, 8, 9]) + nan_idx = np.flatnonzero(np.array(X_binned) == n_bins_non_missing) + assert set(samples_right) == set(nan_idx) From 76e18f8fdecdc473c199d7b3f5360d08b98a4840 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 11 Jul 2019 15:29:45 +0200 Subject: [PATCH 48/76] cleaups --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- .../_hist_gradient_boosting/tests/test_splitting.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 4f4bd6aba4014..570231448fbc0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -473,7 +473,7 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left - if split_info.bin_is_nan: + if split_info.split_is_nan: node['threshold'] = np.nan elif bin_thresholds is not None: threshold = bin_thresholds[feature_idx][bin_idx] diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 6dc2401a70bcf..859c6ccdb783d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -366,14 +366,14 @@ def test_splitting_missing_values(X_binned, all_gradients, @pytest.mark.parametrize( 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' - ' expected_bin_idx, bin_is_nan, expected_go_to_left', [ + ' expected_bin_idx, split_is_nan, expected_go_to_left', [ ([0, 1, 2, 3, 7, 8, 9, 9, 9, 9], # 9 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values 9, # n_bins_non_missing 8, # cut on bin_idx=max_bins - 1 - True, # bin_is_nan + True, # split_is_nan False), # missing values go to right ([9, 9, 9, 9, 9, 9, 1, 3, 8, 6], # 9 <=> missing @@ -381,16 +381,15 @@ def test_splitting_missing_values(X_binned, all_gradients, True, # missing values 9, # n_bins_non_missing 8, # cut on bin_idx=max_bins - 1 - True, # bin_is_nan + True, # split_is_nan False), # missing values go to right ] ) def test_splitting_missing_values_edge_case( X_binned, all_gradients, has_missing_values, n_bins_non_missing, - expected_bin_idx, bin_is_nan, + expected_bin_idx, split_is_nan, expected_go_to_left): - print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") n_bins = max(X_binned) + 1 n_samples = len(X_binned) l2_regularization = 0. From eb0f7e6884ebcaecd0b4ec8d05094294dcb57fa4 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 11 Jul 2019 16:24:08 +0200 Subject: [PATCH 49/76] format and comment --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 94be93e07df0b..f444248ae437a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -659,8 +659,10 @@ cdef inline unsigned char sample_goes_left( X_BINNED_DTYPE_C bin_value) nogil: return ( ( + # if we split on nan, nans always go to right child. split_on_nan and - bin_value != missing_values_bin_idx) + bin_value != missing_values_bin_idx + ) or ( missing_go_to_left and bin_value == missing_values_bin_idx From e0abc50da8065b877738bf90e36702987e1417bb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 12 Jul 2019 10:57:30 -0400 Subject: [PATCH 50/76] Fixed bug + added more tests --- .../_hist_gradient_boosting/splitting.pyx | 2 +- .../tests/test_splitting.py | 56 +++++++++++++++---- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index f444248ae437a..b08b580e6be06 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -565,7 +565,7 @@ cdef class Splitter: Y_DTYPE_C sum_gradient_right Y_DTYPE_C negative_loss_current_node Y_DTYPE_C gain - unsigned int start = self.n_bins_non_missing[feature_idx] - 1 + unsigned int start = self.n_bins_non_missing[feature_idx] - 2 sum_gradient_right, sum_hessian_right = 0., 0. n_samples_right = 0 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 859c6ccdb783d..77bae28a40298 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -286,10 +286,18 @@ def test_min_gain_to_split(): 3, # expected_bin_idx 'not_applicable'), - # We replace 2 samples by NaNs (bin_idx=9) + # We replace 2 samples by NaNs (bin_idx=8) # These 2 samples were mapped to the left node before, so they should # be mapped to left node again # Notice how the bin_idx threshold changes from 3 to 1. + ([8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + 1, # cut on bin_idx=1 + True), # missing values go to left + + # same as above, but with non-consecutive missing_values_bin ([9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values @@ -297,7 +305,15 @@ def test_min_gain_to_split(): 1, # cut on bin_idx=1 True), # missing values go to left - # Same, this time replacing 2 samples that were on the right. + # this time replacing 2 samples that were on the right. + ([0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + 3, # cut on bin_idx=3 (like in first case) + False), # missing values go to right + + # same as above, but with non-consecutive missing_values_bin ([0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values @@ -368,21 +384,40 @@ def test_splitting_missing_values(X_binned, all_gradients, 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' ' expected_bin_idx, split_is_nan, expected_go_to_left', [ - ([0, 1, 2, 3, 7, 8, 9, 9, 9, 9], # 9 <=> missing + ([0, 1, 2, 3, 4, 5, 6, 6, 6, 6], # 6 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values - 9, # n_bins_non_missing - 8, # cut on bin_idx=max_bins - 1 + 6, # n_bins_non_missing + 5, # cut on bin_idx=max_bins - 1 True, # split_is_nan False), # missing values go to right - ([9, 9, 9, 9, 9, 9, 1, 3, 8, 6], # 9 <=> missing + # same as above, but with non-consecutive missing_values_bin + ([0, 1, 2, 3, 4, 5, 9, 9, 9, 9], # 9 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values - 9, # n_bins_non_missing - 8, # cut on bin_idx=max_bins - 1 + 6, # n_bins_non_missing + 5, True, # split_is_nan False), # missing values go to right + + ([4, 4, 4, 4, 4, 4, 0, 1, 2, 3], # 4 <=> missing + [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + True, # missing values + 4, # n_bins_non_missing + 3, # cut on bin_idx=max_bins - 1 + True, # split_is_nan + False), # missing values go to right + + # same as above, but with non-consecutive missing_values_bin + ([9, 9, 9, 9, 9, 9, 0, 1, 2, 3], # 9 <=> missing + [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + True, # missing values + 4, # n_bins_non_missing + 3, # cut on bin_idx=max_bins - 1 + True, # split_is_nan + False), # missing values go to right + ] ) def test_splitting_missing_values_edge_case( @@ -433,5 +468,6 @@ def test_splitting_missing_values_edge_case( samples_left, samples_right, _ = splitter.split_indices( split_info, splitter.partition) - nan_idx = np.flatnonzero(np.array(X_binned) == n_bins_non_missing) - assert set(samples_right) == set(nan_idx) + missing_samples_indices = np.flatnonzero( + np.array(X_binned) == missing_values_bin_idx) + assert set(samples_right) == set(missing_samples_indices) From 77846a3e97e785b6edcd1a7bf653bfdb316cdae6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 12 Jul 2019 12:13:45 -0400 Subject: [PATCH 51/76] refactor tests --- .../_hist_gradient_boosting/grower.py | 2 +- .../_hist_gradient_boosting/splitting.pyx | 18 +-- .../tests/test_splitting.py | 132 +++++++----------- 3 files changed, 58 insertions(+), 94 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 570231448fbc0..85e5115de9157 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -473,7 +473,7 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left - if split_info.split_is_nan: + if split_info.split_on_nan: node['threshold'] = np.nan elif bin_thresholds is not None: threshold = bin_thresholds[feature_idx][bin_idx] diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index b08b580e6be06..95fc23dd88b0e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -32,7 +32,7 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx - unsigned char split_is_nan + unsigned char split_on_nan unsigned char missing_go_to_left Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right @@ -53,7 +53,7 @@ class SplitInfo: The index of the feature to be split. bin_idx : int The index of the bin on which the split is made. - split_is_nan : bool + split_on_nan : bool Whether the split has only NaN on one side. missing_go_to_left : bool Whether missing values should go to the left child. @@ -70,14 +70,14 @@ class SplitInfo: n_samples_right : int The number of samples in the right child. """ - def __init__(self, gain, feature_idx, bin_idx, split_is_nan, + def __init__(self, gain, feature_idx, bin_idx, split_on_nan, missing_go_to_left, sum_gradient_left, sum_hessian_left, sum_gradient_right, sum_hessian_right, n_samples_left, n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx - self.split_is_nan = split_is_nan + self.split_on_nan = split_on_nan self.missing_go_to_left = missing_go_to_left self.sum_gradient_left = sum_gradient_left self.sum_hessian_left = sum_hessian_left @@ -253,7 +253,7 @@ cdef class Splitter: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left - unsigned char split_on_nan = split_info.split_is_nan + unsigned char split_on_nan = split_info.split_on_nan unsigned char missing_values_bin_idx = self.missing_values_bin_idx int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ @@ -412,7 +412,7 @@ cdef class Splitter: sum_gradients, sum_hessians, &split_infos[feature_idx]) if (has_missing_values[feature_idx] - and not split_infos[feature_idx].split_is_nan): + and not split_infos[feature_idx].split_on_nan): self._find_best_bin_to_split_right_to_left( feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) @@ -426,7 +426,7 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, - split_info.split_is_nan, + split_info.split_on_nan, split_info.missing_go_to_left, split_info.sum_gradient_left, split_info.sum_hessian_left, @@ -525,7 +525,7 @@ cdef class Splitter: split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx # the split is on NaN if bin_idx happens at the end - split_info.split_is_nan = bin_idx == end - 1 + split_info.split_on_nan = bin_idx == end - 1 # we scan from left to right so missing values go to the right split_info.missing_go_to_left = False split_info.sum_gradient_left = sum_gradient_left @@ -610,7 +610,7 @@ cdef class Splitter: split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx # the split is on NaN if bin_idx happens at the end - split_info.split_is_nan = bin_idx == start + split_info.split_on_nan = bin_idx == start # we scan from right to left so missing values go to the left split_info.missing_go_to_left = True split_info.sum_gradient_left = sum_gradient_left diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 77bae28a40298..06552d1b44468 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -275,7 +275,7 @@ def test_min_gain_to_split(): @pytest.mark.parametrize( 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' - ' expected_bin_idx, expected_go_to_left', [ + ' expected_split_on_nan, expected_bin_idx, expected_go_to_left', [ # basic sanity check with no missing values: given the gradient # values, the split must occur on bin_idx=3 @@ -283,6 +283,7 @@ def test_min_gain_to_split(): [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients False, # no missing values 10, # n_bins_non_missing + False, # don't split on nans 3, # expected_bin_idx 'not_applicable'), @@ -294,6 +295,7 @@ def test_min_gain_to_split(): [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values 8, # n_bins_non_missing + False, # don't split on nans 1, # cut on bin_idx=1 True), # missing values go to left @@ -302,6 +304,7 @@ def test_min_gain_to_split(): [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values 8, # n_bins_non_missing + False, # don't split on nans 1, # cut on bin_idx=1 True), # missing values go to left @@ -310,6 +313,7 @@ def test_min_gain_to_split(): [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values 8, # n_bins_non_missing + False, # don't split on nans 3, # cut on bin_idx=3 (like in first case) False), # missing values go to right @@ -318,78 +322,18 @@ def test_min_gain_to_split(): [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values 8, # n_bins_non_missing + False, # don't split on nans 3, # cut on bin_idx=3 (like in first case) False), # missing values go to right - ] -) -def test_splitting_missing_values(X_binned, all_gradients, - has_missing_values, n_bins_non_missing, - expected_bin_idx, expected_go_to_left): - # Make sure missing values are properly supported. - # we build an artificial example with gradients such that the best split - # is on bin_idx=3, when there are no missing values. - # Then we introduce missing values and: - # - make sure the chosen bin is correct (find_best_bin()): it's - # still the same split, even though the index of the bin changes - # - make sure the missing values are mapped to the correct child - # (split_indices()) - - n_bins = max(X_binned) + 1 - n_samples = len(X_binned) - l2_regularization = 0. - min_hessian_to_split = 1e-3 - min_samples_leaf = 1 - min_gain_to_split = 0. - - sample_indices = np.arange(n_samples, dtype=np.uint32) - X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) - X_binned = np.asfortranarray(X_binned) - all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) - has_missing_values = np.array([has_missing_values], dtype=np.uint8) - all_hessians = np.ones(1, dtype=G_H_DTYPE) - sum_gradients = all_gradients.sum() - sum_hessians = 1 * n_samples - hessians_are_constant = True - - builder = HistogramBuilder(X_binned, n_bins, - all_gradients, all_hessians, - hessians_are_constant) - - n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) - missing_values_bin_idx = n_bins - 1 - splitter = Splitter(X_binned, n_bins_non_missing, - missing_values_bin_idx, has_missing_values, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split, - hessians_are_constant) - - histograms = builder.compute_histograms_brute(sample_indices) - split_info = splitter.find_node_split(n_samples, histograms, - sum_gradients, sum_hessians) - - assert split_info.bin_idx == expected_bin_idx - if has_missing_values: - assert split_info.missing_go_to_left == expected_go_to_left - - # Whatever the missing values, the split should always be the same. This - # also make sure missing values are properly assigned to the correct child - # in split_indices() - samples_left, samples_right, _ = splitter.split_indices( - split_info, splitter.partition) - assert set(samples_left) == set([0, 1, 2, 3]) - assert set(samples_right) == set([4, 5, 6, 7, 8, 9]) - - -@pytest.mark.parametrize( - 'X_binned, all_gradients, has_missing_values, n_bins_non_missing, ' - ' expected_bin_idx, split_is_nan, expected_go_to_left', [ + # For the following case, split_on_nans is True (we replace all of the + # samples with nans, instead of just 2). ([0, 1, 2, 3, 4, 5, 6, 6, 6, 6], # 6 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values 6, # n_bins_non_missing - 5, # cut on bin_idx=max_bins - 1 - True, # split_is_nan + True, # split on nans + 5, # cut on bin_idx=5 False), # missing values go to right # same as above, but with non-consecutive missing_values_bin @@ -397,16 +341,16 @@ def test_splitting_missing_values(X_binned, all_gradients, [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values 6, # n_bins_non_missing - 5, - True, # split_is_nan + True, # split on nans + 5, # cut on bin_idx=5 False), # missing values go to right ([4, 4, 4, 4, 4, 4, 0, 1, 2, 3], # 4 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values 4, # n_bins_non_missing - 3, # cut on bin_idx=max_bins - 1 - True, # split_is_nan + True, # split on nans + 3, # cut on bin_idx=3 False), # missing values go to right # same as above, but with non-consecutive missing_values_bin @@ -414,17 +358,24 @@ def test_splitting_missing_values(X_binned, all_gradients, [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values 4, # n_bins_non_missing - 3, # cut on bin_idx=max_bins - 1 - True, # split_is_nan + True, # split on nans + 3, # cut on bin_idx=3 False), # missing values go to right - ] ) -def test_splitting_missing_values_edge_case( - X_binned, all_gradients, - has_missing_values, n_bins_non_missing, - expected_bin_idx, split_is_nan, - expected_go_to_left): +def test_splitting_missing_values(X_binned, all_gradients, + has_missing_values, n_bins_non_missing, + expected_split_on_nan, expected_bin_idx, + expected_go_to_left): + # Make sure missing values are properly supported. + # we build an artificial example with gradients such that the best split + # is on bin_idx=3, when there are no missing values. + # Then we introduce missing values and: + # - make sure the chosen bin is correct (find_best_bin()): it's + # still the same split, even though the index of the bin may + # - make sure the missing values are mapped to the correct child + # (split_indices()) + n_bins = max(X_binned) + 1 n_samples = len(X_binned) l2_regularization = 0. @@ -462,12 +413,25 @@ def test_splitting_missing_values_edge_case( if has_missing_values: assert split_info.missing_go_to_left == expected_go_to_left - # Whatever the missing values, the split should always be the same. This - # also make sure missing values are properly assigned to the correct child - # in split_indices() + assert split_info.split_on_nan == expected_split_on_nan + + # Make sure the split is properly computed. + # This also make sure missing values are properly assigned to the correct + # child in split_indices() samples_left, samples_right, _ = splitter.split_indices( split_info, splitter.partition) - missing_samples_indices = np.flatnonzero( - np.array(X_binned) == missing_values_bin_idx) - assert set(samples_right) == set(missing_samples_indices) + if not expected_split_on_nan: + # When we don't split on nans, the split should always be the same. + assert set(samples_left) == set([0, 1, 2, 3]) + assert set(samples_right) == set([4, 5, 6, 7, 8, 9]) + else: + # When we split on nans, samples with missing values are always mapped + # to the right child. + missing_samples_indices = np.flatnonzero( + np.array(X_binned) == missing_values_bin_idx) + non_missing_samples_indices = np.flatnonzero( + np.array(X_binned) != missing_values_bin_idx) + + assert set(samples_right) == set(missing_samples_indices) + assert set(samples_left) == set(non_missing_samples_indices) From 14d444fb15be2f5a57e28d39ecddba8175ac3161 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 12 Jul 2019 12:17:14 -0400 Subject: [PATCH 52/76] put back n_threads to max value --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 95fc23dd88b0e..32c7a2fa54c54 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -262,8 +262,7 @@ cdef class Splitter: unsigned int [::1] right_indices_buffer = self.right_indices_buffer IF SKLEARN_OPENMP_SUPPORTED: - #int n_threads = omp_get_max_threads() - int n_threads = 1 + int n_threads = omp_get_max_threads() ELSE: int n_threads = 1 From 8fb80fd508924d1a8dce6b64948f0594046d9e31 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 12 Jul 2019 12:23:25 -0400 Subject: [PATCH 53/76] minor changes --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 32c7a2fa54c54..ce5860e375d23 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -405,6 +405,9 @@ cdef class Splitter: # (left to right scan) or to the left (right to left case). # See algo 3 from the XGBoost paper # https://arxiv.org/abs/1603.02754 + # If we know that the right child only contains nans + # (split_on_nan is True), then there is no need to scan nodes + # from right to left. self._find_best_bin_to_split_left_to_right( feature_idx, histograms, n_samples, @@ -656,6 +659,8 @@ cdef inline unsigned char sample_goes_left( unsigned char missing_values_bin_idx, X_BINNED_DTYPE_C split_bin_idx, X_BINNED_DTYPE_C bin_value) nogil: + """Helper to decide whether sample should go to left or right child.""" + return ( ( # if we split on nan, nans always go to right child. @@ -668,4 +673,4 @@ cdef inline unsigned char sample_goes_left( ) or ( bin_value <= split_bin_idx - )) \ No newline at end of file + )) From 044039841b0b4392587fdb9b7969d48451dcbba8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 12 Jul 2019 12:29:15 -0400 Subject: [PATCH 54/76] minor cleaning --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index ce5860e375d23..7cf53f6a258bb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -611,8 +611,8 @@ cdef class Splitter: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx - # the split is on NaN if bin_idx happens at the end - split_info.split_on_nan = bin_idx == start + # split_on_nan is only possible when we go from left to right + split_info.split_on_nan = False # we scan from right to left so missing values go to the left split_info.missing_go_to_left = True split_info.sum_gradient_left = sum_gradient_left From 4b0176a2e323cfd69d6e5277781ee39e7b5fb5ce Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 12 Jul 2019 23:55:01 +0200 Subject: [PATCH 55/76] Add (failing) test that checks equivalence with min max imputation --- .../tests/test_gradient_boosting.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index b7dbba993ff51..3cabed6eba6ae 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,6 +1,9 @@ import numpy as np import pytest +from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression +from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler +from sklearn.model_selection import train_test_split # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -275,3 +278,63 @@ def test_small_trainset(): # Test that the class distributions in the whole dataset and in the small # training set are identical assert small_distrib == pytest.approx(original_distrib) + + +def test_missing_values_minmax_imputation(): + # Compare the buit-in missing value handling of Histogram GBC with an + # a-priori missing value imputation strategy that should yield the same + # results in terms of decision function. + rng = np.random.RandomState(42) + X, y = make_regression(n_samples=int(1e3), n_features=4, random_state=rng) + + # Pre-bin the data to ensure a deterministic handling by the 2 strategies + # and also make it easier to insert np.nan in a structured way: + X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) + + # First feature has missing values completely at random: + rnd_mask = rng.rand(X.shape[0]) > 0.6 + X[rnd_mask, 0] = np.nan + + # Second and third features have missing values for extreme values + # (censoring missingness). + low_mask = X[:, 1] <= 3 + X[low_mask, 1] = np.nan + + high_mask = X[:, 2] >= 40 + X[high_mask, 2] = np.nan + + # Last feature has a missing pattern that is highly predictive of the + # target variable + target_mask = y > 0 + X[target_mask, 3] = np.nan + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + builtin_gbm = HistGradientBoostingRegressor(max_iter=10, random_state=0) + builtin_gbm.fit(X_train, y_train) + y_builtin_predict_train = builtin_gbm.predict(X_train) + y_builtin_predict_test = builtin_gbm.predict(X_test) + + # Implement min-max feature imputation + mm = MinMaxScaler().fit(X_train) + X_train_min, X_train_max = X_train.copy(), X_train.copy() + X_test_min, X_test_max = X_test.copy(), X_test.copy() + for feature_idx in range(X.shape[1]): + nan_mask = np.isnan(X_train[:, feature_idx]) + X_train_min[nan_mask, feature_idx] = mm.data_min_[feature_idx] - 1 + X_train_max[nan_mask, feature_idx] = mm.data_max_[feature_idx] + 1 + + nan_mask = np.isnan(X_test[:, feature_idx]) + X_test_min[nan_mask, feature_idx] = mm.data_min_[feature_idx] - 1 + X_test_max[nan_mask, feature_idx] = mm.data_max_[feature_idx] + 1 + + X_train_imputed = np.concatenate([X_train_min, X_train_max], axis=1) + X_test_imputed = np.concatenate([X_test_min, X_test_max], axis=1) + + imputed_gbm = HistGradientBoostingRegressor(max_iter=10, random_state=0) + imputed_gbm.fit(X_train_imputed, y_train) + y_imputed_predict_train = imputed_gbm.predict(X_train_imputed) + y_imputed_predict_test = imputed_gbm.predict(X_test_imputed) + + assert_allclose(y_builtin_predict_train, y_imputed_predict_train) + assert_allclose(y_builtin_predict_test, y_imputed_predict_test) From d38881c2caeff62f8a27e894aafed36f862ad084 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Jul 2019 10:09:31 +0200 Subject: [PATCH 56/76] Decrease the likelihood of ties when training the trees --- .../tests/test_gradient_boosting.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 3cabed6eba6ae..5cc964651a0b0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -4,6 +4,7 @@ from sklearn.datasets import make_classification, make_regression from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler from sklearn.model_selection import train_test_split +from sklearn.base import clone # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -284,38 +285,57 @@ def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. + # + # Assuming the data is such that there is never a tie to select the best + # feature to split on during training, the learned decision trees should be + # strictly equivalent (learn a sequence of splits that encode the same + # decision function). rng = np.random.RandomState(42) - X, y = make_regression(n_samples=int(1e3), n_features=4, random_state=rng) + X, y = make_regression(n_samples=int(1e4), n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 strategies # and also make it easier to insert np.nan in a structured way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: - rnd_mask = rng.rand(X.shape[0]) > 0.6 + rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness). - low_mask = X[:, 1] <= 3 + low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan - high_mask = X[:, 2] >= 40 + high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Last feature has a missing pattern that is highly predictive of the # target variable - target_mask = y > 0 + target_mask = y > np.percentile(y, 90) X[target_mask, 3] = np.nan + # Check that there is at least one missing value in each feature: + for feature_idx in range(X.shape[1]): + assert any(np.isnan(X[:, feature_idx])) + + # Let's use a test set to check that the learned decision function is the + # same as evaluated on unseen data. Otherwise it could just be the case + # that we find two independent ways to overfit the training set. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - builtin_gbm = HistGradientBoostingRegressor(max_iter=10, random_state=0) + # Use a small number of leaf nodes and iterations so as to keep + # under-fitting models to minimize the likelihood of ties when training the + # model. + builtin_gbm = HistGradientBoostingRegressor(max_iter=10, + max_leaf_nodes=5, + random_state=0) builtin_gbm.fit(X_train, y_train) y_builtin_predict_train = builtin_gbm.predict(X_train) y_builtin_predict_test = builtin_gbm.predict(X_test) - # Implement min-max feature imputation + # Implement min-max feature imputation: we use MinMaxScaler to easily + # extract the min and max values of non-missing numerical data for each + # feature. mm = MinMaxScaler().fit(X_train) X_train_min, X_train_max = X_train.copy(), X_train.copy() X_test_min, X_test_max = X_test.copy(), X_test.copy() @@ -331,7 +351,7 @@ def test_missing_values_minmax_imputation(): X_train_imputed = np.concatenate([X_train_min, X_train_max], axis=1) X_test_imputed = np.concatenate([X_test_min, X_test_max], axis=1) - imputed_gbm = HistGradientBoostingRegressor(max_iter=10, random_state=0) + imputed_gbm = clone(builtin_gbm) imputed_gbm.fit(X_train_imputed, y_train) y_imputed_predict_train = imputed_gbm.predict(X_train_imputed) y_imputed_predict_test = imputed_gbm.predict(X_test_imputed) From f5e8e45820aa479e1ac7b68ecfc66e82976e7249 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Jul 2019 10:40:57 +0200 Subject: [PATCH 57/76] More robust test --- .../tests/test_gradient_boosting.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 5cc964651a0b0..539ecc3426489 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -281,7 +281,8 @@ def test_small_trainset(): assert small_distrib == pytest.approx(original_distrib) -def test_missing_values_minmax_imputation(): +@pytest.mark.parametrize('seed', range(100)) +def test_missing_values_minmax_imputation(seed=0): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. @@ -290,8 +291,8 @@ def test_missing_values_minmax_imputation(): # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). - rng = np.random.RandomState(42) - X, y = make_regression(n_samples=int(1e4), n_features=4, random_state=rng) + rng = np.random.RandomState(seed) + X, y = make_regression(n_samples=int(1e4), n_features=3, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 strategies # and also make it easier to insert np.nan in a structured way: @@ -309,11 +310,6 @@ def test_missing_values_minmax_imputation(): high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan - # Last feature has a missing pattern that is highly predictive of the - # target variable - target_mask = y > np.percentile(y, 90) - X[target_mask, 3] = np.nan - # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) From a0963fbcb98f050c3a489c96405be5da34b8898c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Jul 2019 10:56:26 +0200 Subject: [PATCH 58/76] Fix pytest parametrization --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 539ecc3426489..8f4f01a14c2a1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -281,8 +281,7 @@ def test_small_trainset(): assert small_distrib == pytest.approx(original_distrib) -@pytest.mark.parametrize('seed', range(100)) -def test_missing_values_minmax_imputation(seed=0): +def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. @@ -291,7 +290,7 @@ def test_missing_values_minmax_imputation(seed=0): # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). - rng = np.random.RandomState(seed) + rng = np.random.RandomState(0) X, y = make_regression(n_samples=int(1e4), n_features=3, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 strategies From d0be6cbf9d482a229701e06f54b66a01689bc254 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Jul 2019 13:01:54 +0200 Subject: [PATCH 59/76] Check bin thresholds in test --- .../tests/test_gradient_boosting.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 8f4f01a14c2a1..9220957c8f629 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -325,6 +325,13 @@ def test_missing_values_minmax_imputation(): max_leaf_nodes=5, random_state=0) builtin_gbm.fit(X_train, y_train) + assert_allclose(builtin_gbm.bin_mapper_.bin_thresholds_[0], + np.arange(0, 41) + .5) + assert_allclose(builtin_gbm.bin_mapper_.bin_thresholds_[1], + np.arange(1, 41) + .5) + assert_allclose(builtin_gbm.bin_mapper_.bin_thresholds_[2], + np.arange(0, 40) + .5) + y_builtin_predict_train = builtin_gbm.predict(X_train) y_builtin_predict_test = builtin_gbm.predict(X_test) @@ -348,6 +355,19 @@ def test_missing_values_minmax_imputation(): imputed_gbm = clone(builtin_gbm) imputed_gbm.fit(X_train_imputed, y_train) + assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[0], + np.arange(-1, 41) + .5) + assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[1], + np.arange(0, 41) + .5) + assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[2], + np.arange(-1, 40) + .5) + assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[3], + np.arange(0, 42) + .5) + assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[4], + np.arange(1, 42) + .5) + assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[5], + np.arange(0, 41) + .5) + y_imputed_predict_train = imputed_gbm.predict(X_train_imputed) y_imputed_predict_test = imputed_gbm.predict(X_test_imputed) From 9c9d7e57f0c2b084b3464af9f50feb702790e366 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 15 Jul 2019 14:32:33 +0200 Subject: [PATCH 60/76] Try to make the test even easier to see if the Linux 32bit build would pass in this case --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 9220957c8f629..928e9f1cdd2ec 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -321,7 +321,7 @@ def test_missing_values_minmax_imputation(): # Use a small number of leaf nodes and iterations so as to keep # under-fitting models to minimize the likelihood of ties when training the # model. - builtin_gbm = HistGradientBoostingRegressor(max_iter=10, + builtin_gbm = HistGradientBoostingRegressor(max_iter=1, max_leaf_nodes=5, random_state=0) builtin_gbm.fit(X_train, y_train) From 75dc126ff81177355c1741fd9d925be03ab3f7d0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Jul 2019 09:02:59 -0400 Subject: [PATCH 61/76] Don't check last non-missing bin if there's no nan --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 7cf53f6a258bb..20bbcf4c7bb88 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -410,8 +410,9 @@ cdef class Splitter: # from right to left. self._find_best_bin_to_split_left_to_right( - feature_idx, histograms, n_samples, - sum_gradients, sum_hessians, &split_infos[feature_idx]) + feature_idx, has_missing_values[feature_idx], + histograms, n_samples, sum_gradients, sum_hessians, + &split_infos[feature_idx]) if (has_missing_values[feature_idx] and not split_infos[feature_idx].split_on_nan): @@ -457,6 +458,7 @@ cdef class Splitter: cdef void _find_best_bin_to_split_left_to_right( Splitter self, unsigned int feature_idx, + unsigned char has_missing_values, const hist_struct [:, ::1] histograms, # IN unsigned int n_samples, Y_DTYPE_C sum_gradients, @@ -476,7 +478,7 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int end = self.n_bins_non_missing[feature_idx] + unsigned int end = self.n_bins_non_missing[feature_idx] - 1 + has_missing_values Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left @@ -527,7 +529,7 @@ cdef class Splitter: split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx # the split is on NaN if bin_idx happens at the end - split_info.split_on_nan = bin_idx == end - 1 + split_info.split_on_nan = has_missing_values and (bin_idx == end - 1) # we scan from left to right so missing values go to the right split_info.missing_go_to_left = False split_info.sum_gradient_left = sum_gradient_left From 3b2075ce9324247787002e274dfadb742c7abb54 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 19 Jul 2019 12:35:30 +0200 Subject: [PATCH 62/76] Improve min-max imputation test --- .../tests/test_gradient_boosting.py | 165 +++++++++--------- 1 file changed, 86 insertions(+), 79 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 928e9f1cdd2ec..c4f8b1b946b33 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -4,7 +4,8 @@ from sklearn.datasets import make_classification, make_regression from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler from sklearn.model_selection import train_test_split -from sklearn.base import clone +from sklearn.base import clone, BaseEstimator, TransformerMixin +from sklearn.pipeline import make_pipeline # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -290,86 +291,92 @@ def test_missing_values_minmax_imputation(): # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). - rng = np.random.RandomState(0) - X, y = make_regression(n_samples=int(1e4), n_features=3, random_state=rng) - - # Pre-bin the data to ensure a deterministic handling by the 2 strategies - # and also make it easier to insert np.nan in a structured way: - X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) - - # First feature has missing values completely at random: - rnd_mask = rng.rand(X.shape[0]) > 0.9 - X[rnd_mask, 0] = np.nan + # + # The MinMaxImputer transformer is meant to be a toy implementation of the + # "Missing In Attributes" (MIA) missing value handling for decision trees + # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 + # The implementation of MIA as an imputation transformer was suggested by + # "Remark 3" in https://arxiv.org/abs/1902.06931 + + class MinMaxImputer(BaseEstimator, TransformerMixin): + + def fit(self, X, y=None): + mm = MinMaxScaler().fit(X) + self.data_min_ = mm.data_min_ + self.data_max_ = mm.data_max_ + return self + + def transform(self, X): + X_min, X_max = X.copy(), X.copy() + + for feature_idx in range(X.shape[1]): + nan_mask = np.isnan(X[:, feature_idx]) + X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 + X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 + + return np.concatenate([X_min, X_max], axis=1) + + def make_missing_value_data(n_samples=int(1e4), seed=0): + rng = np.random.RandomState(seed) + X, y = make_regression(n_samples=n_samples, n_features=4, + random_state=rng) + + # Pre-bin the data to ensure a deterministic handling by the 2 + # strategies and also make it easier to insert np.nan in a structured + # way: + X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) + + # First feature has missing values completely at random: + rnd_mask = rng.rand(X.shape[0]) > 0.9 + X[rnd_mask, 0] = np.nan + + # Second and third features have missing values for extreme values + # (censoring missingness): + low_mask = X[:, 1] == 0 + X[low_mask, 1] = np.nan + + high_mask = X[:, 2] == X[:, 2].max() + X[high_mask, 2] = np.nan + + # Make the last feature nan pattern very informative: + y_max = np.percentile(y, 70) + y_max_mask = y >= y_max + y[y_max_mask] = y_max + X[y_max_mask, 3] = np.nan + + # Check that there is at least one missing value in each feature: + for feature_idx in range(X.shape[1]): + assert any(np.isnan(X[:, feature_idx])) + + # Let's use a test set to check that the learned decision function is + # the same as evaluated on unseen data. Otherwise it could just be the + # case that we find two independent ways to overfit the training set. + return train_test_split(X, y, random_state=rng) + + # n_samples need to be large enough to minimize the likelihood of having + # several candidate splits with the same gain value in a given tree. + X_train, X_test, y_train, y_test = make_missing_value_data( + n_samples=int(1e4), seed=0) - # Second and third features have missing values for extreme values - # (censoring missingness). - low_mask = X[:, 1] == 0 - X[low_mask, 1] = np.nan + # Use a small number of leaf nodes and iterations so as to keep + # under-fitting models to minimize the likelihood of ties when training the + # model. + gbm1 = HistGradientBoostingRegressor(max_iter=100, + max_leaf_nodes=5, + random_state=0) + gbm1.fit(X_train, y_train) - high_mask = X[:, 2] == X[:, 2].max() - X[high_mask, 2] = np.nan + gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) + gbm2.fit(X_train, y_train) - # Check that there is at least one missing value in each feature: - for feature_idx in range(X.shape[1]): - assert any(np.isnan(X[:, feature_idx])) + # Check that the model reach the same score: + assert gbm1.score(X_train, y_train) == \ + pytest.approx(gbm2.score(X_train, y_train)) - # Let's use a test set to check that the learned decision function is the - # same as evaluated on unseen data. Otherwise it could just be the case - # that we find two independent ways to overfit the training set. - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + assert gbm1.score(X_test, y_test) == \ + pytest.approx(gbm2.score(X_test, y_test)) - # Use a small number of leaf nodes and iterations so as to keep - # under-fitting models to minimize the likelihood of ties when training the - # model. - builtin_gbm = HistGradientBoostingRegressor(max_iter=1, - max_leaf_nodes=5, - random_state=0) - builtin_gbm.fit(X_train, y_train) - assert_allclose(builtin_gbm.bin_mapper_.bin_thresholds_[0], - np.arange(0, 41) + .5) - assert_allclose(builtin_gbm.bin_mapper_.bin_thresholds_[1], - np.arange(1, 41) + .5) - assert_allclose(builtin_gbm.bin_mapper_.bin_thresholds_[2], - np.arange(0, 40) + .5) - - y_builtin_predict_train = builtin_gbm.predict(X_train) - y_builtin_predict_test = builtin_gbm.predict(X_test) - - # Implement min-max feature imputation: we use MinMaxScaler to easily - # extract the min and max values of non-missing numerical data for each - # feature. - mm = MinMaxScaler().fit(X_train) - X_train_min, X_train_max = X_train.copy(), X_train.copy() - X_test_min, X_test_max = X_test.copy(), X_test.copy() - for feature_idx in range(X.shape[1]): - nan_mask = np.isnan(X_train[:, feature_idx]) - X_train_min[nan_mask, feature_idx] = mm.data_min_[feature_idx] - 1 - X_train_max[nan_mask, feature_idx] = mm.data_max_[feature_idx] + 1 - - nan_mask = np.isnan(X_test[:, feature_idx]) - X_test_min[nan_mask, feature_idx] = mm.data_min_[feature_idx] - 1 - X_test_max[nan_mask, feature_idx] = mm.data_max_[feature_idx] + 1 - - X_train_imputed = np.concatenate([X_train_min, X_train_max], axis=1) - X_test_imputed = np.concatenate([X_test_min, X_test_max], axis=1) - - imputed_gbm = clone(builtin_gbm) - imputed_gbm.fit(X_train_imputed, y_train) - assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[0], - np.arange(-1, 41) + .5) - assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[1], - np.arange(0, 41) + .5) - assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[2], - np.arange(-1, 40) + .5) - assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[3], - np.arange(0, 42) + .5) - assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[4], - np.arange(1, 42) + .5) - assert_allclose(imputed_gbm.bin_mapper_.bin_thresholds_[5], - np.arange(0, 41) + .5) - - y_imputed_predict_train = imputed_gbm.predict(X_train_imputed) - y_imputed_predict_test = imputed_gbm.predict(X_test_imputed) - - assert_allclose(y_builtin_predict_train, y_imputed_predict_train) - assert_allclose(y_builtin_predict_test, y_imputed_predict_test) + # Check the individual prediction match as a finer grained + # decision function check. + assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) + assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test)) From a66103cbc7548fb0451888a614af98d17094b5d3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 19 Jul 2019 12:36:22 +0200 Subject: [PATCH 63/76] FIX: _find_best_bin_to_split_right_to_left is still required even when left to right wants to split on nans --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 20bbcf4c7bb88..291f838a9eec7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -414,8 +414,11 @@ cdef class Splitter: histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) - if (has_missing_values[feature_idx] - and not split_infos[feature_idx].split_on_nan): + if has_missing_values[feature_idx]: + # We need to explore both directions to check whether + # sending the nans to the left child would lead to a higher + # gain # numerical splits that all the splits explored when + # sending the nans to the right (or to split them appart): self._find_best_bin_to_split_right_to_left( feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) From 49140c29863ae48cef662ce29f91b98b0bde223e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Jul 2019 10:12:52 -0400 Subject: [PATCH 64/76] comments --- sklearn/ensemble/_hist_gradient_boosting/lol | 22 ++++ .../_hist_gradient_boosting/plotting.py | 112 ++++++++++++++++++ .../_hist_gradient_boosting/splitting.pyx | 13 +- .../tests/test_gradient_boosting.py | 7 ++ 4 files changed, 148 insertions(+), 6 deletions(-) create mode 100644 sklearn/ensemble/_hist_gradient_boosting/lol create mode 100644 sklearn/ensemble/_hist_gradient_boosting/plotting.py diff --git a/sklearn/ensemble/_hist_gradient_boosting/lol b/sklearn/ensemble/_hist_gradient_boosting/lol new file mode 100644 index 0000000000000..717a86c4d0210 --- /dev/null +++ b/sklearn/ensemble/_hist_gradient_boosting/lol @@ -0,0 +1,22 @@ +- [ ] `sklearn/cluster/tests/` +- [ ] `sklearn/compose/tests/` +- [ ] `sklearn/covariance/tests/` +- [ ] `sklearn/datasets/tests/` +- [ ] `sklearn/decomposition/tests/` +- [ ] `sklearn/ensemble/tests/` +- [ ] `sklearn/feature_extraction/tests/` +- [ ] `sklearn/feature_selection/tests/` +- [ ] `sklearn/linear_model/tests/` +- [ ] `sklearn/manifold/tests/` +- [ ] `sklearn/metrics/cluster/tests/` +- [ ] `sklearn/metrics/tests/` +- [ ] `sklearn/model_selection/tests/` +- [ ] `sklearn/neighbors/tests/` +- [ ] `sklearn/neural_network/tests/` +- [ ] `sklearn/preprocessing/tests/` +- [ ] `sklearn/semi_supervised/tests/` +- [ ] `sklearn/svm/tests/` +- [ ] `sklearn/tests/` +- [ ] `sklearn/tree/tests/` +- [ ] `sklearn/utils/estimator_checks.py` +- [ ] `sklearn/utils/tests/` \ No newline at end of file diff --git a/sklearn/ensemble/_hist_gradient_boosting/plotting.py b/sklearn/ensemble/_hist_gradient_boosting/plotting.py new file mode 100644 index 0000000000000..92230483bcbac --- /dev/null +++ b/sklearn/ensemble/_hist_gradient_boosting/plotting.py @@ -0,0 +1,112 @@ +from graphviz import Digraph + +from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import BaseHistGradientBoosting + + +def plot_tree(est_or_grower, est_lightgbm=None, tree_index=0, view=True, + **kwargs): + """Plot the i'th predictor tree of an estimator, or a grower's tree + + est_or_grower can either be a GradientBoostingMachine instance or a + TreeGrower. In this latter case tree_index is ignored, and more debugging + info are displayed. Trees displayed from TreeGrower has additional + profiling information that are not kept in the predictor trees that + result from fitting a GradientBoostingMachine. + + tree_index corresponds to the ith built tree. In a multiclass setting, the + ith tree isn't necessarily the tree built durint the ith iteration because + there are K trees per iteration. For example with 3 classes, + tree_index=5 will print the third tree of the second iteration. + + Can also plot a LightGBM estimator (on the left) for comparison. + + Requires matplotlib and graphviz (both python package and binary program). + + kwargs are passed to graphviz.Digraph() + + Example: plotting.plot_tree(est_pygbm, est_lightgbm, view=False, + filename='output') will silently save output to output.pdf + """ + def make_pygbm_tree(): + def add_predictor_node(node_idx, parent=None, decision=None): + iteration = tree_index // est_or_grower.n_trees_per_iteration_ + k = tree_index % est_or_grower.n_trees_per_iteration_ + predictor_tree = est_or_grower._predictors[iteration][k] + node = predictor_tree.nodes[node_idx] + name = 'split__{}'.format(node_idx) + label = 'split_feature_index: {}'.format( + node['feature_idx']) + label += r'\nthreshold: {:.3f}'.format(node['threshold']) + label += r'\ngain: {:.3E}'.format(node['gain']) + label += r'\nvalue: {:.3f}'.format(node['value']) + label += r'\ncount: {:,}'.format(node['count']) + label += r'\nnans_go_left: {:,}'.format(node['missing_go_to_left']) + + graph.node(name, label=label) + if not node['is_leaf']: + add_predictor_node(node['left'], name, decision='<=') + add_predictor_node(node['right'], name, decision='>') + + if parent is not None: + graph.edge(parent, name, decision) + + def add_grower_node(node, parent=None, decision=None): + name = 'split__{0}'.format(id(node)) + si = node.split_info + if si is None: + feature_idx = 0 + bin_idx = 0 + gain = 0. + sum_gradients = 0. + sum_hessians = 0. + else: + feature_idx = si.feature_idx + gain = 0. if si.gain is None else si.gain + bin_idx = si.bin_idx + sum_gradients = si.gradient_left + si.gradient_right + sum_hessians = si.hessian_left + si.hessian_right + + value = 0. if node.value is None else node.value + label = 'split_feature_index: {}'.format(feature_idx) + label += r'\nbin threshold: {}'.format(bin_idx) + label += r'\ngain: {:.3E}'.format(gain) + label += r'\nvalue: {:.3f}'.format(value) + label += r'\ncount: {:,}'.format(node.sample_indices.shape[0]) + label += r'\nhist substration: {}'.format(node.hist_subtraction) + label += r'\nhist speed: {:.3E}'.format( + node.construction_speed) + label += r'\nfind split time: {:.4f}'.format(node.find_split_time) + label += r'\napply split time: {:.4f}'.format( + node.apply_split_time) + label += r'\nsum gradients: {:.3E}'.format(sum_gradients) + label += r'\nsum hessians: {:.3E}'.format(sum_hessians) + + graph.node(name, label=label) + if node.value is None: # not a leaf node + add_grower_node(node.left_child, name, decision='<=') + add_grower_node(node.right_child, name, decision='>') + + if parent is not None: + graph.edge(parent, name, decision) + + if isinstance(est_or_grower, BaseHistGradientBoosting): + add_predictor_node(0) + # elif isinstance(est_or_grower, pygbm.grower.TreeGrower): + # add_grower_node(est_or_grower.root) + + # make lightgbm tree + if est_lightgbm is not None: + import lightgbm as lb + graph = lb.create_tree_digraph( + est_lightgbm, + tree_index=tree_index, + show_info=['split_gain', 'internal_value', 'internal_count', + 'leaf_count'], + **kwargs) + else: + graph = Digraph(**kwargs) + + # make pygbm tree + make_pygbm_tree() + + graph.render(view=view) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 291f838a9eec7..e095344237728 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -405,9 +405,6 @@ cdef class Splitter: # (left to right scan) or to the left (right to left case). # See algo 3 from the XGBoost paper # https://arxiv.org/abs/1603.02754 - # If we know that the right child only contains nans - # (split_on_nan is True), then there is no need to scan nodes - # from right to left. self._find_best_bin_to_split_left_to_right( feature_idx, has_missing_values[feature_idx], @@ -417,8 +414,7 @@ cdef class Splitter: if has_missing_values[feature_idx]: # We need to explore both directions to check whether # sending the nans to the left child would lead to a higher - # gain # numerical splits that all the splits explored when - # sending the nans to the right (or to split them appart): + # gain self._find_best_bin_to_split_right_to_left( feature_idx, histograms, n_samples, sum_gradients, sum_hessians, &split_infos[feature_idx]) @@ -481,7 +477,12 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - unsigned int end = self.n_bins_non_missing[feature_idx] - 1 + has_missing_values + # We set the 'end' variable such that the last non-missing bin + # never goes to the left child (which would result in and empty + # right child), unless there are missing values, since these would + # go to the right child. + unsigned int end = \ + self.n_bins_non_missing[feature_idx] - 1 + has_missing_values Y_DTYPE_C sum_hessian_left Y_DTYPE_C sum_hessian_right Y_DTYPE_C sum_gradient_left diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index e6aceff10a41c..ac863f300998c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -287,6 +287,13 @@ def test_missing_values_minmax_imputation(): # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. # + # Each feature (containing NaNs) is replaced by 2 features: + # - one where the nans are replaced by min(feature) - 1 + # - one where the nans are replaced by max(feature) + 1 + # A split where nans go to the left has an equivalent split in the + # first (min) feature, and a split where nans go to the right has an + # equivalent split in the second (max) feature. + # # Assuming the data is such that there is never a tie to select the best # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same From e39f48e0c8d3c738e3c550ae644a7daa4b83ab8d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Jul 2019 11:02:12 -0400 Subject: [PATCH 65/76] remove split_on_nan --- .../_hist_gradient_boosting/grower.py | 18 ++++++++++++----- .../_hist_gradient_boosting/splitting.pyx | 20 ++----------------- .../tests/test_splitting.py | 3 ++- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 85e5115de9157..b5f79052f5a1a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -196,6 +196,7 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) + self.n_bins_non_missing = n_bins_non_missing self.max_leaf_nodes = max_leaf_nodes self.has_missing_values = has_missing_values self.n_features = X_binned.shape[1] @@ -446,12 +447,13 @@ def make_predictor(self, bin_thresholds=None): """ predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) _fill_predictor_node_array(predictor_nodes, self.root, - bin_thresholds=bin_thresholds) + bin_thresholds, self.n_bins_non_missing) return TreePredictor(predictor_nodes) def _fill_predictor_node_array(predictor_nodes, grower_node, - bin_thresholds, next_free_idx=0): + bin_thresholds, n_bins_non_missing, + next_free_idx=0): """Helper used in make_predictor to set the TreePredictor fields.""" node = predictor_nodes[next_free_idx] node['count'] = grower_node.n_samples @@ -473,7 +475,9 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left - if split_info.split_on_nan: + if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1: + # Split is on the last non-missing bin: it's a "split on nans". All + # nans go to the right, the rest go to the left. node['threshold'] = np.nan elif bin_thresholds is not None: threshold = bin_thresholds[feature_idx][bin_idx] @@ -483,9 +487,13 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['left'] = next_free_idx next_free_idx = _fill_predictor_node_array( predictor_nodes, grower_node.left_child, - bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) + bin_thresholds=bin_thresholds, + n_bins_non_missing=n_bins_non_missing, + next_free_idx=next_free_idx) node['right'] = next_free_idx return _fill_predictor_node_array( predictor_nodes, grower_node.right_child, - bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) + bin_thresholds=bin_thresholds, + n_bins_non_missing=n_bins_non_missing, + next_free_idx=next_free_idx) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index e095344237728..bc2477c3ecf1c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -32,7 +32,6 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx - unsigned char split_on_nan unsigned char missing_go_to_left Y_DTYPE_C sum_gradient_left Y_DTYPE_C sum_gradient_right @@ -53,8 +52,6 @@ class SplitInfo: The index of the feature to be split. bin_idx : int The index of the bin on which the split is made. - split_on_nan : bool - Whether the split has only NaN on one side. missing_go_to_left : bool Whether missing values should go to the left child. sum_gradient_left : float @@ -70,14 +67,13 @@ class SplitInfo: n_samples_right : int The number of samples in the right child. """ - def __init__(self, gain, feature_idx, bin_idx, split_on_nan, + def __init__(self, gain, feature_idx, bin_idx, missing_go_to_left, sum_gradient_left, sum_hessian_left, sum_gradient_right, sum_hessian_right, n_samples_left, n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx - self.split_on_nan = split_on_nan self.missing_go_to_left = missing_go_to_left self.sum_gradient_left = sum_gradient_left self.sum_hessian_left = sum_hessian_left @@ -253,7 +249,6 @@ cdef class Splitter: int n_samples = sample_indices.shape[0] X_BINNED_DTYPE_C bin_idx = split_info.bin_idx unsigned char missing_go_to_left = split_info.missing_go_to_left - unsigned char split_on_nan = split_info.split_on_nan unsigned char missing_values_bin_idx = self.missing_values_bin_idx int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ @@ -302,7 +297,7 @@ cdef class Splitter: for i in range(start, stop): sample_idx = sample_indices[i] turn_left = sample_goes_left( - split_on_nan, missing_go_to_left, + missing_go_to_left, missing_values_bin_idx, bin_idx, X_binned[sample_idx]) @@ -428,7 +423,6 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, - split_info.split_on_nan, split_info.missing_go_to_left, split_info.sum_gradient_left, split_info.sum_hessian_left, @@ -532,8 +526,6 @@ cdef class Splitter: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx - # the split is on NaN if bin_idx happens at the end - split_info.split_on_nan = has_missing_values and (bin_idx == end - 1) # we scan from left to right so missing values go to the right split_info.missing_go_to_left = False split_info.sum_gradient_left = sum_gradient_left @@ -617,8 +609,6 @@ cdef class Splitter: split_info.gain = gain split_info.feature_idx = feature_idx split_info.bin_idx = bin_idx - # split_on_nan is only possible when we go from left to right - split_info.split_on_nan = False # we scan from right to left so missing values go to the left split_info.missing_go_to_left = True split_info.sum_gradient_left = sum_gradient_left @@ -660,7 +650,6 @@ cdef inline Y_DTYPE_C negative_loss( return (gradient * gradient) / (hessian + l2_regularization) cdef inline unsigned char sample_goes_left( - unsigned char split_on_nan, unsigned char missing_go_to_left, unsigned char missing_values_bin_idx, X_BINNED_DTYPE_C split_bin_idx, @@ -669,11 +658,6 @@ cdef inline unsigned char sample_goes_left( return ( ( - # if we split on nan, nans always go to right child. - split_on_nan and - bin_value != missing_values_bin_idx - ) - or ( missing_go_to_left and bin_value == missing_values_bin_idx ) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index cb24a99827483..2a8a15aea6894 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -415,7 +415,8 @@ def test_splitting_missing_values(X_binned, all_gradients, if has_missing_values: assert split_info.missing_go_to_left == expected_go_to_left - assert split_info.split_on_nan == expected_split_on_nan + split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1 + assert split_on_nan == expected_split_on_nan # Make sure the split is properly computed. # This also make sure missing values are properly assigned to the correct From f89c1c57c2c6dcd9bcecf54ea4a3a1a66a411de5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Jul 2019 11:25:32 -0400 Subject: [PATCH 66/76] ooops deleted useless files --- sklearn/ensemble/_hist_gradient_boosting/lol | 22 ---- .../_hist_gradient_boosting/plotting.py | 112 ------------------ 2 files changed, 134 deletions(-) delete mode 100644 sklearn/ensemble/_hist_gradient_boosting/lol delete mode 100644 sklearn/ensemble/_hist_gradient_boosting/plotting.py diff --git a/sklearn/ensemble/_hist_gradient_boosting/lol b/sklearn/ensemble/_hist_gradient_boosting/lol deleted file mode 100644 index 717a86c4d0210..0000000000000 --- a/sklearn/ensemble/_hist_gradient_boosting/lol +++ /dev/null @@ -1,22 +0,0 @@ -- [ ] `sklearn/cluster/tests/` -- [ ] `sklearn/compose/tests/` -- [ ] `sklearn/covariance/tests/` -- [ ] `sklearn/datasets/tests/` -- [ ] `sklearn/decomposition/tests/` -- [ ] `sklearn/ensemble/tests/` -- [ ] `sklearn/feature_extraction/tests/` -- [ ] `sklearn/feature_selection/tests/` -- [ ] `sklearn/linear_model/tests/` -- [ ] `sklearn/manifold/tests/` -- [ ] `sklearn/metrics/cluster/tests/` -- [ ] `sklearn/metrics/tests/` -- [ ] `sklearn/model_selection/tests/` -- [ ] `sklearn/neighbors/tests/` -- [ ] `sklearn/neural_network/tests/` -- [ ] `sklearn/preprocessing/tests/` -- [ ] `sklearn/semi_supervised/tests/` -- [ ] `sklearn/svm/tests/` -- [ ] `sklearn/tests/` -- [ ] `sklearn/tree/tests/` -- [ ] `sklearn/utils/estimator_checks.py` -- [ ] `sklearn/utils/tests/` \ No newline at end of file diff --git a/sklearn/ensemble/_hist_gradient_boosting/plotting.py b/sklearn/ensemble/_hist_gradient_boosting/plotting.py deleted file mode 100644 index 92230483bcbac..0000000000000 --- a/sklearn/ensemble/_hist_gradient_boosting/plotting.py +++ /dev/null @@ -1,112 +0,0 @@ -from graphviz import Digraph - -from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import BaseHistGradientBoosting - - -def plot_tree(est_or_grower, est_lightgbm=None, tree_index=0, view=True, - **kwargs): - """Plot the i'th predictor tree of an estimator, or a grower's tree - - est_or_grower can either be a GradientBoostingMachine instance or a - TreeGrower. In this latter case tree_index is ignored, and more debugging - info are displayed. Trees displayed from TreeGrower has additional - profiling information that are not kept in the predictor trees that - result from fitting a GradientBoostingMachine. - - tree_index corresponds to the ith built tree. In a multiclass setting, the - ith tree isn't necessarily the tree built durint the ith iteration because - there are K trees per iteration. For example with 3 classes, - tree_index=5 will print the third tree of the second iteration. - - Can also plot a LightGBM estimator (on the left) for comparison. - - Requires matplotlib and graphviz (both python package and binary program). - - kwargs are passed to graphviz.Digraph() - - Example: plotting.plot_tree(est_pygbm, est_lightgbm, view=False, - filename='output') will silently save output to output.pdf - """ - def make_pygbm_tree(): - def add_predictor_node(node_idx, parent=None, decision=None): - iteration = tree_index // est_or_grower.n_trees_per_iteration_ - k = tree_index % est_or_grower.n_trees_per_iteration_ - predictor_tree = est_or_grower._predictors[iteration][k] - node = predictor_tree.nodes[node_idx] - name = 'split__{}'.format(node_idx) - label = 'split_feature_index: {}'.format( - node['feature_idx']) - label += r'\nthreshold: {:.3f}'.format(node['threshold']) - label += r'\ngain: {:.3E}'.format(node['gain']) - label += r'\nvalue: {:.3f}'.format(node['value']) - label += r'\ncount: {:,}'.format(node['count']) - label += r'\nnans_go_left: {:,}'.format(node['missing_go_to_left']) - - graph.node(name, label=label) - if not node['is_leaf']: - add_predictor_node(node['left'], name, decision='<=') - add_predictor_node(node['right'], name, decision='>') - - if parent is not None: - graph.edge(parent, name, decision) - - def add_grower_node(node, parent=None, decision=None): - name = 'split__{0}'.format(id(node)) - si = node.split_info - if si is None: - feature_idx = 0 - bin_idx = 0 - gain = 0. - sum_gradients = 0. - sum_hessians = 0. - else: - feature_idx = si.feature_idx - gain = 0. if si.gain is None else si.gain - bin_idx = si.bin_idx - sum_gradients = si.gradient_left + si.gradient_right - sum_hessians = si.hessian_left + si.hessian_right - - value = 0. if node.value is None else node.value - label = 'split_feature_index: {}'.format(feature_idx) - label += r'\nbin threshold: {}'.format(bin_idx) - label += r'\ngain: {:.3E}'.format(gain) - label += r'\nvalue: {:.3f}'.format(value) - label += r'\ncount: {:,}'.format(node.sample_indices.shape[0]) - label += r'\nhist substration: {}'.format(node.hist_subtraction) - label += r'\nhist speed: {:.3E}'.format( - node.construction_speed) - label += r'\nfind split time: {:.4f}'.format(node.find_split_time) - label += r'\napply split time: {:.4f}'.format( - node.apply_split_time) - label += r'\nsum gradients: {:.3E}'.format(sum_gradients) - label += r'\nsum hessians: {:.3E}'.format(sum_hessians) - - graph.node(name, label=label) - if node.value is None: # not a leaf node - add_grower_node(node.left_child, name, decision='<=') - add_grower_node(node.right_child, name, decision='>') - - if parent is not None: - graph.edge(parent, name, decision) - - if isinstance(est_or_grower, BaseHistGradientBoosting): - add_predictor_node(0) - # elif isinstance(est_or_grower, pygbm.grower.TreeGrower): - # add_grower_node(est_or_grower.root) - - # make lightgbm tree - if est_lightgbm is not None: - import lightgbm as lb - graph = lb.create_tree_digraph( - est_lightgbm, - tree_index=tree_index, - show_info=['split_gain', 'internal_value', 'internal_count', - 'leaf_count'], - **kwargs) - else: - graph = Digraph(**kwargs) - - # make pygbm tree - make_pygbm_tree() - - graph.render(view=view) From 299d3e0dcbcb6d217ae1264a1bb1fc74dd6c7557 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Jul 2019 13:57:34 -0400 Subject: [PATCH 67/76] Got rid of individual checks in predictor code +inf thresholds are only allowed in a split on nan situation. Thresholds that are computed as +inf are capped to a very high constant value --- .../_hist_gradient_boosting/_predictor.pyx | 11 +---- .../_hist_gradient_boosting/binning.py | 4 ++ .../_hist_gradient_boosting/grower.py | 7 ++-- .../tests/test_binning.py | 2 +- .../tests/test_grower.py | 41 +++++++++++++++++++ .../tests/test_predictor.py | 6 +-- 6 files changed, 54 insertions(+), 17 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 6eb2d1aeb8a9b..2160823c112bd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -45,16 +45,7 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( if node.is_leaf: return node.value - if isnan(node.threshold): - if isnan(numeric_data[row, node.feature_idx]): - node = nodes[node.right] - else: - node = nodes[node.left] - elif numeric_data[row, node.feature_idx] == INFINITY: - # if data is +inf we always go to the right child, even when the - # threhsold is +inf - node = nodes[node.right] - elif isnan(numeric_data[row, node.feature_idx]): + if isnan(numeric_data[row, node.feature_idx]): if node.missing_go_to_left: node = nodes[node.left] else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 90c4af77f1863..445387a7f8f41 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -74,6 +74,10 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): interpolation='midpoint').astype(X_DTYPE) assert midpoints.shape[0] == max_bins - 1 + # We avoid having +inf thresholds: +inf thresholds are only allowed in + # a "split on nan" situation. + np.clip(midpoints, a_min=None, a_max=1e300, out=midpoints) + binning_thresholds.append(midpoints) return binning_thresholds diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index b5f79052f5a1a..e5d862409e2ac 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -475,13 +475,14 @@ def _fill_predictor_node_array(predictor_nodes, grower_node, node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx node['missing_go_to_left'] = split_info.missing_go_to_left + if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1: # Split is on the last non-missing bin: it's a "split on nans". All # nans go to the right, the rest go to the left. - node['threshold'] = np.nan + node['threshold'] = np.inf elif bin_thresholds is not None: - threshold = bin_thresholds[feature_idx][bin_idx] - node['threshold'] = threshold + node['threshold'] = bin_thresholds[feature_idx][bin_idx] + next_free_idx += 1 node['left'] = next_free_idx diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index d365059eeaed5..d9c7f0d6102f0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -306,7 +306,7 @@ def test_infinite_values(): X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) bin_mapper.fit(X) - assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, np.inf]) + assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, 1e300]) assert bin_mapper.n_bins_non_missing_ == [4] expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 14affe89166e6..6679738635cba 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -343,3 +343,44 @@ def test_missing_value_predict_only(): # to prediction_main_path all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) assert np.all(predictor.predict(all_nans) == prediction_main_path) + + +def test_split_on_nan_with_infinite_values(): + # Make sure the split on nan situations are respected even when there are + # samples with +inf values (we set the threshold to +inf when we have a + # split on nan so this test make sure this does not introduce edge-case + # bugs) + + X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) + # the gradient values will force a split on nan situation + gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + n_bins_non_missing = 3 + has_missing_values = True + grower = TreeGrower(X_binned, gradients, hessians, + n_bins_non_missing=n_bins_non_missing, + has_missing_values=has_missing_values, + min_samples_leaf=1) + + grower.grow() + + predictor = grower.make_predictor( + bin_thresholds=bin_mapper.bin_thresholds_ + ) + + # sanity check: this was a split on nan + assert predictor.nodes[0]['threshold'] == np.inf + assert predictor.nodes[0]['bin_threshold'] == n_bins_non_missing - 1 + + # Make sure in particular that the +inf sample is mapped to the left child + # Note that lightgbm "fails" here and will assign the inf sample to the + # right child, even though it's a "split on nan" situation. + predictions = predictor.predict(X) + predictions_binned = predictor.predict_binned( + X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_) + assert np.all(predictions == -gradients) + assert np.all(predictions_binned == -gradients) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 7abdb34307de3..3d4dd49950359 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -42,12 +42,12 @@ def test_boston_dataset(n_bins): (-np.inf, [0, 1, 1, 1]), (10, [0, 0, 1, 1]), (20, [0, 0, 0, 1]), - (np.inf, [0, 0, 0, 1]), + (1e300, [0, 0, 0, 1]), ]) def test_infinite_values_and_thresholds(threshold, expected_predictions): # Make sure infinite values and infinite thresholds are handled properly. - # In paticular, if a value is +inf and the threhsold is +inf, the sample - # should go to the right child. + # In particular, if a value is +inf and the threshold is +inf (1e300), the + # sample should go to the right child. X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1) nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE) From 9540f99b6b4268d2366497f5caee1d920a52ce95 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 23 Jul 2019 07:55:07 -0400 Subject: [PATCH 68/76] can also remove special case in binning code --- sklearn/ensemble/_hist_gradient_boosting/_binning.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 0d13b65c99b0c..c6648e8510b87 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -57,10 +57,6 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, if isnan(data[i]): binned[i] = missing_values_bin_idx - elif data[i] == INFINITY: - # Special case for +inf. - # -inf is handled properly by binary search. - binned[i] = binning_thresholds.shape[0] else: # for known values, use binary search left, right = 0, binning_thresholds.shape[0] From cb3936d54db74e32118f89e85f502aaf806d91f6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 23 Jul 2019 11:37:22 -0400 Subject: [PATCH 69/76] minor typos + more consistent test --- .../_hist_gradient_boosting/binning.py | 7 ++-- .../gradient_boosting.py | 28 +++++++-------- .../tests/test_grower.py | 2 +- .../tests/test_splitting.py | 34 +++++++++---------- 4 files changed, 36 insertions(+), 35 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 445387a7f8f41..cb9e12d73bd6c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -129,7 +129,7 @@ class _BinMapper(BaseEstimator, TransformerMixin): constant accross all features. This corresponds to the last bin, and it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_`` is less than ``n_bins - 1`` for a given feature, then there are - empty (an unused) bins. + empty (and unused) bins. """ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): self.n_bins = n_bins @@ -139,8 +139,8 @@ def __init__(self, n_bins=256, subsample=int(2e5), random_state=None): def fit(self, X, y=None): """Fit data X by computing the binning thresholds. - The last bin is reserved for missing values, whether there are - missing values present in the data or not. + The last bin is reserved for missing values, whether missing values + are present in the data or not. Parameters ---------- @@ -154,6 +154,7 @@ def fit(self, X, y=None): self : object """ if not (3 <= self.n_bins <= 256): + # min is 3: at least 2 distinct bins and a missing values bin raise ValueError('n_bins={} should be no smaller than 3 ' 'and no larger than 256.'.format(self.n_bins)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index ae7e4e51dbf26..b18adeba04b6e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -649,12 +649,12 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): approximate in this setting. This estimator has native support for missing values (NaNs). During - training, the tree grower learns at each split point whether nodes with - missing values should go to the left or right child, based on the - potential gain. When predicting, nodes with missing values are assigned to - the left or right child consequently. If no missing values were encountered - for a given feature during training, then nodes with missing values are - mapped to whichever child has the most samples. + training, the tree grower learns at each split point whether samples + with missing values should go to the left or right child, based on the + potential gain. When predicting, samples with missing values are + assigned to the left or right child consequently. If no missing values + were encountered for a given feature during training, then samples with + missing values are mapped to whichever child has the most samples. This implementation is inspired by `LightGBM `_. @@ -700,7 +700,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): (default). max_bins : int, optional (default=255) The maximum number of bins to use for non-missing values. Before - training, each feature of the input array ``X`` is binned into + training, each feature of the input array `X` is binned into integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin @@ -831,12 +831,12 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, approximate in this setting. This estimator has native support for missing values (NaNs). During - training, the tree grower learns at each split point whether nodes with - missing values should go to the left or right child, based on the - potential gain. When predicting, nodes with missing values are assigned to - the left or right child consequently. If no missing values were encountered - for a given feature during training, then nodes with missing values are - mapped to whichever child has the most samples. + training, the tree grower learns at each split point whether samples + with missing values should go to the left or right child, based on the + potential gain. When predicting, samples with missing values are + assigned to the left or right child consequently. If no missing values + were encountered for a given feature during training, then samples with + missing values are mapped to whichever child has the most samples. This implementation is inspired by `LightGBM `_. @@ -884,7 +884,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, The L2 regularization parameter. Use 0 for no regularization. max_bins : int, optional (default=255) The maximum number of bins to use for non-missing values. Before - training, each feature of the input array ``X`` is binned into + training, each feature of the input array `X` is binned into integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 6679738635cba..ec0c21c3a2433 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -348,7 +348,7 @@ def test_missing_value_predict_only(): def test_split_on_nan_with_infinite_values(): # Make sure the split on nan situations are respected even when there are # samples with +inf values (we set the threshold to +inf when we have a - # split on nan so this test make sure this does not introduce edge-case + # split on nan so this test makes sure this does not introduce edge-case # bugs) X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 2a8a15aea6894..e2bad66d6d05c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -328,40 +328,40 @@ def test_min_gain_to_split(): 3, # cut on bin_idx=3 (like in first case) False), # missing values go to right - # For the following case, split_on_nans is True (we replace all of the - # samples with nans, instead of just 2). - ([0, 1, 2, 3, 4, 5, 6, 6, 6, 6], # 6 <=> missing - [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + # For the following cases, split_on_nans is True (we replace all of + # the samples with nans, instead of just 2). + ([0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values - 6, # n_bins_non_missing + 4, # n_bins_non_missing True, # split on nans - 5, # cut on bin_idx=5 + 3, # cut on bin_idx=3 False), # missing values go to right # same as above, but with non-consecutive missing_values_bin - ([0, 1, 2, 3, 4, 5, 9, 9, 9, 9], # 9 <=> missing + ([0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], True, # missing values - 6, # n_bins_non_missing + 4, # n_bins_non_missing True, # split on nans - 5, # cut on bin_idx=5 + 3, # cut on bin_idx=3 False), # missing values go to right - ([4, 4, 4, 4, 4, 4, 0, 1, 2, 3], # 4 <=> missing - [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 4 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values - 4, # n_bins_non_missing + 6, # n_bins_non_missing True, # split on nans - 3, # cut on bin_idx=3 + 5, # cut on bin_idx=5 False), # missing values go to right # same as above, but with non-consecutive missing_values_bin - ([9, 9, 9, 9, 9, 9, 0, 1, 2, 3], # 9 <=> missing - [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + ([9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], True, # missing values - 4, # n_bins_non_missing + 6, # n_bins_non_missing True, # split on nans - 3, # cut on bin_idx=3 + 5, # cut on bin_idx=5 False), # missing values go to right ] ) From c8f64099cf5f91b3e3fc80d6a3d406e15522d226 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 23 Jul 2019 11:51:57 -0400 Subject: [PATCH 70/76] renamed types -> common --- sklearn/ensemble/_hist_gradient_boosting/_binning.pyx | 2 +- .../_hist_gradient_boosting/_gradient_boosting.pyx | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/_loss.pyx | 4 ++-- .../ensemble/_hist_gradient_boosting/_predictor.pyx | 10 +++++----- sklearn/ensemble/_hist_gradient_boosting/binning.py | 2 +- .../_hist_gradient_boosting/{types.pxd => common.pxd} | 0 .../_hist_gradient_boosting/{types.pyx => common.pyx} | 2 +- .../_hist_gradient_boosting/gradient_boosting.py | 2 +- sklearn/ensemble/_hist_gradient_boosting/grower.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 8 ++++---- sklearn/ensemble/_hist_gradient_boosting/loss.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/predictor.py | 2 +- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 8 ++++---- .../_hist_gradient_boosting/tests/test_binning.py | 4 ++-- .../_hist_gradient_boosting/tests/test_grower.py | 6 +++--- .../_hist_gradient_boosting/tests/test_histogram.py | 6 +++--- .../_hist_gradient_boosting/tests/test_loss.py | 4 ++-- .../_hist_gradient_boosting/tests/test_predictor.py | 2 +- .../_hist_gradient_boosting/tests/test_splitting.py | 6 +++--- sklearn/ensemble/_hist_gradient_boosting/utils.pyx | 4 ++-- sklearn/ensemble/setup.py | 4 ++-- 21 files changed, 44 insertions(+), 44 deletions(-) rename sklearn/ensemble/_hist_gradient_boosting/{types.pxd => common.pxd} (100%) rename sklearn/ensemble/_hist_gradient_boosting/{types.pyx => common.pyx} (95%) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index c6648e8510b87..1ecee3c9ee27e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -14,7 +14,7 @@ from numpy.math cimport INFINITY from cython.parallel import prange from libc.math cimport isnan -from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C +from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C def _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index 3603e6b2e2d8e..8d307c3806532 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -10,8 +10,8 @@ from cython.parallel import prange import numpy as np cimport numpy as np -from .types import Y_DTYPE -from .types cimport Y_DTYPE_C +from .common import Y_DTYPE +from .common cimport Y_DTYPE_C def _update_raw_predictions( diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 91c3e53101ed6..ff17654840005 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -12,8 +12,8 @@ cimport numpy as np from libc.math cimport exp -from .types cimport Y_DTYPE_C -from .types cimport G_H_DTYPE_C +from .common cimport Y_DTYPE_C +from .common cimport G_H_DTYPE_C def _update_gradients_least_squares( diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index 2160823c112bd..b3234cb5ba945 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -12,11 +12,11 @@ import numpy as np cimport numpy as np from numpy.math cimport INFINITY -from .types cimport X_DTYPE_C -from .types cimport Y_DTYPE_C -from .types import Y_DTYPE -from .types cimport X_BINNED_DTYPE_C -from .types cimport node_struct +from .common cimport X_DTYPE_C +from .common cimport Y_DTYPE_C +from .common import Y_DTYPE +from .common cimport X_BINNED_DTYPE_C +from .common cimport node_struct def _predict_from_numeric_data( diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index cb9e12d73bd6c..263de93c848b2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -13,7 +13,7 @@ from ...base import BaseEstimator, TransformerMixin from ...utils.validation import check_is_fitted from ._binning import _map_to_bins -from .types import X_DTYPE, X_BINNED_DTYPE +from .common import X_DTYPE, X_BINNED_DTYPE def _find_binning_thresholds(data, max_bins, subsample, random_state): diff --git a/sklearn/ensemble/_hist_gradient_boosting/types.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd similarity index 100% rename from sklearn/ensemble/_hist_gradient_boosting/types.pxd rename to sklearn/ensemble/_hist_gradient_boosting/common.pxd diff --git a/sklearn/ensemble/_hist_gradient_boosting/types.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx similarity index 95% rename from sklearn/ensemble/_hist_gradient_boosting/types.pyx rename to sklearn/ensemble/_hist_gradient_boosting/common.pyx index 4838e5d152873..0a8ba0e1fa4b5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/types.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pyx @@ -6,7 +6,7 @@ import numpy as np Y_DTYPE = np.float64 X_DTYPE = np.float64 X_BINNED_DTYPE = np.uint8 # hence max_bins == 256 -# dtypes for gradients and hessians arrays +# dtype for gradients and hessians arrays G_H_DTYPE = np.float32 HISTOGRAM_DTYPE = np.dtype([ diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index b18adeba04b6e..67955ad8df904 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -15,7 +15,7 @@ from ...model_selection import train_test_split from ...preprocessing import LabelEncoder from ._gradient_boosting import _update_raw_predictions -from .types import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE +from .common import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE from .binning import _BinMapper from .grower import TreeGrower diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e5d862409e2ac..c7d303b8f6201 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -15,8 +15,8 @@ from .histogram import HistogramBuilder from .predictor import TreePredictor from .utils import sum_parallel -from .types import PREDICTOR_RECORD_DTYPE -from .types import Y_DTYPE +from .common import PREDICTOR_RECORD_DTYPE +from .common import Y_DTYPE EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index b6031bc86846f..740e5e002cf4e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -12,10 +12,10 @@ from cython.parallel import prange import numpy as np cimport numpy as np -from .types import HISTOGRAM_DTYPE -from .types cimport hist_struct -from .types cimport X_BINNED_DTYPE_C -from .types cimport G_H_DTYPE_C +from .common import HISTOGRAM_DTYPE +from .common cimport hist_struct +from .common cimport X_BINNED_DTYPE_C +from .common cimport G_H_DTYPE_C # Notes: # - IN views are read-only, OUT views are write-only diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 5d7c68ea0b38f..9e00187d62425 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -15,8 +15,8 @@ except ImportError: from scipy.misc import logsumexp -from .types import Y_DTYPE -from .types import G_H_DTYPE +from .common import Y_DTYPE +from .common import G_H_DTYPE from ._loss import _update_gradients_least_squares from ._loss import _update_gradients_hessians_binary_crossentropy from ._loss import _update_gradients_hessians_categorical_crossentropy diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index f7215a03831e5..0b359c8f98224 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -5,7 +5,7 @@ import numpy as np -from .types import Y_DTYPE +from .common import Y_DTYPE from ._predictor import _predict_from_numeric_data from ._predictor import _predict_from_binned_data from ._predictor import _compute_partial_dependence diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index bc2477c3ecf1c..9aa3c643ffcaa 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -20,10 +20,10 @@ IF SKLEARN_OPENMP_SUPPORTED: from libc.stdlib cimport malloc, free from libc.string cimport memcpy -from .types cimport X_BINNED_DTYPE_C -from .types cimport Y_DTYPE_C -from .types cimport hist_struct -from .types import HISTOGRAM_DTYPE +from .common cimport X_BINNED_DTYPE_C +from .common cimport Y_DTYPE_C +from .common cimport hist_struct +from .common import HISTOGRAM_DTYPE cdef struct split_info_struct: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index d9c7f0d6102f0..c25d1cf0b5aac 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -7,8 +7,8 @@ _find_binning_thresholds as _find_binning_thresholds_orig, _map_to_bins ) -from sklearn.ensemble._hist_gradient_boosting.types import X_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE DATA = np.random.RandomState(42).normal( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index ec0c21c3a2433..5785c08dba02b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -4,9 +4,9 @@ from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE def _make_training_data(n_bins=256, constant_hessian=True): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index c425a0389a789..1ffb08353b30a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -12,9 +12,9 @@ _build_histogram_root, _subtract_histograms ) -from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE @pytest.mark.parametrize( diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 29b5b6b47a04a..b49acc52b6e40 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -7,8 +7,8 @@ import pytest from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES -from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE def get_derivatives_helper(loss): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 3d4dd49950359..4960b5d295a91 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -7,7 +7,7 @@ from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor -from sklearn.ensemble._hist_gradient_boosting.types import ( +from sklearn.ensemble._hist_gradient_boosting.common import ( G_H_DTYPE, PREDICTOR_RECORD_DTYPE) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index e2bad66d6d05c..004bad56786ca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder from sklearn.utils.testing import skip_if_32bit diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index fa9556ef9efb5..291c015fec5d3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -9,8 +9,8 @@ from cython.parallel import prange from ...base import is_classifier from .binning import _BinMapper -from .types cimport G_H_DTYPE_C -from .types cimport Y_DTYPE_C +from .common cimport G_H_DTYPE_C +from .common cimport Y_DTYPE_C def get_equivalent_estimator(estimator, lib='lightgbm'): diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index 88e1b2e32d98d..4430cb129efcf 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -37,8 +37,8 @@ def configuration(parent_package="", top_path=None): sources=["_hist_gradient_boosting/_loss.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("_hist_gradient_boosting.types", - sources=["_hist_gradient_boosting/types.pyx"], + config.add_extension("_hist_gradient_boosting.common", + sources=["_hist_gradient_boosting/common.pyx"], include_dirs=[numpy.get_include()]) config.add_extension("_hist_gradient_boosting.utils", From 6f0e1912c040634654405eee9e5af4a3bcf400a8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 23 Jul 2019 11:55:33 -0400 Subject: [PATCH 71/76] 1e300 -> almost inf --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/common.pyx | 2 ++ .../_hist_gradient_boosting/tests/test_binning.py | 3 ++- .../_hist_gradient_boosting/tests/test_predictor.py | 10 ++++++---- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 263de93c848b2..ebc2d2ad1da51 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -13,7 +13,7 @@ from ...base import BaseEstimator, TransformerMixin from ...utils.validation import check_is_fitted from ._binning import _map_to_bins -from .common import X_DTYPE, X_BINNED_DTYPE +from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF def _find_binning_thresholds(data, max_bins, subsample, random_state): @@ -76,7 +76,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): # We avoid having +inf thresholds: +inf thresholds are only allowed in # a "split on nan" situation. - np.clip(midpoints, a_min=None, a_max=1e300, out=midpoints) + np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints) binning_thresholds.append(midpoints) diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx index 0a8ba0e1fa4b5..8604548e44163 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/common.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pyx @@ -28,3 +28,5 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ ('is_leaf', np.uint8), ('bin_threshold', X_BINNED_DTYPE), ]) + +ALMOST_INF = 1e300 # see LightGBM AvoidInf() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index c25d1cf0b5aac..f277309e3d3cb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -9,6 +9,7 @@ ) from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF DATA = np.random.RandomState(42).normal( @@ -306,7 +307,7 @@ def test_infinite_values(): X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) bin_mapper.fit(X) - assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, 1e300]) + assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, .5, ALMOST_INF]) assert bin_mapper.n_bins_non_missing_ == [4] expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 4960b5d295a91..2956a660144e5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -8,7 +8,7 @@ from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor from sklearn.ensemble._hist_gradient_boosting.common import ( - G_H_DTYPE, PREDICTOR_RECORD_DTYPE) + G_H_DTYPE, PREDICTOR_RECORD_DTYPE, ALMOST_INF) @pytest.mark.parametrize('n_bins', [200, 256]) @@ -42,12 +42,14 @@ def test_boston_dataset(n_bins): (-np.inf, [0, 1, 1, 1]), (10, [0, 0, 1, 1]), (20, [0, 0, 0, 1]), - (1e300, [0, 0, 0, 1]), + (ALMOST_INF, [0, 0, 0, 1]), + (np.inf, [0, 0, 0, 0]), ]) def test_infinite_values_and_thresholds(threshold, expected_predictions): # Make sure infinite values and infinite thresholds are handled properly. - # In particular, if a value is +inf and the threshold is +inf (1e300), the - # sample should go to the right child. + # In particular, if a value is +inf and the threshold is ALMOST_INF the + # sample should go to the right child. If the threshold is inf (split on + # nan), the +inf sample will go to the left child. X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1) nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE) From c1123356be60b36abf5085dee60f2ede7b24c175 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 5 Aug 2019 13:15:04 -0400 Subject: [PATCH 72/76] added user guide section on missing values --- doc/modules/ensemble.rst | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index e1bcf47b8ff7b..8dec4b6c400f7 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -876,7 +876,7 @@ controls the number of iterations of the boosting process: >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) >>> clf.score(X_test, y_test) - 0.8998 + 0.8965 The size of the trees can be controlled through the ``max_leaf_nodes``, ``max_depth``, and ``min_samples_leaf`` parameters. @@ -895,6 +895,32 @@ using an arbitrary :term:`scorer`, or just the training or validation loss. By default, early-stopping is performed using the default :term:`scorer` of the estimator on a validation set. +Missing values support +---------------------- + +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` have built-in support for missing +values (NaNs). + +During training, the tree grower learns at each split point whether samples +with missing values should go to the left or right child, based on the +potential gain. When predicting, samples with missing values are assigned to +the left or right child consequently.: + + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> import numpy as np + + >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) + >>> y = [0, 0, 1, 1] + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) + >>> gbdt.predict(X) + array([0, 0, 1, 1]) + +If no missing values were encountered for a given feature during training, +then samples with missing values are mapped to whichever child has the most +samples. + Low-level parallelism --------------------- From 3b0c2bac0caa9eab6d68cde05d0a6ad1c9a7d621 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 20 Aug 2019 11:30:16 -0400 Subject: [PATCH 73/76] Addressed Olivier's comment + updated whatsnew --- doc/whats_new/v0.22.rst | 50 +++++++++++-------- .../_hist_gradient_boosting/splitting.pyx | 16 +++--- .../tests/test_binning.py | 2 +- .../tests/test_gradient_boosting.py | 18 +++++++ .../tests/test_grower.py | 3 +- 5 files changed, 58 insertions(+), 31 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index c2a3f09c3aae4..23ab8acb81afd 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -23,10 +23,11 @@ random sampling procedures. - :class:`decomposition.SparseCoder` with `algorithm='lasso_lars'` |Fix| - :class:`decomposition.SparsePCA` where `normalize_components` has no effect due to deprecation. - - :class:`linear_model.Ridge` when `X` is sparse. |Fix| - - :class:`cluster.KMeans` when `n_jobs=1`. |Fix| +- :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` |Fix|, |Feature|, + |Enhancement|. Details are listed in the changelog below. @@ -112,30 +113,29 @@ Changelog :mod:`sklearn.ensemble` ....................... -- |Feature| :class:`ensemble.HistGradientBoostingClassifier` - and :class:`ensemble.HistGradientBoostingRegressor` now natively supports - dense data with missing values both for training and predicting. They also - support infinite values. :pr:`13911` and :pr:`14406` by `NicolasHug`_, - `Adrin Jalali`_ and `Olivier Grisel`_. - -- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` have an additional - parameter called `warm_start` that enables warm starting. :pr:`14012` by - :user:`Johann Faouzi `. - -- |Fix| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` now bin the training and - validation data separately to avoid any data leak. :pr:`13933` by - `Nicolas Hug`_. +- Many improvements were made to + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`: + + - |MajorFeature| Estimators now natively support dense data with missing + values both for training and predicting. They also support infinite + values. :pr:`13911` and :pr:`14406` by `Nicolas Hug`_, `Adrin Jalali`_ + and `Olivier Grisel`_. + - |Feature| Estimators now have an additional `warm_start` parameter that + enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. + - |Enhancement| for :class:`ensemble.HistGradientBoostingClassifier` the + training loss or score is now monitored on a class-wise stratified + subsample to preserve the class balance of the original training set. + :pr:`14194` by :user:`Johann Faouzi `. + - |Feature| :func:`inspection.partial_dependence` and + :func:`inspection.plot_partial_dependence` now support the fast 'recursion' + method for both estimators. :pr:`13769` by `Nicolas Hug`_. + - |Fix| Estimators now bin the training and validation data separately to + avoid any data leak. :pr:`13933` by `Nicolas Hug`_. - |Fix| :func:`ensemble.VotingClassifier.predict_proba` will no longer be present when `voting='hard'`. :pr:`14287` by `Thomas Fan`_. -- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` the training - loss or score is now monitored on a class-wise stratified subsample to - preserve the class balance of the original training set. :pr:`14194` - by :user:`Johann Faouzi `. - - |Fix| Run by default :func:`utils.estimator_checks.check_estimator` on both :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. It @@ -188,6 +188,12 @@ Changelog measure the importance of each feature in an arbitrary trained model with respect to a given scoring function. :issue:`13146` by `Thomas Fan`_. +- |Feature| :func:`inspection.partial_dependence` and + :func:`inspection.plot_partial_dependence` now support the fast 'recursion' + method for :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. :pr:`13769` by + `Nicolas Hug`_. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 9aa3c643ffcaa..fda060e238514 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -471,10 +471,10 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - # We set the 'end' variable such that the last non-missing bin - # never goes to the left child (which would result in and empty - # right child), unless there are missing values, since these would - # go to the right child. + # We set the 'end' variable such that the last non-missing-values + # bin never goes to the left child (which would result in and + # empty right child), unless there are missing values, since these + # would go to the right child. unsigned int end = \ self.n_bins_non_missing[feature_idx] - 1 + has_missing_values Y_DTYPE_C sum_hessian_left @@ -549,9 +549,11 @@ cdef class Splitter: (min_gain_to_split, etc.) are discarded here. We scan node from right to left. This version is only called when - there are missing values. If there's no missing value, calling - _find_best_bin_to_split_left_to_right is enough. Missing - values are assigned to the left node. + there are missing values. Missing values are assigned to the left + child. + + If no missing value are present in the data this method isn't called + since only calling _find_best_bin_to_split_left_to_right is enough. """ cdef: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index f277309e3d3cb..06e38d62f7638 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -274,7 +274,7 @@ def test_subsample(): [1, 0, 0]])]) def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected): # check for missing values: make sure nans are mapped to the last bin - # and that attributes are correct + # and that the _BinMapper attributes are correct X = [[1, 1, 0], [np.NaN, np.NaN, 0], diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index ac863f300998c..1eebdefd5288d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -390,6 +390,7 @@ def make_missing_value_data(n_samples=int(1e4), seed=0): def test_infinite_values(): + # Basic test for infinite values X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) y = np.array([0, 0, 1, 1]) @@ -397,3 +398,20 @@ def test_infinite_values(): gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) gbdt.fit(X, y) np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4) + + +def test_infinite_values_missing_values(): + # High level test making sure that inf and nan values are properly handled + # when both are present. This is similar to + # test_split_on_nan_with_infinite_values() in test_grower.py, though we + # cannot check the predicitons for binned values here. + + X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1) + y_isnan = np.isnan(X.ravel()) + y_isinf = X.ravel() == np.inf + + stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1, + learning_rate=1, max_depth=2) + + assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1 + assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 5785c08dba02b..0cc301b7b1b36 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -349,7 +349,8 @@ def test_split_on_nan_with_infinite_values(): # Make sure the split on nan situations are respected even when there are # samples with +inf values (we set the threshold to +inf when we have a # split on nan so this test makes sure this does not introduce edge-case - # bugs) + # bugs). We need to use the private API so that we can also test + # predict_binned(). X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) # the gradient values will force a split on nan situation From 7c868ae193225fc2c27da85bad622c334009944f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 20 Aug 2019 13:07:30 -0400 Subject: [PATCH 74/76] addressed comments --- doc/modules/ensemble.rst | 17 ++++++++++++++--- doc/whats_new/v0.22.rst | 2 ++ .../tests/test_splitting.py | 2 +- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 34980965f4203..032093f1f6752 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -864,7 +864,7 @@ Usage Most of the parameters are unchanged from :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`. One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and -controls the number of iterations of the boosting process: +controls the number of iterations of the boosting process:: >>> from sklearn.experimental import enable_hist_gradient_boosting >>> from sklearn.ensemble import HistGradientBoostingClassifier @@ -873,8 +873,8 @@ controls the number of iterations of the boosting process: >>> X, y = make_hastie_10_2(random_state=0) >>> X_train, X_test = X[:2000], X[2000:] >>> y_train, y_test = y[:2000], y[2000:] - >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) + >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) >>> clf.score(X_test, y_test) 0.8965 @@ -905,7 +905,7 @@ values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to -the left or right child consequently.: +the left or right child consequently:: >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa >>> from sklearn.ensemble import HistGradientBoostingClassifier @@ -913,10 +913,21 @@ the left or right child consequently.: >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) >>> y = [0, 0, 1, 1] + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) >>> gbdt.predict(X) array([0, 0, 1, 1]) +When the missingness pattern is predictive, the splits can be done on +whether the feature value is missing or not:: + + >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1) + >>> y = [0, 1, 0, 0, 1] + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1, + max_iter=1, max_depth=2).fit(X, y) + >>> gbdt.predict(X) + [0, 1, 0, 0, 1] + If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 23ab8acb81afd..779a94c2dd1b0 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -133,6 +133,8 @@ Changelog - |Fix| Estimators now bin the training and validation data separately to avoid any data leak. :pr:`13933` by `Nicolas Hug`_. + Note that pickles from 0.21 will not work in 0.22. + - |Fix| :func:`ensemble.VotingClassifier.predict_proba` will no longer be present when `voting='hard'`. :pr:`14287` by `Thomas Fan`_. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 004bad56786ca..a0eb6c6ab61c5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -374,7 +374,7 @@ def test_splitting_missing_values(X_binned, all_gradients, # is on bin_idx=3, when there are no missing values. # Then we introduce missing values and: # - make sure the chosen bin is correct (find_best_bin()): it's - # still the same split, even though the index of the bin may + # still the same split, even though the index of the bin may change # - make sure the missing values are mapped to the correct child # (split_indices()) From 876f538d3d177ddbfc6bed7d6ccf054cf1fbb391 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 21 Aug 2019 09:22:03 +0200 Subject: [PATCH 75/76] Fix doctest formatting --- doc/modules/ensemble.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 032093f1f6752..4da95eb6cd9e0 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -924,7 +924,7 @@ whether the feature value is missing or not:: >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1) >>> y = [0, 1, 0, 0, 1] >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1, - max_iter=1, max_depth=2).fit(X, y) + ... max_iter=1, max_depth=2).fit(X, y) >>> gbdt.predict(X) [0, 1, 0, 0, 1] From 601dc2245675b98a6968f1c6f32c32b2ec423c5c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 21 Aug 2019 09:57:37 +0200 Subject: [PATCH 76/76] Fix nan predictive doctest --- doc/modules/ensemble.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 4da95eb6cd9e0..fde8f40db6c8c 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -924,9 +924,11 @@ whether the feature value is missing or not:: >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1) >>> y = [0, 1, 0, 0, 1] >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1, - ... max_iter=1, max_depth=2).fit(X, y) + ... max_depth=2, + ... learning_rate=1, + ... max_iter=1).fit(X, y) >>> gbdt.predict(X) - [0, 1, 0, 0, 1] + array([0, 1, 0, 0, 1]) If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most