diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 37a19aad6cb97..7e4f081247836 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -16,76 +16,53 @@ from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF -def _find_binning_thresholds(data, max_bins, subsample, random_state): +def _find_binning_thresholds(col_data, max_bins): """Extract feature-wise quantiles from numerical data. Missing values are ignored for finding the thresholds. Parameters ---------- - data : array-like of shape (n_samples, n_features) - The data to bin. - + col_data : array-like, shape (n_features,) + The numerical feature to bin. max_bins: int The maximum number of bins to use for non-missing values. If for a given feature the number of unique values is less than ``max_bins``, then those unique values will be used to compute the bin thresholds, - instead of the quantiles. - - subsample : int or None - If ``n_samples > subsample``, then ``sub_samples`` samples will be - randomly chosen to compute the quantiles. If ``None``, the whole data - is used. - - random_state: int, RandomState instance or None - Pseudo-random number generator to control the random sub-sampling. - Pass an int for reproducible output across multiple - function calls. - See :term: `Glossary `. + instead of the quantiles Return ------ - binning_thresholds: list of ndarray + binning_thresholds: ndarray For each feature, stores the increasing numeric values that can be used to separate the bins. Thus ``len(binning_thresholds) == n_features``. """ - rng = check_random_state(random_state) - if subsample is not None and data.shape[0] > subsample: - subset = rng.choice(data.shape[0], subsample, replace=False) - data = data.take(subset, axis=0) - - binning_thresholds = [] - for f_idx in range(data.shape[1]): - col_data = data[:, f_idx] - # ignore missing values when computing bin thresholds - missing_mask = np.isnan(col_data) - if missing_mask.any(): - col_data = col_data[~missing_mask] - col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) - distinct_values = np.unique(col_data) - if len(distinct_values) <= max_bins: - midpoints = distinct_values[:-1] + distinct_values[1:] - midpoints *= .5 - else: - # We sort again the data in this case. We could compute - # approximate midpoint percentiles using the output of - # np.unique(col_data, return_counts) instead but this is more - # work and the performance benefit will be limited because we - # work on a fixed-size subsample of the full data. - percentiles = np.linspace(0, 100, num=max_bins + 1) - percentiles = percentiles[1:-1] - midpoints = np.percentile(col_data, percentiles, - interpolation='midpoint').astype(X_DTYPE) - assert midpoints.shape[0] == max_bins - 1 - - # We avoid having +inf thresholds: +inf thresholds are only allowed in - # a "split on nan" situation. - np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints) - - binning_thresholds.append(midpoints) - - return binning_thresholds + # ignore missing values when computing bin thresholds + missing_mask = np.isnan(col_data) + if missing_mask.any(): + col_data = col_data[~missing_mask] + col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE) + distinct_values = np.unique(col_data) + if len(distinct_values) <= max_bins: + midpoints = distinct_values[:-1] + distinct_values[1:] + midpoints *= .5 + else: + # We sort again the data in this case. We could compute + # approximate midpoint percentiles using the output of + # np.unique(col_data, return_counts) instead but this is more + # work and the performance benefit will be limited because we + # work on a fixed-size subsample of the full data. + percentiles = np.linspace(0, 100, num=max_bins + 1) + percentiles = percentiles[1:-1] + midpoints = np.percentile(col_data, percentiles, + interpolation='midpoint').astype(X_DTYPE) + assert midpoints.shape[0] == max_bins - 1 + + # We avoid having +inf thresholds: +inf thresholds are only allowed in + # a "split on nan" situation. + np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints) + return midpoints class _BinMapper(TransformerMixin, BaseEstimator): @@ -170,9 +147,15 @@ def fit(self, X, y=None): X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) max_bins = self.n_bins - 1 - self.bin_thresholds_ = _find_binning_thresholds( - X, max_bins, subsample=self.subsample, - random_state=self.random_state) + + rng = check_random_state(self.random_state) + if self.subsample is not None and X.shape[0] > self.subsample: + subset = rng.choice(X.shape[0], self.subsample, replace=False) + X = X.take(subset, axis=0) + + self.bin_thresholds_ = [ + _find_binning_thresholds(X[:, f_idx], max_bins) + for f_idx in range(X.shape[1])] self.n_bins_non_missing_ = np.array( [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 06e38d62f7638..305eecc6a965c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -4,7 +4,7 @@ from sklearn.ensemble._hist_gradient_boosting.binning import ( _BinMapper, - _find_binning_thresholds as _find_binning_thresholds_orig, + _find_binning_thresholds, _map_to_bins ) from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE @@ -17,45 +17,34 @@ ).astype(X_DTYPE) -def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5), - random_state=None): - # Just a redef to avoid having to pass arguments all the time (as the - # function is private we don't use default values for parameters) - return _find_binning_thresholds_orig(data, max_bins, subsample, - random_state) - - def test_find_binning_thresholds_regular_data(): data = np.linspace(0, 10, 1001).reshape(-1, 1) bin_thresholds = _find_binning_thresholds(data, max_bins=10) - assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) - assert len(bin_thresholds) == 1 + assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9]) bin_thresholds = _find_binning_thresholds(data, max_bins=5) - assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) - assert len(bin_thresholds) == 1 + assert_allclose(bin_thresholds, [2, 4, 6, 8]) def test_find_binning_thresholds_small_regular_data(): data = np.linspace(0, 10, 11).reshape(-1, 1) bin_thresholds = _find_binning_thresholds(data, max_bins=5) - assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) + assert_allclose(bin_thresholds, [2, 4, 6, 8]) bin_thresholds = _find_binning_thresholds(data, max_bins=10) - assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) + assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9]) bin_thresholds = _find_binning_thresholds(data, max_bins=11) - assert_allclose(bin_thresholds[0], np.arange(10) + .5) + assert_allclose(bin_thresholds, np.arange(10) + .5) bin_thresholds = _find_binning_thresholds(data, max_bins=255) - assert_allclose(bin_thresholds[0], np.arange(10) + .5) + assert_allclose(bin_thresholds, np.arange(10) + .5) def test_find_binning_thresholds_random_data(): - bin_thresholds = _find_binning_thresholds(DATA, max_bins=255, - random_state=0) - assert len(bin_thresholds) == 2 + bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=255) + for i in range(2)] for i in range(len(bin_thresholds)): assert bin_thresholds[i].shape == (254,) # 255 - 1 assert bin_thresholds[i].dtype == DATA.dtype @@ -68,9 +57,8 @@ def test_find_binning_thresholds_random_data(): def test_find_binning_thresholds_low_n_bins(): - bin_thresholds = _find_binning_thresholds(DATA, max_bins=128, - random_state=0) - assert len(bin_thresholds) == 2 + bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=128) + for i in range(2)] for i in range(len(bin_thresholds)): assert bin_thresholds[i].shape == (127,) # 128 - 1 assert bin_thresholds[i].dtype == DATA.dtype @@ -94,8 +82,8 @@ def test_bin_mapper_n_features_transform(): @pytest.mark.parametrize('max_bins', [16, 128, 255]) def test_map_to_bins(max_bins): - bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins, - random_state=0) + bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=max_bins) + for i in range(2)] binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') last_bin_idx = max_bins _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)