From 7d7dc2a182bf273b860420390d79f5ff6a879957 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 17 Dec 2018 16:14:26 -0500 Subject: [PATCH 001/247] Added all gbm/* files, removed numba use. Works, need Cython now. --- sklearn/ensemble/__init__.py | 5 +- sklearn/ensemble/gbm/binning.py | 176 ++++++ sklearn/ensemble/gbm/gradient_boosting.py | 700 ++++++++++++++++++++++ sklearn/ensemble/gbm/grower.py | 468 +++++++++++++++ sklearn/ensemble/gbm/histogram.pyx | 195 ++++++ sklearn/ensemble/gbm/loss.py | 299 +++++++++ sklearn/ensemble/gbm/predictor.py | 110 ++++ sklearn/ensemble/gbm/splitting.py | 552 +++++++++++++++++ sklearn/ensemble/gbm/utils.py | 79 +++ sklearn/ensemble/setup.py | 4 + 10 files changed, 2587 insertions(+), 1 deletion(-) create mode 100644 sklearn/ensemble/gbm/binning.py create mode 100644 sklearn/ensemble/gbm/gradient_boosting.py create mode 100644 sklearn/ensemble/gbm/grower.py create mode 100644 sklearn/ensemble/gbm/histogram.pyx create mode 100644 sklearn/ensemble/gbm/loss.py create mode 100644 sklearn/ensemble/gbm/predictor.py create mode 100644 sklearn/ensemble/gbm/splitting.py create mode 100644 sklearn/ensemble/gbm/utils.py diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index 5586a9e1e1fba..7069117704d17 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -17,6 +17,8 @@ from .gradient_boosting import GradientBoostingClassifier from .gradient_boosting import GradientBoostingRegressor from .voting_classifier import VotingClassifier +from .gbm.gradient_boosting import GradientBoostingClassifier as GBMCLassifier +from .gbm.gradient_boosting import GradientBoostingRegressor as GBMRegressor from . import bagging from . import forest @@ -32,4 +34,5 @@ "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "bagging", "forest", "gradient_boosting", - "partial_dependence", "weight_boosting"] + "partial_dependence", "weight_boosting", + "GBMClassifier", "GBMRegressor"] diff --git a/sklearn/ensemble/gbm/binning.py b/sklearn/ensemble/gbm/binning.py new file mode 100644 index 0000000000000..3371db94095be --- /dev/null +++ b/sklearn/ensemble/gbm/binning.py @@ -0,0 +1,176 @@ +""" +This module contains the BinMapper class. + +BinMapper is used for mapping a real-valued dataset into integer-valued bins +with equally-spaced thresholds. +""" +import numpy as np +from sklearn.utils import check_random_state, check_array +from sklearn.base import BaseEstimator, TransformerMixin + + +def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), + random_state=None): + """Extract feature-wise equally-spaced quantiles from numerical data + + + Return + ------ + binning_thresholds: tuple of arrays + For each feature, stores the increasing numeric values that can + be used to separate the bins. len(binning_thresholds) == n_features. + """ + if not (2 <= max_bins <= 256): + raise ValueError(f'max_bins={max_bins} should be no smaller than 2 ' + f'and no larger than 256.') + rng = check_random_state(random_state) + if subsample is not None and data.shape[0] > subsample: + subset = rng.choice(np.arange(data.shape[0]), subsample) + data = data[subset] + dtype = data.dtype + if dtype.kind != 'f': + dtype = np.float32 + + percentiles = np.linspace(0, 100, num=max_bins + 1)[1:-1] + binning_thresholds = [] + for f_idx in range(data.shape[1]): + col_data = np.ascontiguousarray(data[:, f_idx], dtype=dtype) + distinct_values = np.unique(col_data) + if len(distinct_values) <= max_bins: + midpoints = (distinct_values[:-1] + distinct_values[1:]) + midpoints *= .5 + else: + # We sort again the data in this case. We could compute + # approximate midpoint percentiles using the output of + # np.unique(col_data, return_counts) instead but this is more + # work and the performance benefit will be limited because we + # work on a fixed-size subsample of the full data. + midpoints = np.percentile(col_data, percentiles, + interpolation='midpoint').astype(dtype) + binning_thresholds.append(midpoints) + return tuple(binning_thresholds) + + +def _map_to_bins(data, binning_thresholds=None, out=None): + """Bin numerical values to discrete integer-coded levels. + + Parameters + ---------- + data : array-like, shape=(n_samples, n_features) + The numerical data to bin. + binning_thresholds : tuple of arrays + For each feature, stores the increasing numeric values that are + used to separate the bins. + out : array-like + If not None, write result inplace in out. + + Returns + ------- + binned_data : array of int, shape=data.shape + The binned data. + """ + # TODO: add support for categorical data encoded as integers + # TODO: add support for sparse data (numerical or categorical) + if out is not None: + assert out.shape == data.shape + assert out.dtype == np.uint8 + assert out.flags.f_contiguous + binned = out + else: + binned = np.zeros_like(data, dtype=np.uint8, order='F') + + binning_thresholds = tuple(np.ascontiguousarray(bt, dtype=np.float32) + for bt in binning_thresholds) + + for feature_idx in range(data.shape[1]): + _map_num_col_to_bins(data[:, feature_idx], + binning_thresholds[feature_idx], + binned[:, feature_idx]) + return binned + + +def _map_num_col_to_bins(data, binning_thresholds, binned): + """Binary search to the find the bin index for each value in data.""" + for i in range(data.shape[0]): + # TODO: add support for missing values (NaN or custom marker) + left, right = 0, binning_thresholds.shape[0] + while left < right: + middle = (right + left - 1) // 2 + if data[i] <= binning_thresholds[middle]: + right = middle + else: + left = middle + 1 + binned[i] = left + + +class BinMapper(BaseEstimator, TransformerMixin): + """Transformer that maps a dataset into integer-valued bins. + + The bins are created in a feature-wise fashion, with equally-spaced + quantiles. + + Large datasets are subsampled, but the feature-wise quantiles should + remain stable. + + If the number of unique values for a given feature is less than + ``max_bins``, then the unique values of this feature are used instead of + the quantiles. + + Parameters + ---------- + max_bins : int, optional (default=256) + The maximum number of bins to use. If for a given feature the number of + unique values is less than ``max_bins``, then those unique values + will be used to compute the bin thresholds, instead of the quantiles. + subsample : int or None, optional (default=1e5) + If ``n_samples > subsample``, then ``sub_samples`` samples will be + randomly choosen to compute the quantiles. If ``None``, the whole data + is used. + random_state: int or numpy.random.RandomState or None, \ + optional (default=None) + Pseudo-random number generator to control the random sub-sampling. + See `scikit-learn glossary + `_. + """ + def __init__(self, max_bins=256, subsample=int(1e5), random_state=None): + self.max_bins = max_bins + self.subsample = subsample + self.random_state = random_state + + def fit(self, X, y=None): + """Fit data X by computing the binning thresholds. + + Parameters + ---------- + X: array-like + The data to bin + + Returns + ------- + self : object + """ + X = check_array(X) + self.bin_thresholds_ = _find_binning_thresholds( + X, self.max_bins, subsample=self.subsample, + random_state=self.random_state) + + self.n_bins_per_feature_ = np.array( + [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], + dtype=np.uint32) + + return self + + def transform(self, X): + """Bin data X. + + Parameters + ---------- + X: array-like + The data to bin + + Returns + ------- + X_binned : array-like + The binned data + """ + return _map_to_bins(X, binning_thresholds=self.bin_thresholds_) diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/ensemble/gbm/gradient_boosting.py new file mode 100644 index 0000000000000..52fd3b6ad4934 --- /dev/null +++ b/sklearn/ensemble/gbm/gradient_boosting.py @@ -0,0 +1,700 @@ +""" +Gradient Boosting decision trees for classification and regression. +""" +from abc import ABC, abstractmethod + +import numpy as np +from time import time +from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin +from sklearn.utils import check_X_y, check_random_state, check_array +from sklearn.utils.validation import check_is_fitted +from sklearn.utils.multiclass import check_classification_targets +from sklearn.metrics import check_scoring +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + +from .binning import BinMapper +from .grower import TreeGrower +from .loss import _LOSSES + + +class BaseGradientBoostingMachine(BaseEstimator, ABC): + """Base class for gradient boosting estimators.""" + + @abstractmethod + def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, + max_depth, min_samples_leaf, l2_regularization, max_bins, + scoring, validation_split, n_iter_no_change, tol, verbose, + random_state): + self.loss = loss + self.learning_rate = learning_rate + self.max_iter = max_iter + self.max_leaf_nodes = max_leaf_nodes + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.l2_regularization = l2_regularization + self.max_bins = max_bins + self.n_iter_no_change = n_iter_no_change + self.validation_split = validation_split + self.scoring = scoring + self.tol = tol + self.verbose = verbose + self.random_state = random_state + + def _validate_parameters(self): + """Validate parameters passed to __init__. + + The parameters that are directly passed to the grower are checked in + TreeGrower.""" + + if self.loss not in self._VALID_LOSSES: + raise ValueError( + "Loss {} is not supported for {}. Accepted losses" + "are {}.".format(self.loss, self.__class__.__name__, + ', '.join(self._VALID_LOSSES))) + + if self.learning_rate <= 0: + raise ValueError(f'learning_rate={self.learning_rate} must ' + f'be strictly positive') + if self.max_iter < 1: + raise ValueError(f'max_iter={self.max_iter} must ' + f'not be smaller than 1.') + if self.n_iter_no_change is not None and self.n_iter_no_change < 0: + raise ValueError(f'n_iter_no_change={self.n_iter_no_change} ' + f'must be positive.') + if self.validation_split is not None and self.validation_split <= 0: + raise ValueError(f'validation_split={self.validation_split} ' + f'must be strictly positive, or None.') + if self.tol is not None and self.tol < 0: + raise ValueError(f'tol={self.tol} ' + f'must not be smaller than 0.') + + def fit(self, X, y): + """Fit the gradient boosting model. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. + + y : array-like, shape=(n_samples,) + Target values. + + Returns + ------- + self : object + """ + + fit_start_time = time() + acc_find_split_time = 0. # time spent finding the best splits + acc_apply_split_time = 0. # time spent splitting nodes + # time spent predicting X for gradient and hessians update + acc_prediction_time = 0. + # TODO: add support for mixed-typed (numerical + categorical) data + # TODO: add support for missing data + # TODO: add support for pre-binned data (pass-through)? + X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) + y = self._encode_y(y) + if X.shape[0] == 1 or X.shape[1] == 1: + raise ValueError( + 'Passing only one sample or one feature is not supported yet. ' + 'See numba issue #3569.' + ) + rng = check_random_state(self.random_state) + + self._validate_parameters() + self.n_features_ = X.shape[1] # used for validation in predict() + + if self.verbose: + print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", + flush=True) + tic = time() + self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) + X_binned = self.bin_mapper_.fit_transform(X) + toc = time() + if self.verbose: + duration = toc - tic + troughput = X.nbytes / duration + print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") + + self.loss_ = self._get_loss() + + self.do_early_stopping_ = (self.n_iter_no_change is not None and + self.n_iter_no_change > 0) + + if self.do_early_stopping_ and self.validation_split is not None: + # stratify for classification + stratify = y if hasattr(self.loss_, 'predict_proba') else None + + X_binned_train, X_binned_val, y_train, y_val = train_test_split( + X_binned, y, test_size=self.validation_split, + stratify=stratify, random_state=rng) + if X_binned_train.size == 0 or X_binned_val.size == 0: + raise ValueError( + f'Not enough data (n_samples={X_binned.shape[0]}) to ' + f'perform early stopping with validation_split=' + f'{self.validation_split}. Use more training data or ' + f'adjust validation_split.' + ) + # Predicting is faster of C-contiguous arrays, training is faster + # on Fortran arrays. + X_binned_val = np.ascontiguousarray(X_binned_val) + X_binned_train = np.asfortranarray(X_binned_train) + else: + X_binned_train, y_train = X_binned, y + X_binned_val, y_val = None, None + + # Subsample the training set for score-based monitoring. + if self.do_early_stopping_: + subsample_size = 10000 + indices = np.arange(X_binned_train.shape[0]) + if X_binned_train.shape[0] > subsample_size: + indices = rng.choice(indices, subsample_size) + X_binned_small_train = X_binned_train[indices] + y_small_train = y_train[indices] + # Predicting is faster of C-contiguous arrays. + X_binned_small_train = np.ascontiguousarray(X_binned_small_train) + + if self.verbose: + print("Fitting gradient boosted rounds:") + + n_samples = X_binned_train.shape[0] + self.baseline_prediction_ = self.loss_.get_baseline_prediction( + y_train, self.n_trees_per_iteration_) + # raw_predictions are the accumulated values predicted by the trees + # for the training data. + raw_predictions = np.zeros( + shape=(n_samples, self.n_trees_per_iteration_), + dtype=self.baseline_prediction_.dtype + ) + raw_predictions += self.baseline_prediction_ + + # gradients and hessians are 1D arrays of size + # n_samples * n_trees_per_iteration + gradients, hessians = self.loss_.init_gradients_and_hessians( + n_samples=n_samples, + prediction_dim=self.n_trees_per_iteration_ + ) + + # predictors_ is a matrix of TreePredictor objects with shape + # (n_iter_, n_trees_per_iteration) + self.predictors_ = predictors = [] + + # scorer_ is a callable with signature (est, X, y) and calls + # est.predict() or est.predict_proba() depending on its nature. + self.scorer_ = check_scoring(self, self.scoring) + self.train_scores_ = [] + self.validation_scores_ = [] + if self.do_early_stopping_: + # Add predictions of the initial model (before the first tree) + self.train_scores_.append( + self._get_scores(X_binned_train, y_train)) + + if self.validation_split is not None: + self.validation_scores_.append( + self._get_scores(X_binned_val, y_val)) + + for iteration in range(self.max_iter): + + if self.verbose: + iteration_start_time = time() + print(f"[{iteration + 1}/{self.max_iter}] ", end='', + flush=True) + + # Update gradients and hessians, inplace + self.loss_.update_gradients_and_hessians(gradients, hessians, + y_train, raw_predictions) + + predictors.append([]) + + # Build `n_trees_per_iteration` trees. + for k, (gradients_at_k, hessians_at_k) in enumerate(zip( + np.array_split(gradients, self.n_trees_per_iteration_), + np.array_split(hessians, self.n_trees_per_iteration_))): + # the xxxx_at_k arrays are **views** on the original arrays. + # Note that for binary classif and regressions, + # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the + # whole array. + + grower = TreeGrower( + X_binned_train, gradients_at_k, hessians_at_k, + max_bins=self.max_bins, + n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, + max_leaf_nodes=self.max_leaf_nodes, + max_depth=self.max_depth, + min_samples_leaf=self.min_samples_leaf, + l2_regularization=self.l2_regularization, + shrinkage=self.learning_rate) + grower.grow() + + acc_apply_split_time += grower.total_apply_split_time + acc_find_split_time += grower.total_find_split_time + + predictor = grower.make_predictor( + bin_thresholds=self.bin_mapper_.bin_thresholds_) + predictors[-1].append(predictor) + + tic_pred = time() + + # prepare leaves_data so that _update_raw_predictions can be + # @njitted + leaves_data = [(l.value, l.sample_indices) + for l in grower.finalized_leaves] + _update_raw_predictions(leaves_data, raw_predictions[:, k]) + toc_pred = time() + acc_prediction_time += toc_pred - tic_pred + + should_early_stop = False + if self.do_early_stopping_: + should_early_stop = self._check_early_stopping( + X_binned_small_train, y_small_train, + X_binned_val, y_val) + + if self.verbose: + self._print_iteration_stats(iteration_start_time) + + if should_early_stop: + break + + if self.verbose: + duration = time() - fit_start_time + n_total_leaves = sum( + predictor.get_n_leaf_nodes() + for predictors_at_ith_iteration in self.predictors_ + for predictor in predictors_at_ith_iteration) + n_predictors = sum( + len(predictors_at_ith_iteration) + for predictors_at_ith_iteration in self.predictors_) + print(f"Fit {n_predictors} trees in {duration:.3f} s, " + f"({n_total_leaves} total leaves)") + print(f"{'Time spent finding best splits:':<32} " + f"{acc_find_split_time:.3f}s") + print(f"{'Time spent applying splits:':<32} " + f"{acc_apply_split_time:.3f}s") + print(f"{'Time spent predicting:':<32} " + f"{acc_prediction_time:.3f}s") + + self.train_scores_ = np.asarray(self.train_scores_) + self.validation_scores_ = np.asarray(self.validation_scores_) + return self + + def _check_early_stopping(self, X_binned_train, y_train, + X_binned_val, y_val): + """Check if fitting should be early-stopped. + + Scores are computed on validation data or on training data. + """ + + self.train_scores_.append( + self._get_scores(X_binned_train, y_train)) + + if self.validation_split is not None: + self.validation_scores_.append( + self._get_scores(X_binned_val, y_val)) + return self._should_stop(self.validation_scores_) + + return self._should_stop(self.train_scores_) + + def _should_stop(self, scores): + """ + Return True (do early stopping) if the last n scores aren't better + than the (n-1)th-to-last score, up to some tolerance. + """ + reference_position = self.n_iter_no_change + 1 + if len(scores) < reference_position: + return False + + # A higher score is always better. Higher tol means that it will be + # harder for subsequent iteration to be considered an improvement upon + # the reference score, and therefore it is more likely to early stop + # because of the lack of significant improvement. + tol = 0 if self.tol is None else self.tol + reference_score = scores[-reference_position] + tol + recent_scores = scores[-reference_position + 1:] + recent_improvements = [score > reference_score + for score in recent_scores] + return not any(recent_improvements) + + def _get_scores(self, X, y): + """Compute scores on data X with target y. + + Scores are either computed with a scorer if scoring parameter is not + None, else with the loss. As higher is always better, we return + -loss_value. + """ + if self.scoring is not None: + return self.scorer_(self, X, y) + + # Else, use loss + raw_predictions = self._raw_predict(X) + return -self.loss_(y, raw_predictions) + + def _print_iteration_stats(self, iteration_start_time): + """Print info about the current fitting iteration.""" + log_msg = '' + + predictors_of_ith_iteration = [ + predictors_list for predictors_list in self.predictors_[-1] + if predictors_list + ] + n_trees = len(predictors_of_ith_iteration) + max_depth = max(predictor.get_max_depth() + for predictor in predictors_of_ith_iteration) + n_leaves = sum(predictor.get_n_leaf_nodes() + for predictor in predictors_of_ith_iteration) + + if n_trees == 1: + log_msg += (f"{n_trees} tree, {n_leaves} leaves, ") + else: + log_msg += (f"{n_trees} trees, {n_leaves} leaves ") + log_msg += (f"({int(n_leaves / n_trees)} on avg), ") + + log_msg += f"max depth = {max_depth}, " + + if self.do_early_stopping_: + log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, " + if self.validation_split is not None: + log_msg += (f"{self.scoring} val: " + f"{self.validation_scores_[-1]:.5f}, ") + + iteration_time = time() - iteration_start_time + log_msg += f"in {iteration_time:0.3f}s" + + print(log_msg) + + def _raw_predict(self, X): + """Return the sum of the leaves values over all predictors. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. If ``X.dtype == np.uint8``, the data is assumed + to be pre-binned. + + Returns + ------- + raw_predictions : array, shape (n_samples * n_trees_per_iteration,) + The raw predicted values. + """ + X = check_array(X) + check_is_fitted(self, 'predictors_') + if X.shape[1] != self.n_features_: + raise ValueError( + f'X has {X.shape[1]} features but this estimator was ' + f'trained with {self.n_features_} features.' + ) + n_samples = X.shape[0] + raw_predictions = np.zeros( + shape=(n_samples, self.n_trees_per_iteration_), + dtype=self.baseline_prediction_.dtype + ) + raw_predictions += self.baseline_prediction_ + # Should we parallelize this? + is_binned = X.dtype == np.uint8 + for predictors_of_ith_iteration in self.predictors_: + for k, predictor in enumerate(predictors_of_ith_iteration): + predict = (predictor.predict_binned if is_binned + else predictor.predict) + raw_predictions[:, k] += predict(X) + + return raw_predictions + + @abstractmethod + def _get_loss(self): + pass + + @abstractmethod + def _encode_y(self, y=None): + pass + + @property + def n_iter_(self): + check_is_fitted(self, 'predictors_') + return len(self.predictors_) + + +class GradientBoostingRegressor(BaseGradientBoostingMachine, RegressorMixin): + """Scikit-learn compatible Gradient Boosting Tree for regression. + + Parameters + ---------- + loss : {'least_squares'}, optional(default='least_squares') + The loss function to use in the boosting process. + learning_rate : float, optional(default=0.1) + The learning rate, also known as *shrinkage*. This is used as a + multiplicative factor for the leaves values. Use ``1`` for no + shrinkage. + max_iter : int, optional(default=100) + The maximum number of iterations of the boosting process, i.e. the + maximum number of trees. + max_leaf_nodes : int or None, optional(default=None) + The maximum number of leaves for each tree. If None, there is no + maximum limit. + max_depth : int or None, optional(default=None) + The maximum depth of each tree. The depth of a tree is the number of + nodes to go from the root to the deepest leaf. + min_samples_leaf : int, optional(default=20) + The minimum number of samples per leaf. + l2_regularization : float, optional(default=0) + The L2 regularization parameter. Use 0 for no regularization. + max_bins : int, optional(default=256) + The maximum number of bins to use. Before training, each feature of + the input array ``X`` is binned into at most ``max_bins`` bins, which + allows for a much faster training stage. Features with a small + number of unique values may use less than ``max_bins`` bins. Must be no + larger than 256. + scoring : str or callable or None, \ + optional (default=None) + Scoring parameter to use for early stopping (see sklearn.metrics for + available options). If None, early stopping is check w.r.t the loss + value. + validation_split : int or float or None, optional(default=0.1) + Proportion (or absolute size) of training data to set aside as + validation data for early stopping. If None, early stopping is done on + the training data. + n_iter_no_change : int or None, optional (default=5) + Used to determine when to "early stop". The fitting process is + stopped when none of the last ``n_iter_no_change`` scores are better + than the ``n_iter_no_change - 1``th-to-last one, up to some + tolerance. If None or 0, no early-stopping is done. + tol : float or None optional (default=1e-7) + The absolute tolerance to use when comparing scores. The higher the + tolerance, the more likely we are to early stop: higher tolerance + means that it will be harder for subsequent iterations to be + considered an improvement upon the reference score. + verbose: int, optional (default=0) + The verbosity level. If not zero, print some information about the + fitting process. + random_state : int, np.random.RandomStateInstance or None, \ + optional (default=None) + Pseudo-random number generator to control the subsampling in the + binning process, and the train/validation data split if early stopping + is enabled. See + `scikit-learn glossary + `_. + + + Examples + -------- + >>> from sklearn.datasets import load_boston + >>> from pygbm import GradientBoostingRegressor + >>> X, y = load_boston(return_X_y=True) + >>> est = GradientBoostingRegressor().fit(X, y) + >>> est.score(X, y) + 0.92... + """ + + _VALID_LOSSES = ('least_squares',) + + def __init__(self, loss='least_squares', learning_rate=0.1, + max_iter=100, max_leaf_nodes=31, max_depth=None, + min_samples_leaf=20, l2_regularization=0., max_bins=256, + scoring=None, validation_split=0.1, n_iter_no_change=5, + tol=1e-7, verbose=0, random_state=None): + super(GradientBoostingRegressor, self).__init__( + loss=loss, learning_rate=learning_rate, max_iter=max_iter, + max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, + min_samples_leaf=min_samples_leaf, + l2_regularization=l2_regularization, max_bins=max_bins, + scoring=scoring, validation_split=validation_split, + n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, + random_state=random_state) + + def predict(self, X): + """Predict values for X. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. If ``X.dtype == np.uint8``, the data is assumed + to be pre-binned. + + Returns + ------- + y : array, shape (n_samples,) + The predicted values. + """ + # Return raw predictions after converting shape + # (n_samples, 1) to (n_samples,) + return self._raw_predict(X).ravel() + + def _encode_y(self, y): + # Just convert y to float32 + self.n_trees_per_iteration_ = 1 + y = y.astype(np.float32, copy=False) + return y + + def _get_loss(self): + return _LOSSES[self.loss]() + + +class GradientBoostingClassifier(BaseGradientBoostingMachine, ClassifierMixin): + """Scikit-learn compatible Gradient Boosting Tree for classification. + + Parameters + ---------- + loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \ + optional(default='auto') + The loss function to use in the boosting process. 'binary_crossentropy' + (also known as logistic loss) is used for binary classification and + generalizes to 'categorical_crossentropy' for multiclass + classification. 'auto' will automatically choose either loss depending + on the nature of the problem. + learning_rate : float, optional(default=1) + The learning rate, also known as *shrinkage*. This is used as a + multiplicative factor for the leaves values. Use ``1`` for no + shrinkage. + max_iter : int, optional(default=100) + The maximum number of iterations of the boosting process, i.e. the + maximum number of trees for binary classification. For multiclass + classification, `n_classes` trees per iteration are built. + max_leaf_nodes : int or None, optional(default=None) + The maximum number of leaves for each tree. If None, there is no + maximum limit. + max_depth : int or None, optional(default=None) + The maximum depth of each tree. The depth of a tree is the number of + nodes to go from the root to the deepest leaf. + min_samples_leaf : int, optional(default=20) + The minimum number of samples per leaf. + l2_regularization : float, optional(default=0) + The L2 regularization parameter. Use 0 for no regularization. + max_bins : int, optional(default=256) + The maximum number of bins to use. Before training, each feature of + the input array ``X`` is binned into at most ``max_bins`` bins, which + allows for a much faster training stage. Features with a small + number of unique values may use less than ``max_bins`` bins. Must be no + larger than 256. + scoring : str or callable or None, optional (default=None) + Scoring parameter to use for early stopping (see sklearn.metrics for + available options). If None, early stopping is check w.r.t the loss + value. + validation_split : int or float or None, optional(default=0.1) + Proportion (or absolute size) of training data to set aside as + validation data for early stopping. If None, early stopping is done on + the training data. + n_iter_no_change : int or None, optional (default=5) + Used to determine when to "early stop". The fitting process is + stopped when none of the last ``n_iter_no_change`` scores are better + than the ``n_iter_no_change - 1``th-to-last one, up to some + tolerance. If None or 0, no early-stopping is done. + tol : float or None optional (default=1e-7) + The absolute tolerance to use when comparing scores. The higher the + tolerance, the more likely we are to early stop: higher tolerance + means that it will be harder for subsequent iterations to be + considered an improvement upon the reference score. + verbose: int, optional(default=0) + The verbosity level. If not zero, print some information about the + fitting process. + random_state : int, np.random.RandomStateInstance or None, \ + optional(default=None) + Pseudo-random number generator to control the subsampling in the + binning process, and the train/validation data split if early stopping + is enabled. See `scikit-learn glossary + `_. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from pygbm import GradientBoostingClassifier + >>> X, y = load_iris(return_X_y=True) + >>> clf = GradientBoostingClassifier().fit(X, y) + >>> clf.score(X, y) + 0.97... + """ + + _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', + 'auto') + + def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, + max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, + l2_regularization=0., max_bins=256, scoring=None, + validation_split=0.1, n_iter_no_change=5, tol=1e-7, + verbose=0, random_state=None): + super(GradientBoostingClassifier, self).__init__( + loss=loss, learning_rate=learning_rate, max_iter=max_iter, + max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, + min_samples_leaf=min_samples_leaf, + l2_regularization=l2_regularization, max_bins=max_bins, + scoring=scoring, validation_split=validation_split, + n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, + random_state=random_state) + + def predict(self, X): + """Predict classes for X. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. If ``X.dtype == np.uint8``, the data is assumed + to be pre-binned. + + Returns + ------- + y : array, shape (n_samples,) + The predicted classes. + """ + # This could be done in parallel + encoded_classes = np.argmax(self.predict_proba(X), axis=1) + return self.classes_[encoded_classes] + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. If ``X.dtype == np.uint8``, the data is assumed + to be pre-binned. + + Returns + ------- + p : array, shape (n_samples, n_classes) + The class probabilities of the input samples. + """ + raw_predictions = self._raw_predict(X) + return self.loss_.predict_proba(raw_predictions) + + def _encode_y(self, y): + # encode classes into 0 ... n_classes - 1 and sets attributes classes_ + # and n_trees_per_iteration_ + check_classification_targets(y) + + label_encoder = LabelEncoder() + encoded_y = label_encoder.fit_transform(y) + self.classes_ = label_encoder.classes_ + n_classes = self.classes_.shape[0] + # only 1 tree for binary classification. For multiclass classification, + # we build 1 tree per class. + self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes + encoded_y = encoded_y.astype(np.float32, copy=False) + return encoded_y + + def _get_loss(self): + if self.loss == 'auto': + if self.n_trees_per_iteration_ == 1: + return _LOSSES['binary_crossentropy']() + else: + return _LOSSES['categorical_crossentropy']() + + return _LOSSES[self.loss]() + + +def _update_raw_predictions(leaves_data, raw_predictions): + """Update raw_predictions by reading the predictions of the ith tree + directly form the leaves. + + Can only be used for predicting the training data. raw_predictions + contains the sum of the tree values from iteration 0 to i - 1. This adds + the predictions of the ith tree to raw_predictions. + + Parameters + ---------- + leaves_data: list of tuples (leaf.value, leaf.sample_indices) + The leaves data used to update raw_predictions. + raw_predictions : array-like, shape=(n_samples,) + The raw predictions for the training data. + """ + for leaf_idx in range(len(leaves_data)): + leaf_value, sample_indices = leaves_data[leaf_idx] + for sample_idx in sample_indices: + raw_predictions[sample_idx] += leaf_value diff --git a/sklearn/ensemble/gbm/grower.py b/sklearn/ensemble/gbm/grower.py new file mode 100644 index 0000000000000..f1b5000e78fd7 --- /dev/null +++ b/sklearn/ensemble/gbm/grower.py @@ -0,0 +1,468 @@ +""" +This module contains the TreeGrower class. + +TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on +the gradients and hessians of the training data. +""" +from heapq import heappush, heappop +import numpy as np +from time import time + +from .splitting import (SplittingContext, split_indices, find_node_split, + find_node_split_subtraction) +from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE + + +class TreeNode: + """Tree Node class used in TreeGrower. + + This isn't used for prediction purposes, only for training (see + TreePredictor). + + Parameters + ---------- + depth : int + The depth of the node, i.e. its distance from the root + samples_indices : array of int + The indices of the samples at the node + sum_gradients : float + The sum of the gradients of the samples at the node + sum_hessians : float + The sum of the hessians of the samples at the node + parent : TreeNode or None, optional(default=None) + The parent of the node. None for root. + + Attributes + ---------- + depth : int + The depth of the node, i.e. its distance from the root + samples_indices : array of int + The indices of the samples at the node + sum_gradients : float + The sum of the gradients of the samples at the node + sum_hessians : float + The sum of the hessians of the samples at the node + parent : TreeNode or None, optional(default=None) + The parent of the node. None for root. + split_info : SplitInfo or None + The result of the split evaluation + left_child : TreeNode or None + The left child of the node. None for leaves. + right_child : TreeNode or None + The right child of the node. None for leaves. + value : float or None + The value of the leaf, as computed in finalize_leaf(). None for + non-leaf nodes + find_split_time : float + The total time spent computing the histogram and finding the best + split at the node. + construction_speed : float + The Number of samples at the node divided find_split_time. + apply_split_time : float + The total time spent actually splitting the node, e.g. splitting + samples_indices into left and right child. + hist_subtraction : bool + Wheter the subtraction method was used for computing the histograms. + """ + + split_info = None + left_child = None + right_child = None + value = None + histograms = None + sibling = None + parent = None + find_split_time = 0. + construction_speed = 0. + apply_split_time = 0. + hist_subtraction = False + + def __init__(self, depth, sample_indices, sum_gradients, + sum_hessians, parent=None): + self.depth = depth + self.sample_indices = sample_indices + self.n_samples = sample_indices.shape[0] + self.sum_gradients = sum_gradients + self.sum_hessians = sum_hessians + self.parent = parent + + def __repr__(self): + # To help with debugging + out = f"TreeNode: depth={self.depth}, " + out += f"samples={len(self.sample_indices)}" + if self.split_info is not None: + out += f", feature_idx={self.split_info.feature_idx}" + out += f", bin_idx={self.split_info.bin_idx}" + return out + + def __lt__(self, other_node): + """Comparison for priority queue. + + Nodes with high gain are higher priority than nodes with low gain. + + heapq.heappush only need the '<' operator. + heapq.heappop take the smallest item first (smaller is higher + priority). + + Parameters + ----------- + other_node : TreeNode + The node to compare with. + """ + if self.split_info is None or other_node.split_info is None: + raise ValueError("Cannot compare nodes with split_info") + return self.split_info.gain > other_node.split_info.gain + + +class TreeGrower: + """Tree grower class used to build a tree. + + The tree is fitted to predict the values of a Newton-Raphson step. The + splits are considered in a best-first fashion, and the quality of a + split is defined in splitting._split_gain. + + Parameters + ---------- + X_binned : array-like of int, shape=(n_samples, n_features) + The binned input samples. Must be Fortran-aligned. + gradients : array-like, shape=(n_samples,) + The gradients of each training sample. Those are the gradients of the + loss w.r.t the predictions, evaluated at iteration ``i - 1``. + hessians : array-like, shape=(n_samples,) + The hessians of each training sample. Those are the hessians of the + loss w.r.t the predictions, evaluated at iteration ``i - 1``. + max_leaf_nodes : int or None, optional(default=None) + The maximum number of leaves for each tree. If None, there is no + maximum limit. + max_depth : int or None, optional(default=None) + The maximum depth of each tree. The depth of a tree is the number of + nodes to go from the root to the deepest leaf. + min_samples_leaf : int, optional(default=20) + The minimum number of samples per leaf. + min_gain_to_split : float, optional(default=0.) + The minimum gain needed to split a node. Splits with lower gain will + be ignored. + max_bins : int, optional(default=256) + The maximum number of bins. Used to define the shape of the + histograms. + n_bins_per_feature : array-like of int or int, optional(default=None) + The actual number of bins needed for each feature, which is lower or + equal to ``max_bins``. If it's an int, all features are considered to + have the same number of bins. If None, all features are considered to + have ``max_bins`` bins. + l2_regularization : float, optional(default=0) + The L2 regularization parameter. + min_hessian_to_split : float, optional(default=1e-3) + The minimum sum of hessians needed in each node. Splits that result in + at least one child having a sum of hessians less than + min_hessian_to_split are discarded. + shrinkage : float, optional(default=1) + The shrinkage parameter to apply to the leaves values, also known as + learning rate. + """ + def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, + max_depth=None, min_samples_leaf=20, min_gain_to_split=0., + max_bins=256, n_bins_per_feature=None, l2_regularization=0., + min_hessian_to_split=1e-3, shrinkage=1.): + + self._validate_parameters(X_binned, max_leaf_nodes, max_depth, + min_samples_leaf, min_gain_to_split, + l2_regularization, min_hessian_to_split) + + if n_bins_per_feature is None: + n_bins_per_feature = max_bins + + if isinstance(n_bins_per_feature, int): + n_bins_per_feature = np.array( + [n_bins_per_feature] * X_binned.shape[1], + dtype=np.uint32) + + self.splitting_context = SplittingContext( + X_binned, max_bins, n_bins_per_feature, gradients, + hessians, l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + self.max_leaf_nodes = max_leaf_nodes + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.X_binned = X_binned + self.min_gain_to_split = min_gain_to_split + self.shrinkage = shrinkage + self.splittable_nodes = [] + self.finalized_leaves = [] + self.total_find_split_time = 0. # time spent finding the best splits + self.total_apply_split_time = 0. # time spent splitting nodes + self._intilialize_root() + self.n_nodes = 1 + + def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, + min_samples_leaf, min_gain_to_split, + l2_regularization, min_hessian_to_split): + """Validate parameters passed to __init__. + + Also validate parameters passed to SplittingContext because we cannot + raise exceptions in a jitclass. + """ + if X_binned.dtype != np.uint8: + raise NotImplementedError( + "Explicit feature binning required for now") + if not X_binned.flags.f_contiguous: + raise ValueError( + "X_binned should be passed as Fortran contiguous " + "array for maximum efficiency.") + if max_leaf_nodes is not None and max_leaf_nodes < 1: + raise ValueError(f'max_leaf_nodes={max_leaf_nodes} should not be' + f' smaller than 1') + if max_depth is not None and max_depth < 1: + raise ValueError(f'max_depth={max_depth} should not be' + f' smaller than 1') + if min_samples_leaf < 1: + raise ValueError(f'min_samples_leaf={min_samples_leaf} should ' + f'not be smaller than 1') + if min_gain_to_split < 0: + raise ValueError(f'min_gain_to_split={min_gain_to_split} ' + f'must be positive.') + if l2_regularization < 0: + raise ValueError(f'l2_regularization={l2_regularization} must be ' + f'positive.') + if min_hessian_to_split < 0: + raise ValueError(f'min_hessian_to_split={min_hessian_to_split} ' + f'must be positive.') + + def grow(self): + """Grow the tree, from root to leaves.""" + while self.can_split_further(): + self.split_next() + + def _intilialize_root(self): + """Initialize root node and finalize it if needed.""" + n_samples = self.X_binned.shape[0] + depth = 0 + if self.splitting_context.constant_hessian: + hessian = self.splitting_context.hessians[0] * n_samples + else: + hessian = self.splitting_context.hessians.sum() + self.root = TreeNode( + depth=depth, + sample_indices=self.splitting_context.partition.view(), + sum_gradients=self.splitting_context.gradients.sum(), + sum_hessians=hessian + ) + if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1): + self._finalize_leaf(self.root) + return + if self.root.n_samples < 2 * self.min_samples_leaf: + # Do not even bother computing any splitting statistics. + self._finalize_leaf(self.root) + return + + self._compute_spittability(self.root) + + def _compute_spittability(self, node, only_hist=False): + """Compute histograms and best possible split of a node. + + If the best possible gain is 0 of if the constraints aren't met + (min_samples_leaf, min_hessian_to_split, min_gain_to_split) then the + node is finalized (transformed into a leaf), else it is pushed on + the splittable node heap. + + Parameters + ---------- + node : TreeNode + The node to evaluate. + only_hist : bool, optional (default=False) + Whether to only compute the histograms and the SplitInfo. It is + set to ``True`` when ``_compute_spittability`` was called by a + sibling node: we only want to compute the histograms (which also + computes the ``SplitInfo``), not finalize or push the node. If + ``_compute_spittability`` is called again by the grower on this + same node, the histograms won't be computed again. + """ + # Compute split_info and histograms if not already done + if node.split_info is None and node.histograms is None: + # If the sibling has less samples, compute its hist first (with + # the regular method) and use the subtraction method for the + # current node + if node.sibling is not None: # root has no sibling + if node.sibling.n_samples < node.n_samples: + self._compute_spittability(node.sibling, only_hist=True) + # As hist of sibling is now computed we'll use the hist + # subtraction method for the current node. + node.hist_subtraction = True + + tic = time() + if node.hist_subtraction: + split_info, histograms = find_node_split_subtraction( + self.splitting_context, node.sample_indices, + node.parent.histograms, node.sibling.histograms) + else: + split_info, histograms = find_node_split( + self.splitting_context, node.sample_indices) + toc = time() + node.find_split_time = toc - tic + self.total_find_split_time += node.find_split_time + node.construction_speed = node.n_samples / node.find_split_time + node.split_info = split_info + node.histograms = histograms + + if only_hist: + # _compute_spittability was called by a sibling. We only needed to + # compute the histogram. + return + + if node.split_info.gain <= 0: # no valid split + # Note: this condition is reached if either all the leaves are + # pure (best gain = 0), or if no split would satisfy the + # constraints, (min_hessians_to_split, min_gain_to_split, + # min_samples_leaf) + self._finalize_leaf(node) + + else: + heappush(self.splittable_nodes, node) + + def split_next(self): + """Split the node with highest potential gain. + + Returns + ------- + left : TreeNode + The resulting left child. + right : TreeNode + The resulting right child. + """ + if len(self.splittable_nodes) == 0: + raise StopIteration("No more splittable nodes") + + # Consider the node with the highest loss reduction (a.k.a. gain) + node = heappop(self.splittable_nodes) + + tic = time() + (sample_indices_left, sample_indices_right) = split_indices( + self.splitting_context, node.split_info, node.sample_indices) + toc = time() + node.apply_split_time = toc - tic + self.total_apply_split_time += node.apply_split_time + + depth = node.depth + 1 + n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes) + n_leaf_nodes += 2 + + left_child_node = TreeNode(depth, + sample_indices_left, + node.split_info.gradient_left, + node.split_info.hessian_left, + parent=node) + right_child_node = TreeNode(depth, + sample_indices_right, + node.split_info.gradient_right, + node.split_info.hessian_right, + parent=node) + left_child_node.sibling = right_child_node + right_child_node.sibling = left_child_node + node.right_child = right_child_node + node.left_child = left_child_node + self.n_nodes += 2 + + if self.max_depth is not None and depth == self.max_depth: + self._finalize_leaf(left_child_node) + self._finalize_leaf(right_child_node) + return left_child_node, right_child_node + + if (self.max_leaf_nodes is not None + and n_leaf_nodes == self.max_leaf_nodes): + self._finalize_leaf(left_child_node) + self._finalize_leaf(right_child_node) + self._finalize_splittable_nodes() + return left_child_node, right_child_node + + if left_child_node.n_samples < self.min_samples_leaf * 2: + self._finalize_leaf(left_child_node) + else: + self._compute_spittability(left_child_node) + + if right_child_node.n_samples < self.min_samples_leaf * 2: + self._finalize_leaf(right_child_node) + else: + self._compute_spittability(right_child_node) + + return left_child_node, right_child_node + + def can_split_further(self): + """Return True if there are still nodes to split.""" + return len(self.splittable_nodes) >= 1 + + def _finalize_leaf(self, node): + """Compute the prediction value that minimizes the objective function. + + This sets the node.value attribute (node is a leaf iff node.value is + not None). + + See Equation 5 of: + XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016 + https://arxiv.org/abs/1603.02754 + """ + node.value = -self.shrinkage * node.sum_gradients / ( + node.sum_hessians + self.splitting_context.l2_regularization) + self.finalized_leaves.append(node) + + def _finalize_splittable_nodes(self): + """Transform all splittable nodes into leaves. + + Used when some constraint is met e.g. maximum number of leaves or + maximum depth.""" + while len(self.splittable_nodes) > 0: + node = self.splittable_nodes.pop() + self._finalize_leaf(node) + + def make_predictor(self, bin_thresholds=None): + """Make a TreePredictor object out of the current tree. + + Parameters + ---------- + bin_thresholds : array-like of floats, optional (default=None) + The actual thresholds values of each bin. + + Returns + ------- + A TreePredictor object. + """ + predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) + self._fill_predictor_node_array(predictor_nodes, self.root, + bin_thresholds=bin_thresholds) + return TreePredictor(predictor_nodes) + + def _fill_predictor_node_array(self, predictor_nodes, grower_node, + bin_thresholds=None, next_free_idx=0): + """Helper used in make_predictor to set the TreePredictor fields.""" + node = predictor_nodes[next_free_idx] + node['count'] = grower_node.n_samples + node['depth'] = grower_node.depth + if grower_node.split_info is not None: + node['gain'] = grower_node.split_info.gain + else: + node['gain'] = -1 + + if grower_node.value is not None: + # Leaf node + node['is_leaf'] = True + node['value'] = grower_node.value + return next_free_idx + 1 + else: + # Decision node + split_info = grower_node.split_info + feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx + node['feature_idx'] = feature_idx + node['bin_threshold'] = bin_idx + if bin_thresholds is not None: + threshold = bin_thresholds[feature_idx][bin_idx] + node['threshold'] = threshold + next_free_idx += 1 + + node['left'] = next_free_idx + next_free_idx = self._fill_predictor_node_array( + predictor_nodes, grower_node.left_child, + bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) + + node['right'] = next_free_idx + return self._fill_predictor_node_array( + predictor_nodes, grower_node.right_child, + bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx new file mode 100644 index 0000000000000..3052be71617d1 --- /dev/null +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -0,0 +1,195 @@ +"""This module contains njitted routines for building histograms. + +A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each +feature has its own histogram. A histogram contains the sum of gradients and +hessians of all the samples belonging to each bin. +""" +cimport cython + +import numpy as np +cimport numpy as np + +HISTOGRAM_DTYPE = np.dtype([ + ('sum_gradients', np.float32), + ('sum_hessians', np.float32), + ('count', np.uint32), +]) + + +def _build_histogram_naive(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians): + """Build histogram in a naive way, without optimizing for cache hit.""" + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + for i, sample_idx in enumerate(sample_indices): + bin_idx = binned_feature[sample_idx] + histogram[bin_idx]['sum_gradients'] += ordered_gradients[i] + histogram[bin_idx]['sum_hessians'] += ordered_hessians[i] + histogram[bin_idx]['count'] += 1 + return histogram + + +def _subtract_histograms(n_bins, hist_a, hist_b): + """Return hist_a - hist_b""" + + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + + sg = 'sum_gradients' + sh = 'sum_hessians' + c = 'count' + + for i in range(n_bins): + histogram[i][sg] = hist_a[i][sg] - hist_b[i][sg] + histogram[i][sh] = hist_a[i][sh] - hist_b[i][sh] + histogram[i][c] = hist_a[i][c] - hist_b[i][c] + + return histogram + + +def _build_histogram(n_bins, sample_indices, binned_feature, ordered_gradients, + ordered_hessians): + """Return histogram for a given feature.""" + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + n_node_samples = sample_indices.shape[0] + unrolled_upper = (n_node_samples // 4) * 4 + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[sample_indices[i]] + bin_1 = binned_feature[sample_indices[i + 1]] + bin_2 = binned_feature[sample_indices[i + 2]] + bin_3 = binned_feature[sample_indices[i + 3]] + + histogram[bin_0]['sum_gradients'] += ordered_gradients[i] + histogram[bin_1]['sum_gradients'] += ordered_gradients[i + 1] + histogram[bin_2]['sum_gradients'] += ordered_gradients[i + 2] + histogram[bin_3]['sum_gradients'] += ordered_gradients[i + 3] + + histogram[bin_0]['sum_hessians'] += ordered_hessians[i] + histogram[bin_1]['sum_hessians'] += ordered_hessians[i + 1] + histogram[bin_2]['sum_hessians'] += ordered_hessians[i + 2] + histogram[bin_3]['sum_hessians'] += ordered_hessians[i + 3] + + histogram[bin_0]['count'] += 1 + histogram[bin_1]['count'] += 1 + histogram[bin_2]['count'] += 1 + histogram[bin_3]['count'] += 1 + + for i in range(unrolled_upper, n_node_samples): + bin_idx = binned_feature[sample_indices[i]] + histogram[bin_idx]['sum_gradients'] += ordered_gradients[i] + histogram[bin_idx]['sum_hessians'] += ordered_hessians[i] + histogram[bin_idx]['count'] += 1 + + return histogram + + +def _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + ordered_gradients): + """Return histogram for a given feature. + + Hessians are not updated (used when hessians are constant). + """ + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + n_node_samples = sample_indices.shape[0] + unrolled_upper = (n_node_samples // 4) * 4 + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[sample_indices[i]] + bin_1 = binned_feature[sample_indices[i + 1]] + bin_2 = binned_feature[sample_indices[i + 2]] + bin_3 = binned_feature[sample_indices[i + 3]] + + histogram[bin_0]['sum_gradients'] += ordered_gradients[i] + histogram[bin_1]['sum_gradients'] += ordered_gradients[i + 1] + histogram[bin_2]['sum_gradients'] += ordered_gradients[i + 2] + histogram[bin_3]['sum_gradients'] += ordered_gradients[i + 3] + + histogram[bin_0]['count'] += 1 + histogram[bin_1]['count'] += 1 + histogram[bin_2]['count'] += 1 + histogram[bin_3]['count'] += 1 + + for i in range(unrolled_upper, n_node_samples): + bin_idx = binned_feature[sample_indices[i]] + histogram[bin_idx]['sum_gradients'] += ordered_gradients[i] + histogram[bin_idx]['count'] += 1 + + return histogram + + +def _build_histogram_root_no_hessian(n_bins, binned_feature, all_gradients): + """Special case for the root node + + The root node has to find the split among all the samples from the + training set. binned_feature and all_gradients already have a consistent + ordering. + + Hessians are not updated (used when hessians are constant) + """ + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + n_node_samples = binned_feature.shape[0] + unrolled_upper = (n_node_samples // 4) * 4 + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[i] + bin_1 = binned_feature[i + 1] + bin_2 = binned_feature[i + 2] + bin_3 = binned_feature[i + 3] + + histogram[bin_0]['sum_gradients'] += all_gradients[i] + histogram[bin_1]['sum_gradients'] += all_gradients[i + 1] + histogram[bin_2]['sum_gradients'] += all_gradients[i + 2] + histogram[bin_3]['sum_gradients'] += all_gradients[i + 3] + + histogram[bin_0]['count'] += 1 + histogram[bin_1]['count'] += 1 + histogram[bin_2]['count'] += 1 + histogram[bin_3]['count'] += 1 + + for i in range(unrolled_upper, n_node_samples): + bin_idx = binned_feature[i] + histogram[bin_idx]['sum_gradients'] += all_gradients[i] + histogram[bin_idx]['count'] += 1 + + return histogram + + +def _build_histogram_root(n_bins, binned_feature, all_gradients, + all_hessians): + """Special case for the root node + + The root node has to find the split among all the samples from the + training set. binned_feature and all_gradients and all_hessians already + have a consistent ordering. + """ + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + n_node_samples = binned_feature.shape[0] + unrolled_upper = (n_node_samples // 4) * 4 + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[i] + bin_1 = binned_feature[i + 1] + bin_2 = binned_feature[i + 2] + bin_3 = binned_feature[i + 3] + + histogram[bin_0]['sum_gradients'] += all_gradients[i] + histogram[bin_1]['sum_gradients'] += all_gradients[i + 1] + histogram[bin_2]['sum_gradients'] += all_gradients[i + 2] + histogram[bin_3]['sum_gradients'] += all_gradients[i + 3] + + histogram[bin_0]['sum_hessians'] += all_hessians[i] + histogram[bin_1]['sum_hessians'] += all_hessians[i + 1] + histogram[bin_2]['sum_hessians'] += all_hessians[i + 2] + histogram[bin_3]['sum_hessians'] += all_hessians[i + 3] + + histogram[bin_0]['count'] += 1 + histogram[bin_1]['count'] += 1 + histogram[bin_2]['count'] += 1 + histogram[bin_3]['count'] += 1 + + for i in range(unrolled_upper, n_node_samples): + bin_idx = binned_feature[i] + histogram[bin_idx]['sum_gradients'] += all_gradients[i] + histogram[bin_idx]['sum_hessians'] += all_hessians[i] + histogram[bin_idx]['count'] += 1 + + return histogram diff --git a/sklearn/ensemble/gbm/loss.py b/sklearn/ensemble/gbm/loss.py new file mode 100644 index 0000000000000..134569a517d5c --- /dev/null +++ b/sklearn/ensemble/gbm/loss.py @@ -0,0 +1,299 @@ +""" +This module contains the loss classes. + +Specific losses are used for regression, binary classification or multiclass +classification. +""" +from abc import ABC, abstractmethod + +from scipy.special import expit, logsumexp +import numpy as np + +from .utils import get_threads_chunks + + +def _logsumexp(a): + """logsumexp(x) = log(sum(exp(x))) + + Custom logsumexp function with numerical stability, based on scipy's + logsumexp which is unfortunately not supported (neither is + np.logaddexp.reduce, which is equivalent). Only supports 1d arrays. + """ + + a_max = np.amax(a) + if not np.isfinite(a_max): + a_max = 0 + + s = np.sum(np.exp(a - a_max)) + return np.log(s) + a_max + + +def _expit(x): + # custom sigmoid because we cannot use that of scipy with numba + return 1 / (1 + np.exp(-x)) + + +class BaseLoss(ABC): + """Base class for a loss.""" + + def init_gradients_and_hessians(self, n_samples, prediction_dim): + """Return initial gradients and hessians. + + Unless hessians are constant, arrays are initialized with undefined + values. + + Parameters + ---------- + n_samples : int + The number of samples passed to `fit()` + prediction_dim : int + The dimension of a raw prediction, i.e. the number of trees + built at each iteration. Equals 1 for regression and binary + classification, or K where K is the number of classes for + multiclass classification. + + Returns + ------- + gradients : array-like, shape=(n_samples * prediction_dim) + hessians : array-like, shape=(n_samples * prediction_dim). + If hessians are constant (e.g. for ``LeastSquares`` loss, shape + is (1,) and the array is initialized to ``1``. + """ + shape = n_samples * prediction_dim + gradients = np.empty(shape=shape, dtype=np.float32) + if self.hessian_is_constant: + hessians = np.ones(shape=1, dtype=np.float32) + else: + hessians = np.empty(shape=shape, dtype=np.float32) + + return gradients, hessians + + @abstractmethod + def get_baseline_prediction(self, y_train, prediction_dim): + """Return initial predictions (before the first iteration). + + Parameters + ---------- + y_train : array-like, shape=(n_samples,) + The target training values. + prediction_dim : int + The dimension of one prediction: 1 for binary classification and + regression, n_classes for multiclass classification. + + Returns + ------- + baseline_prediction: float or array of shape (1, prediction_dim) + The baseline prediction. + """ + pass + + @abstractmethod + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + """Update gradients and hessians arrays, inplace. + + The gradients (resp. hessians) are the first (resp. second) order + derivatives of the loss for each sample with respect to the + predictions of model, evaluated at iteration ``i - 1``. + + Parameters + ---------- + gradients : array-like, shape=(n_samples * prediction_dim) + The gradients (treated as OUT array). + hessians : array-like, shape=(n_samples * prediction_dim) or \ + (1,) + The hessians (treated as OUT array). + y_true : array-like, shape=(n_samples,) + The true target values or each training sample. + raw_predictions : array-like, shape=(n_samples, prediction_dim) + The raw_predictions (i.e. values from the trees) of the tree + ensemble at iteration ``i - 1``. + """ + pass + + +class LeastSquares(BaseLoss): + """Least squares loss, for regression. + + For a given sample x_i, least squares loss is defined as:: + + loss(x_i) = (y_true_i - raw_pred_i)**2 + """ + + hessian_is_constant = True + + def __call__(self, y_true, raw_predictions, average=True): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + loss = np.power(y_true - raw_predictions, 2) + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + return np.mean(y_train) + + def inverse_link_function(self, raw_predictions): + return raw_predictions + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + return _update_gradients_least_squares(gradients, y_true, + raw_predictions) + + +def _update_gradients_least_squares(gradients, y_true, raw_predictions): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + n_samples = raw_predictions.shape[0] + starts, ends, n_threads = get_threads_chunks(total_size=n_samples) + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + # Note: a more correct exp is 2 * (raw_predictions - y_true) but + # since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = raw_predictions[i] - y_true[i] + + +class BinaryCrossEntropy(BaseLoss): + """Binary cross-entropy loss, for binary classification. + + For a given sample x_i, the binary cross-entropy loss is defined as the + negative log-likelihood of the model which can be expressed as:: + + loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i + + See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. + """ + + hessian_is_constant = False + inverse_link_function = staticmethod(expit) + + def __call__(self, y_true, raw_predictions, average=True): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + # logaddexp(0, x) = log(1 + exp(x)) + loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + proba_positive_class = np.mean(y_train) + eps = np.finfo(y_train.dtype).eps + proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) + # log(x / 1 - x) is the anti function of sigmoid, or the link function + # of the Binomial model. + return np.log(proba_positive_class / (1 - proba_positive_class)) + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + return _update_gradients_hessians_binary_crossentropy( + gradients, hessians, y_true, raw_predictions) + + def predict_proba(self, raw_predictions): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + proba = np.empty((raw_predictions.shape[0], 2), dtype=np.float32) + proba[:, 1] = expit(raw_predictions) + proba[:, 0] = 1 - proba[:, 1] + return proba + + +def _update_gradients_hessians_binary_crossentropy(gradients, hessians, + y_true, raw_predictions): + # Note: using LightGBM version (first mapping {0, 1} into {-1, 1}) + # will cause overflow issues in the exponential as we're using float32 + # precision. + + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + n_samples = raw_predictions.shape[0] + starts, ends, n_threads = get_threads_chunks(total_size=n_samples) + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + gradients[i] = _expit(raw_predictions[i]) - y_true[i] + gradient_abs = np.abs(gradients[i]) + hessians[i] = gradient_abs * (1. - gradient_abs) + + +class CategoricalCrossEntropy(BaseLoss): + """Categorical cross-entropy loss, for multiclass classification. + + For a given sample x_i, the categorical cross-entropy loss is defined as + the negative log-likelihood of the model and generalizes the binary + cross-entropy to more than 2 classes. + """ + + hessian_is_constant = False + + def __call__(self, y_true, raw_predictions, average=True): + one_hot_true = np.zeros_like(raw_predictions) + prediction_dim = raw_predictions.shape[1] + for k in range(prediction_dim): + one_hot_true[:, k] = (y_true == k) + + loss = (logsumexp(raw_predictions, axis=1) - + (one_hot_true * raw_predictions).sum(axis=1)) + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + init_value = np.zeros( + shape=(1, prediction_dim), + dtype=np.float32 + ) + eps = np.finfo(y_train.dtype).eps + for k in range(prediction_dim): + proba_kth_class = np.mean(y_train == k) + proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) + init_value[:, k] += np.log(proba_kth_class) + + return init_value + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + return _update_gradients_hessians_categorical_crossentropy( + gradients, hessians, y_true, raw_predictions) + + def predict_proba(self, raw_predictions): + # TODO: This could be done in parallel + # compute softmax (using exp(log(softmax))) + return np.exp(raw_predictions - + logsumexp(raw_predictions, axis=1)[:, np.newaxis]) + + +def _update_gradients_hessians_categorical_crossentropy( + gradients, hessians, y_true, raw_predictions): + # Here gradients and hessians are of shape + # (n_samples * prediction_dim,). + # y_true is of shape (n_samples,). + # raw_predictions is of shape (n_samples, raw_predictions) + # + # Instead of passing the whole gradients and hessians arrays and slicing + # them here, we could instead do the update in the 'for k in ...' loop of + # fit(), by passing gradients_at_k and hessians_at_k which are of size + # (n_samples,). + # That would however require to pass a copy of raw_predictions, so it does + # not get partially overwritten at the end of the loop when + # _update_y_pred() is called (see sklearn PR 12715) + n_samples, prediction_dim = raw_predictions.shape + starts, ends, n_threads = get_threads_chunks(total_size=n_samples) + for k in range(prediction_dim): + gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] + hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + # p_k is the probability that class(ith sample) == k. + # This is a regular softmax. + p_k = np.exp(raw_predictions[i, k] - + _logsumexp(raw_predictions[i, :])) + gradients_at_k[i] = p_k - (y_true[i] == k) + hessians_at_k[i] = p_k * (1. - p_k) + # LightGBM uses 2 * p_k * (1 - p_k) which is not stricly + # correct but equivalent to using half the learning rate. + + +_LOSSES = {'least_squares': LeastSquares, + 'binary_crossentropy': BinaryCrossEntropy, + 'categorical_crossentropy': CategoricalCrossEntropy} diff --git a/sklearn/ensemble/gbm/predictor.py b/sklearn/ensemble/gbm/predictor.py new file mode 100644 index 0000000000000..ab549639aa8cb --- /dev/null +++ b/sklearn/ensemble/gbm/predictor.py @@ -0,0 +1,110 @@ +""" +This module contains the TreePredictor class which is used for prediction. +""" +import numpy as np + + +PREDICTOR_RECORD_DTYPE = np.dtype([ + ('is_leaf', np.uint8), + ('value', np.float32), + ('count', np.uint32), + ('feature_idx', np.uint32), + ('bin_threshold', np.uint8), + ('threshold', np.float32), + ('left', np.uint32), + ('right', np.uint32), + ('gain', np.float32), + ('depth', np.uint32), + # TODO: shrinkage in leaf for feature importance error bar? +]) + + +class TreePredictor: + """Tree class used for predictions. + + Parameters + ---------- + nodes : list of PREDICTOR_RECORD_DTYPE. + The nodes of the tree. + """ + def __init__(self, nodes): + self.nodes = nodes + + def get_n_leaf_nodes(self): + """Return number of leaves.""" + return int(self.nodes['is_leaf'].sum()) + + def get_max_depth(self): + """Return maximum depth among all leaves.""" + return int(self.nodes['depth'].max()) + + def predict_binned(self, binned_data, out=None): + """Predict raw values for binned data. + + Parameters + ---------- + binned_data : array-like of np.uint8, shape=(n_samples, n_features) + The binned input samples. + out : array-like, shape=(n_samples,), optional (default=None) + If not None, predictions will be written inplace in ``out``. + + Returns + ------- + y : array, shape (n_samples,) + The raw predicted values. + """ + if out is None: + out = np.empty(binned_data.shape[0], dtype=np.float32) + _predict_binned(self.nodes, binned_data, out) + return out + + def predict(self, X): + """Predict raw values for non-binned data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y : array, shape (n_samples,) + The raw predicted values. + """ + # TODO: introspect X to dispatch to numerical or categorical data + # (dense or sparse) on a feature by feature basis. + out = np.empty(X.shape[0], dtype=np.float32) + _predict_from_numeric_data(self.nodes, X, out) + return out + + +def _predict_one_binned(nodes, binned_data): + node = nodes[0] + while True: + if node['is_leaf']: + return node['value'] + if binned_data[node['feature_idx']] <= node['bin_threshold']: + node = nodes[node['left']] + else: + node = nodes[node['right']] + + +def _predict_binned(nodes, binned_data, out): + for i in range(binned_data.shape[0]): + out[i] = _predict_one_binned(nodes, binned_data[i]) + + +def _predict_one_from_numeric_data(nodes, numeric_data): + node = nodes[0] + while True: + if node['is_leaf']: + return node['value'] + if numeric_data[node['feature_idx']] <= node['threshold']: + node = nodes[node['left']] + else: + node = nodes[node['right']] + + +def _predict_from_numeric_data(nodes, numeric_data, out): + for i in range(numeric_data.shape[0]): + out[i] = _predict_one_from_numeric_data(nodes, numeric_data[i]) diff --git a/sklearn/ensemble/gbm/splitting.py b/sklearn/ensemble/gbm/splitting.py new file mode 100644 index 0000000000000..1d8f5ad32ad38 --- /dev/null +++ b/sklearn/ensemble/gbm/splitting.py @@ -0,0 +1,552 @@ +"""This module contains njitted routines and data structures to: + +- Find the best possible split of a node. For a given node, a split is + characterized by a feature and a bin. +- Apply a split to a node, i.e. split the indices of the samples at the node + into the newly created left and right childs. +""" +import numpy as np + +from .histogram import _build_histogram +from .histogram import _subtract_histograms +from .histogram import _build_histogram_no_hessian +from .histogram import _build_histogram_root +from .histogram import _build_histogram_root_no_hessian +from .histogram import HISTOGRAM_DTYPE +from .utils import get_threads_chunks + + +class SplitInfo: + """Pure data class to store information about a potential split. + + Parameters + ---------- + gain : float32 + The gain of the split + feature_idx : int + The index of the feature to be split + bin_idx : int + The index of the bin on which the split is made + gradient_left : float32 + The sum of the gradients of all the samples in the left child + hessian_left : float32 + The sum of the hessians of all the samples in the left child + gradient_right : float32 + The sum of the gradients of all the samples in the right child + hessian_right : float32 + The sum of the hessians of all the samples in the right child + n_samples_left : int + The number of samples in the left child + n_samples_right : int + The number of samples in the right child + """ + def __init__(self, gain=-1., feature_idx=0, bin_idx=0, + gradient_left=0., hessian_left=0., + gradient_right=0., hessian_right=0., + n_samples_left=0, n_samples_right=0): + self.gain = gain + self.feature_idx = feature_idx + self.bin_idx = bin_idx + self.gradient_left = gradient_left + self.hessian_left = hessian_left + self.gradient_right = gradient_right + self.hessian_right = hessian_right + self.n_samples_left = n_samples_left + self.n_samples_right = n_samples_right + + +class SplittingContext: + """Pure data class defining a splitting context. + + Ideally it would also have methods but numba does not support annotating + jitclasses (so we can't use parallel=True). This structure is + instanciated in the grower and stores all the required information to + compute the SplitInfo and histograms of each node. + + Parameters + ---------- + X_binned : array of int + The binned input samples. Must be Fortran-aligned. + max_bins : int, optional(default=256) + The maximum number of bins. Used to define the shape of the + histograms. + n_bins_per_feature : array-like of int + The actual number of bins needed for each feature, which is lower or + equal to max_bins. + gradients : array-like, shape=(n_samples,) + The gradients of each training sample. Those are the gradients of the + loss w.r.t the predictions, evaluated at iteration i - 1. + hessians : array-like, shape=(n_samples,) + The hessians of each training sample. Those are the hessians of the + loss w.r.t the predictions, evaluated at iteration i - 1. + l2_regularization : float + The L2 regularization parameter. + min_hessian_to_split : float + The minimum sum of hessians needed in each node. Splits that result in + at least one child having a sum of hessians less than + min_hessian_to_split are discarded. + min_samples_leaf : int + The minimum number of samples per leaf. + min_gain_to_split : float, optional(default=0.) + The minimum gain needed to split a node. Splits with lower gain will + be ignored. + """ + def __init__(self, X_binned, max_bins, n_bins_per_feature, + gradients, hessians, l2_regularization, + min_hessian_to_split=1e-3, min_samples_leaf=20, + min_gain_to_split=0.): + + self.X_binned = X_binned + self.n_features = X_binned.shape[1] + # Note: all histograms will have bins, but some of the + # last bins may be unused if n_bins_per_feature[f] < max_bins + self.max_bins = max_bins + self.n_bins_per_feature = n_bins_per_feature + self.gradients = gradients + self.hessians = hessians + # for root node, gradients and hessians are already ordered + self.ordered_gradients = gradients.copy() + self.ordered_hessians = hessians.copy() + self.sum_gradients = self.gradients.sum() + self.sum_hessians = self.hessians.sum() + self.constant_hessian = hessians.shape[0] == 1 + self.l2_regularization = l2_regularization + self.min_hessian_to_split = min_hessian_to_split + self.min_samples_leaf = min_samples_leaf + self.min_gain_to_split = min_gain_to_split + if self.constant_hessian: + self.constant_hessian_value = self.hessians[0] # 1 scalar + else: + self.constant_hessian_value = np.float32(1.) # won't be used anyway + + # The partition array maps each sample index into the leaves of the + # tree (a leaf in this context is a node that isn't splitted yet, not + # necessarily a 'finalized' leaf). Initially, the root contains all + # the indices, e.g.: + # partition = [abcdefghijkl] + # After a call to split_indices, it may look e.g. like this: + # partition = [cef|abdghijkl] + # we have 2 leaves, the left one is at position 0 and the second one at + # position 3. The order of the samples is irrelevant. + self.partition = np.arange(0, X_binned.shape[0], 1, np.uint32) + # buffers used in split_indices to support parallel splitting. + self.left_indices_buffer = np.empty_like(self.partition) + self.right_indices_buffer = np.empty_like(self.partition) + + +def split_indices(context, split_info, sample_indices): + """Split samples into left and right arrays. + + Parameters + ---------- + context : SplittingContext + The splitting context + split_ingo : SplitInfo + The SplitInfo of the node to split + sample_indices : array of int + The indices of the samples at the node to split. This is a view on + context.partition, and it is modified inplace by placing the indices + of the left child at the beginning, and the indices of the right child + at the end. + + Returns + ------- + left_indices : array of int + The indices of the samples in the left child. This is a view on + context.partition. + right_indices : array of int + The indices of the samples in the right child. This is a view on + context.partition. + """ + # This is a multi-threaded implementation inspired by lightgbm. + # Here is a quick break down. Let's suppose we want to split a node with + # 24 samples named from a to x. context.partition looks like this (the * + # are indices in other leaves that we don't care about): + # partition = [*************abcdefghijklmnopqrstuvwx****************] + # ^ ^ + # node_position node_position + node.n_samples + + # Ultimately, we want to reorder the samples inside the boundaries of the + # leaf (which becomes a node) to now represent the samples in its left and + # right child. For example: + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + # ^ ^ + # left_child_pos right_child_pos + # Note that left_child_pos always takes the value of node_position, and + # right_child_pos = left_child_pos + left_child.n_samples. The order of + # the samples inside a leaf is irrelevant. + + # 1. samples_indices is a view on this region a..x. We conceptually + # divide it into n_threads regions. Each thread will be responsible for + # its own region. Here is an example with 4 threads: + # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] + # 2. Each thread processes 6 = 24 // 4 entries and maps them into + # left_indices_buffer or right_indices_buffer. For example, we could + # have the following mapping ('.' denotes an undefined entry): + # - left_indices_buffer = [abef..|il....|mnopqr|tux...] + # - right_indices_buffer = [cd....|ghjk..|......|svw...] + # 3. We keep track of the start positions of the regions (the '|') in + # ``offset_in_buffers`` as well as the size of each region. We also keep + # track of the number of samples put into the left/right child by each + # thread. Concretely: + # - left_counts = [4, 2, 6, 3] + # - right_counts = [2, 4, 0, 3] + # 4. Finally, we put left/right_indices_buffer back into the + # samples_indices, without any undefined entries and the partition looks + # as expected + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + + # Note: We here show left/right_indices_buffer as being the same size as + # sample_indices for simplicity, but in reality they are of the same size + # as partition. + + X_binned = context.X_binned.T[split_info.feature_idx] + + n_threads = 4 # TODO: change this + n_samples = sample_indices.shape[0] + + # Note: we could probably allocate all the arrays of size n_threads in the + # splitting context as well, but gains are probably going to be minimal + sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) + if n_samples % n_threads > 0: + # array[:0] will cause a bug in numba 0.41 so we need the if. Remove + # once issue numba 3554 is fixed. + sizes[:n_samples % n_threads] += 1 + offset_in_buffers = np.zeros(n_threads, dtype=np.int32) + offset_in_buffers[1:] = np.cumsum(sizes[:-1]) + + left_counts = np.empty(n_threads, dtype=np.int32) + right_counts = np.empty(n_threads, dtype=np.int32) + + # Need to declare local variables, else they're not updated :/ + # (see numba issue 3459) + left_indices_buffer = context.left_indices_buffer + right_indices_buffer = context.right_indices_buffer + + # map indices from samples_indices to left/right_indices_buffer + for thread_idx in range(n_threads): + left_count = 0 + right_count = 0 + + start = offset_in_buffers[thread_idx] + stop = start + sizes[thread_idx] + for i in range(start, stop): + sample_idx = sample_indices[i] + if X_binned[sample_idx] <= split_info.bin_idx: + left_indices_buffer[start + left_count] = sample_idx + left_count += 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count += 1 + + left_counts[thread_idx] = left_count + right_counts[thread_idx] = right_count + + # position of right child = just after the left child + right_child_position = left_counts.sum() + + # offset of each thread in samples_indices for left and right child, i.e. + # where each thread will start to write. + left_offset = np.zeros(n_threads, dtype=np.int32) + left_offset[1:] = np.cumsum(left_counts[:-1]) + right_offset = np.full(n_threads, right_child_position, dtype=np.int32) + right_offset[1:] += np.cumsum(right_counts[:-1]) + + # map indices in left/right_indices_buffer back into samples_indices. This + # also updates context.partition since samples_indice is a view. + for thread_idx in range(n_threads): + + for i in range(left_counts[thread_idx]): + sample_indices[left_offset[thread_idx] + i] = \ + left_indices_buffer[offset_in_buffers[thread_idx] + i] + for i in range(right_counts[thread_idx]): + sample_indices[right_offset[thread_idx] + i] = \ + right_indices_buffer[offset_in_buffers[thread_idx] + i] + + return (sample_indices[:right_child_position], + sample_indices[right_child_position:]) + + +def find_node_split(context, sample_indices): + """For each feature, find the best bin to split on at a given node. + + Returns the best split info among all features, and the histograms of + all the features. The histograms are computed by scanning the whole + data. + + Parameters + ---------- + context : SplittingContext + The splitting context + sample_indices : array of int + The indices of the samples at the node to split. + + Returns + ------- + best_split_info : SplitInfo + The info about the best possible split among all features. + histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) + The histograms of each feature. A histogram is an array of + HISTOGRAM_DTYPE of size ``max_bins`` (only + ``n_bins_per_features[feature]`` entries are relevant). + """ + + ctx = context # shorter name to avoid various line breaks + n_samples = sample_indices.shape[0] + + # Need to declare local variables, else they're not updated + # (see numba issue 3459) + ordered_gradients = ctx.ordered_gradients + ordered_hessians = ctx.ordered_hessians + + # Populate ordered_gradients and ordered_hessians. (Already done for root) + # Ordering the gradients and hessians helps to improve cache hit. + # This is a parallelized version of the following vanilla code: + # for i range(n_samples): + # ctx.ordered_gradients[i] = ctx.gradients[samples_indices[i]] + if sample_indices.shape[0] != ctx.gradients.shape[0]: + starts, ends, n_threads = get_threads_chunks(n_samples) + if ctx.constant_hessian: + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + ordered_gradients[i] = ctx.gradients[sample_indices[i]] + else: + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + ordered_gradients[i] = ctx.gradients[sample_indices[i]] + ordered_hessians[i] = ctx.hessians[sample_indices[i]] + + ctx.sum_gradients = ctx.ordered_gradients[:n_samples].sum() + if ctx.constant_hessian: + ctx.sum_hessians = ctx.constant_hessian_value * float32(n_samples) + else: + ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() + + # Pre-allocate the results datastructure to be able to use prange: + # numba jitclass do not seem to properly support default values for kwargs. + split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) + for i in range(context.n_features)] + histograms = np.empty( + shape=(np.int64(context.n_features), np.int64(context.max_bins)), + dtype=HISTOGRAM_DTYPE + ) + for feature_idx in range(context.n_features): + split_info, histogram = _find_histogram_split( + context, feature_idx, sample_indices) + split_infos[feature_idx] = split_info + histograms[feature_idx, :] = histogram + + split_info = _find_best_feature_to_split_helper(split_infos) + return split_info, histograms + + +def find_node_split_subtraction(context, sample_indices, parent_histograms, + sibling_histograms): + """For each feature, find the best bin to split on at a given node. + + Returns the best split info among all features, and the histograms of + all the features. + + This does the same job as ``find_node_split()`` but uses the histograms + of the parent and sibling of the node to split. This allows to use the + identity: ``histogram(parent) = histogram(node) - histogram(sibling)``, + which is significantly faster than computing the histograms from data. + + Returns the best SplitInfo among all features, along with all the feature + histograms that can be latter used to compute the sibling or children + histograms by substraction. + + Parameters + ---------- + context : SplittingContext + The splitting context + sample_indices : array of int + The indices of the samples at the node to split. + parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + The histograms of the parent + sibling_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the sibling + + Returns + ------- + best_split_info : SplitInfo + The info about the best possible split among all features. + histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) + The histograms of each feature. A histogram is an array of + HISTOGRAM_DTYPE of size ``max_bins`` (only + ``n_bins_per_features[feature]`` entries are relevant). + """ + + # We can pick any feature (here the first) in the histograms to + # compute the gradients: they must be the same across all features + # anyway, we have tests ensuring this. Maybe a more robust way would + # be to compute an average but it's probably not worth it. + context.sum_gradients = (parent_histograms[0]['sum_gradients'].sum() - + sibling_histograms[0]['sum_gradients'].sum()) + + n_samples = sample_indices.shape[0] + if context.constant_hessian: + context.sum_hessians = \ + context.constant_hessian_value * np.float32(n_samples) + else: + context.sum_hessians = (parent_histograms[0]['sum_hessians'].sum() - + sibling_histograms[0]['sum_hessians'].sum()) + + # Pre-allocate the results datastructure to be able to use prange + split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) + for i in range(context.n_features)] + histograms = np.empty( + shape=(np.int64(context.n_features), np.int64(context.max_bins)), + dtype=HISTOGRAM_DTYPE + ) + for feature_idx in range(context.n_features): + split_info, histogram = _find_histogram_split_subtraction( + context, feature_idx, parent_histograms, + sibling_histograms, n_samples) + split_infos[feature_idx] = split_info + histograms[feature_idx, :] = histogram + + split_info = _find_best_feature_to_split_helper(split_infos) + return split_info, histograms + + +def _find_best_feature_to_split_helper(split_infos): + best_gain = None + for i, split_info in enumerate(split_infos): + gain = split_info.gain + if best_gain is None or gain > best_gain: + best_gain = gain + best_split_info = split_info + return best_split_info + + +def _find_histogram_split(context, feature_idx, sample_indices): + """Compute the histogram for a given feature + + Returns the best SplitInfo among all the possible bins of the feature. + """ + n_samples = sample_indices.shape[0] + X_binned = context.X_binned.T[feature_idx] + + root_node = X_binned.shape[0] == n_samples + ordered_gradients = context.ordered_gradients[:n_samples] + ordered_hessians = context.ordered_hessians[:n_samples] + + if root_node: + if context.constant_hessian: + histogram = _build_histogram_root_no_hessian( + context.max_bins, X_binned, ordered_gradients) + else: + histogram = _build_histogram_root( + context.max_bins, X_binned, ordered_gradients, + context.ordered_hessians) + else: + if context.constant_hessian: + histogram = _build_histogram_no_hessian( + context.max_bins, sample_indices, X_binned, + ordered_gradients) + else: + histogram = _build_histogram( + context.max_bins, sample_indices, X_binned, + ordered_gradients, ordered_hessians) + + return _find_best_bin_to_split_helper(context, feature_idx, histogram, + n_samples) + + +def _find_histogram_split_subtraction(context, feature_idx, + parent_histograms, sibling_histograms, + n_samples): + """Compute the histogram by substraction of parent and sibling + + Uses the identity: hist(parent) = hist(left) + hist(right). + Returns the best SplitInfo among all the possible bins of the feature. + """ + histogram = _subtract_histograms( + context.max_bins, + parent_histograms[feature_idx], sibling_histograms[feature_idx]) + + return _find_best_bin_to_split_helper(context, feature_idx, histogram, + n_samples) + + +def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): + """Find best bin to split on, and return the corresponding SplitInfo. + + Splits that do not satisfy the splitting constraints (min_gain_to_split, + etc.) are discarded here. If no split can satisfy the constraints, a + SplitInfo with a gain of -1 is returned. If for a given node the best + SplitInfo has a gain of -1, it is finalized into a leaf. + """ + # Allocate the structure for the best split information. It can be + # returned as such (with a negative gain) if the min_hessian_to_split + # condition is not satisfied. Such invalid splits are later discarded by + # the TreeGrower. + best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) + gradient_left, hessian_left = 0., 0. + n_samples_left = 0 + + for bin_idx in range(context.n_bins_per_feature[feature_idx]): + n_samples_left += histogram[bin_idx]['count'] + n_samples_right = n_samples - n_samples_left + + if context.constant_hessian: + hessian_left += (histogram[bin_idx]['count'] + * context.constant_hessian_value) + else: + hessian_left += histogram[bin_idx]['sum_hessians'] + hessian_right = context.sum_hessians - hessian_left + + gradient_left += histogram[bin_idx]['sum_gradients'] + gradient_right = context.sum_gradients - gradient_left + + if n_samples_left < context.min_samples_leaf: + continue + if n_samples_right < context.min_samples_leaf: + # won't get any better + break + + if hessian_left < context.min_hessian_to_split: + continue + if hessian_right < context.min_hessian_to_split: + # won't get any better (hessians are > 0 since loss is convex) + break + + gain = _split_gain(gradient_left, hessian_left, + gradient_right, hessian_right, + context.sum_gradients, context.sum_hessians, + context.l2_regularization) + + if gain > best_split.gain and gain > context.min_gain_to_split: + best_split.gain = gain + best_split.feature_idx = feature_idx + best_split.bin_idx = bin_idx + best_split.gradient_left = gradient_left + best_split.hessian_left = hessian_left + best_split.n_samples_left = n_samples_left + best_split.gradient_right = gradient_right + best_split.hessian_right = hessian_right + best_split.n_samples_right = n_samples_right + + return best_split, histogram + + +def _split_gain(gradient_left, hessian_left, gradient_right, hessian_right, + sum_gradients, sum_hessians, l2_regularization): + """Loss reduction + + Compute the reduction in loss after taking a split, compared to keeping + the node a leaf of the tree. + + See Equation 7 of: + XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016 + https://arxiv.org/abs/1603.02754 + """ + def negative_loss(gradient, hessian): + return (gradient ** 2) / (hessian + l2_regularization) + + gain = negative_loss(gradient_left, hessian_left) + gain += negative_loss(gradient_right, hessian_right) + gain -= negative_loss(sum_gradients, sum_hessians) + return gain diff --git a/sklearn/ensemble/gbm/utils.py b/sklearn/ensemble/gbm/utils.py new file mode 100644 index 0000000000000..628c8e95639b1 --- /dev/null +++ b/sklearn/ensemble/gbm/utils.py @@ -0,0 +1,79 @@ +"""This module contains utility routines.""" +import numpy as np + + +def get_lightgbm_estimator(pygbm_estimator): + """Return an unfitted LightGBM estimator with matching hyperparams. + + This utility function takes care of renaming the PyGBM parameters into + their LightGBM equivalent parameters. + """ + from lightgbm import LGBMRegressor + from lightgbm import LGBMClassifier + + # Import here to avoid cyclic dependencies + from .gradient_boosting import GradientBoostingClassifier + + pygbm_params = pygbm_estimator.get_params() + + if pygbm_params['loss'] == 'auto': + raise ValueError('auto loss is not accepted. We need to know if ' + 'the problem is binary or multiclass classification.') + if pygbm_params['n_iter_no_change'] is not None: + raise NotImplementedError('Early stopping should be deactivated.') + + loss_mapping = { + 'least_squares': 'regression_l2', + 'binary_crossentropy': 'binary', + 'categorical_crossentropy': 'multiclass' + } + + lgbm_params = { + 'objective': loss_mapping[pygbm_params['loss']], + 'learning_rate': pygbm_params['learning_rate'], + 'n_estimators': pygbm_params['max_iter'], + 'num_leaves': pygbm_params['max_leaf_nodes'], + 'max_depth': pygbm_params['max_depth'], + 'min_data_in_leaf': pygbm_params['min_samples_leaf'], + 'lambda_l2': pygbm_params['l2_regularization'], + 'max_bin': pygbm_params['max_bins'], + 'min_data_in_bin': 1, + 'min_sum_hessian_in_leaf': 1e-3, + 'min_gain_to_split': 0, + 'verbosity': 10 if pygbm_params['verbose'] else 0, + 'boost_from_average': True, + } + # TODO: change hardcoded values when / if they're arguments to the + # estimator. + + if pygbm_params['loss'] == 'categorical_crossentropy': + # LGBM multiplies hessians by 2 in multiclass loss. + lgbm_params['min_sum_hessian_in_leaf'] *= 2 + lgbm_params['learning_rate'] *= 2 + + if isinstance(pygbm_estimator, GradientBoostingClassifier): + Est = LGBMClassifier + else: + Est = LGBMRegressor + + return Est(**lgbm_params) + + +def get_threads_chunks(total_size): + """Get start and end indices of threads in an array of size total_size. + + The interval [0, total_size - 1] is divided into n_threads contiguous + regions, and the starts and ends of each region are returned. Used to + simulate a 'static' scheduling. + """ + n_threads = 4 # TODO: change this + sizes = np.full(n_threads, total_size // n_threads, dtype=np.int32) + if total_size % n_threads > 0: + # array[:0] will cause a bug in numba 0.41 so we need the if. + # Remove once issue numba 3554 is fixed. + sizes[:total_size % n_threads] += 1 + starts = np.zeros(n_threads, dtype=np.int32) + starts[1:] = np.cumsum(sizes[:-1]) + ends = starts + sizes + + return starts, ends, n_threads diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index 34fb63b906d0a..0698e910c7bbf 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -8,6 +8,10 @@ def configuration(parent_package="", top_path=None): sources=["_gradient_boosting.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("gbm.histogram", + sources=["gbm/histogram.pyx"], + include_dirs=[numpy.get_include()]) + config.add_subpackage("tests") return config From eb0235105ad9f4805fa02210212e0dedb836f5f4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 19 Dec 2018 15:56:23 -0500 Subject: [PATCH 002/247] Some progress --- gdb_test.py | 51 +++ sklearn/ensemble/__init__.py | 2 +- sklearn/ensemble/gbm/_gradient_boosting.pyx | 31 ++ .../ensemble/gbm/{binning.py => binning.pyx} | 35 +- sklearn/ensemble/gbm/fun.py | 5 + sklearn/ensemble/gbm/gradient_boosting.py | 33 +- sklearn/ensemble/gbm/grower.py | 8 +- sklearn/ensemble/gbm/histogram.pyx | 261 ++++++++----- sklearn/ensemble/gbm/playground.pyx | 8 + .../gbm/{splitting.py => splitting.pyx} | 347 +++++++++--------- sklearn/ensemble/setup.py | 17 + sklearn/tree/_tree.pyx | 4 + sklearn/tree/tree.py | 2 +- 13 files changed, 494 insertions(+), 310 deletions(-) create mode 100644 gdb_test.py create mode 100644 sklearn/ensemble/gbm/_gradient_boosting.pyx rename sklearn/ensemble/gbm/{binning.py => binning.pyx} (87%) create mode 100644 sklearn/ensemble/gbm/fun.py create mode 100644 sklearn/ensemble/gbm/playground.pyx rename sklearn/ensemble/gbm/{splitting.py => splitting.pyx} (63%) diff --git a/gdb_test.py b/gdb_test.py new file mode 100644 index 0000000000000..07b0f59913867 --- /dev/null +++ b/gdb_test.py @@ -0,0 +1,51 @@ +from time import time + +from sklearn.datasets import make_regression, make_classification +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import GBMRegressor +from sklearn.ensemble import GBMClassifier + +import pstats +import cProfile + +classif = True +n_samples = 100000 +max_iter = 5 + +if classif: + X, y = make_classification(n_samples=n_samples, random_state=0) + GBM = GBMClassifier + GBDT = GradientBoostingClassifier +else: + X, y = make_regression(n_samples=n_samples, random_state=0) + GBM = GBMRegressor + GBDT = GradientBoostingRegressor + + +tic = time() +gbm = GBM(max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + n_iter_no_change=None, + random_state=0, + verbose=True) +gbm.fit(X, y) +duration = time() - tic +print(f'score: {gbm.score(X, y)}') +print(f'Took {duration:.3f}s\n') + +# cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") + +# s = pstats.Stats("Profile.prof") +# s.strip_dirs().sort_stats("time").print_stats(.2) + +tic = time() +gbdt = GBDT(n_estimators=max_iter, + n_iter_no_change=None, # no early stopping + random_state=0, + verbose=True).fit(X, y) +print(gbdt.n_estimators_) +duration = time() - tic +print(f'score: {gbdt.score(X, y)}') +print(f'Took {duration:.3f}s') diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index 7069117704d17..c1760ae39a763 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -17,7 +17,7 @@ from .gradient_boosting import GradientBoostingClassifier from .gradient_boosting import GradientBoostingRegressor from .voting_classifier import VotingClassifier -from .gbm.gradient_boosting import GradientBoostingClassifier as GBMCLassifier +from .gbm.gradient_boosting import GradientBoostingClassifier as GBMClassifier from .gbm.gradient_boosting import GradientBoostingRegressor as GBMRegressor from . import bagging diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx new file mode 100644 index 0000000000000..43ccf7644db34 --- /dev/null +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -0,0 +1,31 @@ +# cython: profile=True +cimport cython + +import numpy as np +cimport numpy as np + + +def _update_raw_predictions__(float [:] leaves_values, list samples_indices_at_leaf, np.float_t [:] raw_predictions): + """Update raw_predictions by reading the predictions of the ith tree + directly form the leaves. + + Can only be used for predicting the training data. raw_predictions + contains the sum of the tree values from iteration 0 to i - 1. This adds + the predictions of the ith tree to raw_predictions. + + Parameters + ---------- + leaves_data: list of tuples (leaf.value, leaf.sample_indices) + The leaves data used to update raw_predictions. + raw_predictions : array-like, shape=(n_samples,) + The raw predictions for the training data. + """ + cdef: + int leaf_idx + unsigned int sample_idx + unsigned int [:] sample_indices + + for leaf_idx in range(leaves_values.shape[0]): + samples_indices = samples_indices_at_leaf[leaf_idx] + for sample_idx in samples_indices: + raw_predictions[sample_idx] += leaves_values[leaf_idx] \ No newline at end of file diff --git a/sklearn/ensemble/gbm/binning.py b/sklearn/ensemble/gbm/binning.pyx similarity index 87% rename from sklearn/ensemble/gbm/binning.py rename to sklearn/ensemble/gbm/binning.pyx index 3371db94095be..b52f53ad5326d 100644 --- a/sklearn/ensemble/gbm/binning.py +++ b/sklearn/ensemble/gbm/binning.pyx @@ -1,10 +1,15 @@ +# cython: profile=True """ This module contains the BinMapper class. BinMapper is used for mapping a real-valued dataset into integer-valued bins with equally-spaced thresholds. """ +cimport cython + import numpy as np +cimport numpy as np + from sklearn.utils import check_random_state, check_array from sklearn.base import BaseEstimator, TransformerMixin @@ -51,7 +56,7 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return tuple(binning_thresholds) -def _map_to_bins(data, binning_thresholds=None, out=None): +cdef _map_to_bins(np.ndarray[np.float_t, ndim=2] data, binning_thresholds): """Bin numerical values to discrete integer-coded levels. Parameters @@ -71,26 +76,32 @@ def _map_to_bins(data, binning_thresholds=None, out=None): """ # TODO: add support for categorical data encoded as integers # TODO: add support for sparse data (numerical or categorical) - if out is not None: - assert out.shape == data.shape - assert out.dtype == np.uint8 - assert out.flags.f_contiguous - binned = out - else: - binned = np.zeros_like(data, dtype=np.uint8, order='F') + cdef: + np.ndarray[np.uint8_t, ndim=2] binned + np.ndarray[np.float32_t, ndim=2] binning_thresholds_ + int feature_idx - binning_thresholds = tuple(np.ascontiguousarray(bt, dtype=np.float32) - for bt in binning_thresholds) + binned = np.zeros_like(data, dtype=np.uint8, order='F') + + # binning_thresholds = tuple(np.ascontiguousarray(bt, dtype=np.float32) + # for bt in binning_thresholds) + binning_thresholds_ = np.array(binning_thresholds, dtype=np.float32) for feature_idx in range(data.shape[1]): _map_num_col_to_bins(data[:, feature_idx], - binning_thresholds[feature_idx], + binning_thresholds_[feature_idx], binned[:, feature_idx]) return binned -def _map_num_col_to_bins(data, binning_thresholds, binned): +cdef _map_num_col_to_bins(np.ndarray[np.float_t] data, np.ndarray[np.float32_t] binning_thresholds, np.ndarray[np.uint8_t] binned): """Binary search to the find the bin index for each value in data.""" + cdef: + int i + int left + int right + int middle + for i in range(data.shape[0]): # TODO: add support for missing values (NaN or custom marker) left, right = 0, binning_thresholds.shape[0] diff --git a/sklearn/ensemble/gbm/fun.py b/sklearn/ensemble/gbm/fun.py new file mode 100644 index 0000000000000..e84dcc71d639a --- /dev/null +++ b/sklearn/ensemble/gbm/fun.py @@ -0,0 +1,5 @@ +from playground import g + +a = g() +print(a) +print(a.dtype) \ No newline at end of file diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/ensemble/gbm/gradient_boosting.py index 52fd3b6ad4934..e2746748fd7e8 100644 --- a/sklearn/ensemble/gbm/gradient_boosting.py +++ b/sklearn/ensemble/gbm/gradient_boosting.py @@ -12,6 +12,7 @@ from sklearn.metrics import check_scoring from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder +from ._gradient_boosting import _update_raw_predictions__ from .binning import BinMapper from .grower import TreeGrower @@ -167,6 +168,7 @@ def fit(self, X, y): shape=(n_samples, self.n_trees_per_iteration_), dtype=self.baseline_prediction_.dtype ) + print(raw_predictions.dtype) raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size @@ -236,11 +238,15 @@ def fit(self, X, y): tic_pred = time() - # prepare leaves_data so that _update_raw_predictions can be - # @njitted - leaves_data = [(l.value, l.sample_indices) - for l in grower.finalized_leaves] - _update_raw_predictions(leaves_data, raw_predictions[:, k]) + leaves_values = [l.value for l in grower.finalized_leaves] + samples_indices_in_leaves = [l.sample_indices for l in grower.finalized_leaves] + leaves_values = np.array(leaves_values, dtype=np.float32) + _update_raw_predictions__(leaves_values, samples_indices_in_leaves, raw_predictions[:, k]) + # leaves_data = [(l.value, l.sample_indices) + # for l in grower.finalized_leaves] + # _update_raw_predictions(leaves_data, raw_predictions[:, k]) + + toc_pred = time() acc_prediction_time += toc_pred - tic_pred @@ -678,23 +684,8 @@ def _get_loss(self): return _LOSSES[self.loss]() - def _update_raw_predictions(leaves_data, raw_predictions): - """Update raw_predictions by reading the predictions of the ith tree - directly form the leaves. - - Can only be used for predicting the training data. raw_predictions - contains the sum of the tree values from iteration 0 to i - 1. This adds - the predictions of the ith tree to raw_predictions. - - Parameters - ---------- - leaves_data: list of tuples (leaf.value, leaf.sample_indices) - The leaves data used to update raw_predictions. - raw_predictions : array-like, shape=(n_samples,) - The raw predictions for the training data. - """ for leaf_idx in range(len(leaves_data)): leaf_value, sample_indices = leaves_data[leaf_idx] for sample_idx in sample_indices: - raw_predictions[sample_idx] += leaf_value + raw_predictions[sample_idx] += leaf_value \ No newline at end of file diff --git a/sklearn/ensemble/gbm/grower.py b/sklearn/ensemble/gbm/grower.py index f1b5000e78fd7..06723fe27f114 100644 --- a/sklearn/ensemble/gbm/grower.py +++ b/sklearn/ensemble/gbm/grower.py @@ -240,11 +240,13 @@ def _intilialize_root(self): if self.splitting_context.constant_hessian: hessian = self.splitting_context.hessians[0] * n_samples else: - hessian = self.splitting_context.hessians.sum() + hessian = np.sum(self.splitting_context.hessians) self.root = TreeNode( depth=depth, - sample_indices=self.splitting_context.partition.view(), - sum_gradients=self.splitting_context.gradients.sum(), + #sample_indices=self.splitting_context.partition.view(), + sample_indices=self.splitting_context.partition, + #sum_gradients=self.splitting_context.gradients.sum(), + sum_gradients=np.sum(self.splitting_context.gradients), sum_hessians=hessian ) if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1): diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx index 3052be71617d1..7fd2e967f5a1a 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -1,3 +1,4 @@ +# cython: profile=True """This module contains njitted routines for building histograms. A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each @@ -9,6 +10,7 @@ cimport cython import numpy as np cimport numpy as np + HISTOGRAM_DTYPE = np.dtype([ ('sum_gradients', np.float32), ('sum_hessians', np.float32), @@ -16,41 +18,71 @@ HISTOGRAM_DTYPE = np.dtype([ ]) +from libc.stdlib cimport malloc, free + +cdef struct hist_struct: + float sum_gradients + float sum_hessians + unsigned int count + + + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. def _build_histogram_naive(n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians): """Build histogram in a naive way, without optimizing for cache hit.""" histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) for i, sample_idx in enumerate(sample_indices): bin_idx = binned_feature[sample_idx] - histogram[bin_idx]['sum_gradients'] += ordered_gradients[i] - histogram[bin_idx]['sum_hessians'] += ordered_hessians[i] - histogram[bin_idx]['count'] += 1 + histogram[bin_idx].sum_gradients += ordered_gradients[i] + histogram[bin_idx].sum_hessians += ordered_hessians[i] + histogram[bin_idx].count += 1 return histogram -def _subtract_histograms(n_bins, hist_a, hist_b): +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +def _subtract_histograms(unsigned int n_bins, np.ndarray hist_a, np.ndarray hist_b): """Return hist_a - hist_b""" + # print('subtract_hist') - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - - sg = 'sum_gradients' - sh = 'sum_hessians' - c = 'count' + cdef unsigned int i = 0 + cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + cdef hist_struct [:] view = histogram + cdef hist_struct [:] view_a = hist_a + cdef hist_struct [:] view_b = hist_b for i in range(n_bins): - histogram[i][sg] = hist_a[i][sg] - hist_b[i][sg] - histogram[i][sh] = hist_a[i][sh] - hist_b[i][sh] - histogram[i][c] = hist_a[i][c] - hist_b[i][c] + view[i].sum_gradients = view_a[i].sum_gradients - view_b[i].sum_gradients + view[i].sum_hessians = view_a[i].sum_hessians - view_b[i].sum_hessians + view[i].count = view_a[i].count - view_b[i].count return histogram -def _build_histogram(n_bins, sample_indices, binned_feature, ordered_gradients, - ordered_hessians): +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +def _build_histogram(unsigned int n_bins, unsigned int [:] + sample_indices, unsigned char [:] + binned_feature, float [:] ordered_gradients, + float[:] ordered_hessians): """Return histogram for a given feature.""" - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - n_node_samples = sample_indices.shape[0] - unrolled_upper = (n_node_samples // 4) * 4 + cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + cdef hist_struct [:] view = histogram + cdef int i = 0 + + cdef float [:] ordered_gradients_view = ordered_gradients + cdef float [:] ordered_hessians_view = ordered_hessians + + cdef int n_node_samples = sample_indices.shape[0] + cdef int unrolled_upper = (n_node_samples // 4) * 4 + + cdef unsigned int bin_0 + cdef unsigned int bin_1 + cdef unsigned int bin_2 + cdef unsigned int bin_3 + cdef unsigned int bin_idx for i in range(0, unrolled_upper, 4): bin_0 = binned_feature[sample_indices[i]] @@ -58,65 +90,86 @@ def _build_histogram(n_bins, sample_indices, binned_feature, ordered_gradients, bin_2 = binned_feature[sample_indices[i + 2]] bin_3 = binned_feature[sample_indices[i + 3]] - histogram[bin_0]['sum_gradients'] += ordered_gradients[i] - histogram[bin_1]['sum_gradients'] += ordered_gradients[i + 1] - histogram[bin_2]['sum_gradients'] += ordered_gradients[i + 2] - histogram[bin_3]['sum_gradients'] += ordered_gradients[i + 3] + view[bin_0].sum_gradients += ordered_gradients_view[i] + view[bin_1].sum_gradients += ordered_gradients_view[i + 1] + view[bin_2].sum_gradients += ordered_gradients_view[i + 2] + view[bin_3].sum_gradients += ordered_gradients_view[i + 3] - histogram[bin_0]['sum_hessians'] += ordered_hessians[i] - histogram[bin_1]['sum_hessians'] += ordered_hessians[i + 1] - histogram[bin_2]['sum_hessians'] += ordered_hessians[i + 2] - histogram[bin_3]['sum_hessians'] += ordered_hessians[i + 3] + view[bin_0].sum_hessians += ordered_hessians_view[i] + view[bin_1].sum_hessians += ordered_hessians_view[i + 1] + view[bin_2].sum_hessians += ordered_hessians_view[i + 2] + view[bin_3].sum_hessians += ordered_hessians_view[i + 3] - histogram[bin_0]['count'] += 1 - histogram[bin_1]['count'] += 1 - histogram[bin_2]['count'] += 1 - histogram[bin_3]['count'] += 1 + view[bin_0].count += 1 + view[bin_1].count += 1 + view[bin_2].count += 1 + view[bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): bin_idx = binned_feature[sample_indices[i]] - histogram[bin_idx]['sum_gradients'] += ordered_gradients[i] - histogram[bin_idx]['sum_hessians'] += ordered_hessians[i] - histogram[bin_idx]['count'] += 1 + view[bin_idx].sum_gradients += ordered_gradients_view[i] + view[bin_idx].sum_hessians += ordered_hessians_view[i] + view[bin_idx].count += 1 return histogram -def _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, - ordered_gradients): +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +def _build_histogram_no_hessian(unsigned int n_bins, unsigned int [:] + sample_indices, unsigned char [:] + binned_feature, float [:] ordered_gradients): """Return histogram for a given feature. Hessians are not updated (used when hessians are constant). """ - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - n_node_samples = sample_indices.shape[0] - unrolled_upper = (n_node_samples // 4) * 4 + # print('build_hist_no_hessian') + cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + cdef hist_struct [:] view = histogram + cdef unsigned int i = 0 + + cdef float [:] ordered_gradients_view = ordered_gradients + cdef unsigned char [:] binned_feature_view = binned_feature + cdef unsigned int [:] sample_indices_view = sample_indices + + cdef unsigned int n_node_samples = sample_indices.shape[0] + cdef unsigned int unrolled_upper = (n_node_samples // 4) * 4 + + cdef unsigned int bin_0 + cdef unsigned int bin_1 + cdef unsigned int bin_2 + cdef unsigned int bin_3 + cdef unsigned int bin_idx for i in range(0, unrolled_upper, 4): - bin_0 = binned_feature[sample_indices[i]] - bin_1 = binned_feature[sample_indices[i + 1]] - bin_2 = binned_feature[sample_indices[i + 2]] - bin_3 = binned_feature[sample_indices[i + 3]] + bin_0 = binned_feature_view[sample_indices_view[i]] + bin_1 = binned_feature_view[sample_indices_view[i + 1]] + bin_2 = binned_feature_view[sample_indices_view[i + 2]] + bin_3 = binned_feature_view[sample_indices_view[i + 3]] - histogram[bin_0]['sum_gradients'] += ordered_gradients[i] - histogram[bin_1]['sum_gradients'] += ordered_gradients[i + 1] - histogram[bin_2]['sum_gradients'] += ordered_gradients[i + 2] - histogram[bin_3]['sum_gradients'] += ordered_gradients[i + 3] + view[bin_0].sum_gradients += ordered_gradients_view[i] + view[bin_1].sum_gradients += ordered_gradients_view[i + 1] + view[bin_2].sum_gradients += ordered_gradients_view[i + 2] + view[bin_3].sum_gradients += ordered_gradients_view[i + 3] - histogram[bin_0]['count'] += 1 - histogram[bin_1]['count'] += 1 - histogram[bin_2]['count'] += 1 - histogram[bin_3]['count'] += 1 + view[bin_0].count += 1 + view[bin_1].count += 1 + view[bin_2].count += 1 + view[bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): - bin_idx = binned_feature[sample_indices[i]] - histogram[bin_idx]['sum_gradients'] += ordered_gradients[i] - histogram[bin_idx]['count'] += 1 + bin_idx = binned_feature_view[sample_indices_view[i]] + view[bin_idx].sum_gradients += ordered_gradients_view[i] + view[bin_idx].count += 1 return histogram -def _build_histogram_root_no_hessian(n_bins, binned_feature, all_gradients): + +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +def _build_histogram_root_no_hessian(unsigned int n_bins, unsigned char [:] + binned_feature, float [:]all_gradients): """Special case for the root node The root node has to find the split among all the samples from the @@ -125,45 +178,71 @@ def _build_histogram_root_no_hessian(n_bins, binned_feature, all_gradients): Hessians are not updated (used when hessians are constant) """ - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - n_node_samples = binned_feature.shape[0] - unrolled_upper = (n_node_samples // 4) * 4 + # print('build_hist_root_no_hessian') + + cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + cdef hist_struct [:] view = histogram + cdef unsigned int i = 0 + + cdef float [:] all_gradients_view = all_gradients + cdef unsigned char [:] binned_feature_view = binned_feature + + cdef unsigned int n_node_samples = binned_feature.shape[0] + cdef unsigned int unrolled_upper = (n_node_samples // 4) * 4 + + cdef unsigned int bin_0 + cdef unsigned int bin_1 + cdef unsigned int bin_2 + cdef unsigned int bin_3 + cdef unsigned int bin_idx for i in range(0, unrolled_upper, 4): - bin_0 = binned_feature[i] - bin_1 = binned_feature[i + 1] - bin_2 = binned_feature[i + 2] - bin_3 = binned_feature[i + 3] + bin_0 = binned_feature_view[i] + bin_1 = binned_feature_view[i + 1] + bin_2 = binned_feature_view[i + 2] + bin_3 = binned_feature_view[i + 3] - histogram[bin_0]['sum_gradients'] += all_gradients[i] - histogram[bin_1]['sum_gradients'] += all_gradients[i + 1] - histogram[bin_2]['sum_gradients'] += all_gradients[i + 2] - histogram[bin_3]['sum_gradients'] += all_gradients[i + 3] + view[bin_0].sum_gradients += all_gradients_view[i] + view[bin_1].sum_gradients += all_gradients_view[i + 1] + view[bin_2].sum_gradients += all_gradients_view[i + 2] + view[bin_3].sum_gradients += all_gradients_view[i + 3] - histogram[bin_0]['count'] += 1 - histogram[bin_1]['count'] += 1 - histogram[bin_2]['count'] += 1 - histogram[bin_3]['count'] += 1 + view[bin_0].count += 1 + view[bin_1].count += 1 + view[bin_2].count += 1 + view[bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): - bin_idx = binned_feature[i] - histogram[bin_idx]['sum_gradients'] += all_gradients[i] - histogram[bin_idx]['count'] += 1 + bin_idx = binned_feature_view[i] + view[bin_idx].sum_gradients += all_gradients_view[i] + view[bin_idx].count += 1 return histogram -def _build_histogram_root(n_bins, binned_feature, all_gradients, - all_hessians): +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +def _build_histogram_root(unsigned int n_bins, unsigned char [:] + binned_feature, float [:] all_gradients, + float[:] all_hessians): """Special case for the root node The root node has to find the split among all the samples from the training set. binned_feature and all_gradients and all_hessians already have a consistent ordering. """ - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - n_node_samples = binned_feature.shape[0] - unrolled_upper = (n_node_samples // 4) * 4 + cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + cdef hist_struct [:] view = histogram + cdef int i = 0 + + cdef unsigned int n_node_samples = binned_feature.shape[0] + cdef unsigned int unrolled_upper = (n_node_samples // 4) * 4 + + cdef unsigned int bin_0 + cdef unsigned int bin_1 + cdef unsigned int bin_2 + cdef unsigned int bin_3 + cdef unsigned int bin_idx for i in range(0, unrolled_upper, 4): bin_0 = binned_feature[i] @@ -171,25 +250,25 @@ def _build_histogram_root(n_bins, binned_feature, all_gradients, bin_2 = binned_feature[i + 2] bin_3 = binned_feature[i + 3] - histogram[bin_0]['sum_gradients'] += all_gradients[i] - histogram[bin_1]['sum_gradients'] += all_gradients[i + 1] - histogram[bin_2]['sum_gradients'] += all_gradients[i + 2] - histogram[bin_3]['sum_gradients'] += all_gradients[i + 3] + view[bin_0].sum_gradients += all_gradients[i] + view[bin_1].sum_gradients += all_gradients[i + 1] + view[bin_2].sum_gradients += all_gradients[i + 2] + view[bin_3].sum_gradients += all_gradients[i + 3] - histogram[bin_0]['sum_hessians'] += all_hessians[i] - histogram[bin_1]['sum_hessians'] += all_hessians[i + 1] - histogram[bin_2]['sum_hessians'] += all_hessians[i + 2] - histogram[bin_3]['sum_hessians'] += all_hessians[i + 3] + view[bin_0].sum_hessians += all_hessians[i] + view[bin_1].sum_hessians += all_hessians[i + 1] + view[bin_2].sum_hessians += all_hessians[i + 2] + view[bin_3].sum_hessians += all_hessians[i + 3] - histogram[bin_0]['count'] += 1 - histogram[bin_1]['count'] += 1 - histogram[bin_2]['count'] += 1 - histogram[bin_3]['count'] += 1 + view[bin_0].count += 1 + view[bin_1].count += 1 + view[bin_2].count += 1 + view[bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): bin_idx = binned_feature[i] - histogram[bin_idx]['sum_gradients'] += all_gradients[i] - histogram[bin_idx]['sum_hessians'] += all_hessians[i] - histogram[bin_idx]['count'] += 1 + view[bin_idx].sum_gradients += all_gradients[i] + view[bin_idx].sum_hessians += all_hessians[i] + view[bin_idx].count += 1 return histogram diff --git a/sklearn/ensemble/gbm/playground.pyx b/sklearn/ensemble/gbm/playground.pyx new file mode 100644 index 0000000000000..b40b37d35bbd9 --- /dev/null +++ b/sklearn/ensemble/gbm/playground.pyx @@ -0,0 +1,8 @@ +cimport cython + +cdef class Shrubbery: + cdef int width, height + + def __init__(self, int w, int h): + self.width = w + self.height = h \ No newline at end of file diff --git a/sklearn/ensemble/gbm/splitting.py b/sklearn/ensemble/gbm/splitting.pyx similarity index 63% rename from sklearn/ensemble/gbm/splitting.py rename to sklearn/ensemble/gbm/splitting.pyx index 1d8f5ad32ad38..a68dc177f560e 100644 --- a/sklearn/ensemble/gbm/splitting.py +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -1,3 +1,4 @@ +# cython: profile=True """This module contains njitted routines and data structures to: - Find the best possible split of a node. For a given node, a split is @@ -5,7 +6,10 @@ - Apply a split to a node, i.e. split the indices of the samples at the node into the newly created left and right childs. """ +cimport cython + import numpy as np +cimport numpy as np from .histogram import _build_histogram from .histogram import _subtract_histograms @@ -15,8 +19,13 @@ from .histogram import HISTOGRAM_DTYPE from .utils import get_threads_chunks +cdef struct hist_struct: + float sum_gradients + float sum_hessians + unsigned int count -class SplitInfo: +@cython.freelist(100) +cdef class SplitInfo: """Pure data class to store information about a potential split. Parameters @@ -40,10 +49,21 @@ class SplitInfo: n_samples_right : int The number of samples in the right child """ - def __init__(self, gain=-1., feature_idx=0, bin_idx=0, - gradient_left=0., hessian_left=0., - gradient_right=0., hessian_right=0., - n_samples_left=0, n_samples_right=0): + cdef public float gain + cdef public unsigned int feature_idx + cdef public unsigned int bin_idx + cdef public float gradient_left + cdef public float gradient_right + cdef public float hessian_left + cdef public float hessian_right + cdef public unsigned int n_samples_left + cdef public unsigned int n_samples_right + + def __cinit__(self, float gain=-1., unsigned int feature_idx=0, unsigned + int bin_idx=0, + float gradient_left=0., float hessian_left=0., + float gradient_right=0., float hessian_right=0., + unsigned int n_samples_left=0, unsigned int n_samples_right=0): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx @@ -55,7 +75,7 @@ def __init__(self, gain=-1., feature_idx=0, bin_idx=0, self.n_samples_right = n_samples_right -class SplittingContext: +cdef class SplittingContext: """Pure data class defining a splitting context. Ideally it would also have methods but numba does not support annotating @@ -91,10 +111,32 @@ class SplittingContext: The minimum gain needed to split a node. Splits with lower gain will be ignored. """ - def __init__(self, X_binned, max_bins, n_bins_per_feature, - gradients, hessians, l2_regularization, - min_hessian_to_split=1e-3, min_samples_leaf=20, - min_gain_to_split=0.): + cdef public unsigned char [:, :] X_binned + cdef public unsigned int n_features + cdef public unsigned int max_bins + cdef public unsigned int [:] n_bins_per_feature + cdef public float [:] gradients + cdef public float [:] hessians + cdef public float [:] ordered_gradients + cdef public float [:] ordered_hessians + cdef public float sum_gradients + cdef public float sum_hessians + cdef public unsigned char constant_hessian + cdef public float constant_hessian_value + cdef public float l2_regularization + cdef public float min_hessian_to_split + cdef public unsigned int min_samples_leaf + cdef public float min_gain_to_split + + cdef public unsigned int [:] partition + cdef public unsigned int [:] left_indices_buffer + cdef public unsigned int [:] right_indices_buffer + + def __cinit__(self, np.ndarray[np.uint8_t, ndim=2] X_binned, unsigned int max_bins, + np.ndarray[np.uint32_t] n_bins_per_feature, + np.ndarray [np.float32_t] gradients, np.ndarray[np.float32_t] hessians, float l2_regularization, + float min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, + float min_gain_to_split=0.): self.X_binned = X_binned self.n_features = X_binned.shape[1] @@ -107,8 +149,8 @@ def __init__(self, X_binned, max_bins, n_bins_per_feature, # for root node, gradients and hessians are already ordered self.ordered_gradients = gradients.copy() self.ordered_hessians = hessians.copy() - self.sum_gradients = self.gradients.sum() - self.sum_hessians = self.hessians.sum() + self.sum_gradients = gradients.sum() + self.sum_hessians = hessians.sum() self.constant_hessian = hessians.shape[0] == 1 self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split @@ -134,140 +176,36 @@ def __init__(self, X_binned, max_bins, n_bins_per_feature, self.right_indices_buffer = np.empty_like(self.partition) -def split_indices(context, split_info, sample_indices): - """Split samples into left and right arrays. - - Parameters - ---------- - context : SplittingContext - The splitting context - split_ingo : SplitInfo - The SplitInfo of the node to split - sample_indices : array of int - The indices of the samples at the node to split. This is a view on - context.partition, and it is modified inplace by placing the indices - of the left child at the beginning, and the indices of the right child - at the end. - - Returns - ------- - left_indices : array of int - The indices of the samples in the left child. This is a view on - context.partition. - right_indices : array of int - The indices of the samples in the right child. This is a view on - context.partition. - """ - # This is a multi-threaded implementation inspired by lightgbm. - # Here is a quick break down. Let's suppose we want to split a node with - # 24 samples named from a to x. context.partition looks like this (the * - # are indices in other leaves that we don't care about): - # partition = [*************abcdefghijklmnopqrstuvwx****************] - # ^ ^ - # node_position node_position + node.n_samples - - # Ultimately, we want to reorder the samples inside the boundaries of the - # leaf (which becomes a node) to now represent the samples in its left and - # right child. For example: - # partition = [*************abefilmnopqrtuxcdghjksvw*****************] - # ^ ^ - # left_child_pos right_child_pos - # Note that left_child_pos always takes the value of node_position, and - # right_child_pos = left_child_pos + left_child.n_samples. The order of - # the samples inside a leaf is irrelevant. - - # 1. samples_indices is a view on this region a..x. We conceptually - # divide it into n_threads regions. Each thread will be responsible for - # its own region. Here is an example with 4 threads: - # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] - # 2. Each thread processes 6 = 24 // 4 entries and maps them into - # left_indices_buffer or right_indices_buffer. For example, we could - # have the following mapping ('.' denotes an undefined entry): - # - left_indices_buffer = [abef..|il....|mnopqr|tux...] - # - right_indices_buffer = [cd....|ghjk..|......|svw...] - # 3. We keep track of the start positions of the regions (the '|') in - # ``offset_in_buffers`` as well as the size of each region. We also keep - # track of the number of samples put into the left/right child by each - # thread. Concretely: - # - left_counts = [4, 2, 6, 3] - # - right_counts = [2, 4, 0, 3] - # 4. Finally, we put left/right_indices_buffer back into the - # samples_indices, without any undefined entries and the partition looks - # as expected - # partition = [*************abefilmnopqrtuxcdghjksvw*****************] - - # Note: We here show left/right_indices_buffer as being the same size as - # sample_indices for simplicity, but in reality they are of the same size - # as partition. - - X_binned = context.X_binned.T[split_info.feature_idx] - - n_threads = 4 # TODO: change this - n_samples = sample_indices.shape[0] +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [:] sample_indices): + cdef: + unsigned int n_samples = sample_indices.shape[0] + unsigned int i = 0 + unsigned int j = n_samples - 1 + unsigned char pivot = split_info.bin_idx + unsigned int [:] view = sample_indices + unsigned char [:] binned_feature = context.X_binned.T[split_info.feature_idx] + + while i != j: + # continue until we find an element that should be on right + while binned_feature[view[i]] <= pivot and i < n_samples: + i += 1 + # same, but now an element that should be on the left + while binned_feature[view[j]] > pivot and j >= 0: + j -= 1 + if i >= j: # j can become smaller than j! + break + else: + # swap + view[i], view[j] = view[j], view[i] + i += 1 + j -= 1 - # Note: we could probably allocate all the arrays of size n_threads in the - # splitting context as well, but gains are probably going to be minimal - sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) - if n_samples % n_threads > 0: - # array[:0] will cause a bug in numba 0.41 so we need the if. Remove - # once issue numba 3554 is fixed. - sizes[:n_samples % n_threads] += 1 - offset_in_buffers = np.zeros(n_threads, dtype=np.int32) - offset_in_buffers[1:] = np.cumsum(sizes[:-1]) + return sample_indices[:i], sample_indices[i:] - left_counts = np.empty(n_threads, dtype=np.int32) - right_counts = np.empty(n_threads, dtype=np.int32) - # Need to declare local variables, else they're not updated :/ - # (see numba issue 3459) - left_indices_buffer = context.left_indices_buffer - right_indices_buffer = context.right_indices_buffer - - # map indices from samples_indices to left/right_indices_buffer - for thread_idx in range(n_threads): - left_count = 0 - right_count = 0 - - start = offset_in_buffers[thread_idx] - stop = start + sizes[thread_idx] - for i in range(start, stop): - sample_idx = sample_indices[i] - if X_binned[sample_idx] <= split_info.bin_idx: - left_indices_buffer[start + left_count] = sample_idx - left_count += 1 - else: - right_indices_buffer[start + right_count] = sample_idx - right_count += 1 - - left_counts[thread_idx] = left_count - right_counts[thread_idx] = right_count - - # position of right child = just after the left child - right_child_position = left_counts.sum() - - # offset of each thread in samples_indices for left and right child, i.e. - # where each thread will start to write. - left_offset = np.zeros(n_threads, dtype=np.int32) - left_offset[1:] = np.cumsum(left_counts[:-1]) - right_offset = np.full(n_threads, right_child_position, dtype=np.int32) - right_offset[1:] += np.cumsum(right_counts[:-1]) - - # map indices in left/right_indices_buffer back into samples_indices. This - # also updates context.partition since samples_indice is a view. - for thread_idx in range(n_threads): - - for i in range(left_counts[thread_idx]): - sample_indices[left_offset[thread_idx] + i] = \ - left_indices_buffer[offset_in_buffers[thread_idx] + i] - for i in range(right_counts[thread_idx]): - sample_indices[right_offset[thread_idx] + i] = \ - right_indices_buffer[offset_in_buffers[thread_idx] + i] - - return (sample_indices[:right_child_position], - sample_indices[right_child_position:]) - - -def find_node_split(context, sample_indices): +def find_node_split(SplittingContext context, unsigned int [:] sample_indices): """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -290,6 +228,11 @@ def find_node_split(context, sample_indices): HISTOGRAM_DTYPE of size ``max_bins`` (only ``n_bins_per_features[feature]`` entries are relevant). """ + cdef hist_struct [:, :] view + cdef hist_struct [:] histogram + cdef unsigned int feature_idx + cdef unsigned int i + cdef unsigned int thread_idx ctx = context # shorter name to avoid various line breaks n_samples = sample_indices.shape[0] @@ -316,11 +259,13 @@ def find_node_split(context, sample_indices): ordered_gradients[i] = ctx.gradients[sample_indices[i]] ordered_hessians[i] = ctx.hessians[sample_indices[i]] - ctx.sum_gradients = ctx.ordered_gradients[:n_samples].sum() + # ctx.sum_gradients = ctx.ordered_gradients[:n_samples].sum() + ctx.sum_gradients = np.sum(ctx.ordered_gradients[:n_samples]) if ctx.constant_hessian: - ctx.sum_hessians = ctx.constant_hessian_value * float32(n_samples) + ctx.sum_hessians = ctx.constant_hessian_value * np.float32(n_samples) else: - ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() + # ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() + ctx.sum_hessians = np.sum(ctx.ordered_hessians[:n_samples]) # Pre-allocate the results datastructure to be able to use prange: # numba jitclass do not seem to properly support default values for kwargs. @@ -330,18 +275,20 @@ def find_node_split(context, sample_indices): shape=(np.int64(context.n_features), np.int64(context.max_bins)), dtype=HISTOGRAM_DTYPE ) + view = histograms for feature_idx in range(context.n_features): split_info, histogram = _find_histogram_split( context, feature_idx, sample_indices) split_infos[feature_idx] = split_info - histograms[feature_idx, :] = histogram + view[feature_idx, :] = histogram split_info = _find_best_feature_to_split_helper(split_infos) return split_info, histograms -def find_node_split_subtraction(context, sample_indices, parent_histograms, - sibling_histograms): +def find_node_split_subtraction(SplittingContext context, unsigned int [:] + sample_indices, np.ndarray parent_histograms, + np.ndarray sibling_histograms): """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -378,6 +325,10 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms, ``n_bins_per_features[feature]`` entries are relevant). """ + cdef hist_struct [:, :] view + cdef hist_struct [:] histogram + cdef unsigned int feature_idx + # We can pick any feature (here the first) in the histograms to # compute the gradients: they must be the same across all features # anyway, we have tests ensuring this. Maybe a more robust way would @@ -400,12 +351,13 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms, shape=(np.int64(context.n_features), np.int64(context.max_bins)), dtype=HISTOGRAM_DTYPE ) + view = histograms for feature_idx in range(context.n_features): split_info, histogram = _find_histogram_split_subtraction( context, feature_idx, parent_histograms, sibling_histograms, n_samples) split_infos[feature_idx] = split_info - histograms[feature_idx, :] = histogram + view[feature_idx, :] = histogram split_info = _find_best_feature_to_split_helper(split_infos) return split_info, histograms @@ -421,17 +373,19 @@ def _find_best_feature_to_split_helper(split_infos): return best_split_info -def _find_histogram_split(context, feature_idx, sample_indices): +cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, + unsigned int [:] sample_indices): """Compute the histogram for a given feature Returns the best SplitInfo among all the possible bins of the feature. """ - n_samples = sample_indices.shape[0] - X_binned = context.X_binned.T[feature_idx] - root_node = X_binned.shape[0] == n_samples - ordered_gradients = context.ordered_gradients[:n_samples] - ordered_hessians = context.ordered_hessians[:n_samples] + cdef unsigned int n_samples = sample_indices.shape[0] + cdef unsigned char [:] X_binned = context.X_binned.T[feature_idx] + cdef unsigned int root_node = X_binned.shape[0] == n_samples + cdef float [:] ordered_gradients = context.ordered_gradients[:n_samples] + cdef float [:] ordered_hessians = context.ordered_hessians[:n_samples] + cdef np.ndarray histogram if root_node: if context.constant_hessian: @@ -454,15 +408,15 @@ def _find_histogram_split(context, feature_idx, sample_indices): return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) - -def _find_histogram_split_subtraction(context, feature_idx, - parent_histograms, sibling_histograms, - n_samples): +cdef _find_histogram_split_subtraction(SplittingContext context, unsigned int feature_idx, + np.ndarray parent_histograms, np.ndarray sibling_histograms, + unsigned int n_samples): """Compute the histogram by substraction of parent and sibling Uses the identity: hist(parent) = hist(left) + hist(right). Returns the best SplitInfo among all the possible bins of the feature. """ + cdef np.ndarray histogram histogram = _subtract_histograms( context.max_bins, parent_histograms[feature_idx], sibling_histograms[feature_idx]) @@ -471,7 +425,11 @@ def _find_histogram_split_subtraction(context, feature_idx, n_samples) -def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +cdef _find_best_bin_to_split_helper(SplittingContext context, unsigned int feature_idx, + hist_struct [:] histogram, unsigned int + n_samples): """Find best bin to split on, and return the corresponding SplitInfo. Splits that do not satisfy the splitting constraints (min_gain_to_split, @@ -479,26 +437,36 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): SplitInfo with a gain of -1 is returned. If for a given node the best SplitInfo has a gain of -1, it is finalized into a leaf. """ - # Allocate the structure for the best split information. It can be - # returned as such (with a negative gain) if the min_hessian_to_split - # condition is not satisfied. Such invalid splits are later discarded by - # the TreeGrower. - best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) + cdef: + unsigned int bin_idx + unsigned int n_samples_left + unsigned int n_samples_right + unsigned int n_samples_ = n_samples + float hessian_left + float hessian_right + float gradient_left + float gradient_right + float gain + SplitInfo best_split + + hist_struct [:] view = histogram + + best_split = SplitInfo.__new__(SplitInfo) gradient_left, hessian_left = 0., 0. n_samples_left = 0 for bin_idx in range(context.n_bins_per_feature[feature_idx]): - n_samples_left += histogram[bin_idx]['count'] - n_samples_right = n_samples - n_samples_left + n_samples_left += view[bin_idx].count + n_samples_right = n_samples_ - n_samples_left if context.constant_hessian: - hessian_left += (histogram[bin_idx]['count'] + hessian_left += ( view[bin_idx].count * context.constant_hessian_value) else: - hessian_left += histogram[bin_idx]['sum_hessians'] + hessian_left += view[bin_idx].sum_hessians hessian_right = context.sum_hessians - hessian_left - gradient_left += histogram[bin_idx]['sum_gradients'] + gradient_left += view[bin_idx].sum_gradients gradient_right = context.sum_gradients - gradient_left if n_samples_left < context.min_samples_leaf: @@ -523,17 +491,31 @@ def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples): best_split.feature_idx = feature_idx best_split.bin_idx = bin_idx best_split.gradient_left = gradient_left - best_split.hessian_left = hessian_left - best_split.n_samples_left = n_samples_left best_split.gradient_right = gradient_right + best_split.hessian_left = hessian_left best_split.hessian_right = hessian_right + best_split.n_samples_left = n_samples_left best_split.n_samples_right = n_samples_right + """ + best_split = SplitInfo( + gain, + feature_idx, + bin_idx, + gradient_left, + gradient_right, + hessian_left, + hessian_right, + n_samples_left, + n_samples_right, + ) + """ return best_split, histogram -def _split_gain(gradient_left, hessian_left, gradient_right, hessian_right, - sum_gradients, sum_hessians, l2_regularization): +cdef inline float _split_gain(float gradient_left, float hessian_left, float gradient_right, + float hessian_right, float sum_gradients, float + sum_hessians, float l2_regularization) nogil: """Loss reduction Compute the reduction in loss after taking a split, compared to keeping @@ -543,10 +525,13 @@ def _split_gain(gradient_left, hessian_left, gradient_right, hessian_right, XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016 https://arxiv.org/abs/1603.02754 """ - def negative_loss(gradient, hessian): - return (gradient ** 2) / (hessian + l2_regularization) - - gain = negative_loss(gradient_left, hessian_left) - gain += negative_loss(gradient_right, hessian_right) - gain -= negative_loss(sum_gradients, sum_hessians) + cdef float gain + gain = negative_loss(gradient_left, hessian_left, l2_regularization) + gain += negative_loss(gradient_right, hessian_right, l2_regularization) + gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain + +@cython.cdivision(True) +cdef inline float negative_loss(float gradient, float hessian, float +l2_regularization) nogil: + return (gradient * gradient) / (hessian + l2_regularization) diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index 0698e910c7bbf..d38ab4fa48896 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -8,11 +8,28 @@ def configuration(parent_package="", top_path=None): sources=["_gradient_boosting.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("gbm._gradient_boosting", + sources=["gbm/_gradient_boosting.pyx"], + include_dirs=[numpy.get_include()]) + config.add_extension("gbm.histogram", sources=["gbm/histogram.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("gbm.splitting", + sources=["gbm/splitting.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("gbm.binning", + sources=["gbm/binning.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("gbm.playground", + sources=["gbm/playground.pyx"], + include_dirs=[numpy.get_include()]) + config.add_subpackage("tests") + config.add_data_files("gbm/slitting.pxd") return config diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ed259c98ac850..d7ce5d195ac11 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -604,6 +604,10 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] + property nodes: + def __get__(self): + return self._get_node_ndarray() + def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes, int n_outputs): """Constructor.""" diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index faa83efbb7703..04b8af518780d 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -360,7 +360,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.presort) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) - # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, @@ -380,6 +379,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) + if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] From d8f1bbadcd6a598579e2d9ce552435b0a4f48e71 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 13:24:34 -0500 Subject: [PATCH 003/247] used fused type for update_raw_predict --- sklearn/ensemble/gbm/_gradient_boosting.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx index 43ccf7644db34..8c472949f3477 100644 --- a/sklearn/ensemble/gbm/_gradient_boosting.pyx +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -4,8 +4,11 @@ cimport cython import numpy as np cimport numpy as np +ctypedef fused float_or_double: + float + double -def _update_raw_predictions__(float [:] leaves_values, list samples_indices_at_leaf, np.float_t [:] raw_predictions): +def _update_raw_predictions__(float [:] leaves_values, list samples_indices_at_leaf, float_or_double [:] raw_predictions): """Update raw_predictions by reading the predictions of the ith tree directly form the leaves. From 58203297496615988b8c9c01ddaa025244201f5c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 14:48:12 -0500 Subject: [PATCH 004/247] Cythonized prediction code --- gdb_test.py | 10 ++--- .../gbm/{predictor.py => predictor.pyx} | 42 ++++++++++++++----- sklearn/ensemble/setup.py | 4 ++ 3 files changed, 41 insertions(+), 15 deletions(-) rename sklearn/ensemble/gbm/{predictor.py => predictor.pyx} (75%) diff --git a/gdb_test.py b/gdb_test.py index 07b0f59913867..566f784b3e9d4 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -9,8 +9,8 @@ import pstats import cProfile -classif = True -n_samples = 100000 +classif = False +n_samples = 500000 max_iter = 5 if classif: @@ -31,11 +31,11 @@ random_state=0, verbose=True) gbm.fit(X, y) -duration = time() - tic print(f'score: {gbm.score(X, y)}') +duration = time() - tic print(f'Took {duration:.3f}s\n') -# cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") +# cProfile.runctx("gbm.fit(X, y).predict(X)", globals(), locals(), "Profile.prof") # s = pstats.Stats("Profile.prof") # s.strip_dirs().sort_stats("time").print_stats(.2) @@ -46,6 +46,6 @@ random_state=0, verbose=True).fit(X, y) print(gbdt.n_estimators_) -duration = time() - tic print(f'score: {gbdt.score(X, y)}') +duration = time() - tic print(f'Took {duration:.3f}s') diff --git a/sklearn/ensemble/gbm/predictor.py b/sklearn/ensemble/gbm/predictor.pyx similarity index 75% rename from sklearn/ensemble/gbm/predictor.py rename to sklearn/ensemble/gbm/predictor.pyx index ab549639aa8cb..b7cda2814baac 100644 --- a/sklearn/ensemble/gbm/predictor.py +++ b/sklearn/ensemble/gbm/predictor.pyx @@ -1,3 +1,4 @@ +# cython: profile=True """ This module contains the TreePredictor class which is used for prediction. """ @@ -5,19 +6,35 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ - ('is_leaf', np.uint8), ('value', np.float32), ('count', np.uint32), ('feature_idx', np.uint32), - ('bin_threshold', np.uint8), ('threshold', np.float32), ('left', np.uint32), ('right', np.uint32), ('gain', np.float32), ('depth', np.uint32), + ('is_leaf', np.uint8), + ('bin_threshold', np.uint8), # TODO: shrinkage in leaf for feature importance error bar? ]) +ctypedef fused float_or_double: + float + double + +cdef packed struct node_struct: + float value + unsigned int count + unsigned int feature_idx + float threshold + unsigned int left + unsigned int right + float gain + unsigned int depth + unsigned char is_leaf + unsigned char bin_threshold + class TreePredictor: """Tree class used for predictions. @@ -94,17 +111,22 @@ def _predict_binned(nodes, binned_data, out): out[i] = _predict_one_binned(nodes, binned_data[i]) -def _predict_one_from_numeric_data(nodes, numeric_data): - node = nodes[0] +cdef float _predict_one_from_numeric_data(node_struct [:] nodes, float_or_double [:] numeric_data) nogil: + cdef node_struct node = nodes[0] while True: - if node['is_leaf']: - return node['value'] - if numeric_data[node['feature_idx']] <= node['threshold']: - node = nodes[node['left']] + if node.is_leaf: + return node.value + if numeric_data[node.feature_idx] <= node.threshold: + node = nodes[node.left] else: - node = nodes[node['right']] + node = nodes[node.right] + + +# TODO: having a view on numeric_data (passed by user) may not be supported, +# see sklearn issue 10624 +def _predict_from_numeric_data(node_struct [:] nodes, float_or_double [:, :] numeric_data, float [:] out): + cdef int i -def _predict_from_numeric_data(nodes, numeric_data, out): for i in range(numeric_data.shape[0]): out[i] = _predict_one_from_numeric_data(nodes, numeric_data[i]) diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index d38ab4fa48896..edbee1f86666c 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -24,6 +24,10 @@ def configuration(parent_package="", top_path=None): sources=["gbm/binning.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("gbm.predictor", + sources=["gbm/predictor.pyx"], + include_dirs=[numpy.get_include()]) + config.add_extension("gbm.playground", sources=["gbm/playground.pyx"], include_dirs=[numpy.get_include()]) From 31ac23330ab5ddf45c7937bb31fb3d650a67c415 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 16:23:27 -0500 Subject: [PATCH 005/247] Added script for uploading html annotated cython files --- push_annotated_cython.sh | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 push_annotated_cython.sh diff --git a/push_annotated_cython.sh b/push_annotated_cython.sh new file mode 100755 index 0000000000000..45641834aaa97 --- /dev/null +++ b/push_annotated_cython.sh @@ -0,0 +1,55 @@ +#!/bin/sh + +set -e # exit if any command fails + + +BRANCH=gbm +SOURCE_DIR=/home/nico/dev/sklearn/sklearn/ensemble/gbm +TARGET_DIR=/home/nico/dev/cython_annotations + +ORIGINAL_DIR=`pwd` + + +git co $BRANCH + +# Commits in the branch (provided it branched off master) +COMMITS=`git log master.. --pretty=format:"%h"` + +annotate_and_copy_files() { + # For a give commit, annotate all pyx file in SOURCE_DIR and copy the html + # files in TARGET_DIR/COMMIT_HASH/ + + git co $1 # checkout commit + for pyx_file in `ls $SOURCE_DIR/*.pyx` + do + echo 'annotating' $1 $pyx_file + cython -a $pyx_file + done + + for html_file in `ls $SOURCE_DIR/*.html` + do + mkdir -p $TARGET_DIR/$1 + cp $html_file $TARGET_DIR/$1 + html_file_name=$(basename -- "$html_file") # without path + echo Copied $html_file_name to $TARGET_DIR/$1 + done +} + +for commit in $COMMITS +do + annotate_and_copy_files $commit +done + + +# Get into target dir, commit html files and push them. +cd $TARGET_DIR +git co gh-pages +echo Generating index.html +python lol.py # generates index.html with links to each file +echo Committing and pushing files +git add . +git ci -am "Added some annotated cython files" +git push + +cd $ORIGINAL_DIR # go back where we were +git co $BRANCH # Probably useless since with checked out the last commit From b1ae6b8cf23693551d10a53326671c115a556e3e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 16:46:04 -0500 Subject: [PATCH 006/247] parallelized binning... still hacky --- gdb_test.py | 40 ++++++++++----------- sklearn/ensemble/gbm/_gradient_boosting.pyx | 2 ++ sklearn/ensemble/gbm/binning.pyx | 7 ++-- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index 566f784b3e9d4..12bfb1a4be8b6 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -10,7 +10,7 @@ import cProfile classif = False -n_samples = 500000 +n_samples = 100000 max_iter = 5 if classif: @@ -30,22 +30,22 @@ n_iter_no_change=None, random_state=0, verbose=True) -gbm.fit(X, y) -print(f'score: {gbm.score(X, y)}') -duration = time() - tic -print(f'Took {duration:.3f}s\n') - -# cProfile.runctx("gbm.fit(X, y).predict(X)", globals(), locals(), "Profile.prof") - -# s = pstats.Stats("Profile.prof") -# s.strip_dirs().sort_stats("time").print_stats(.2) - -tic = time() -gbdt = GBDT(n_estimators=max_iter, - n_iter_no_change=None, # no early stopping - random_state=0, - verbose=True).fit(X, y) -print(gbdt.n_estimators_) -print(f'score: {gbdt.score(X, y)}') -duration = time() - tic -print(f'Took {duration:.3f}s') +# gbm.fit(X, y) +# print(f'score: {gbm.score(X, y)}') +# duration = time() - tic +# print(f'Took {duration:.3f}s\n') + +cProfile.runctx("gbm.fit(X, y).predict(X)", globals(), locals(), "Profile.prof") + +s = pstats.Stats("Profile.prof") +s.strip_dirs().sort_stats("time").print_stats(.2) + +# tic = time() +# gbdt = GBDT(n_estimators=max_iter, +# n_iter_no_change=None, # no early stopping +# random_state=0, +# verbose=True).fit(X, y) +# print(gbdt.n_estimators_) +# print(f'score: {gbdt.score(X, y)}') +# duration = time() - tic +# print(f'Took {duration:.3f}s') diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx index 8c472949f3477..9602fe6f22bcb 100644 --- a/sklearn/ensemble/gbm/_gradient_boosting.pyx +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -8,6 +8,8 @@ ctypedef fused float_or_double: float double +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. def _update_raw_predictions__(float [:] leaves_values, list samples_indices_at_leaf, float_or_double [:] raw_predictions): """Update raw_predictions by reading the predictions of the ith tree directly form the leaves. diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/ensemble/gbm/binning.pyx index b52f53ad5326d..9b00b1002a4b6 100644 --- a/sklearn/ensemble/gbm/binning.pyx +++ b/sklearn/ensemble/gbm/binning.pyx @@ -9,6 +9,7 @@ cimport cython import numpy as np cimport numpy as np +from cython.parallel import prange from sklearn.utils import check_random_state, check_array from sklearn.base import BaseEstimator, TransformerMixin @@ -94,7 +95,9 @@ cdef _map_to_bins(np.ndarray[np.float_t, ndim=2] data, binning_thresholds): return binned -cdef _map_num_col_to_bins(np.ndarray[np.float_t] data, np.ndarray[np.float32_t] binning_thresholds, np.ndarray[np.uint8_t] binned): +@cython.boundscheck(False) # Deactivate bounds checking +@cython.wraparound(False) # Deactivate negative indexing. +cdef void _map_num_col_to_bins(double [:] data, float [:] binning_thresholds, unsigned char [:] binned)nogil: """Binary search to the find the bin index for each value in data.""" cdef: int i @@ -102,7 +105,7 @@ cdef _map_num_col_to_bins(np.ndarray[np.float_t] data, np.ndarray[np.float32_t] int right int middle - for i in range(data.shape[0]): + for i in prange(data.shape[0], schedule='static'): # TODO: add support for missing values (NaN or custom marker) left, right = 0, binning_thresholds.shape[0] while left < right: From e7cb4a388660f5d113ddc7190430ef077cb14f84 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 18:10:34 -0500 Subject: [PATCH 007/247] cleaned code a bit --- sklearn/ensemble/gbm/histogram.pyx | 18 +++- sklearn/ensemble/gbm/splitting.pyx | 166 ++++++++++++++++++----------- 2 files changed, 117 insertions(+), 67 deletions(-) diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx index 7fd2e967f5a1a..4f2cbde692d32 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -29,15 +29,23 @@ cdef struct hist_struct: @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. -def _build_histogram_naive(n_bins, sample_indices, binned_feature, - ordered_gradients, ordered_hessians): +cdef _build_histogram_naive(unsigned int n_bins, unsigned int [:] + sample_indices, unsigned char [:] + binned_feature, float [:] ordered_gradients, + float[:] ordered_hessians): """Build histogram in a naive way, without optimizing for cache hit.""" histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + cdef: + hist_struct [:] view = histogram + unsigned int i + unsigned int sample_idx + unsigned char bin_idx + for i, sample_idx in enumerate(sample_indices): bin_idx = binned_feature[sample_idx] - histogram[bin_idx].sum_gradients += ordered_gradients[i] - histogram[bin_idx].sum_hessians += ordered_hessians[i] - histogram[bin_idx].count += 1 + view[bin_idx].sum_gradients += ordered_gradients[i] + view[bin_idx].sum_hessians += ordered_hessians[i] + view[bin_idx].count += 1 return histogram diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index a68dc177f560e..c7f99b70fe0f1 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -17,13 +17,38 @@ from .histogram import _build_histogram_no_hessian from .histogram import _build_histogram_root from .histogram import _build_histogram_root_no_hessian from .histogram import HISTOGRAM_DTYPE -from .utils import get_threads_chunks cdef struct hist_struct: float sum_gradients float sum_hessians unsigned int count + +cdef get_threads_chunks(unsigned int total_size): + """Get start and end indices of threads in an array of size total_size. + + The interval [0, total_size - 1] is divided into n_threads contiguous + regions, and the starts and ends of each region are returned. Used to + simulate a 'static' scheduling. + """ + cdef: + np.ndarray[np.uint32_t] sizes + np.ndarray[np.uint32_t] starts + np.ndarray[np.uint32_t] ends + unsigned int n_threads + + n_threads = 4 # TODO: change this + sizes = np.full(n_threads, total_size // n_threads, dtype=np.uint32) + if total_size % n_threads > 0: + # array[:0] will cause a bug in numba 0.41 so we need the if. + # Remove once issue numba 3554 is fixed. + sizes[:total_size % n_threads] += 1 + starts = np.zeros(n_threads, dtype=np.uint32) + starts[1:] = np.cumsum(sizes[:-1]) + ends = starts + sizes + + return starts, ends, n_threads + @cython.freelist(100) cdef class SplitInfo: """Pure data class to store information about a potential split. @@ -49,15 +74,16 @@ cdef class SplitInfo: n_samples_right : int The number of samples in the right child """ - cdef public float gain - cdef public unsigned int feature_idx - cdef public unsigned int bin_idx - cdef public float gradient_left - cdef public float gradient_right - cdef public float hessian_left - cdef public float hessian_right - cdef public unsigned int n_samples_left - cdef public unsigned int n_samples_right + cdef public: + float gain + unsigned int feature_idx + unsigned int bin_idx + float gradient_left + float gradient_right + float hessian_left + float hessian_right + unsigned int n_samples_left + unsigned int n_samples_right def __cinit__(self, float gain=-1., unsigned int feature_idx=0, unsigned int bin_idx=0, @@ -111,26 +137,27 @@ cdef class SplittingContext: The minimum gain needed to split a node. Splits with lower gain will be ignored. """ - cdef public unsigned char [:, :] X_binned - cdef public unsigned int n_features - cdef public unsigned int max_bins - cdef public unsigned int [:] n_bins_per_feature - cdef public float [:] gradients - cdef public float [:] hessians - cdef public float [:] ordered_gradients - cdef public float [:] ordered_hessians - cdef public float sum_gradients - cdef public float sum_hessians - cdef public unsigned char constant_hessian - cdef public float constant_hessian_value - cdef public float l2_regularization - cdef public float min_hessian_to_split - cdef public unsigned int min_samples_leaf - cdef public float min_gain_to_split - - cdef public unsigned int [:] partition - cdef public unsigned int [:] left_indices_buffer - cdef public unsigned int [:] right_indices_buffer + cdef public: + unsigned char [:, :] X_binned + unsigned int n_features + unsigned int max_bins + unsigned int [:] n_bins_per_feature + float [:] gradients + float [:] hessians + float [:] ordered_gradients + float [:] ordered_hessians + float sum_gradients + float sum_hessians + unsigned char constant_hessian + float constant_hessian_value + float l2_regularization + float min_hessian_to_split + unsigned int min_samples_leaf + float min_gain_to_split + + unsigned int [:] partition + unsigned int [:] left_indices_buffer + unsigned int [:] right_indices_buffer def __cinit__(self, np.ndarray[np.uint8_t, ndim=2] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, @@ -157,9 +184,9 @@ cdef class SplittingContext: self.min_samples_leaf = min_samples_leaf self.min_gain_to_split = min_gain_to_split if self.constant_hessian: - self.constant_hessian_value = self.hessians[0] # 1 scalar + self.constant_hessian_value = hessians[0] # 1 scalar else: - self.constant_hessian_value = np.float32(1.) # won't be used anyway + self.constant_hessian_value = 1. # won't be used anyway # The partition array maps each sample index into the leaves of the # tree (a leaf in this context is a node that isn't splitted yet, not @@ -170,7 +197,7 @@ cdef class SplittingContext: # partition = [cef|abdghijkl] # we have 2 leaves, the left one is at position 0 and the second one at # position 3. The order of the samples is irrelevant. - self.partition = np.arange(0, X_binned.shape[0], 1, np.uint32) + self.partition = np.arange(X_binned.shape[0], dtype=np.uint32) # buffers used in split_indices to support parallel splitting. self.left_indices_buffer = np.empty_like(self.partition) self.right_indices_buffer = np.empty_like(self.partition) @@ -228,20 +255,23 @@ def find_node_split(SplittingContext context, unsigned int [:] sample_indices): HISTOGRAM_DTYPE of size ``max_bins`` (only ``n_bins_per_features[feature]`` entries are relevant). """ - cdef hist_struct [:, :] view - cdef hist_struct [:] histogram - cdef unsigned int feature_idx - cdef unsigned int i - cdef unsigned int thread_idx + cdef: + unsigned int n_samples + hist_struct [:, :] view + hist_struct [:] histogram + unsigned int feature_idx + unsigned int i + unsigned int thread_idx + SplittingContext ctx + unsigned int [:] starts + unsigned int [:] ends + unsigned int n_threads + SplitInfo split_info + list split_infos ctx = context # shorter name to avoid various line breaks n_samples = sample_indices.shape[0] - # Need to declare local variables, else they're not updated - # (see numba issue 3459) - ordered_gradients = ctx.ordered_gradients - ordered_hessians = ctx.ordered_hessians - # Populate ordered_gradients and ordered_hessians. (Already done for root) # Ordering the gradients and hessians helps to improve cache hit. # This is a parallelized version of the following vanilla code: @@ -252,12 +282,12 @@ def find_node_split(SplittingContext context, unsigned int [:] sample_indices): if ctx.constant_hessian: for thread_idx in range(n_threads): for i in range(starts[thread_idx], ends[thread_idx]): - ordered_gradients[i] = ctx.gradients[sample_indices[i]] + ctx.ordered_gradients[i] = ctx.gradients[sample_indices[i]] else: for thread_idx in range(n_threads): for i in range(starts[thread_idx], ends[thread_idx]): - ordered_gradients[i] = ctx.gradients[sample_indices[i]] - ordered_hessians[i] = ctx.hessians[sample_indices[i]] + ctx.ordered_gradients[i] = ctx.gradients[sample_indices[i]] + ctx.ordered_hessians[i] = ctx.hessians[sample_indices[i]] # ctx.sum_gradients = ctx.ordered_gradients[:n_samples].sum() ctx.sum_gradients = np.sum(ctx.ordered_gradients[:n_samples]) @@ -267,8 +297,6 @@ def find_node_split(SplittingContext context, unsigned int [:] sample_indices): # ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() ctx.sum_hessians = np.sum(ctx.ordered_hessians[:n_samples]) - # Pre-allocate the results datastructure to be able to use prange: - # numba jitclass do not seem to properly support default values for kwargs. split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) for i in range(context.n_features)] histograms = np.empty( @@ -325,9 +353,13 @@ def find_node_split_subtraction(SplittingContext context, unsigned int [:] ``n_bins_per_features[feature]`` entries are relevant). """ - cdef hist_struct [:, :] view - cdef hist_struct [:] histogram - cdef unsigned int feature_idx + cdef: + hist_struct [:, :] view + hist_struct [:] histogram + unsigned int feature_idx + unsigned int n_samples + SplitInfo split_info + list split_infos # We can pick any feature (here the first) in the histograms to # compute the gradients: they must be the same across all features @@ -339,7 +371,7 @@ def find_node_split_subtraction(SplittingContext context, unsigned int [:] n_samples = sample_indices.shape[0] if context.constant_hessian: context.sum_hessians = \ - context.constant_hessian_value * np.float32(n_samples) + context.constant_hessian_value * float(n_samples) else: context.sum_hessians = (parent_histograms[0]['sum_hessians'].sum() - sibling_histograms[0]['sum_hessians'].sum()) @@ -363,11 +395,18 @@ def find_node_split_subtraction(SplittingContext context, unsigned int [:] return split_info, histograms -def _find_best_feature_to_split_helper(split_infos): - best_gain = None +cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): + cdef: + float gain + float best_gain + SplitInfo split_info + SplitInfo best_split_info + unsigned int i + + best_gain = -1. for i, split_info in enumerate(split_infos): gain = split_info.gain - if best_gain is None or gain > best_gain: + if gain > best_gain: best_gain = gain best_split_info = split_info return best_split_info @@ -380,12 +419,13 @@ cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, Returns the best SplitInfo among all the possible bins of the feature. """ - cdef unsigned int n_samples = sample_indices.shape[0] - cdef unsigned char [:] X_binned = context.X_binned.T[feature_idx] - cdef unsigned int root_node = X_binned.shape[0] == n_samples - cdef float [:] ordered_gradients = context.ordered_gradients[:n_samples] - cdef float [:] ordered_hessians = context.ordered_hessians[:n_samples] - cdef np.ndarray histogram + cdef: + unsigned int n_samples = sample_indices.shape[0] + unsigned char [:] X_binned = context.X_binned.T[feature_idx] + unsigned int root_node = X_binned.shape[0] == n_samples + float [:] ordered_gradients = context.ordered_gradients[:n_samples] + float [:] ordered_hessians = context.ordered_hessians[:n_samples] + np.ndarray histogram if root_node: if context.constant_hessian: @@ -416,7 +456,9 @@ cdef _find_histogram_split_subtraction(SplittingContext context, unsigned int fe Uses the identity: hist(parent) = hist(left) + hist(right). Returns the best SplitInfo among all the possible bins of the feature. """ - cdef np.ndarray histogram + cdef: + np.ndarray histogram + histogram = _subtract_histograms( context.max_bins, parent_histograms[feature_idx], sibling_histograms[feature_idx]) From 8a69785ca62cd421388fca1c7c86d56b09090a0c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 19:12:53 -0500 Subject: [PATCH 008/247] started cythonizing loss --- gdb_test.py | 5 +- sklearn/ensemble/gbm/_gradient_boosting.pyx | 1 - sklearn/ensemble/gbm/binning.pyx | 1 - sklearn/ensemble/gbm/histogram.pyx | 1 - sklearn/ensemble/gbm/loss.py | 299 -------------------- sklearn/ensemble/gbm/splitting.pyx | 10 +- sklearn/ensemble/setup.py | 4 + 7 files changed, 10 insertions(+), 311 deletions(-) delete mode 100644 sklearn/ensemble/gbm/loss.py diff --git a/gdb_test.py b/gdb_test.py index 12bfb1a4be8b6..ee94c30ed635b 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -9,12 +9,13 @@ import pstats import cProfile -classif = False +classif = True +n_classes = 3 n_samples = 100000 max_iter = 5 if classif: - X, y = make_classification(n_samples=n_samples, random_state=0) + X, y = make_classification(n_samples=n_samples, random_state=0, n_classes=n_classes, n_clusters_per_class=1) GBM = GBMClassifier GBDT = GradientBoostingClassifier else: diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx index 9602fe6f22bcb..ec2b1de0e87e8 100644 --- a/sklearn/ensemble/gbm/_gradient_boosting.pyx +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -1,4 +1,3 @@ -# cython: profile=True cimport cython import numpy as np diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/ensemble/gbm/binning.pyx index 9b00b1002a4b6..571d26cf9ecb6 100644 --- a/sklearn/ensemble/gbm/binning.pyx +++ b/sklearn/ensemble/gbm/binning.pyx @@ -1,4 +1,3 @@ -# cython: profile=True """ This module contains the BinMapper class. diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx index 4f2cbde692d32..e7efff769064d 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -1,4 +1,3 @@ -# cython: profile=True """This module contains njitted routines for building histograms. A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each diff --git a/sklearn/ensemble/gbm/loss.py b/sklearn/ensemble/gbm/loss.py deleted file mode 100644 index 134569a517d5c..0000000000000 --- a/sklearn/ensemble/gbm/loss.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -This module contains the loss classes. - -Specific losses are used for regression, binary classification or multiclass -classification. -""" -from abc import ABC, abstractmethod - -from scipy.special import expit, logsumexp -import numpy as np - -from .utils import get_threads_chunks - - -def _logsumexp(a): - """logsumexp(x) = log(sum(exp(x))) - - Custom logsumexp function with numerical stability, based on scipy's - logsumexp which is unfortunately not supported (neither is - np.logaddexp.reduce, which is equivalent). Only supports 1d arrays. - """ - - a_max = np.amax(a) - if not np.isfinite(a_max): - a_max = 0 - - s = np.sum(np.exp(a - a_max)) - return np.log(s) + a_max - - -def _expit(x): - # custom sigmoid because we cannot use that of scipy with numba - return 1 / (1 + np.exp(-x)) - - -class BaseLoss(ABC): - """Base class for a loss.""" - - def init_gradients_and_hessians(self, n_samples, prediction_dim): - """Return initial gradients and hessians. - - Unless hessians are constant, arrays are initialized with undefined - values. - - Parameters - ---------- - n_samples : int - The number of samples passed to `fit()` - prediction_dim : int - The dimension of a raw prediction, i.e. the number of trees - built at each iteration. Equals 1 for regression and binary - classification, or K where K is the number of classes for - multiclass classification. - - Returns - ------- - gradients : array-like, shape=(n_samples * prediction_dim) - hessians : array-like, shape=(n_samples * prediction_dim). - If hessians are constant (e.g. for ``LeastSquares`` loss, shape - is (1,) and the array is initialized to ``1``. - """ - shape = n_samples * prediction_dim - gradients = np.empty(shape=shape, dtype=np.float32) - if self.hessian_is_constant: - hessians = np.ones(shape=1, dtype=np.float32) - else: - hessians = np.empty(shape=shape, dtype=np.float32) - - return gradients, hessians - - @abstractmethod - def get_baseline_prediction(self, y_train, prediction_dim): - """Return initial predictions (before the first iteration). - - Parameters - ---------- - y_train : array-like, shape=(n_samples,) - The target training values. - prediction_dim : int - The dimension of one prediction: 1 for binary classification and - regression, n_classes for multiclass classification. - - Returns - ------- - baseline_prediction: float or array of shape (1, prediction_dim) - The baseline prediction. - """ - pass - - @abstractmethod - def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): - """Update gradients and hessians arrays, inplace. - - The gradients (resp. hessians) are the first (resp. second) order - derivatives of the loss for each sample with respect to the - predictions of model, evaluated at iteration ``i - 1``. - - Parameters - ---------- - gradients : array-like, shape=(n_samples * prediction_dim) - The gradients (treated as OUT array). - hessians : array-like, shape=(n_samples * prediction_dim) or \ - (1,) - The hessians (treated as OUT array). - y_true : array-like, shape=(n_samples,) - The true target values or each training sample. - raw_predictions : array-like, shape=(n_samples, prediction_dim) - The raw_predictions (i.e. values from the trees) of the tree - ensemble at iteration ``i - 1``. - """ - pass - - -class LeastSquares(BaseLoss): - """Least squares loss, for regression. - - For a given sample x_i, least squares loss is defined as:: - - loss(x_i) = (y_true_i - raw_pred_i)**2 - """ - - hessian_is_constant = True - - def __call__(self, y_true, raw_predictions, average=True): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - loss = np.power(y_true - raw_predictions, 2) - return loss.mean() if average else loss - - def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train) - - def inverse_link_function(self, raw_predictions): - return raw_predictions - - def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): - return _update_gradients_least_squares(gradients, y_true, - raw_predictions) - - -def _update_gradients_least_squares(gradients, y_true, raw_predictions): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - n_samples = raw_predictions.shape[0] - starts, ends, n_threads = get_threads_chunks(total_size=n_samples) - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - # Note: a more correct exp is 2 * (raw_predictions - y_true) but - # since we use 1 for the constant hessian value (and not 2) this - # is strictly equivalent for the leaves values. - gradients[i] = raw_predictions[i] - y_true[i] - - -class BinaryCrossEntropy(BaseLoss): - """Binary cross-entropy loss, for binary classification. - - For a given sample x_i, the binary cross-entropy loss is defined as the - negative log-likelihood of the model which can be expressed as:: - - loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i - - See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. - """ - - hessian_is_constant = False - inverse_link_function = staticmethod(expit) - - def __call__(self, y_true, raw_predictions, average=True): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - # logaddexp(0, x) = log(1 + exp(x)) - loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions - return loss.mean() if average else loss - - def get_baseline_prediction(self, y_train, prediction_dim): - proba_positive_class = np.mean(y_train) - eps = np.finfo(y_train.dtype).eps - proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) - # log(x / 1 - x) is the anti function of sigmoid, or the link function - # of the Binomial model. - return np.log(proba_positive_class / (1 - proba_positive_class)) - - def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): - return _update_gradients_hessians_binary_crossentropy( - gradients, hessians, y_true, raw_predictions) - - def predict_proba(self, raw_predictions): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - proba = np.empty((raw_predictions.shape[0], 2), dtype=np.float32) - proba[:, 1] = expit(raw_predictions) - proba[:, 0] = 1 - proba[:, 1] - return proba - - -def _update_gradients_hessians_binary_crossentropy(gradients, hessians, - y_true, raw_predictions): - # Note: using LightGBM version (first mapping {0, 1} into {-1, 1}) - # will cause overflow issues in the exponential as we're using float32 - # precision. - - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - n_samples = raw_predictions.shape[0] - starts, ends, n_threads = get_threads_chunks(total_size=n_samples) - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - gradients[i] = _expit(raw_predictions[i]) - y_true[i] - gradient_abs = np.abs(gradients[i]) - hessians[i] = gradient_abs * (1. - gradient_abs) - - -class CategoricalCrossEntropy(BaseLoss): - """Categorical cross-entropy loss, for multiclass classification. - - For a given sample x_i, the categorical cross-entropy loss is defined as - the negative log-likelihood of the model and generalizes the binary - cross-entropy to more than 2 classes. - """ - - hessian_is_constant = False - - def __call__(self, y_true, raw_predictions, average=True): - one_hot_true = np.zeros_like(raw_predictions) - prediction_dim = raw_predictions.shape[1] - for k in range(prediction_dim): - one_hot_true[:, k] = (y_true == k) - - loss = (logsumexp(raw_predictions, axis=1) - - (one_hot_true * raw_predictions).sum(axis=1)) - return loss.mean() if average else loss - - def get_baseline_prediction(self, y_train, prediction_dim): - init_value = np.zeros( - shape=(1, prediction_dim), - dtype=np.float32 - ) - eps = np.finfo(y_train.dtype).eps - for k in range(prediction_dim): - proba_kth_class = np.mean(y_train == k) - proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) - init_value[:, k] += np.log(proba_kth_class) - - return init_value - - def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): - return _update_gradients_hessians_categorical_crossentropy( - gradients, hessians, y_true, raw_predictions) - - def predict_proba(self, raw_predictions): - # TODO: This could be done in parallel - # compute softmax (using exp(log(softmax))) - return np.exp(raw_predictions - - logsumexp(raw_predictions, axis=1)[:, np.newaxis]) - - -def _update_gradients_hessians_categorical_crossentropy( - gradients, hessians, y_true, raw_predictions): - # Here gradients and hessians are of shape - # (n_samples * prediction_dim,). - # y_true is of shape (n_samples,). - # raw_predictions is of shape (n_samples, raw_predictions) - # - # Instead of passing the whole gradients and hessians arrays and slicing - # them here, we could instead do the update in the 'for k in ...' loop of - # fit(), by passing gradients_at_k and hessians_at_k which are of size - # (n_samples,). - # That would however require to pass a copy of raw_predictions, so it does - # not get partially overwritten at the end of the loop when - # _update_y_pred() is called (see sklearn PR 12715) - n_samples, prediction_dim = raw_predictions.shape - starts, ends, n_threads = get_threads_chunks(total_size=n_samples) - for k in range(prediction_dim): - gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] - hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - # p_k is the probability that class(ith sample) == k. - # This is a regular softmax. - p_k = np.exp(raw_predictions[i, k] - - _logsumexp(raw_predictions[i, :])) - gradients_at_k[i] = p_k - (y_true[i] == k) - hessians_at_k[i] = p_k * (1. - p_k) - # LightGBM uses 2 * p_k * (1 - p_k) which is not stricly - # correct but equivalent to using half the learning rate. - - -_LOSSES = {'least_squares': LeastSquares, - 'binary_crossentropy': BinaryCrossEntropy, - 'categorical_crossentropy': CategoricalCrossEntropy} diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index c7f99b70fe0f1..840b2fbb3a8d1 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -1,4 +1,3 @@ -# cython: profile=True """This module contains njitted routines and data structures to: - Find the best possible split of a node. For a given node, a split is @@ -37,12 +36,9 @@ cdef get_threads_chunks(unsigned int total_size): np.ndarray[np.uint32_t] ends unsigned int n_threads - n_threads = 4 # TODO: change this + n_threads = 1 # TODO: change this sizes = np.full(n_threads, total_size // n_threads, dtype=np.uint32) - if total_size % n_threads > 0: - # array[:0] will cause a bug in numba 0.41 so we need the if. - # Remove once issue numba 3554 is fixed. - sizes[:total_size % n_threads] += 1 + sizes[:total_size % n_threads] += 1 starts = np.zeros(n_threads, dtype=np.uint32) starts[1:] = np.cumsum(sizes[:-1]) ends = starts + sizes @@ -406,7 +402,7 @@ cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): best_gain = -1. for i, split_info in enumerate(split_infos): gain = split_info.gain - if gain > best_gain: + if best_gain == -1 or gain > best_gain: best_gain = gain best_split_info = split_info return best_split_info diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index edbee1f86666c..bc084917122ba 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -28,6 +28,10 @@ def configuration(parent_package="", top_path=None): sources=["gbm/predictor.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("gbm.loss", + sources=["gbm/loss.pyx"], + include_dirs=[numpy.get_include()]) + config.add_extension("gbm.playground", sources=["gbm/playground.pyx"], include_dirs=[numpy.get_include()]) From e366a89420e7aaaba3e361f716e0dd19796621d9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 21 Dec 2018 19:40:08 -0500 Subject: [PATCH 009/247] Added loss file --- push_annotated_cython.sh | 5 +- sklearn/ensemble/gbm/loss.pyx | 319 ++++++++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 2 deletions(-) create mode 100644 sklearn/ensemble/gbm/loss.pyx diff --git a/push_annotated_cython.sh b/push_annotated_cython.sh index 45641834aaa97..9e7424b995e81 100755 --- a/push_annotated_cython.sh +++ b/push_annotated_cython.sh @@ -20,6 +20,7 @@ annotate_and_copy_files() { # files in TARGET_DIR/COMMIT_HASH/ git co $1 # checkout commit + rm -f $SOURCE_DIR/*.html # remove any previous file just in case for pyx_file in `ls $SOURCE_DIR/*.pyx` do echo 'annotating' $1 $pyx_file @@ -29,9 +30,9 @@ annotate_and_copy_files() { for html_file in `ls $SOURCE_DIR/*.html` do mkdir -p $TARGET_DIR/$1 - cp $html_file $TARGET_DIR/$1 + mv $html_file $TARGET_DIR/$1 html_file_name=$(basename -- "$html_file") # without path - echo Copied $html_file_name to $TARGET_DIR/$1 + echo moved $html_file_name to $TARGET_DIR/$1 done } diff --git a/sklearn/ensemble/gbm/loss.pyx b/sklearn/ensemble/gbm/loss.pyx new file mode 100644 index 0000000000000..2d95048f40268 --- /dev/null +++ b/sklearn/ensemble/gbm/loss.pyx @@ -0,0 +1,319 @@ +# cython: profile=True +""" +This module contains the loss classes. + +Specific losses are used for regression, binary classification or multiclass +classification. +""" +from abc import ABC, abstractmethod + +cimport cython + +import numpy as np +cimport numpy as np + +from scipy.special import expit, logsumexp + + +ctypedef fused float_or_double: + float + double + + +cdef get_threads_chunks(unsigned int total_size): + """Get start and end indices of threads in an array of size total_size. + + The interval [0, total_size - 1] is divided into n_threads contiguous + regions, and the starts and ends of each region are returned. Used to + simulate a 'static' scheduling. + """ + cdef: + np.ndarray[np.uint32_t] sizes + np.ndarray[np.uint32_t] starts + np.ndarray[np.uint32_t] ends + unsigned int n_threads + + n_threads = 1 # TODO: change this + sizes = np.full(n_threads, total_size // n_threads, dtype=np.uint32) + sizes[:total_size % n_threads] += 1 + starts = np.zeros(n_threads, dtype=np.uint32) + starts[1:] = np.cumsum(sizes[:-1]) + ends = starts + sizes + + return starts, ends, n_threads + + +class BaseLoss(ABC): + """Base class for a loss.""" + + def init_gradients_and_hessians(self, n_samples, prediction_dim): + """Return initial gradients and hessians. + + Unless hessians are constant, arrays are initialized with undefined + values. + + Parameters + ---------- + n_samples : int + The number of samples passed to `fit()` + prediction_dim : int + The dimension of a raw prediction, i.e. the number of trees + built at each iteration. Equals 1 for regression and binary + classification, or K where K is the number of classes for + multiclass classification. + + Returns + ------- + gradients : array-like, shape=(n_samples * prediction_dim) + hessians : array-like, shape=(n_samples * prediction_dim). + If hessians are constant (e.g. for ``LeastSquares`` loss, shape + is (1,) and the array is initialized to ``1``. + """ + shape = n_samples * prediction_dim + gradients = np.empty(shape=shape, dtype=np.float32) + if self.hessian_is_constant: + hessians = np.ones(shape=1, dtype=np.float32) + else: + hessians = np.empty(shape=shape, dtype=np.float32) + + return gradients, hessians + + @abstractmethod + def get_baseline_prediction(self, y_train, prediction_dim): + """Return initial predictions (before the first iteration). + + Parameters + ---------- + y_train : array-like, shape=(n_samples,) + The target training values. + prediction_dim : int + The dimension of one prediction: 1 for binary classification and + regression, n_classes for multiclass classification. + + Returns + ------- + baseline_prediction: float or array of shape (1, prediction_dim) + The baseline prediction. + """ + pass + + @abstractmethod + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + """Update gradients and hessians arrays, inplace. + + The gradients (resp. hessians) are the first (resp. second) order + derivatives of the loss for each sample with respect to the + predictions of model, evaluated at iteration ``i - 1``. + + Parameters + ---------- + gradients : array-like, shape=(n_samples * prediction_dim) + The gradients (treated as OUT array). + hessians : array-like, shape=(n_samples * prediction_dim) or \ + (1,) + The hessians (treated as OUT array). + y_true : array-like, shape=(n_samples,) + The true target values or each training sample. + raw_predictions : array-like, shape=(n_samples, prediction_dim) + The raw_predictions (i.e. values from the trees) of the tree + ensemble at iteration ``i - 1``. + """ + pass + + +class LeastSquares(BaseLoss): + """Least squares loss, for regression. + + For a given sample x_i, least squares loss is defined as:: + + loss(x_i) = (y_true_i - raw_pred_i)**2 + """ + + hessian_is_constant = True + + def __call__(self, y_true, raw_predictions, average=True): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + loss = np.power(y_true - raw_predictions, 2) + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + return np.mean(y_train) + + def inverse_link_function(self, raw_predictions): + return raw_predictions + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + raw_predictions = raw_predictions.reshape(-1) + return _update_gradients_least_squares(gradients, y_true, + raw_predictions) + + +def _update_gradients_least_squares(float [:] gradients, float [:] y_true, float [:] raw_predictions): + cdef: + unsigned int n_samples + unsigned int i + unsigned int thread_idx + unsigned int n_threads + unsigned int [:] starts + unsigned int [:] ends + + n_samples = raw_predictions.shape[0] + starts, ends, n_threads = get_threads_chunks(total_size=n_samples) + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + # Note: a more correct exp is 2 * (raw_predictions - y_true) but + # since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = raw_predictions[i] - y_true[i] + + +class BinaryCrossEntropy(BaseLoss): + """Binary cross-entropy loss, for binary classification. + + For a given sample x_i, the binary cross-entropy loss is defined as the + negative log-likelihood of the model which can be expressed as:: + + loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i + + See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. + """ + + hessian_is_constant = False + inverse_link_function = staticmethod(expit) + + def __call__(self, y_true, raw_predictions, average=True): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + # logaddexp(0, x) = log(1 + exp(x)) + loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + proba_positive_class = np.mean(y_train) + eps = np.finfo(y_train.dtype).eps + proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) + # log(x / 1 - x) is the anti function of sigmoid, or the link function + # of the Binomial model. + return np.log(proba_positive_class / (1 - proba_positive_class)) + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + raw_predictions = raw_predictions.reshape(-1) + return _update_gradients_hessians_binary_crossentropy( + gradients, hessians, y_true, raw_predictions) + + def predict_proba(self, raw_predictions): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + proba = np.empty((raw_predictions.shape[0], 2), dtype=np.float32) + proba[:, 1] = expit(raw_predictions) + proba[:, 0] = 1 - proba[:, 1] + return proba + + +def _update_gradients_hessians_binary_crossentropy(float [:] gradients, +float [:] hessians, float_or_double [:] y_true, double [:] raw_predictions): + cdef: + unsigned int n_samples + unsigned int i + unsigned int thread_idx + unsigned int n_threads + unsigned int [:] starts + unsigned int [:] ends + n_samples = raw_predictions.shape[0] + starts, ends, n_threads = get_threads_chunks(total_size=n_samples) + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + gradients[i] = expit(raw_predictions[i]) - y_true[i] + gradient_abs = np.abs(gradients[i]) + hessians[i] = gradient_abs * (1. - gradient_abs) + + +class CategoricalCrossEntropy(BaseLoss): + """Categorical cross-entropy loss, for multiclass classification. + + For a given sample x_i, the categorical cross-entropy loss is defined as + the negative log-likelihood of the model and generalizes the binary + cross-entropy to more than 2 classes. + """ + + hessian_is_constant = False + + def __call__(self, y_true, raw_predictions, average=True): + one_hot_true = np.zeros_like(raw_predictions) + prediction_dim = raw_predictions.shape[1] + for k in range(prediction_dim): + one_hot_true[:, k] = (y_true == k) + + loss = (logsumexp(raw_predictions, axis=1) - + (one_hot_true * raw_predictions).sum(axis=1)) + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + init_value = np.zeros( + shape=(1, prediction_dim), + dtype=np.float32 + ) + eps = np.finfo(y_train.dtype).eps + for k in range(prediction_dim): + proba_kth_class = np.mean(y_train == k) + proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) + init_value[:, k] += np.log(proba_kth_class) + + return init_value + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + return _update_gradients_hessians_categorical_crossentropy( + gradients, hessians, y_true, raw_predictions) + + def predict_proba(self, raw_predictions): + # TODO: This could be done in parallel + # compute softmax (using exp(log(softmax))) + return np.exp(raw_predictions - + logsumexp(raw_predictions, axis=1)[:, np.newaxis]) + + +def _update_gradients_hessians_categorical_crossentropy( + float [:] gradients, float [:] hessians, float_or_double [:] y_true, + float_or_double [:, :] raw_predictions): + # Here gradients and hessians are of shape + # (n_samples * prediction_dim,). + # y_true is of shape (n_samples,). + # raw_predictions is of shape (n_samples, raw_predictions) + cdef: + unsigned int n_samples + unsigned int prediction_dim + unsigned int i + unsigned int k + unsigned int thread_idx + unsigned int n_threads + unsigned int [:] starts + unsigned int [:] ends + float p_k + + n_samples = raw_predictions.shape[0] + prediction_dim = raw_predictions.shape[1] + starts, ends, n_threads = get_threads_chunks(total_size=n_samples) + for k in range(prediction_dim): + gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] + hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] + for thread_idx in range(n_threads): + for i in range(starts[thread_idx], ends[thread_idx]): + # p_k is the probability that class(ith sample) == k. + # This is a regular softmax. + p_k = np.exp(raw_predictions[i, k] - + logsumexp(raw_predictions[i, :])) + gradients_at_k[i] = p_k - (y_true[i] == k) + hessians_at_k[i] = p_k * (1. - p_k) + + +_LOSSES = {'least_squares': LeastSquares, + 'binary_crossentropy': BinaryCrossEntropy, + 'categorical_crossentropy': CategoricalCrossEntropy} From 1da9357fe2a8984537fdbbbd4cd0d3842f6086ff Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 25 Dec 2018 11:17:27 -0500 Subject: [PATCH 010/247] Added some (failing) tests --- sklearn/ensemble/gbm/histogram.pyx | 2 +- .../gbm/tests/test_compare_lightgbm.py | 209 ++++++++++++ .../gbm/tests/test_gradient_boosting.py | 318 ++++++++++++++++++ sklearn/ensemble/gbm/tests/test_grower.py | 290 ++++++++++++++++ sklearn/ensemble/gbm/tests/test_histogram.py | 167 +++++++++ sklearn/ensemble/gbm/tests/test_loss.py | 191 +++++++++++ 6 files changed, 1176 insertions(+), 1 deletion(-) create mode 100644 sklearn/ensemble/gbm/tests/test_compare_lightgbm.py create mode 100644 sklearn/ensemble/gbm/tests/test_gradient_boosting.py create mode 100644 sklearn/ensemble/gbm/tests/test_grower.py create mode 100644 sklearn/ensemble/gbm/tests/test_histogram.py create mode 100644 sklearn/ensemble/gbm/tests/test_loss.py diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx index e7efff769064d..c2fc04ad1859c 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -28,7 +28,7 @@ cdef struct hist_struct: @cython.boundscheck(False) # Deactivate bounds checking @cython.wraparound(False) # Deactivate negative indexing. -cdef _build_histogram_naive(unsigned int n_bins, unsigned int [:] +def _build_histogram_naive(unsigned int n_bins, unsigned int [:] sample_indices, unsigned char [:] binned_feature, float [:] ordered_gradients, float[:] ordered_hessians): diff --git a/sklearn/ensemble/gbm/tests/test_compare_lightgbm.py b/sklearn/ensemble/gbm/tests/test_compare_lightgbm.py new file mode 100644 index 0000000000000..cdd6778452e95 --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_compare_lightgbm.py @@ -0,0 +1,209 @@ +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +from sklearn.datasets import make_classification, make_regression +import numpy as np +import pytest + +from sklearn.ensemble import GBMRegressor, GBMClassifier +from sklearn.ensemble.gbm.binning import BinMapper +from sklearn.ensemble.gbm.utils import get_lightgbm_estimator + + +pytest.importorskip("lightgbm") + + +@pytest.mark.parametrize('seed', range(5)) +@pytest.mark.parametrize('min_samples_leaf', (1, 20)) +@pytest.mark.parametrize('n_samples, max_leaf_nodes', [ + (255, 4096), + (1000, 8), +]) +def test_same_predictions_regression(seed, min_samples_leaf, n_samples, + max_leaf_nodes): + # Make sure pygbm has the same predictions as LGBM for easy targets. + # + # In particular when the size of the trees are bound and the number of + # samples is large enough, the structure of the prediction trees found by + # LightGBM and PyGBM should be exactly identical. + # + # Notes: + # - Several candidate splits may have equal gains when the number of + # samples in a node is low (and because of float errors). Therefore the + # predictions on the test set might differ if the structure of the tree + # is not exactly the same. To avoid this issue we only compare the + # predictions on the test set when the number of samples is large enough + # and max_leaf_nodes is low enough. + # - To ignore discrepancies caused by small differences the binning + # strategy, data is pre-binned if n_samples > 255. + + rng = np.random.RandomState(seed=seed) + n_samples = n_samples + max_iter = 1 + max_bins = 256 + + X, y = make_regression(n_samples=n_samples, n_features=5, + n_informative=5, random_state=0) + + if n_samples > 255: + # bin data and convert it to float32 so that the estimator doesn't + # treat it as pre-binned + X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + est_pygbm = GBMRegressor(max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) + est_lightgbm = get_lightgbm_estimator(est_pygbm) + + est_lightgbm.fit(X_train, y_train) + est_pygbm.fit(X_train, y_train) + + # We need X to be treated an numerical data, not pre-binned data. + X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) + + pred_lgbm = est_lightgbm.predict(X_train) + pred_pygbm = est_pygbm.predict(X_train) + # less than 1% of the predictions are different up to the 3rd decimal + assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011 + + if max_leaf_nodes < 10 and n_samples >= 1000: + pred_lgbm = est_lightgbm.predict(X_test) + pred_pygbm = est_pygbm.predict(X_test) + # less than 1% of the predictions are different up to the 4th decimal + assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01 + + +@pytest.mark.parametrize('seed', range(5)) +@pytest.mark.parametrize('min_samples_leaf', (1, 20)) +@pytest.mark.parametrize('n_samples, max_leaf_nodes', [ + (255, 4096), + (1000, 8), +]) +def test_same_predictions_classification(seed, min_samples_leaf, n_samples, + max_leaf_nodes): + # Same as test_same_predictions_regression but for classification + + rng = np.random.RandomState(seed=seed) + n_samples = n_samples + max_iter = 1 + max_bins = 256 + + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, + n_informative=5, n_redundant=0, random_state=0) + + if n_samples > 255: + # bin data and convert it to float32 so that the estimator doesn't + # treat it as pre-binned + X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + est_pygbm = GBMClassifier(loss='binary_crossentropy', + max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) + est_lightgbm = get_lightgbm_estimator(est_pygbm) + + est_lightgbm.fit(X_train, y_train) + est_pygbm.fit(X_train, y_train) + + # We need X to be treated an numerical data, not pre-binned data. + X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) + + pred_lightgbm = est_lightgbm.predict(X_train) + pred_pygbm = est_pygbm.predict(X_train) + assert np.mean(pred_pygbm == pred_lightgbm) > .89 + + acc_lgbm = accuracy_score(y_train, pred_lightgbm) + acc_pygbm = accuracy_score(y_train, pred_pygbm) + np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) + + if max_leaf_nodes < 10 and n_samples >= 1000: + + pred_lightgbm = est_lightgbm.predict(X_test) + pred_pygbm = est_pygbm.predict(X_test) + assert np.mean(pred_pygbm == pred_lightgbm) > .89 + + acc_lgbm = accuracy_score(y_test, pred_lightgbm) + acc_pygbm = accuracy_score(y_test, pred_pygbm) + np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) + + +@pytest.mark.parametrize('seed', range(5)) +@pytest.mark.parametrize('min_samples_leaf', (1, 20)) +@pytest.mark.parametrize('n_samples, max_leaf_nodes', [ + (255, 4096), + (10000, 8), +]) +def test_same_predictions_multiclass_classification( + seed, min_samples_leaf, n_samples, max_leaf_nodes): + # Same as test_same_predictions_regression but for classification + + rng = np.random.RandomState(seed=seed) + n_samples = n_samples + max_iter = 1 + max_bins = 256 + lr = 1 + + X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, + n_informative=5, n_redundant=0, + n_clusters_per_class=1, random_state=0) + + if n_samples > 255: + # bin data and convert it to float32 so that the estimator doesn't + # treat it as pre-binned + X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + est_pygbm = GBMClassifier(loss='categorical_crossentropy', + max_iter=max_iter, + max_bins=max_bins, + learning_rate=lr, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) + est_lightgbm = get_lightgbm_estimator(est_pygbm) + + est_lightgbm.fit(X_train, y_train) + est_pygbm.fit(X_train, y_train) + + # We need X to be treated an numerical data, not pre-binned data. + X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) + + pred_lightgbm = est_lightgbm.predict(X_train) + pred_pygbm = est_pygbm.predict(X_train) + assert np.mean(pred_pygbm == pred_lightgbm) > .89 + + proba_lightgbm = est_lightgbm.predict_proba(X_train) + proba_pygbm = est_pygbm.predict_proba(X_train) + # assert more than 75% of the predicted probabilities are the same up to + # the second decimal + assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 + + acc_lgbm = accuracy_score(y_train, pred_lightgbm) + acc_pygbm = accuracy_score(y_train, pred_pygbm) + np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) + + if max_leaf_nodes < 10 and n_samples >= 1000: + + pred_lightgbm = est_lightgbm.predict(X_test) + pred_pygbm = est_pygbm.predict(X_test) + assert np.mean(pred_pygbm == pred_lightgbm) > .89 + + proba_lightgbm = est_lightgbm.predict_proba(X_train) + proba_pygbm = est_pygbm.predict_proba(X_train) + # assert more than 75% of the predicted probabilities are the same up + # to the second decimal + assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 + + acc_lgbm = accuracy_score(y_test, pred_lightgbm) + acc_pygbm = accuracy_score(y_test, pred_pygbm) + np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) diff --git a/sklearn/ensemble/gbm/tests/test_gradient_boosting.py b/sklearn/ensemble/gbm/tests/test_gradient_boosting.py new file mode 100644 index 0000000000000..9a8d06f726eba --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_gradient_boosting.py @@ -0,0 +1,318 @@ +import os +import warnings + +import numpy as np +from numpy.testing import assert_allclose +import pytest +from sklearn.utils.testing import assert_raises_regex +from sklearn.datasets import make_classification, make_regression + +from sklearn.ensemble import GBMClassifier +from sklearn.ensemble import GBMRegressor +from sklearn.ensemble.gbm.binning import BinMapper + + +X_classification, y_classification = make_classification(random_state=0) +X_regression, y_regression = make_regression(random_state=0) + + +@pytest.mark.parametrize('GradientBoosting, X, y', [ + (GBMClassifier, X_classification, y_classification), + (GBMRegressor, X_regression, y_regression) +]) +def test_init_parameters_validation(GradientBoosting, X, y): + + assert_raises_regex( + ValueError, + "Loss blah is not supported for", + GradientBoosting(loss='blah').fit, X, y + ) + + for learning_rate in (-1, 0): + assert_raises_regex( + ValueError, + f"learning_rate={learning_rate} must be strictly positive", + GradientBoosting(learning_rate=learning_rate).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"max_iter=0 must not be smaller than 1", + GradientBoosting(max_iter=0).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"max_leaf_nodes=0 should not be smaller than 1", + GradientBoosting(max_leaf_nodes=0).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"max_depth=0 should not be smaller than 1", + GradientBoosting(max_depth=0).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"min_samples_leaf=0 should not be smaller than 1", + GradientBoosting(min_samples_leaf=0).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"l2_regularization=-1 must be positive", + GradientBoosting(l2_regularization=-1).fit, X, y + ) + + for max_bins in (1, 257): + assert_raises_regex( + ValueError, + f"max_bins={max_bins} should be no smaller than 2 and no larger", + GradientBoosting(max_bins=max_bins).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"max_bins is set to 4 but the data is pre-binned with 256 bins", + GradientBoosting(max_bins=4).fit, X.astype(np.uint8), y + ) + + assert_raises_regex( + ValueError, + f"n_iter_no_change=-1 must be positive", + GradientBoosting(n_iter_no_change=-1).fit, X, y + ) + + for validation_split in (-1, 0): + assert_raises_regex( + ValueError, + f"validation_split={validation_split} must be strictly positive", + GradientBoosting(validation_split=validation_split).fit, X, y + ) + + assert_raises_regex( + ValueError, + f"tol=-1 must not be smaller than 0", + GradientBoosting(tol=-1).fit, X, y + ) + + +def test_one_sample_one_feature(): + # Until numba issue #3569 is fixed, we raise an informative error message + # when X is only one sample or one feature in fit (it's OK in predict). + # The array is both F and C contiguous, and numba can't compile. + gb = GBMClassifier() + for X, y in (([[1, 2]], [0]), ([[1], [2]], [0, 1])): + assert_raises_regex( + ValueError, + 'Passing only one sample or one feature is not supported yet.', + gb.fit, X, y + ) + + +@pytest.mark.skipif( + int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, + reason="Travis times out without numba") +@pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ + ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer + ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on training data + (None, .1, 5, 1e-7), # use loss + (None, None, 5, 1e-1), # use loss on training data + (None, None, None, None), # no early stopping +]) +def test_early_stopping_regression(scoring, validation_split, + n_iter_no_change, tol): + + max_iter = 500 + + X, y = make_regression(random_state=0) + + gb = GBMRegressor(verbose=1, # just for coverage + scoring=scoring, + tol=tol, + validation_split=validation_split, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0) + gb.fit(X, y) + + if n_iter_no_change is not None: + assert n_iter_no_change <= gb.n_iter_ < max_iter + else: + assert gb.n_iter_ == max_iter + + +@pytest.mark.skipif( + int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, + reason="Travis times out without numba") +@pytest.mark.parametrize('data', ( + make_classification(random_state=0), + make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) +)) +@pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ + ('accuracy', .1, 5, 1e-7), # use scorer + ('accuracy', None, 5, 1e-1), # use scorer on training data + (None, .1, 5, 1e-7), # use loss + (None, None, 5, 1e-1), # use loss on training data + (None, None, None, None), # no early stopping +]) +def test_early_stopping_classification(data, scoring, validation_split, + n_iter_no_change, tol): + + max_iter = 500 + + X, y = data + + gb = GBMClassifier(verbose=1, # just for coverage + scoring=scoring, + tol=tol, + validation_split=validation_split, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0) + gb.fit(X, y) + + if n_iter_no_change is not None: + assert n_iter_no_change <= gb.n_iter_ < max_iter + else: + assert gb.n_iter_ == max_iter + + +def test_early_stopping_loss(): + # Make sure that when scoring is None, the early stopping is done w.r.t to + # the loss. Using scoring='neg_log_loss' and scoring=None should be + # equivalent since the loss is precisely the negative log likelihood + n_samples = int(1e3) + max_iter = 100 + n_iter_no_change = 5 + + X, y = make_classification(n_samples, random_state=0) + + clf_scoring = GBMClassifier(max_iter=max_iter, + scoring='neg_log_loss', + validation_split=.1, + n_iter_no_change=n_iter_no_change, + tol=1e-4, + verbose=1, + random_state=0) + clf_scoring.fit(X, y) + + clf_loss = GBMClassifier(max_iter=max_iter, + scoring=None, + validation_split=.1, + n_iter_no_change=n_iter_no_change, + tol=1e-4, + verbose=1, + random_state=0) + clf_loss.fit(X, y) + + assert n_iter_no_change < clf_loss.n_iter_ < max_iter + assert clf_loss.n_iter_ == clf_scoring.n_iter_ + + +def test_should_stop(): + + def should_stop(scores, n_iter_no_change, tol): + gbdt = GBMClassifier(n_iter_no_change=n_iter_no_change, + tol=tol) + return gbdt._should_stop(scores) + + # not enough iterations + assert not should_stop([], n_iter_no_change=1, tol=0.001) + + assert not should_stop([1, 1, 1], n_iter_no_change=5, tol=0.001) + assert not should_stop([1] * 5, n_iter_no_change=5, tol=0.001) + + # still making significant progress up to tol + assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.001) + assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.) + assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.999) + assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, + tol=5 - 1e-5) + + # no significant progress according to tol + assert should_stop([1] * 6, n_iter_no_change=5, tol=0.) + assert should_stop([1] * 6, n_iter_no_change=5, tol=0.001) + assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=5) + + +# TODO: Remove if / when numba issue 3569 is fixed and check_classifiers_train +# is less strict +def custom_check_estimator(Estimator): + # Same as sklearn.check_estimator, skipping tests that can't succeed. + + from sklearn.utils.estimator_checks import _yield_all_checks + from sklearn.utils.testing import SkipTest + from sklearn.exceptions import SkipTestWarning + from sklearn.utils import estimator_checks + + estimator = Estimator + name = type(estimator).__name__ + + for check in _yield_all_checks(name, estimator): + if (check is estimator_checks.check_fit2d_1feature or + check is estimator_checks.check_fit2d_1sample): + # X is both Fortran and C aligned and numba can't compile. + # Opened numba issue 3569 + continue + if check is estimator_checks.check_classifiers_train: + continue # probas don't exactly sum to 1 (very close though) + if (hasattr(check, 'func') and + check.func is estimator_checks.check_classifiers_train): + continue # same, wrapped in a functools.partial object. + + try: + check(name, estimator) + except SkipTest as exception: + # the only SkipTest thrown currently results from not + # being able to import pandas. + warnings.warn(str(exception), SkipTestWarning) + + +@pytest.mark.skipif( + int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, + reason="Potentially long") +@pytest.mark.parametrize('Estimator', ( + GBMRegressor(), + GBMClassifier(n_iter_no_change=None, min_samples_leaf=5),)) +def test_estimator_checks(Estimator): + # Run the check_estimator() test suite on GBRegressor and GBClassifier. + + # Notes: + # - Can't do early stopping with classifier because often + # validation_split=.1 leads to test_size=2 < n_classes and + # train_test_split raises an error. + # - Also, need to set a low min_samples_leaf for + # check_classifiers_classes() to pass: with only 30 samples on the + # dataset, the root is never split with min_samples_leaf=20 and only the + # majority class is predicted. + custom_check_estimator(Estimator) + + +def test_pre_binned_data(): + # Make sure that: + # - training on numerical data and predicting on numerical data is the + # same as training on binned data and predicting on binned data + # - training on numerical data and predicting on numerical data is the + # same as training on numerical data and predicting on binned data + # - training on binned data and predicting on numerical data is not + # possible. + + X, y = make_regression(random_state=0) + gbdt = GBMRegressor(scoring=None, random_state=0) + mapper = BinMapper(random_state=0) + X_binned = mapper.fit_transform(X) + + fit_num_pred_num = gbdt.fit(X, y).predict(X) + fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned) + fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned) + + assert_allclose(fit_num_pred_num, fit_binned_pred_binned) + assert_allclose(fit_num_pred_num, fit_num_pred_binned) + + assert_raises_regex( + ValueError, + 'This estimator was fitted with pre-binned data ', + gbdt.fit(X_binned, y).predict, X + ) diff --git a/sklearn/ensemble/gbm/tests/test_grower.py b/sklearn/ensemble/gbm/tests/test_grower.py new file mode 100644 index 0000000000000..4e865589ee28e --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_grower.py @@ -0,0 +1,290 @@ +import numpy as np +from numpy.testing import assert_array_almost_equal +import pytest +from pytest import approx +from sklearn.utils.testing import assert_raises_regex + +from sklearn.ensemble.gbm.grower import TreeGrower +from sklearn.ensemble.gbm.binning import BinMapper + + +def _make_training_data(n_bins=256, constant_hessian=True): + rng = np.random.RandomState(42) + n_samples = 10000 + + # Generate some test data directly binned so as to test the grower code + # independently of the binning logic. + X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) + + def true_decision_function(input_features): + """Ground truth decision function + + This is a very simple yet asymmetric decision tree. Therefore the + grower code should have no trouble recovering the decision function + from 10000 training samples. + """ + if input_features[0] <= n_bins // 2: + return -1 + else: + if input_features[1] <= n_bins // 3: + return -1 + else: + return 1 + + target = np.array([true_decision_function(x) for x in X_binned], + dtype=np.float32) + + # Assume a square loss applied to an initial model that always predicts 0 + # (hardcoded for this test): + all_gradients = target + if constant_hessian: + all_hessians = np.ones(shape=1, dtype=np.float32) + else: + all_hessians = np.ones_like(all_gradients) + return X_binned, all_gradients, all_hessians + + +def _check_children_consistency(parent, left, right): + assert parent.left_child is left + assert parent.right_child is right + + # each sample from the parent is propagated to one of the two children + assert (len(left.sample_indices) + len(right.sample_indices) + == len(parent.sample_indices)) + + assert (set(left.sample_indices).union(set(right.sample_indices)) + == set(parent.sample_indices)) + + # samples are sent either to the left or the right node, never to both + assert (set(left.sample_indices).intersection(set(right.sample_indices)) + == set()) + + +@pytest.mark.parametrize( + 'n_bins, constant_hessian, stopping_param, shrinkage', + [ + (11, True, "min_gain_to_split", 0.5), + (11, False, "min_gain_to_split", 1.), + (11, True, "max_leaf_nodes", 1.), + (11, False, "max_leaf_nodes", 0.1), + (42, True, "max_leaf_nodes", 0.01), + (42, False, "max_leaf_nodes", 1.), + (256, True, "min_gain_to_split", 1.), + (256, True, "max_leaf_nodes", 0.1), + ] +) +def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): + X_binned, all_gradients, all_hessians = _make_training_data( + n_bins=n_bins, constant_hessian=constant_hessian) + n_samples = X_binned.shape[0] + + if stopping_param == "max_leaf_nodes": + stopping_param = {"max_leaf_nodes": 3} + else: + stopping_param = {"min_gain_to_split": 0.01} + + grower = TreeGrower(X_binned, all_gradients, all_hessians, + max_bins=n_bins, shrinkage=shrinkage, + min_samples_leaf=1, **stopping_param) + + # The root node is not yet splitted, but the best possible split has + # already been evaluated: + assert grower.root.left_child is None + assert grower.root.right_child is None + + root_split = grower.root.split_info + assert root_split.feature_idx == 0 + assert root_split.bin_idx == n_bins // 2 + assert len(grower.splittable_nodes) == 1 + + # Calling split next applies the next split and computes the best split + # for each of the two newly introduced children nodes. + assert grower.can_split_further() + left_node, right_node = grower.split_next() + + # All training samples have ben splitted in the two nodes, approximately + # 50%/50% + _check_children_consistency(grower.root, left_node, right_node) + assert len(left_node.sample_indices) > 0.4 * n_samples + assert len(left_node.sample_indices) < 0.6 * n_samples + + if grower.min_gain_to_split > 0: + # The left node is too pure: there is no gain to split it further. + assert left_node.split_info.gain < grower.min_gain_to_split + assert left_node in grower.finalized_leaves + + # The right node can still be splitted further, this time on feature #1 + split_info = right_node.split_info + assert split_info.gain > 1. + assert split_info.feature_idx == 1 + assert split_info.bin_idx == n_bins // 3 + assert right_node.left_child is None + assert right_node.right_child is None + + # The right split has not been applied yet. Let's do it now: + assert grower.can_split_further() + right_left_node, right_right_node = grower.split_next() + _check_children_consistency(right_node, right_left_node, right_right_node) + assert len(right_left_node.sample_indices) > 0.1 * n_samples + assert len(right_left_node.sample_indices) < 0.2 * n_samples + + assert len(right_right_node.sample_indices) > 0.2 * n_samples + assert len(right_right_node.sample_indices) < 0.4 * n_samples + + # All the leafs are pure, it is not possible to split any further: + assert not grower.can_split_further() + + # Check the values of the leaves: + assert grower.root.left_child.value == approx(shrinkage) + assert grower.root.right_child.left_child.value == approx(shrinkage) + assert grower.root.right_child.right_child.value == approx(-shrinkage) + + +def test_predictor_from_grower(): + # Build a tree on the toy 3-leaf dataset to extract the predictor. + n_bins = 256 + X_binned, all_gradients, all_hessians = _make_training_data( + n_bins=n_bins) + grower = TreeGrower(X_binned, all_gradients, all_hessians, + max_bins=n_bins, shrinkage=1., + max_leaf_nodes=3, min_samples_leaf=5) + grower.grow() + assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) + + # Check that the node structure can be converted into a predictor + # object to perform predictions at scale + predictor = grower.make_predictor() + assert predictor.nodes.shape[0] == 5 + assert predictor.nodes['is_leaf'].sum() == 3 + + # Probe some predictions for each leaf of the tree + input_data = np.array([ + [0, 0], + [42, 99], + [128, 255], + + [129, 0], + [129, 85], + [255, 85], + + [129, 86], + [129, 255], + [242, 100], + ], dtype=np.uint8) + predictions = predictor.predict_binned(input_data) + expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] + assert_array_almost_equal(predictions, expected_targets, decimal=5) + + # Check that training set can be recovered exactly: + predictions = predictor.predict_binned(X_binned) + assert_array_almost_equal(predictions, -all_gradients, decimal=5) + + +@pytest.mark.parametrize( + 'n_samples, min_samples_leaf, n_bins, constant_hessian, noise', + [ + (11, 10, 7, True, 0), + (13, 10, 42, False, 0), + (56, 10, 255, True, 0.1), + (101, 3, 7, True, 0), + (200, 42, 42, False, 0), + (300, 55, 255, True, 0.1), + (300, 301, 255, True, 0.1), + ] +) +def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, + constant_hessian, noise): + rng = np.random.RandomState(seed=0) + # data = linear target, 3 features, 1 irrelevant. + X = rng.normal(size=(n_samples, 3)) + y = X[:, 0] - X[:, 1] + if noise: + y_scale = y.std() + y += rng.normal(scale=noise, size=n_samples) * y_scale + mapper = BinMapper(max_bins=n_bins) + X = mapper.fit_transform(X) + + all_gradients = y.astype(np.float32) + if constant_hessian: + all_hessians = np.ones(shape=1, dtype=np.float32) + else: + all_hessians = np.ones_like(all_gradients) + grower = TreeGrower(X, all_gradients, all_hessians, + max_bins=n_bins, shrinkage=1., + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=n_samples) + grower.grow() + predictor = grower.make_predictor( + numerical_thresholds=mapper.numerical_thresholds_) + + if n_samples >= min_samples_leaf: + for node in predictor.nodes: + if node['is_leaf']: + assert node['count'] >= min_samples_leaf + else: + assert predictor.nodes.shape[0] == 1 + assert predictor.nodes[0]['is_leaf'] + assert predictor.nodes[0]['count'] == n_samples + + +@pytest.mark.parametrize('n_samples, min_samples_leaf', [ + (99, 50), + (100, 50)]) +def test_min_samples_leaf_root(n_samples, min_samples_leaf): + # Make sure root node isn't split if n_samples is not at least twice + # min_samples_leaf + rng = np.random.RandomState(seed=0) + + max_bins = 255 + + # data = linear target, 3 features, 1 irrelevant. + X = rng.normal(size=(n_samples, 3)) + y = X[:, 0] - X[:, 1] + mapper = BinMapper(max_bins=max_bins) + X = mapper.fit_transform(X) + + all_gradients = y.astype(np.float32) + all_hessians = np.ones(shape=1, dtype=np.float32) + grower = TreeGrower(X, all_gradients, all_hessians, + max_bins=max_bins, shrinkage=1., + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=n_samples) + grower.grow() + if n_samples >= min_samples_leaf * 2: + assert len(grower.finalized_leaves) >= 2 + else: + assert len(grower.finalized_leaves) == 1 + + +def test_init_parameters_validation(): + + X_binned, all_gradients, all_hessians = _make_training_data() + + X_binned_float = X_binned.astype(np.float32) + assert_raises_regex( + NotImplementedError, + "Explicit feature binning required for now", + TreeGrower, X_binned_float, all_gradients, all_hessians + ) + + X_binned_C_array = np.ascontiguousarray(X_binned) + assert_raises_regex( + ValueError, + "X_binned should be passed as Fortran contiguous array", + TreeGrower, X_binned_C_array, all_gradients, all_hessians + ) + + assert_raises_regex( + ValueError, + "min_gain_to_split=-1 must be positive", + TreeGrower, X_binned, all_gradients, all_hessians, + min_gain_to_split=-1 + ) + + assert_raises_regex( + ValueError, + "min_hessian_to_split=-1 must be positive", + TreeGrower, X_binned, all_gradients, all_hessians, + min_hessian_to_split=-1 + ) diff --git a/sklearn/ensemble/gbm/tests/test_histogram.py b/sklearn/ensemble/gbm/tests/test_histogram.py new file mode 100644 index 0000000000000..5a392371acd75 --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_histogram.py @@ -0,0 +1,167 @@ +import numpy as np +import pytest + +from numpy.testing import assert_allclose +from numpy.testing import assert_array_equal + +from sklearn.ensemble.gbm.histogram import _build_histogram_naive +from sklearn.ensemble.gbm.histogram import _build_histogram +from sklearn.ensemble.gbm.histogram import _build_histogram_no_hessian +from sklearn.ensemble.gbm.histogram import _build_histogram_root_no_hessian +from sklearn.ensemble.gbm.histogram import _build_histogram_root +from sklearn.ensemble.gbm.histogram import _subtract_histograms + + +@pytest.mark.parametrize( + 'build_func', [_build_histogram_naive, _build_histogram]) +def test_build_histogram(build_func): + binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=np.uint8) + + # Small sample_indices (below unrolling threshold) + ordered_gradients = np.array([0, 1, 3], dtype=np.float32) + ordered_hessians = np.array([1, 1, 2], dtype=np.float32) + + sample_indices = np.array([0, 2, 3], dtype=np.uint32) + hist = build_func(3, sample_indices, binned_feature, + ordered_gradients, ordered_hessians) + assert_array_equal(hist['count'], [2, 1, 0]) + assert_allclose(hist['sum_gradients'], [1, 3, 0]) + assert_allclose(hist['sum_hessians'], [2, 2, 0]) + + # Larger sample_indices (above unrolling threshold) + sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32) + ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=np.float32) + ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=np.float32) + + hist = build_func(3, sample_indices, binned_feature, + ordered_gradients, ordered_hessians) + assert_array_equal(hist['count'], [2, 2, 1]) + assert_allclose(hist['sum_gradients'], [1, 4, 0]) + assert_allclose(hist['sum_hessians'], [2, 2, 1]) + + +def test_histogram_sample_order_independence(): + rng = np.random.RandomState(42) + n_sub_samples = 100 + n_samples = 1000 + n_bins = 256 + + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) + sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32), + n_sub_samples, replace=False) + ordered_gradients = rng.randn(n_sub_samples).astype(np.float32) + hist_gc = _build_histogram_no_hessian(n_bins, sample_indices, + binned_feature, ordered_gradients) + + ordered_hessians = rng.exponential(size=n_sub_samples).astype(np.float32) + hist_ghc = _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians) + + permutation = rng.permutation(n_sub_samples) + hist_gc_perm = _build_histogram_no_hessian( + n_bins, sample_indices[permutation], binned_feature, + ordered_gradients[permutation]) + + hist_ghc_perm = _build_histogram( + n_bins, sample_indices[permutation], binned_feature, + ordered_gradients[permutation], ordered_hessians[permutation]) + + assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients']) + assert_array_equal(hist_gc['count'], hist_gc_perm['count']) + + assert_allclose(hist_ghc['sum_gradients'], hist_ghc_perm['sum_gradients']) + assert_allclose(hist_ghc['sum_hessians'], hist_ghc_perm['sum_hessians']) + assert_array_equal(hist_ghc['count'], hist_ghc_perm['count']) + + +@pytest.mark.parametrize("constant_hessian", [True, False]) +def test_unrolled_equivalent_to_naive(constant_hessian): + # Make sure the different unrolled histogram computations give the same + # results as the naive one. + rng = np.random.RandomState(42) + n_samples = 10 + n_bins = 5 + sample_indices = np.arange(n_samples).astype(np.uint32) + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) + ordered_gradients = rng.randn(n_samples).astype(np.float32) + if constant_hessian: + ordered_hessians = np.ones(n_samples, dtype=np.float32) + else: + ordered_hessians = rng.lognormal(size=n_samples).astype(np.float32) + + hist_gc_root = _build_histogram_root_no_hessian(n_bins, binned_feature, + ordered_gradients) + hist_ghc_root = _build_histogram_root(n_bins, binned_feature, + ordered_gradients, ordered_hessians) + hist_gc = _build_histogram_no_hessian(n_bins, sample_indices, + binned_feature, ordered_gradients) + hist_ghc = _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians) + + hist_naive = _build_histogram_naive(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians) + + for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_gc, hist_ghc): + assert_array_equal(hist['count'], hist_naive['count']) + assert_allclose(hist['sum_gradients'], hist_naive['sum_gradients']) + for hist in (hist_ghc_root, hist_ghc): + assert_allclose(hist['sum_hessians'], hist_naive['sum_hessians']) + for hist in (hist_gc_root, hist_gc): + assert_array_equal(hist['sum_hessians'], np.zeros(n_bins)) + + +@pytest.mark.parametrize("constant_hessian", [True, False]) +def test_hist_subtraction(constant_hessian): + # Make sure the histogram subtraction trick gives the same result as the + # classical method. + rng = np.random.RandomState(42) + n_samples = 10 + n_bins = 5 + sample_indices = np.arange(n_samples).astype(np.uint32) + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) + ordered_gradients = rng.randn(n_samples).astype(np.float32) + if constant_hessian: + ordered_hessians = np.ones(n_samples, dtype=np.float32) + else: + ordered_hessians = rng.lognormal(size=n_samples).astype(np.float32) + + if constant_hessian: + hist_parent = _build_histogram_no_hessian(n_bins, sample_indices, + binned_feature, + ordered_gradients) + else: + hist_parent = _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians) + + mask = rng.randint(0, 2, n_samples).astype(np.bool) + + sample_indices_left = sample_indices[mask] + ordered_gradients_left = ordered_gradients[mask] + ordered_hessians_left = ordered_hessians[mask] + if constant_hessian: + hist_left = _build_histogram_no_hessian(n_bins, sample_indices_left, + binned_feature, + ordered_gradients_left) + else: + hist_left = _build_histogram(n_bins, sample_indices_left, + binned_feature, ordered_gradients_left, + ordered_hessians_left) + + sample_indices_right = sample_indices[~mask] + ordered_gradients_right = ordered_gradients[~mask] + ordered_hessians_right = ordered_hessians[~mask] + if constant_hessian: + hist_right = _build_histogram_no_hessian(n_bins, sample_indices_right, + binned_feature, + ordered_gradients_right) + else: + hist_right = _build_histogram(n_bins, sample_indices_right, + binned_feature, ordered_gradients_right, + ordered_hessians_right) + + hist_left_sub = _subtract_histograms(n_bins, hist_parent, hist_right) + hist_right_sub = _subtract_histograms(n_bins, hist_parent, hist_left) + + for key in ('count', 'sum_hessians', 'sum_gradients'): + assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) + assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6) diff --git a/sklearn/ensemble/gbm/tests/test_loss.py b/sklearn/ensemble/gbm/tests/test_loss.py new file mode 100644 index 0000000000000..07c48f877d234 --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_loss.py @@ -0,0 +1,191 @@ +import numpy as np +from numpy.testing import assert_almost_equal +from scipy.optimize import newton +from scipy.special import logsumexp +from sklearn.utils import assert_all_finite +import pytest + +from sklearn.ensemble.gbm.loss import _LOSSES + + +def get_derivatives_helper(loss): + """Return get_gradients() and get_hessians() functions for a given loss. + + Loss classes used to have get_gradients() and + get_hessians() methods, but now the update is done inplace in + update_gradient_and_hessians(). This helper is used to keep the tests + almost unchanged. + """ + + def get_gradients(y_true, raw_predictions): + # create gradients and hessians array, update inplace, and return + shape = raw_predictions.shape[0] * raw_predictions.shape[1] + gradients = np.empty(shape=shape, dtype=raw_predictions.dtype) + hessians = np.empty(shape=shape, dtype=raw_predictions.dtype) + loss.update_gradients_and_hessians(gradients, hessians, y_true, + raw_predictions) + + if loss.__class__ is _LOSSES['least_squares']: + gradients *= 2 # ommitted a factor of 2 to be consistent with LGBM + + return gradients + + def get_hessians(y_true, raw_predictions): + # create gradients and hessians array, update inplace, and return + shape = raw_predictions.shape[0] * raw_predictions.shape[1] + gradients = np.empty(shape=shape, dtype=raw_predictions.dtype) + hessians = np.empty(shape=shape, dtype=raw_predictions.dtype) + loss.update_gradients_and_hessians(gradients, hessians, y_true, + raw_predictions) + + if loss.__class__ is _LOSSES['least_squares']: + # hessians aren't updated because they're constant + hessians = np.full_like(y_true, fill_value=2) + + return hessians + + return get_gradients, get_hessians + + +@pytest.mark.parametrize('loss, x0, y_true', [ + ('least_squares', -2., 42), + ('least_squares', 117., 1.05), + ('least_squares', 0., 0.), + ('binary_crossentropy', 0.3, 0), + ('binary_crossentropy', -12, 1), + ('binary_crossentropy', 30, 1), +]) +def test_derivatives(loss, x0, y_true): + # Check that gradients are zero when the loss is minimized on 1D array + # using the Newton-Raphson and the first and second order derivatives + # computed by the Loss instance. + + loss = _LOSSES[loss]() + y_true = np.array([y_true], dtype=np.float32) + x0 = np.array([x0], dtype=np.float32).reshape(1, 1) + get_gradients, get_hessians = get_derivatives_helper(loss) + + def func(x): + return loss(y_true, x) + + def fprime(x): + return get_gradients(y_true, x) + + def fprime2(x): + return get_hessians(y_true, x) + + optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2) + assert np.allclose(loss.inverse_link_function(optimum), y_true) + assert np.allclose(loss(y_true, optimum), 0) + assert np.allclose(get_gradients(y_true, optimum), 0) + + +@pytest.mark.parametrize('loss, n_classes, prediction_dim', [ + ('least_squares', 0, 1), + ('binary_crossentropy', 2, 1), + ('categorical_crossentropy', 3, 3), +]) +def test_numerical_gradients(loss, n_classes, prediction_dim): + # Make sure gradients and hessians computed in the loss are correct, by + # comparing with their approximations computed with finite central + # differences. + # See https://en.wikipedia.org/wiki/Finite_difference. + + rng = np.random.RandomState(0) + n_samples = 100 + if loss == 'least_squares': + y_true = rng.normal(size=n_samples).astype(np.float64) + else: + y_true = rng.randint(0, n_classes, size=n_samples).astype(np.float64) + raw_predictions = rng.normal( + size=(n_samples, prediction_dim) + ).astype(np.float64) + loss = _LOSSES[loss]() + get_gradients, get_hessians = get_derivatives_helper(loss) + + # [:n_samples] to only take gradients and hessians of first tree. + gradients = get_gradients(y_true, raw_predictions)[:n_samples] + hessians = get_hessians(y_true, raw_predictions)[:n_samples] + + # Approximate gradients + # For multiclass loss, we should only change the predictions of one tree + # (here the first), hence the use of offset[:, 0] += eps + # As a softmax is computed, offsetting the whole array by a constant would + # have no effect on the probabilities, and thus on the loss + eps = 1e-9 + offset = np.zeros_like(raw_predictions) + offset[:, 0] = eps + f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) + f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) + numerical_gradient = (f_plus_eps - f_minus_eps) / eps + numerical_gradient = numerical_gradient + + # Approximate hessians + eps = 1e-4 # need big enough eps as we divide by its square + offset[:, 0] = eps + f_plus_eps = loss(y_true, raw_predictions + offset, average=False) + f_minus_eps = loss(y_true, raw_predictions - offset, average=False) + f = loss(y_true, raw_predictions, average=False) + numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2 + numerical_hessians = numerical_hessians + + def relative_error(a, b): + return np.abs(a - b) / np.maximum(np.abs(a), np.abs(b)) + + assert np.all(relative_error(numerical_gradient, gradients) < 1e-5) + assert np.all(relative_error(numerical_hessians, hessians) < 1e-5) + + +def test_baseline_least_squares(): + rng = np.random.RandomState(0) + + loss = _LOSSES['least_squares']() + y_train = rng.normal(size=100) + baseline_prediction = loss.get_baseline_prediction(y_train, 1) + assert baseline_prediction.shape == tuple() # scalar + # Make sure baseline prediction is the mean of all targets + assert_almost_equal(baseline_prediction, y_train.mean()) + + +def test_baseline_binary_crossentropy(): + rng = np.random.RandomState(0) + + loss = _LOSSES['binary_crossentropy']() + for y_train in (np.zeros(shape=100), np.ones(shape=100)): + y_train = y_train.astype(np.float32) + baseline_prediction = loss.get_baseline_prediction(y_train, 1) + assert_all_finite(baseline_prediction) + assert_almost_equal(loss.inverse_link_function(baseline_prediction), + y_train[0]) + + # Make sure baseline prediction is equal to link_function(p), where p + # is the proba of the positive class. We want predict_proba() to return p, + # and by definition + # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) + # So we want raw_prediction = link_function(p) = log(p / (1 - p)) + y_train = rng.randint(0, 2, size=100).astype(np.float32) + baseline_prediction = loss.get_baseline_prediction(y_train, 1) + assert baseline_prediction.shape == tuple() # scalar + p = y_train.mean() + assert_almost_equal(baseline_prediction, np.log(p / (1 - p))) + + +def test_baseline_categorical_crossentropy(): + rng = np.random.RandomState(0) + + prediction_dim = 4 + loss = _LOSSES['categorical_crossentropy']() + for y_train in (np.zeros(shape=100), np.ones(shape=100)): + y_train = y_train.astype(np.float32) + baseline_prediction = loss.get_baseline_prediction(y_train, + prediction_dim) + assert_all_finite(baseline_prediction) + + # Same logic as for above test. Here inverse_link_function = softmax and + # link_function = log + y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) + baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) + assert baseline_prediction.shape == (1, prediction_dim) + for k in range(prediction_dim): + p = (y_train == k).mean() + assert_almost_equal(baseline_prediction[:, k], np.log(p)) From e953672283abe08fdc6f82bfb4e8dcf0bc03cb29 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 3 Jan 2019 10:33:04 -0500 Subject: [PATCH 011/247] some optimizations --- gdb_test.py | 2 +- sklearn/ensemble/gbm/binning.pyx | 48 ++-- sklearn/ensemble/gbm/gradient_boosting.py | 8 +- sklearn/ensemble/gbm/histogram.pyx | 16 +- sklearn/ensemble/gbm/loss.pyx | 326 +++++++++++----------- sklearn/ensemble/gbm/splitting.pyx | 9 +- 6 files changed, 198 insertions(+), 211 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index ee94c30ed635b..995b29579df83 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -9,7 +9,7 @@ import pstats import cProfile -classif = True +classif = False n_classes = 3 n_samples = 100000 max_iter = 5 diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/ensemble/gbm/binning.pyx index 571d26cf9ecb6..eee0f66ef5151 100644 --- a/sklearn/ensemble/gbm/binning.pyx +++ b/sklearn/ensemble/gbm/binning.pyx @@ -1,3 +1,9 @@ +# cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: nonecheck=False +# cython: language_level=3 """ This module contains the BinMapper class. @@ -13,6 +19,12 @@ from cython.parallel import prange from sklearn.utils import check_random_state, check_array from sklearn.base import BaseEstimator, TransformerMixin +from .types import X_DTYPE, X_BINNED_DTYPE + + +ctypedef np.npy_float64 NPY_X_DTYPE +ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE + def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), random_state=None): @@ -32,14 +44,12 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), if subsample is not None and data.shape[0] > subsample: subset = rng.choice(np.arange(data.shape[0]), subsample) data = data[subset] - dtype = data.dtype - if dtype.kind != 'f': - dtype = np.float32 + # TODO: DONT USE NEGATIVE INDEXING (see warning when compiling with cython) percentiles = np.linspace(0, 100, num=max_bins + 1)[1:-1] binning_thresholds = [] for f_idx in range(data.shape[1]): - col_data = np.ascontiguousarray(data[:, f_idx], dtype=dtype) + col_data = np.ascontiguousarray(data[:, f_idx], dtype=X_DTYPE) distinct_values = np.unique(col_data) if len(distinct_values) <= max_bins: midpoints = (distinct_values[:-1] + distinct_values[1:]) @@ -51,12 +61,12 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), # work and the performance benefit will be limited because we # work on a fixed-size subsample of the full data. midpoints = np.percentile(col_data, percentiles, - interpolation='midpoint').astype(dtype) + interpolation='midpoint').astype(X_DTYPE) binning_thresholds.append(midpoints) - return tuple(binning_thresholds) + return binning_thresholds -cdef _map_to_bins(np.ndarray[np.float_t, ndim=2] data, binning_thresholds): +cdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, NPY_X_BINNED_DTYPE [:, :] binned): """Bin numerical values to discrete integer-coded levels. Parameters @@ -77,26 +87,15 @@ cdef _map_to_bins(np.ndarray[np.float_t, ndim=2] data, binning_thresholds): # TODO: add support for categorical data encoded as integers # TODO: add support for sparse data (numerical or categorical) cdef: - np.ndarray[np.uint8_t, ndim=2] binned - np.ndarray[np.float32_t, ndim=2] binning_thresholds_ int feature_idx - binned = np.zeros_like(data, dtype=np.uint8, order='F') - - # binning_thresholds = tuple(np.ascontiguousarray(bt, dtype=np.float32) - # for bt in binning_thresholds) - binning_thresholds_ = np.array(binning_thresholds, dtype=np.float32) - for feature_idx in range(data.shape[1]): _map_num_col_to_bins(data[:, feature_idx], - binning_thresholds_[feature_idx], + binning_thresholds[feature_idx], binned[:, feature_idx]) - return binned -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. -cdef void _map_num_col_to_bins(double [:] data, float [:] binning_thresholds, unsigned char [:] binned)nogil: +cdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, NPY_X_DTYPE [:] binning_thresholds, NPY_X_BINNED_DTYPE [:] binned) nogil: """Binary search to the find the bin index for each value in data.""" cdef: int i @@ -104,8 +103,8 @@ cdef void _map_num_col_to_bins(double [:] data, float [:] binning_thresholds, un int right int middle + # for i in range(data.shape[0]): for i in prange(data.shape[0], schedule='static'): - # TODO: add support for missing values (NaN or custom marker) left, right = 0, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 @@ -162,7 +161,7 @@ class BinMapper(BaseEstimator, TransformerMixin): ------- self : object """ - X = check_array(X) + X = check_array(X, dtype=[X_DTYPE]) self.bin_thresholds_ = _find_binning_thresholds( X, self.max_bins, subsample=self.subsample, random_state=self.random_state) @@ -186,4 +185,7 @@ class BinMapper(BaseEstimator, TransformerMixin): X_binned : array-like The binned data """ - return _map_to_bins(X, binning_thresholds=self.bin_thresholds_) + X = check_array(X, dtype=[X_DTYPE]) + binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') + _map_to_bins(X, self.bin_thresholds_, binned) + return binned diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/ensemble/gbm/gradient_boosting.py index e2746748fd7e8..e0d6b4ddc57ba 100644 --- a/sklearn/ensemble/gbm/gradient_boosting.py +++ b/sklearn/ensemble/gbm/gradient_boosting.py @@ -13,6 +13,7 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from ._gradient_boosting import _update_raw_predictions__ +from .types import Y_DTYPE, X_DTYPE from .binning import BinMapper from .grower import TreeGrower @@ -94,7 +95,7 @@ def fit(self, X, y): # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data # TODO: add support for pre-binned data (pass-through)? - X, y = check_X_y(X, y, dtype=[np.float32, np.float64]) + X, y = check_X_y(X, y, dtype=[X_DTYPE]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( @@ -168,7 +169,6 @@ def fit(self, X, y): shape=(n_samples, self.n_trees_per_iteration_), dtype=self.baseline_prediction_.dtype ) - print(raw_predictions.dtype) raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size @@ -527,7 +527,7 @@ def predict(self, X): def _encode_y(self, y): # Just convert y to float32 self.n_trees_per_iteration_ = 1 - y = y.astype(np.float32, copy=False) + y = y.astype(Y_DTYPE, copy=False) return y def _get_loss(self): @@ -672,7 +672,7 @@ def _encode_y(self, y): # only 1 tree for binary classification. For multiclass classification, # we build 1 tree per class. self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes - encoded_y = encoded_y.astype(np.float32, copy=False) + encoded_y = encoded_y.astype(Y_DTYPE, copy=False) return encoded_y def _get_loss(self): diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx index c2fc04ad1859c..4426e4b424ffe 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -1,3 +1,7 @@ +# cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False """This module contains njitted routines for building histograms. A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each @@ -26,8 +30,6 @@ cdef struct hist_struct: -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def _build_histogram_naive(unsigned int n_bins, unsigned int [:] sample_indices, unsigned char [:] binned_feature, float [:] ordered_gradients, @@ -48,8 +50,6 @@ def _build_histogram_naive(unsigned int n_bins, unsigned int [:] return histogram -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def _subtract_histograms(unsigned int n_bins, np.ndarray hist_a, np.ndarray hist_b): """Return hist_a - hist_b""" # print('subtract_hist') @@ -68,8 +68,6 @@ def _subtract_histograms(unsigned int n_bins, np.ndarray hist_a, np.ndarray hist return histogram -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def _build_histogram(unsigned int n_bins, unsigned int [:] sample_indices, unsigned char [:] binned_feature, float [:] ordered_gradients, @@ -121,8 +119,6 @@ def _build_histogram(unsigned int n_bins, unsigned int [:] return histogram -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def _build_histogram_no_hessian(unsigned int n_bins, unsigned int [:] sample_indices, unsigned char [:] binned_feature, float [:] ordered_gradients): @@ -173,8 +169,6 @@ def _build_histogram_no_hessian(unsigned int n_bins, unsigned int [:] -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def _build_histogram_root_no_hessian(unsigned int n_bins, unsigned char [:] binned_feature, float [:]all_gradients): """Special case for the root node @@ -227,8 +221,6 @@ def _build_histogram_root_no_hessian(unsigned int n_bins, unsigned char [:] return histogram -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def _build_histogram_root(unsigned int n_bins, unsigned char [:] binned_feature, float [:] all_gradients, float[:] all_hessians): diff --git a/sklearn/ensemble/gbm/loss.pyx b/sklearn/ensemble/gbm/loss.pyx index 2d95048f40268..f4a448819c15c 100644 --- a/sklearn/ensemble/gbm/loss.pyx +++ b/sklearn/ensemble/gbm/loss.pyx @@ -1,4 +1,7 @@ # cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False """ This module contains the loss classes. @@ -14,10 +17,9 @@ cimport numpy as np from scipy.special import expit, logsumexp +from .types import Y_DTYPE -ctypedef fused float_or_double: - float - double +ctypedef np.npy_float32 NPY_Y_DTYPE cdef get_threads_chunks(unsigned int total_size): @@ -70,11 +72,11 @@ class BaseLoss(ABC): is (1,) and the array is initialized to ``1``. """ shape = n_samples * prediction_dim - gradients = np.empty(shape=shape, dtype=np.float32) + gradients = np.empty(shape=shape, dtype=Y_DTYPE) if self.hessian_is_constant: - hessians = np.ones(shape=1, dtype=np.float32) + hessians = np.ones(shape=1, dtype=Y_DTYPE) else: - hessians = np.empty(shape=shape, dtype=np.float32) + hessians = np.empty(shape=shape, dtype=Y_DTYPE) return gradients, hessians @@ -152,168 +154,160 @@ class LeastSquares(BaseLoss): raw_predictions) -def _update_gradients_least_squares(float [:] gradients, float [:] y_true, float [:] raw_predictions): +def _update_gradients_least_squares(NPY_Y_DTYPE[:] gradients, NPY_Y_DTYPE[:] y_true, NPY_Y_DTYPE[:] raw_predictions): cdef: unsigned int n_samples unsigned int i - unsigned int thread_idx - unsigned int n_threads - unsigned int [:] starts - unsigned int [:] ends - - n_samples = raw_predictions.shape[0] - starts, ends, n_threads = get_threads_chunks(total_size=n_samples) - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - # Note: a more correct exp is 2 * (raw_predictions - y_true) but - # since we use 1 for the constant hessian value (and not 2) this - # is strictly equivalent for the leaves values. - gradients[i] = raw_predictions[i] - y_true[i] - - -class BinaryCrossEntropy(BaseLoss): - """Binary cross-entropy loss, for binary classification. - - For a given sample x_i, the binary cross-entropy loss is defined as the - negative log-likelihood of the model which can be expressed as:: - - loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i - - See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. - """ - - hessian_is_constant = False - inverse_link_function = staticmethod(expit) - - def __call__(self, y_true, raw_predictions, average=True): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - # logaddexp(0, x) = log(1 + exp(x)) - loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions - return loss.mean() if average else loss - - def get_baseline_prediction(self, y_train, prediction_dim): - proba_positive_class = np.mean(y_train) - eps = np.finfo(y_train.dtype).eps - proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) - # log(x / 1 - x) is the anti function of sigmoid, or the link function - # of the Binomial model. - return np.log(proba_positive_class / (1 - proba_positive_class)) - - def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): - raw_predictions = raw_predictions.reshape(-1) - return _update_gradients_hessians_binary_crossentropy( - gradients, hessians, y_true, raw_predictions) - - def predict_proba(self, raw_predictions): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to - # return a view. - raw_predictions = raw_predictions.reshape(-1) - proba = np.empty((raw_predictions.shape[0], 2), dtype=np.float32) - proba[:, 1] = expit(raw_predictions) - proba[:, 0] = 1 - proba[:, 1] - return proba - - -def _update_gradients_hessians_binary_crossentropy(float [:] gradients, -float [:] hessians, float_or_double [:] y_true, double [:] raw_predictions): - cdef: - unsigned int n_samples - unsigned int i - unsigned int thread_idx - unsigned int n_threads - unsigned int [:] starts - unsigned int [:] ends - n_samples = raw_predictions.shape[0] - starts, ends, n_threads = get_threads_chunks(total_size=n_samples) - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - gradients[i] = expit(raw_predictions[i]) - y_true[i] - gradient_abs = np.abs(gradients[i]) - hessians[i] = gradient_abs * (1. - gradient_abs) - - -class CategoricalCrossEntropy(BaseLoss): - """Categorical cross-entropy loss, for multiclass classification. - - For a given sample x_i, the categorical cross-entropy loss is defined as - the negative log-likelihood of the model and generalizes the binary - cross-entropy to more than 2 classes. - """ - - hessian_is_constant = False - - def __call__(self, y_true, raw_predictions, average=True): - one_hot_true = np.zeros_like(raw_predictions) - prediction_dim = raw_predictions.shape[1] - for k in range(prediction_dim): - one_hot_true[:, k] = (y_true == k) - - loss = (logsumexp(raw_predictions, axis=1) - - (one_hot_true * raw_predictions).sum(axis=1)) - return loss.mean() if average else loss - - def get_baseline_prediction(self, y_train, prediction_dim): - init_value = np.zeros( - shape=(1, prediction_dim), - dtype=np.float32 - ) - eps = np.finfo(y_train.dtype).eps - for k in range(prediction_dim): - proba_kth_class = np.mean(y_train == k) - proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) - init_value[:, k] += np.log(proba_kth_class) - - return init_value - - def update_gradients_and_hessians(self, gradients, hessians, y_true, - raw_predictions): - return _update_gradients_hessians_categorical_crossentropy( - gradients, hessians, y_true, raw_predictions) - - def predict_proba(self, raw_predictions): - # TODO: This could be done in parallel - # compute softmax (using exp(log(softmax))) - return np.exp(raw_predictions - - logsumexp(raw_predictions, axis=1)[:, np.newaxis]) - - -def _update_gradients_hessians_categorical_crossentropy( - float [:] gradients, float [:] hessians, float_or_double [:] y_true, - float_or_double [:, :] raw_predictions): - # Here gradients and hessians are of shape - # (n_samples * prediction_dim,). - # y_true is of shape (n_samples,). - # raw_predictions is of shape (n_samples, raw_predictions) - cdef: - unsigned int n_samples - unsigned int prediction_dim - unsigned int i - unsigned int k - unsigned int thread_idx - unsigned int n_threads - unsigned int [:] starts - unsigned int [:] ends - float p_k n_samples = raw_predictions.shape[0] - prediction_dim = raw_predictions.shape[1] - starts, ends, n_threads = get_threads_chunks(total_size=n_samples) - for k in range(prediction_dim): - gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] - hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - # p_k is the probability that class(ith sample) == k. - # This is a regular softmax. - p_k = np.exp(raw_predictions[i, k] - - logsumexp(raw_predictions[i, :])) - gradients_at_k[i] = p_k - (y_true[i] == k) - hessians_at_k[i] = p_k * (1. - p_k) - - -_LOSSES = {'least_squares': LeastSquares, - 'binary_crossentropy': BinaryCrossEntropy, - 'categorical_crossentropy': CategoricalCrossEntropy} + for i in range(n_samples): + # Note: a more correct exp is 2 * (raw_predictions - y_true) but + # since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = raw_predictions[i] - y_true[i] + + +## class BinaryCrossEntropy(BaseLoss): +## """Binary cross-entropy loss, for binary classification. +## +## For a given sample x_i, the binary cross-entropy loss is defined as the +## negative log-likelihood of the model which can be expressed as:: +## +## loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i +## +## See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. +## """ +## +## hessian_is_constant = False +## inverse_link_function = staticmethod(expit) +## +## def __call__(self, y_true, raw_predictions, average=True): +## # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to +## # return a view. +## raw_predictions = raw_predictions.reshape(-1) +## # logaddexp(0, x) = log(1 + exp(x)) +## loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions +## return loss.mean() if average else loss +## +## def get_baseline_prediction(self, y_train, prediction_dim): +## proba_positive_class = np.mean(y_train) +## eps = np.finfo(y_train.dtype).eps +## proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) +## # log(x / 1 - x) is the anti function of sigmoid, or the link function +## # of the Binomial model. +## return np.log(proba_positive_class / (1 - proba_positive_class)) +## +## def update_gradients_and_hessians(self, gradients, hessians, y_true, +## raw_predictions): +## raw_predictions = raw_predictions.reshape(-1) +## return _update_gradients_hessians_binary_crossentropy( +## gradients, hessians, y_true, raw_predictions) +## +## def predict_proba(self, raw_predictions): +## # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to +## # return a view. +## raw_predictions = raw_predictions.reshape(-1) +## proba = np.empty((raw_predictions.shape[0], 2), dtype=np.float32) +## proba[:, 1] = expit(raw_predictions) +## proba[:, 0] = 1 - proba[:, 1] +## return proba +## +## +## def _update_gradients_hessians_binary_crossentropy(float [:] gradients, +## float [:] hessians, float_or_double [:] y_true, double [:] raw_predictions): +## cdef: +## unsigned int n_samples +## unsigned int i +## unsigned int thread_idx +## unsigned int n_threads +## unsigned int [:] starts +## unsigned int [:] ends +## n_samples = raw_predictions.shape[0] +## starts, ends, n_threads = get_threads_chunks(total_size=n_samples) +## for thread_idx in range(n_threads): +## for i in range(starts[thread_idx], ends[thread_idx]): +## gradients[i] = expit(raw_predictions[i]) - y_true[i] +## gradient_abs = np.abs(gradients[i]) +## hessians[i] = gradient_abs * (1. - gradient_abs) +## +## +## class CategoricalCrossEntropy(BaseLoss): +## """Categorical cross-entropy loss, for multiclass classification. +## +## For a given sample x_i, the categorical cross-entropy loss is defined as +## the negative log-likelihood of the model and generalizes the binary +## cross-entropy to more than 2 classes. +## """ +## +## hessian_is_constant = False +## +## def __call__(self, y_true, raw_predictions, average=True): +## one_hot_true = np.zeros_like(raw_predictions) +## prediction_dim = raw_predictions.shape[1] +## for k in range(prediction_dim): +## one_hot_true[:, k] = (y_true == k) +## +## loss = (logsumexp(raw_predictions, axis=1) - +## (one_hot_true * raw_predictions).sum(axis=1)) +## return loss.mean() if average else loss +## +## def get_baseline_prediction(self, y_train, prediction_dim): +## init_value = np.zeros( +## shape=(1, prediction_dim), +## dtype=np.float32 +## ) +## eps = np.finfo(y_train.dtype).eps +## for k in range(prediction_dim): +## proba_kth_class = np.mean(y_train == k) +## proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) +## init_value[:, k] += np.log(proba_kth_class) +## +## return init_value +## +## def update_gradients_and_hessians(self, gradients, hessians, y_true, +## raw_predictions): +## return _update_gradients_hessians_categorical_crossentropy( +## gradients, hessians, y_true, raw_predictions) +## +## def predict_proba(self, raw_predictions): +## # TODO: This could be done in parallel +## # compute softmax (using exp(log(softmax))) +## return np.exp(raw_predictions - +## logsumexp(raw_predictions, axis=1)[:, np.newaxis]) +## +## +## def _update_gradients_hessians_categorical_crossentropy( +## float [:] gradients, float [:] hessians, float_or_double [:] y_true, +## float_or_double [:, :] raw_predictions): +## # Here gradients and hessians are of shape +## # (n_samples * prediction_dim,). +## # y_true is of shape (n_samples,). +## # raw_predictions is of shape (n_samples, raw_predictions) +## cdef: +## unsigned int n_samples +## unsigned int prediction_dim +## unsigned int i +## unsigned int k +## unsigned int thread_idx +## unsigned int n_threads +## unsigned int [:] starts +## unsigned int [:] ends +## float p_k +## +## n_samples = raw_predictions.shape[0] +## prediction_dim = raw_predictions.shape[1] +## starts, ends, n_threads = get_threads_chunks(total_size=n_samples) +## for k in range(prediction_dim): +## gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] +## hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] +## for thread_idx in range(n_threads): +## for i in range(starts[thread_idx], ends[thread_idx]): +## # p_k is the probability that class(ith sample) == k. +## # This is a regular softmax. +## p_k = np.exp(raw_predictions[i, k] - +## logsumexp(raw_predictions[i, :])) +## gradients_at_k[i] = p_k - (y_true[i] == k) +## hessians_at_k[i] = p_k * (1. - p_k) + + +_LOSSES = {'least_squares': LeastSquares} diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index 840b2fbb3a8d1..62961d66ab26b 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -1,3 +1,7 @@ +# cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False """This module contains njitted routines and data structures to: - Find the best possible split of a node. For a given node, a split is @@ -199,8 +203,6 @@ cdef class SplittingContext: self.right_indices_buffer = np.empty_like(self.partition) -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [:] sample_indices): cdef: unsigned int n_samples = sample_indices.shape[0] @@ -463,8 +465,6 @@ cdef _find_histogram_split_subtraction(SplittingContext context, unsigned int fe n_samples) -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. cdef _find_best_bin_to_split_helper(SplittingContext context, unsigned int feature_idx, hist_struct [:] histogram, unsigned int n_samples): @@ -569,7 +569,6 @@ cdef inline float _split_gain(float gradient_left, float hessian_left, float gra gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain -@cython.cdivision(True) cdef inline float negative_loss(float gradient, float hessian, float l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) From dfe7a65582c2f0bd9f6599321c8f56c95b7d7a1d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 3 Jan 2019 16:15:56 -0500 Subject: [PATCH 012/247] Cleaned Histogram computation code - everything is no-python and nogil - histogram tests are passing - made benchmark (commited for now): consistently slower than pygbm --- bench_hist.py | 145 ++++++++ gdb_test.py | 47 ++- sklearn/ensemble/gbm/binning.pyx | 7 +- sklearn/ensemble/gbm/histogram.pyx | 329 +++++++++---------- sklearn/ensemble/gbm/splitting.pyx | 30 +- sklearn/ensemble/gbm/tests/test_histogram.py | 106 +++--- sklearn/ensemble/gbm/types.py | 12 + 7 files changed, 421 insertions(+), 255 deletions(-) create mode 100644 bench_hist.py create mode 100644 sklearn/ensemble/gbm/types.py diff --git a/bench_hist.py b/bench_hist.py new file mode 100644 index 0000000000000..7ef6822555325 --- /dev/null +++ b/bench_hist.py @@ -0,0 +1,145 @@ +""" +Compare histogram building function with pygbm. + +run with +export OMP_NUM_THREADS=1 && make in && python bench_hist.py + +might be a bit unfair to cython code since we're calling the python versions of +the cpdef functions, which causes unnecessary conversions. +""" +from time import time +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +from joblib import Memory +from pygbm.histogram import _build_histogram_naive as pygbm_build_histogram_naive +from pygbm.histogram import _build_histogram as pygbm_build_histogram +from pygbm.histogram import _build_histogram_no_hessian as pygbm_build_histogram_no_hessian +from pygbm.histogram import _build_histogram_root as pygbm_build_histogram_root +from pygbm.histogram import _build_histogram_root_no_hessian as pygbm_build_histogram_root_no_hessian +from pygbm.histogram import _subtract_histograms as pygbm_subtract_histograms + +from sklearn.ensemble.gbm.histogram import _build_histogram_naive +from sklearn.ensemble.gbm.histogram import _build_histogram +from sklearn.ensemble.gbm.histogram import _build_histogram_no_hessian +from sklearn.ensemble.gbm.histogram import _build_histogram_root +from sklearn.ensemble.gbm.histogram import _build_histogram_root_no_hessian +from sklearn.ensemble.gbm.histogram import _subtract_histograms +from sklearn.ensemble.gbm.types import HISTOGRAM_DTYPE + + +m = Memory(location='/tmp') + +@m.cache +def make_data(n_bins=256, n_samples=int(1e8), loss_dtype=np.float32, + binned_feature_dtype=np.uint8, seed=42): + rng = np.random.RandomState(seed) + + sample_indices = np.arange(n_samples, dtype=np.uint32) + ordered_gradients = rng.randn(n_samples).astype(loss_dtype) + ordered_hessians = rng.exponential(size=n_samples).astype(loss_dtype) + binned_feature = rng.randint(0, n_bins, size=n_samples, dtype=np.uint8) + return sample_indices, binned_feature, ordered_gradients, ordered_hessians + + +n_bins = 256 +print(f"Compiling pygbm...") +sample_indices, binned_feature, gradients, hessians = make_data( + n_bins, n_samples=10) +tic = time() +a = pygbm_build_histogram_naive(n_bins, sample_indices, binned_feature, gradients, hessians) +b = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) +pygbm_subtract_histograms(n_bins, a, b) +pygbm_build_histogram_no_hessian(n_bins, sample_indices, binned_feature, gradients) +pygbm_build_histogram_root(n_bins, binned_feature, gradients, hessians) +pygbm_build_histogram_root_no_hessian(n_bins, binned_feature, gradients) +toc = time() +duration = toc - tic +print(f"done in {duration:.3f}s") + +def one_run(sklearn_fun, pygbm_fun): + print('-' * 10) + print(sklearn_fun.__name__) + + if 'subtract' in sklearn_fun.__name__: + # specal case for subtract... crappy + a = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) + b = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + + args = [n_bins, a, b] + tic = time() + pygbm_fun(*args) + pygbm_duration = time() - tic + print(f"pygbm: Built in {pygbm_duration:.3f}s") + + tic = time() + args.append(histogram) + sklearn_fun(*args) + sklearn_duration = time() - tic + print(f"sklearn: Built in {sklearn_duration:.3f}s") + + else: + args = [n_bins] + if not 'root' in sklearn_fun.__name__: + args.append(sample_indices) + args += [binned_feature, gradients, hessians] + if 'no_hessian' in sklearn_fun.__name__: + args.pop() + + tic = time() + pygbm_fun(*args) + pygbm_duration = time() - tic + print(f"pygbm: Built in {pygbm_duration:.3f}s") + + tic = time() + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + args.append(histogram) + sklearn_fun(*args) + sklearn_duration = time() - tic + print(f"sklearn: Built in {sklearn_duration:.3f}s") + + return sklearn_duration, pygbm_duration + +n_exp = 10 +n_samples_list = [10**x for x in range(2, 9)] + + +n_rows = 3 +n_cols = 2 +fig, axs = plt.subplots(n_rows, n_cols, sharex=True) + +for i, (sklearn_fun, pygbm_fun) in enumerate(( + (_build_histogram_naive, pygbm_build_histogram_naive), + (_build_histogram, pygbm_build_histogram), + (_build_histogram_no_hessian, pygbm_build_histogram_no_hessian), + (_build_histogram_root, pygbm_build_histogram_root), + (_build_histogram_root_no_hessian, pygbm_build_histogram_root_no_hessian), + (_subtract_histograms, pygbm_subtract_histograms))): + + row = i // n_cols + col = i % n_cols + ax = axs[row][col] + + durations = defaultdict(lambda: defaultdict(list)) + for n_samples in n_samples_list: + sample_indices, binned_feature, gradients, hessians = make_data( + n_bins, n_samples) + for _ in range(n_exp): + sklearn_duration, pygbm_duration = one_run(sklearn_fun, pygbm_fun) + durations[n_samples]['sklearn'].append(sklearn_duration) + durations[n_samples]['pygbm'].append(pygbm_duration) + + sklearn_avgs = [np.mean(durations[n_samples]['sklearn']) for n_samples in n_samples_list] + sklearn_stds = [np.std(durations[n_samples]['sklearn']) for n_samples in n_samples_list] + ax.errorbar(n_samples_list, sklearn_avgs, yerr=sklearn_stds, label='PR') + + pygbm_avgs = [np.mean(durations[n_samples]['pygbm']) for n_samples in n_samples_list] + pygbm_stds = [np.std(durations[n_samples]['pygbm']) for n_samples in n_samples_list] + ax.errorbar(n_samples_list, pygbm_avgs, yerr=pygbm_stds, label='pygbm') + ax.set_xscale('log') + ax.set_title(sklearn_fun.__name__) + ax.legend() +fig.suptitle(f'Avg histogram computation time over {n_exp} runs\nfor different sample sizes') +plt.show() diff --git a/gdb_test.py b/gdb_test.py index 995b29579df83..d7f3e0c6b24c4 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -8,8 +8,9 @@ import pstats import cProfile +import pygbm -classif = False +classif = False n_classes = 3 n_samples = 100000 max_iter = 5 @@ -18,28 +19,46 @@ X, y = make_classification(n_samples=n_samples, random_state=0, n_classes=n_classes, n_clusters_per_class=1) GBM = GBMClassifier GBDT = GradientBoostingClassifier + PYGBM_GBM = pygbm.GradientBoostingClassifier else: X, y = make_regression(n_samples=n_samples, random_state=0) GBM = GBMRegressor GBDT = GradientBoostingRegressor + PYGBM_GBM = pygbm.GradientBoostingRegressor +pygbm_est = PYGBM_GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + random_state=0, + verbose=False) +print("compiling pygbm code") +pygbm_est.fit(X[:1000], y[:1000]) +print("done") + +gbm = GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + n_iter_no_change=None, + random_state=0, + verbose=True) tic = time() -gbm = GBM(max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - n_iter_no_change=None, - random_state=0, - verbose=True) -# gbm.fit(X, y) -# print(f'score: {gbm.score(X, y)}') -# duration = time() - tic -# print(f'Took {duration:.3f}s\n') +gbm.fit(X, y) +fit_duration = time() - tic +print(f'sklearn gbm fit_duration: {fit_duration:.3f}s\n') -cProfile.runctx("gbm.fit(X, y).predict(X)", globals(), locals(), "Profile.prof") -s = pstats.Stats("Profile.prof") -s.strip_dirs().sort_stats("time").print_stats(.2) +pygbm_est.set_params(verbose=True) +tic = time() +pygbm_est.fit(X, y) +fit_duration = time() - tic +print(f'pygbm fit_duration: {fit_duration:.3f}s\n') + +# cProfile.runctx("gbm.fit(X, y).predict(X)", globals(), locals(), "Profile.prof") +# s = pstats.Stats("Profile.prof") +# s.strip_dirs().sort_stats("time").print_stats(.2) # tic = time() # gbdt = GBDT(n_estimators=max_iter, diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/ensemble/gbm/binning.pyx index eee0f66ef5151..8bb38e04fe75c 100644 --- a/sklearn/ensemble/gbm/binning.pyx +++ b/sklearn/ensemble/gbm/binning.pyx @@ -66,7 +66,8 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return binning_thresholds -cdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, NPY_X_BINNED_DTYPE [:, :] binned): +cdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, + NPY_X_BINNED_DTYPE [::1, :] binned): """Bin numerical values to discrete integer-coded levels. Parameters @@ -95,7 +96,9 @@ cdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, NPY_X_BINNED binned[:, feature_idx]) -cdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, NPY_X_DTYPE [:] binning_thresholds, NPY_X_BINNED_DTYPE [:] binned) nogil: +cdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, + NPY_X_DTYPE [:] binning_thresholds, + NPY_X_BINNED_DTYPE [:] binned) nogil: """Binary search to the find the bin index for each value in data.""" cdef: int i diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/ensemble/gbm/histogram.pyx index 4426e4b424ffe..ce180dd6206bf 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/ensemble/gbm/histogram.pyx @@ -1,7 +1,7 @@ -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False +# cython: language_level=3 """This module contains njitted routines for building histograms. A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each @@ -13,81 +13,70 @@ cimport cython import numpy as np cimport numpy as np +from .types import HISTOGRAM_DTYPE -HISTOGRAM_DTYPE = np.dtype([ - ('sum_gradients', np.float32), - ('sum_hessians', np.float32), - ('count', np.uint32), -]) +ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE +ctypedef np.npy_float32 NPY_Y_DTYPE -from libc.stdlib cimport malloc, free - -cdef struct hist_struct: +cdef packed struct hist_struct: float sum_gradients float sum_hessians unsigned int count - -def _build_histogram_naive(unsigned int n_bins, unsigned int [:] - sample_indices, unsigned char [:] - binned_feature, float [:] ordered_gradients, - float[:] ordered_hessians): +cpdef void _build_histogram_naive(unsigned int n_bins, + unsigned int [:] sample_indices, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] ordered_gradients, + NPY_Y_DTYPE [:] ordered_hessians, + hist_struct [:] out) nogil: """Build histogram in a naive way, without optimizing for cache hit.""" - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) cdef: - hist_struct [:] view = histogram unsigned int i + unsigned int n_samples = sample_indices.shape[0] unsigned int sample_idx - unsigned char bin_idx + unsigned int bin_idx - for i, sample_idx in enumerate(sample_indices): + for i in range(n_samples): + sample_idx = sample_indices[i] bin_idx = binned_feature[sample_idx] - view[bin_idx].sum_gradients += ordered_gradients[i] - view[bin_idx].sum_hessians += ordered_hessians[i] - view[bin_idx].count += 1 - return histogram + out[bin_idx].sum_gradients += ordered_gradients[i] + out[bin_idx].sum_hessians += ordered_hessians[i] + out[bin_idx].count += 1 -def _subtract_histograms(unsigned int n_bins, np.ndarray hist_a, np.ndarray hist_b): +cpdef void _subtract_histograms(unsigned int n_bins, + hist_struct [:] hist_a, + hist_struct [:] hist_b, + hist_struct [:] out) nogil: """Return hist_a - hist_b""" - # print('subtract_hist') - - cdef unsigned int i = 0 - cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - cdef hist_struct [:] view = histogram - cdef hist_struct [:] view_a = hist_a - cdef hist_struct [:] view_b = hist_b + cdef: + unsigned int i = 0 for i in range(n_bins): - view[i].sum_gradients = view_a[i].sum_gradients - view_b[i].sum_gradients - view[i].sum_hessians = view_a[i].sum_hessians - view_b[i].sum_hessians - view[i].count = view_a[i].count - view_b[i].count - - return histogram + out[i].sum_gradients = hist_a[i].sum_gradients - hist_b[i].sum_gradients + out[i].sum_hessians = hist_a[i].sum_hessians - hist_b[i].sum_hessians + out[i].count = hist_a[i].count - hist_b[i].count -def _build_histogram(unsigned int n_bins, unsigned int [:] - sample_indices, unsigned char [:] - binned_feature, float [:] ordered_gradients, - float[:] ordered_hessians): +cpdef void _build_histogram(unsigned int n_bins, + unsigned int [:] sample_indices, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] ordered_gradients, + NPY_Y_DTYPE [:] ordered_hessians, + hist_struct [:] out) nogil: """Return histogram for a given feature.""" - cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - cdef hist_struct [:] view = histogram - cdef int i = 0 - - cdef float [:] ordered_gradients_view = ordered_gradients - cdef float [:] ordered_hessians_view = ordered_hessians - - cdef int n_node_samples = sample_indices.shape[0] - cdef int unrolled_upper = (n_node_samples // 4) * 4 + cdef: + unsigned int i = 0 + unsigned int n_node_samples = sample_indices.shape[0] + unsigned int unrolled_upper = (n_node_samples // 4) * 4 - cdef unsigned int bin_0 - cdef unsigned int bin_1 - cdef unsigned int bin_2 - cdef unsigned int bin_3 - cdef unsigned int bin_idx + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx for i in range(0, unrolled_upper, 4): bin_0 = binned_feature[sample_indices[i]] @@ -95,82 +84,73 @@ def _build_histogram(unsigned int n_bins, unsigned int [:] bin_2 = binned_feature[sample_indices[i + 2]] bin_3 = binned_feature[sample_indices[i + 3]] - view[bin_0].sum_gradients += ordered_gradients_view[i] - view[bin_1].sum_gradients += ordered_gradients_view[i + 1] - view[bin_2].sum_gradients += ordered_gradients_view[i + 2] - view[bin_3].sum_gradients += ordered_gradients_view[i + 3] + out[bin_0].sum_gradients += ordered_gradients[i] + out[bin_1].sum_gradients += ordered_gradients[i + 1] + out[bin_2].sum_gradients += ordered_gradients[i + 2] + out[bin_3].sum_gradients += ordered_gradients[i + 3] - view[bin_0].sum_hessians += ordered_hessians_view[i] - view[bin_1].sum_hessians += ordered_hessians_view[i + 1] - view[bin_2].sum_hessians += ordered_hessians_view[i + 2] - view[bin_3].sum_hessians += ordered_hessians_view[i + 3] + out[bin_0].sum_hessians += ordered_hessians[i] + out[bin_1].sum_hessians += ordered_hessians[i + 1] + out[bin_2].sum_hessians += ordered_hessians[i + 2] + out[bin_3].sum_hessians += ordered_hessians[i + 3] - view[bin_0].count += 1 - view[bin_1].count += 1 - view[bin_2].count += 1 - view[bin_3].count += 1 + out[bin_0].count += 1 + out[bin_1].count += 1 + out[bin_2].count += 1 + out[bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): bin_idx = binned_feature[sample_indices[i]] - view[bin_idx].sum_gradients += ordered_gradients_view[i] - view[bin_idx].sum_hessians += ordered_hessians_view[i] - view[bin_idx].count += 1 - - return histogram - + out[bin_idx].sum_gradients += ordered_gradients[i] + out[bin_idx].sum_hessians += ordered_hessians[i] + out[bin_idx].count += 1 -def _build_histogram_no_hessian(unsigned int n_bins, unsigned int [:] - sample_indices, unsigned char [:] - binned_feature, float [:] ordered_gradients): - """Return histogram for a given feature. - Hessians are not updated (used when hessians are constant). - """ - # print('build_hist_no_hessian') - cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - cdef hist_struct [:] view = histogram - cdef unsigned int i = 0 - - cdef float [:] ordered_gradients_view = ordered_gradients - cdef unsigned char [:] binned_feature_view = binned_feature - cdef unsigned int [:] sample_indices_view = sample_indices - - cdef unsigned int n_node_samples = sample_indices.shape[0] - cdef unsigned int unrolled_upper = (n_node_samples // 4) * 4 +cpdef void _build_histogram_no_hessian( + unsigned int n_bins, + unsigned int [:] sample_indices, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] ordered_gradients, + hist_struct [:] out) nogil: + """Return histogram for a given feature.""" + cdef: + unsigned int i = 0 + unsigned int n_node_samples = sample_indices.shape[0] + unsigned int unrolled_upper = (n_node_samples // 4) * 4 - cdef unsigned int bin_0 - cdef unsigned int bin_1 - cdef unsigned int bin_2 - cdef unsigned int bin_3 - cdef unsigned int bin_idx + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx for i in range(0, unrolled_upper, 4): - bin_0 = binned_feature_view[sample_indices_view[i]] - bin_1 = binned_feature_view[sample_indices_view[i + 1]] - bin_2 = binned_feature_view[sample_indices_view[i + 2]] - bin_3 = binned_feature_view[sample_indices_view[i + 3]] + bin_0 = binned_feature[sample_indices[i]] + bin_1 = binned_feature[sample_indices[i + 1]] + bin_2 = binned_feature[sample_indices[i + 2]] + bin_3 = binned_feature[sample_indices[i + 3]] - view[bin_0].sum_gradients += ordered_gradients_view[i] - view[bin_1].sum_gradients += ordered_gradients_view[i + 1] - view[bin_2].sum_gradients += ordered_gradients_view[i + 2] - view[bin_3].sum_gradients += ordered_gradients_view[i + 3] + out[bin_0].sum_gradients += ordered_gradients[i] + out[bin_1].sum_gradients += ordered_gradients[i + 1] + out[bin_2].sum_gradients += ordered_gradients[i + 2] + out[bin_3].sum_gradients += ordered_gradients[i + 3] - view[bin_0].count += 1 - view[bin_1].count += 1 - view[bin_2].count += 1 - view[bin_3].count += 1 + out[bin_0].count += 1 + out[bin_1].count += 1 + out[bin_2].count += 1 + out[bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): - bin_idx = binned_feature_view[sample_indices_view[i]] - view[bin_idx].sum_gradients += ordered_gradients_view[i] - view[bin_idx].count += 1 - - return histogram - + bin_idx = binned_feature[sample_indices[i]] + out[bin_idx].sum_gradients += ordered_gradients[i] + out[bin_idx].count += 1 -def _build_histogram_root_no_hessian(unsigned int n_bins, unsigned char [:] - binned_feature, float [:]all_gradients): +cpdef void _build_histogram_root_no_hessian( + unsigned int n_bins, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] all_gradients, + hist_struct [:] out) nogil: """Special case for the root node The root node has to find the split among all the samples from the @@ -179,95 +159,86 @@ def _build_histogram_root_no_hessian(unsigned int n_bins, unsigned char [:] Hessians are not updated (used when hessians are constant) """ - # print('build_hist_root_no_hessian') - - cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - cdef hist_struct [:] view = histogram - cdef unsigned int i = 0 - - cdef float [:] all_gradients_view = all_gradients - cdef unsigned char [:] binned_feature_view = binned_feature - - cdef unsigned int n_node_samples = binned_feature.shape[0] - cdef unsigned int unrolled_upper = (n_node_samples // 4) * 4 + cdef: + unsigned int i = 0 + unsigned int n_samples = binned_feature.shape[0] + unsigned int unrolled_upper = (n_samples // 4) * 4 - cdef unsigned int bin_0 - cdef unsigned int bin_1 - cdef unsigned int bin_2 - cdef unsigned int bin_3 - cdef unsigned int bin_idx + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx for i in range(0, unrolled_upper, 4): - bin_0 = binned_feature_view[i] - bin_1 = binned_feature_view[i + 1] - bin_2 = binned_feature_view[i + 2] - bin_3 = binned_feature_view[i + 3] - - view[bin_0].sum_gradients += all_gradients_view[i] - view[bin_1].sum_gradients += all_gradients_view[i + 1] - view[bin_2].sum_gradients += all_gradients_view[i + 2] - view[bin_3].sum_gradients += all_gradients_view[i + 3] + bin_0 = binned_feature[i] + bin_1 = binned_feature[i + 1] + bin_2 = binned_feature[i + 2] + bin_3 = binned_feature[i + 3] - view[bin_0].count += 1 - view[bin_1].count += 1 - view[bin_2].count += 1 - view[bin_3].count += 1 + out[bin_0].sum_gradients += all_gradients[i] + out[bin_1].sum_gradients += all_gradients[i + 1] + out[bin_2].sum_gradients += all_gradients[i + 2] + out[bin_3].sum_gradients += all_gradients[i + 3] - for i in range(unrolled_upper, n_node_samples): - bin_idx = binned_feature_view[i] - view[bin_idx].sum_gradients += all_gradients_view[i] - view[bin_idx].count += 1 + out[bin_0].count += 1 + out[bin_1].count += 1 + out[bin_2].count += 1 + out[bin_3].count += 1 - return histogram + for i in range(unrolled_upper, n_samples): + bin_idx = binned_feature[i] + out[bin_idx].sum_gradients += all_gradients[i] + out[bin_idx].count += 1 -def _build_histogram_root(unsigned int n_bins, unsigned char [:] - binned_feature, float [:] all_gradients, - float[:] all_hessians): +cpdef void _build_histogram_root( + unsigned int n_bins, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] all_gradients, + NPY_Y_DTYPE [:] all_hessians, + hist_struct [:] out) nogil: """Special case for the root node The root node has to find the split among all the samples from the training set. binned_feature and all_gradients and all_hessians already have a consistent ordering. """ - cdef np.ndarray histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - cdef hist_struct [:] view = histogram - cdef int i = 0 - - cdef unsigned int n_node_samples = binned_feature.shape[0] - cdef unsigned int unrolled_upper = (n_node_samples // 4) * 4 + cdef: + unsigned int i = 0 + unsigned int n_samples = binned_feature.shape[0] + unsigned int unrolled_upper = (n_samples // 4) * 4 - cdef unsigned int bin_0 - cdef unsigned int bin_1 - cdef unsigned int bin_2 - cdef unsigned int bin_3 - cdef unsigned int bin_idx + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[i] bin_1 = binned_feature[i + 1] bin_2 = binned_feature[i + 2] bin_3 = binned_feature[i + 3] - view[bin_0].sum_gradients += all_gradients[i] - view[bin_1].sum_gradients += all_gradients[i + 1] - view[bin_2].sum_gradients += all_gradients[i + 2] - view[bin_3].sum_gradients += all_gradients[i + 3] + out[bin_0].sum_gradients += all_gradients[i] + out[bin_1].sum_gradients += all_gradients[i + 1] + out[bin_2].sum_gradients += all_gradients[i + 2] + out[bin_3].sum_gradients += all_gradients[i + 3] - view[bin_0].sum_hessians += all_hessians[i] - view[bin_1].sum_hessians += all_hessians[i + 1] - view[bin_2].sum_hessians += all_hessians[i + 2] - view[bin_3].sum_hessians += all_hessians[i + 3] + out[bin_0].sum_hessians += all_hessians[i] + out[bin_1].sum_hessians += all_hessians[i + 1] + out[bin_2].sum_hessians += all_hessians[i + 2] + out[bin_3].sum_hessians += all_hessians[i + 3] - view[bin_0].count += 1 - view[bin_1].count += 1 - view[bin_2].count += 1 - view[bin_3].count += 1 + out[bin_0].count += 1 + out[bin_1].count += 1 + out[bin_2].count += 1 + out[bin_3].count += 1 - for i in range(unrolled_upper, n_node_samples): + for i in range(unrolled_upper, n_samples): bin_idx = binned_feature[i] - view[bin_idx].sum_gradients += all_gradients[i] - view[bin_idx].sum_hessians += all_hessians[i] - view[bin_idx].count += 1 - - return histogram + out[bin_idx].sum_gradients += all_gradients[i] + out[bin_idx].sum_hessians += all_hessians[i] + out[bin_idx].count += 1 diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index 62961d66ab26b..992e2b3316e1a 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -411,7 +411,7 @@ cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices): + unsigned int [:] sample_indices): """Compute the histogram for a given feature Returns the best SplitInfo among all the possible bins of the feature. @@ -425,23 +425,23 @@ cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, float [:] ordered_hessians = context.ordered_hessians[:n_samples] np.ndarray histogram + histogram = np.zeros(context.max_bins, dtype=HISTOGRAM_DTYPE) + if root_node: if context.constant_hessian: - histogram = _build_histogram_root_no_hessian( - context.max_bins, X_binned, ordered_gradients) + _build_histogram_root_no_hessian(context.max_bins, X_binned, + ordered_gradients, histogram) else: - histogram = _build_histogram_root( - context.max_bins, X_binned, ordered_gradients, - context.ordered_hessians) + _build_histogram_root(context.max_bins, X_binned, + ordered_gradients, + context.ordered_hessians, histogram) else: if context.constant_hessian: - histogram = _build_histogram_no_hessian( - context.max_bins, sample_indices, X_binned, - ordered_gradients) + _build_histogram_no_hessian(context.max_bins, sample_indices, + X_binned, ordered_gradients, histogram) else: - histogram = _build_histogram( - context.max_bins, sample_indices, X_binned, - ordered_gradients, ordered_hessians) + _build_histogram(context.max_bins, sample_indices, X_binned, + ordered_gradients, ordered_hessians, histogram) return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) @@ -457,9 +457,9 @@ cdef _find_histogram_split_subtraction(SplittingContext context, unsigned int fe cdef: np.ndarray histogram - histogram = _subtract_histograms( - context.max_bins, - parent_histograms[feature_idx], sibling_histograms[feature_idx]) + histogram = np.zeros(context.max_bins, dtype=HISTOGRAM_DTYPE) + _subtract_histograms(context.max_bins, parent_histograms[feature_idx], + sibling_histograms[feature_idx], histogram) return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) diff --git a/sklearn/ensemble/gbm/tests/test_histogram.py b/sklearn/ensemble/gbm/tests/test_histogram.py index 5a392371acd75..9af3fe7257209 100644 --- a/sklearn/ensemble/gbm/tests/test_histogram.py +++ b/sklearn/ensemble/gbm/tests/test_histogram.py @@ -10,6 +10,7 @@ from sklearn.ensemble.gbm.histogram import _build_histogram_root_no_hessian from sklearn.ensemble.gbm.histogram import _build_histogram_root from sklearn.ensemble.gbm.histogram import _subtract_histograms +from sklearn.ensemble.gbm.types import HISTOGRAM_DTYPE @pytest.mark.parametrize( @@ -22,8 +23,9 @@ def test_build_histogram(build_func): ordered_hessians = np.array([1, 1, 2], dtype=np.float32) sample_indices = np.array([0, 2, 3], dtype=np.uint32) - hist = build_func(3, sample_indices, binned_feature, - ordered_gradients, ordered_hessians) + hist = np.zeros(3, dtype=HISTOGRAM_DTYPE) + build_func(3, sample_indices, binned_feature, ordered_gradients, + ordered_hessians, hist) assert_array_equal(hist['count'], [2, 1, 0]) assert_allclose(hist['sum_gradients'], [1, 3, 0]) assert_allclose(hist['sum_hessians'], [2, 2, 0]) @@ -33,8 +35,9 @@ def test_build_histogram(build_func): ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=np.float32) ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=np.float32) - hist = build_func(3, sample_indices, binned_feature, - ordered_gradients, ordered_hessians) + hist = np.zeros(3, dtype=HISTOGRAM_DTYPE) + build_func(3, sample_indices, binned_feature, ordered_gradients, + ordered_hessians, hist) assert_array_equal(hist['count'], [2, 2, 1]) assert_allclose(hist['sum_gradients'], [1, 4, 0]) assert_allclose(hist['sum_hessians'], [2, 2, 1]) @@ -50,21 +53,25 @@ def test_histogram_sample_order_independence(): sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False) ordered_gradients = rng.randn(n_sub_samples).astype(np.float32) - hist_gc = _build_histogram_no_hessian(n_bins, sample_indices, - binned_feature, ordered_gradients) + hist_gc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + ordered_gradients, hist_gc) ordered_hessians = rng.exponential(size=n_sub_samples).astype(np.float32) - hist_ghc = _build_histogram(n_bins, sample_indices, binned_feature, - ordered_gradients, ordered_hessians) + hist_ghc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians, hist_ghc) permutation = rng.permutation(n_sub_samples) - hist_gc_perm = _build_histogram_no_hessian( - n_bins, sample_indices[permutation], binned_feature, - ordered_gradients[permutation]) + hist_gc_perm = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + _build_histogram_no_hessian(n_bins, sample_indices[permutation], + binned_feature, ordered_gradients[permutation], + hist_gc_perm) - hist_ghc_perm = _build_histogram( - n_bins, sample_indices[permutation], binned_feature, - ordered_gradients[permutation], ordered_hessians[permutation]) + hist_ghc_perm = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + _build_histogram(n_bins, sample_indices[permutation], binned_feature, + ordered_gradients[permutation], + ordered_hessians[permutation], hist_ghc_perm) assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients']) assert_array_equal(hist_gc['count'], hist_gc_perm['count']) @@ -89,17 +96,22 @@ def test_unrolled_equivalent_to_naive(constant_hessian): else: ordered_hessians = rng.lognormal(size=n_samples).astype(np.float32) - hist_gc_root = _build_histogram_root_no_hessian(n_bins, binned_feature, - ordered_gradients) - hist_ghc_root = _build_histogram_root(n_bins, binned_feature, - ordered_gradients, ordered_hessians) - hist_gc = _build_histogram_no_hessian(n_bins, sample_indices, - binned_feature, ordered_gradients) - hist_ghc = _build_histogram(n_bins, sample_indices, binned_feature, - ordered_gradients, ordered_hessians) - - hist_naive = _build_histogram_naive(n_bins, sample_indices, binned_feature, - ordered_gradients, ordered_hessians) + hist_gc_root = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_ghc_root = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_gc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_ghc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_naive = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + + _build_histogram_root_no_hessian(n_bins, binned_feature, ordered_gradients, + hist_gc_root) + _build_histogram_root(n_bins, binned_feature, ordered_gradients, + ordered_hessians, hist_ghc_root) + _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + ordered_gradients, hist_gc) + _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians, hist_ghc) + _build_histogram_naive(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians, hist_naive) for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_gc, hist_ghc): assert_array_equal(hist['count'], hist_naive['count']) @@ -125,42 +137,46 @@ def test_hist_subtraction(constant_hessian): else: ordered_hessians = rng.lognormal(size=n_samples).astype(np.float32) + hist_parent = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) if constant_hessian: - hist_parent = _build_histogram_no_hessian(n_bins, sample_indices, - binned_feature, - ordered_gradients) + _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + ordered_gradients, hist_parent) else: - hist_parent = _build_histogram(n_bins, sample_indices, binned_feature, - ordered_gradients, ordered_hessians) + _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians, hist_parent) mask = rng.randint(0, 2, n_samples).astype(np.bool) sample_indices_left = sample_indices[mask] ordered_gradients_left = ordered_gradients[mask] ordered_hessians_left = ordered_hessians[mask] + hist_left = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) if constant_hessian: - hist_left = _build_histogram_no_hessian(n_bins, sample_indices_left, - binned_feature, - ordered_gradients_left) + _build_histogram_no_hessian(n_bins, sample_indices_left, + binned_feature, ordered_gradients_left, + hist_left) else: - hist_left = _build_histogram(n_bins, sample_indices_left, - binned_feature, ordered_gradients_left, - ordered_hessians_left) + _build_histogram(n_bins, sample_indices_left, binned_feature, + ordered_gradients_left, ordered_hessians_left, + hist_left) sample_indices_right = sample_indices[~mask] ordered_gradients_right = ordered_gradients[~mask] ordered_hessians_right = ordered_hessians[~mask] + hist_right = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) if constant_hessian: - hist_right = _build_histogram_no_hessian(n_bins, sample_indices_right, - binned_feature, - ordered_gradients_right) + _build_histogram_no_hessian(n_bins, sample_indices_right, + binned_feature, ordered_gradients_right, + hist_right) else: - hist_right = _build_histogram(n_bins, sample_indices_right, - binned_feature, ordered_gradients_right, - ordered_hessians_right) - - hist_left_sub = _subtract_histograms(n_bins, hist_parent, hist_right) - hist_right_sub = _subtract_histograms(n_bins, hist_parent, hist_left) + _build_histogram(n_bins, sample_indices_right, binned_feature, + ordered_gradients_right, ordered_hessians_right, + hist_right) + + hist_left_sub = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_right_sub = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + _subtract_histograms(n_bins, hist_parent, hist_right, hist_left_sub) + _subtract_histograms(n_bins, hist_parent, hist_left, hist_right_sub) for key in ('count', 'sum_hessians', 'sum_gradients'): assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) diff --git a/sklearn/ensemble/gbm/types.py b/sklearn/ensemble/gbm/types.py new file mode 100644 index 0000000000000..738ac539b46b4 --- /dev/null +++ b/sklearn/ensemble/gbm/types.py @@ -0,0 +1,12 @@ +import numpy as np + + +Y_DTYPE = np.float32 +X_DTYPE = np.float64 +X_BINNED_DTYPE = np.uint8 + +HISTOGRAM_DTYPE = np.dtype([ + ('sum_gradients', np.float32), + ('sum_hessians', np.float32), + ('count', np.uint32), +]) From 18c72ae9018a62a3747dbf4224a8444f857bc8bd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 10 Jan 2019 10:55:27 -0500 Subject: [PATCH 013/247] Cleaned predictor code a bit --- sklearn/ensemble/gbm/gradient_boosting.py | 8 +-- sklearn/ensemble/gbm/predictor.pyx | 69 +++++++---------------- 2 files changed, 23 insertions(+), 54 deletions(-) diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/ensemble/gbm/gradient_boosting.py index e0d6b4ddc57ba..f86e6bc93ceae 100644 --- a/sklearn/ensemble/gbm/gradient_boosting.py +++ b/sklearn/ensemble/gbm/gradient_boosting.py @@ -382,7 +382,7 @@ def _raw_predict(self, X): raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ - X = check_array(X) + X = check_array(X, dtype=X_DTYPE) check_is_fitted(self, 'predictors_') if X.shape[1] != self.n_features_: raise ValueError( @@ -395,13 +395,9 @@ def _raw_predict(self, X): dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ - # Should we parallelize this? - is_binned = X.dtype == np.uint8 for predictors_of_ith_iteration in self.predictors_: for k, predictor in enumerate(predictors_of_ith_iteration): - predict = (predictor.predict_binned if is_binned - else predictor.predict) - raw_predictions[:, k] += predict(X) + raw_predictions[:, k] += predictor.predict(X) return raw_predictions diff --git a/sklearn/ensemble/gbm/predictor.pyx b/sklearn/ensemble/gbm/predictor.pyx index b7cda2814baac..4512d1684ef2d 100644 --- a/sklearn/ensemble/gbm/predictor.pyx +++ b/sklearn/ensemble/gbm/predictor.pyx @@ -1,15 +1,21 @@ -# cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 """ This module contains the TreePredictor class which is used for prediction. """ import numpy as np +cimport numpy as np + +from .types import X_DTYPE PREDICTOR_RECORD_DTYPE = np.dtype([ ('value', np.float32), ('count', np.uint32), ('feature_idx', np.uint32), - ('threshold', np.float32), + ('threshold', X_DTYPE), ('left', np.uint32), ('right', np.uint32), ('gain', np.float32), @@ -19,15 +25,13 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ # TODO: shrinkage in leaf for feature importance error bar? ]) -ctypedef fused float_or_double: - float - double +ctypedef np.npy_float64 NPY_X_DTYPE cdef packed struct node_struct: float value unsigned int count unsigned int feature_idx - float threshold + NPY_X_DTYPE threshold unsigned int left unsigned int right float gain @@ -55,26 +59,6 @@ class TreePredictor: """Return maximum depth among all leaves.""" return int(self.nodes['depth'].max()) - def predict_binned(self, binned_data, out=None): - """Predict raw values for binned data. - - Parameters - ---------- - binned_data : array-like of np.uint8, shape=(n_samples, n_features) - The binned input samples. - out : array-like, shape=(n_samples,), optional (default=None) - If not None, predictions will be written inplace in ``out``. - - Returns - ------- - y : array, shape (n_samples,) - The raw predicted values. - """ - if out is None: - out = np.empty(binned_data.shape[0], dtype=np.float32) - _predict_binned(self.nodes, binned_data, out) - return out - def predict(self, X): """Predict raw values for non-binned data. @@ -88,31 +72,18 @@ class TreePredictor: y : array, shape (n_samples,) The raw predicted values. """ - # TODO: introspect X to dispatch to numerical or categorical data - # (dense or sparse) on a feature by feature basis. out = np.empty(X.shape[0], dtype=np.float32) _predict_from_numeric_data(self.nodes, X, out) return out -def _predict_one_binned(nodes, binned_data): - node = nodes[0] - while True: - if node['is_leaf']: - return node['value'] - if binned_data[node['feature_idx']] <= node['bin_threshold']: - node = nodes[node['left']] - else: - node = nodes[node['right']] - - -def _predict_binned(nodes, binned_data, out): - for i in range(binned_data.shape[0]): - out[i] = _predict_one_binned(nodes, binned_data[i]) +cdef float _predict_one_from_numeric_data( + node_struct [:] nodes, + NPY_X_DTYPE [:] numeric_data) nogil: + cdef: + node_struct node = nodes[0] -cdef float _predict_one_from_numeric_data(node_struct [:] nodes, float_or_double [:] numeric_data) nogil: - cdef node_struct node = nodes[0] while True: if node.is_leaf: return node.value @@ -122,11 +93,13 @@ cdef float _predict_one_from_numeric_data(node_struct [:] nodes, float_or_double node = nodes[node.right] -# TODO: having a view on numeric_data (passed by user) may not be supported, -# see sklearn issue 10624 -def _predict_from_numeric_data(node_struct [:] nodes, float_or_double [:, :] numeric_data, float [:] out): +cdef void _predict_from_numeric_data( + node_struct [:] nodes, + NPY_X_DTYPE [:, :] numeric_data, + float [:] out) nogil: - cdef int i + cdef: + unsigned int i for i in range(numeric_data.shape[0]): out[i] = _predict_one_from_numeric_data(nodes, numeric_data[i]) From 45bd35a0c9bf23b1102d3cb1c04af92f57daad49 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 10 Jan 2019 13:23:57 -0500 Subject: [PATCH 014/247] Added test and benchmark --- bench_hist.py | 2 +- bench_predict.py | 98 ++++++++++++++++++++ sklearn/ensemble/gbm/predictor.pyx | 3 + sklearn/ensemble/gbm/tests/test_predictor.py | 37 ++++++++ 4 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 bench_predict.py create mode 100644 sklearn/ensemble/gbm/tests/test_predictor.py diff --git a/bench_hist.py b/bench_hist.py index 7ef6822555325..188f05b445c32 100644 --- a/bench_hist.py +++ b/bench_hist.py @@ -2,7 +2,7 @@ Compare histogram building function with pygbm. run with -export OMP_NUM_THREADS=1 && make in && python bench_hist.py +export NUMBA_NUM_THREADS=1 && make in && python bench_hist.py might be a bit unfair to cython code since we're calling the python versions of the cpdef functions, which causes unnecessary conversions. diff --git a/bench_predict.py b/bench_predict.py new file mode 100644 index 0000000000000..a3b885dada518 --- /dev/null +++ b/bench_predict.py @@ -0,0 +1,98 @@ +""" +Compare prediction time with pygbm. + +run with +export NUMBA_NUM_THREADS=1 && make in && python bench_predict.py + +""" + +from time import time +from collections import defaultdict + +import pygbm +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.datasets import make_regression, make_classification +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import GBMRegressor +from sklearn.ensemble import GBMClassifier + +classif = False +n_classes = 3 +max_pow = 7 +n_samples = int(10**max_pow) +max_iter = 20 +n_features = 5 + +if classif: + X, y = make_classification(n_samples=n_samples, n_features=n_features, + random_state=0, n_classes=n_classes, + n_clusters_per_class=1) + GBM = GBMClassifier + GBDT = GradientBoostingClassifier + PYGBM_GBM = pygbm.GradientBoostingClassifier +else: + X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=0) + GBM = GBMRegressor + GBDT = GradientBoostingRegressor + PYGBM_GBM = pygbm.GradientBoostingRegressor + + +sklearn_est = GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + n_iter_no_change=None, + random_state=0, + verbose=False) + +pygbm_est = PYGBM_GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + random_state=0, + verbose=False) +print("compiling pygbm code, and fit estimators") +pygbm_est.fit(X[:1000], y[:1000]) +pygbm_est.predict(X[:1000]) +sklearn_est.fit(X[:1000], y[:1000]) +print("done") + +n_samples_list = [10**x for x in range(2, max_pow + 1)] +n_exp = 3 + +predict_durations = defaultdict(lambda: defaultdict(list)) + +for n_samples in n_samples_list: + for exp in range(n_exp): + + tic = time() + sklearn_est.predict(X[:n_samples]) + predict_duration = time() - tic + print(f'sklearn_est predict_duration: {predict_duration:.3f}s') + + predict_durations['sklearn'][n_samples].append(predict_duration) + + tic = time() + pygbm_est.predict(X[:n_samples]) + predict_duration = time() - tic + print(f'pygbm_est predict_duration: {predict_duration:.3f}s\n') + predict_durations['pygbm'][n_samples].append(predict_duration) + + +fig, ax = plt.subplots(1) + +for implem in ('sklearn', 'pygbm'): + avgs = [np.mean(predict_durations[implem][n_samples]) + for n_samples in n_samples_list] + stds = [np.std(predict_durations[implem][n_samples]) + for n_samples in n_samples_list] + ax.errorbar(n_samples_list, avgs, yerr=stds, label=implem) +ax.set_xscale('log') +ax.legend(loc='best') + +fig.suptitle(f'Avg prediction time over {n_exp} runs\nfor different sample sizes') +plt.show() diff --git a/sklearn/ensemble/gbm/predictor.pyx b/sklearn/ensemble/gbm/predictor.pyx index 4512d1684ef2d..485145eac5ea7 100644 --- a/sklearn/ensemble/gbm/predictor.pyx +++ b/sklearn/ensemble/gbm/predictor.pyx @@ -1,3 +1,4 @@ +# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False @@ -72,6 +73,8 @@ class TreePredictor: y : array, shape (n_samples,) The raw predicted values. """ + # TODO: change dtype of out (should be same as Y_DTYPE I think since + # the value is grad/hess which are Y_DTYPE) out = np.empty(X.shape[0], dtype=np.float32) _predict_from_numeric_data(self.nodes, X, out) return out diff --git a/sklearn/ensemble/gbm/tests/test_predictor.py b/sklearn/ensemble/gbm/tests/test_predictor.py new file mode 100644 index 0000000000000..35d57fd5f14a5 --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_predictor.py @@ -0,0 +1,37 @@ +import numpy as np +from numpy.testing import assert_allclose +from sklearn.datasets import load_boston +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score +import pytest + +from sklearn.ensemble.gbm.binning import BinMapper +from sklearn.ensemble.gbm.grower import TreeGrower + + +@pytest.mark.parametrize('max_bins', [200, 256]) +def test_boston_dataset(max_bins): + boston = load_boston() + X_train, X_test, y_train, y_test = train_test_split( + boston.data, boston.target, random_state=42) + + mapper = BinMapper(max_bins=max_bins, random_state=42) + X_train_binned = mapper.fit_transform(X_train) + X_test_binned = mapper.transform(X_test) + + # Init gradients and hessians to that of least squares loss + gradients = -y_train.astype(np.float32) + hessians = np.ones(1, dtype=np.float32) + + min_samples_leaf = 8 + max_leaf_nodes = 31 + grower = TreeGrower(X_train_binned, gradients, hessians, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, + n_bins_per_feature=mapper.n_bins_per_feature_) + grower.grow() + + predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) + + assert r2_score(y_train, predictor.predict(X_train)) > 0.85 + assert r2_score(y_test, predictor.predict(X_test)) > 0.70 From cd8057430e946efe9ae824546f7194181b77db9c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 10 Jan 2019 14:00:01 -0500 Subject: [PATCH 015/247] Added tests for binnin --- bench_binning.py | 88 ++++++++ bench_predict.py | 1 - gdb_test.py | 2 +- sklearn/ensemble/gbm/binning.pyx | 14 +- sklearn/ensemble/gbm/tests/test_binning.py | 225 +++++++++++++++++++++ 5 files changed, 321 insertions(+), 9 deletions(-) create mode 100644 bench_binning.py create mode 100644 sklearn/ensemble/gbm/tests/test_binning.py diff --git a/bench_binning.py b/bench_binning.py new file mode 100644 index 0000000000000..bacff736eec64 --- /dev/null +++ b/bench_binning.py @@ -0,0 +1,88 @@ +""" +Compare binning fitting and transform time with pygbm. + +run with +export NUMBA_NUM_THREADS=1 && make in && python bench_binning.py +""" +from time import time +from collections import defaultdict + +import numpy as np +import pygbm +import matplotlib.pyplot as plt +from sklearn.datasets import make_regression + +from sklearn.ensemble.gbm.binning import BinMapper + + +n_features = 5 + +max_pow = 7 +n_samples = int(10**max_pow) +X, y = make_regression(n_samples=n_samples, n_features=n_features, + random_state=0) + +print("compiling pygbm") +pygbm_bm = pygbm.binning.BinMapper() +pygbm_bm.fit_transform(X[:1000]) +print('done') + +bm = BinMapper() + +n_samples_list = [10**x for x in range(2, max_pow + 1)] +n_exp = 10 + +transform_durations = defaultdict(lambda: defaultdict(list)) +fit_durations = defaultdict(lambda: defaultdict(list)) + +for n_samples in n_samples_list: + for exp in range(n_exp): + + tic = time() + tic = time() + bm.fit(X[:n_samples]) + fit_duration = time() - tic + print(f"sklearn fit duration = {fit_duration:.3f}") + tic = time() + bm.transform(X[:n_samples]) + transform_duration = time() - tic + print(f"sklearn transform duration = {transform_duration:.3f}") + + fit_durations['sklearn'][n_samples].append(fit_duration) + transform_durations['sklearn'][n_samples].append(transform_duration) + + tic = time() + pygbm_bm.fit(X[:n_samples]) + fit_duration = time() - tic + print(f"pygbm fit duration = {fit_duration:.3f}") + tic = time() + pygbm_bm.transform(X[:n_samples]) + transform_duration = time() - tic + print(f"pygbm transform duration = {transform_duration:.3f}") + fit_durations['pygbm'][n_samples].append(fit_duration) + transform_durations['pygbm'][n_samples].append(transform_duration) + +fig, axs = plt.subplots(2) + +for implem in ('sklearn', 'pygbm'): + avgs = [np.mean(fit_durations[implem][n_samples]) + for n_samples in n_samples_list] + stds = [np.std(fit_durations[implem][n_samples]) + for n_samples in n_samples_list] + axs[0].errorbar(n_samples_list, avgs, yerr=stds, label=implem) + axs[0].set_title('Fit') + +for implem in ('sklearn', 'pygbm'): + avgs = [np.mean(transform_durations[implem][n_samples]) + for n_samples in n_samples_list] + stds = [np.std(transform_durations[implem][n_samples]) + for n_samples in n_samples_list] + axs[1].errorbar(n_samples_list, avgs, yerr=stds, label=implem) + axs[1].set_title('transform') + +for ax in axs: + ax.set_xscale('log') + ax.legend(loc='best') + +fig.suptitle(f'Avg fit and transform time for binning over {n_exp} runs\nfor different sample sizes') +plt.show() diff --git a/bench_predict.py b/bench_predict.py index a3b885dada518..e859470eaa3fa 100644 --- a/bench_predict.py +++ b/bench_predict.py @@ -3,7 +3,6 @@ run with export NUMBA_NUM_THREADS=1 && make in && python bench_predict.py - """ from time import time diff --git a/gdb_test.py b/gdb_test.py index d7f3e0c6b24c4..ea71f0f0611f0 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -12,7 +12,7 @@ classif = False n_classes = 3 -n_samples = 100000 +n_samples = int(1e6) max_iter = 5 if classif: diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/ensemble/gbm/binning.pyx index 8bb38e04fe75c..1dc81d67ea1af 100644 --- a/sklearn/ensemble/gbm/binning.pyx +++ b/sklearn/ensemble/gbm/binning.pyx @@ -66,8 +66,8 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return binning_thresholds -cdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, - NPY_X_BINNED_DTYPE [::1, :] binned): +cpdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, + NPY_X_BINNED_DTYPE [::1, :] binned): """Bin numerical values to discrete integer-coded levels. Parameters @@ -96,9 +96,9 @@ cdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, binned[:, feature_idx]) -cdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, - NPY_X_DTYPE [:] binning_thresholds, - NPY_X_BINNED_DTYPE [:] binned) nogil: +cpdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, + NPY_X_DTYPE [:] binning_thresholds, + NPY_X_BINNED_DTYPE [:] binned) nogil: """Binary search to the find the bin index for each value in data.""" cdef: int i @@ -106,8 +106,8 @@ cdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, int right int middle - # for i in range(data.shape[0]): - for i in prange(data.shape[0], schedule='static'): + # for i in prange(data.shape[0], schedule='static'): + for i in range(data.shape[0]): left, right = 0, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 diff --git a/sklearn/ensemble/gbm/tests/test_binning.py b/sklearn/ensemble/gbm/tests/test_binning.py new file mode 100644 index 0000000000000..3a654af631a08 --- /dev/null +++ b/sklearn/ensemble/gbm/tests/test_binning.py @@ -0,0 +1,225 @@ +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose +import pytest + +from sklearn.ensemble.gbm.binning import BinMapper +from sklearn.ensemble.gbm.binning import _find_binning_thresholds +from sklearn.ensemble.gbm.binning import _map_to_bins +from sklearn.ensemble.gbm.types import X_DTYPE, X_BINNED_DTYPE + + + +DATA = np.random.RandomState(42).normal( + loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2) +).astype(X_DTYPE) + + +def test_find_binning_thresholds_regular_data(): + data = np.linspace(0, 10, 1001).reshape(-1, 1) + bin_thresholds = _find_binning_thresholds(data, max_bins=10) + assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=5) + assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) + + +def test_find_binning_thresholds_small_regular_data(): + data = np.linspace(0, 10, 11).reshape(-1, 1) + + bin_thresholds = _find_binning_thresholds(data, max_bins=5) + assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=10) + assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=11) + assert_allclose(bin_thresholds[0], np.arange(10) + .5) + + bin_thresholds = _find_binning_thresholds(data, max_bins=255) + assert_allclose(bin_thresholds[0], np.arange(10) + .5) + + +def test_find_binning_thresholds_random_data(): + bin_thresholds = _find_binning_thresholds(DATA, random_state=0) + assert len(bin_thresholds) == 2 + for i in range(len(bin_thresholds)): + assert bin_thresholds[i].shape == (255,) # 256 - 1 + assert bin_thresholds[i].dtype == DATA.dtype + + assert_allclose(bin_thresholds[0][[64, 128, 192]], + np.array([-0.7, 0.0, 0.7]), atol=1e-1) + + assert_allclose(bin_thresholds[1][[64, 128, 192]], + np.array([9.99, 10.00, 10.01]), atol=1e-2) + + +def test_find_binning_thresholds_low_n_bins(): + bin_thresholds = _find_binning_thresholds(DATA, max_bins=128, + random_state=0) + assert len(bin_thresholds) == 2 + for i in range(len(bin_thresholds)): + assert bin_thresholds[i].shape == (127,) # 128 - 1 + assert bin_thresholds[i].dtype == DATA.dtype + + +def test_find_binning_thresholds_invalid_n_bins(): + with pytest.raises(ValueError): + _find_binning_thresholds(DATA, max_bins=1024) + + +@pytest.mark.parametrize('n_bins', [16, 128, 256]) +def test_map_to_bins(n_bins): + bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, + random_state=0) + binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F') + _map_to_bins(DATA, bin_thresholds, binned) + assert binned.shape == DATA.shape + assert binned.dtype == np.uint8 + assert binned.flags.f_contiguous + + min_indices = DATA.argmin(axis=0) + max_indices = DATA.argmax(axis=0) + + for feature_idx, min_idx in enumerate(min_indices): + assert binned[min_idx, feature_idx] == 0 + for feature_idx, max_idx in enumerate(max_indices): + assert binned[max_idx, feature_idx] == n_bins - 1 + + +@pytest.mark.parametrize("n_bins", [5, 10, 42]) +def test_bin_mapper_random_data(n_bins): + n_samples, n_features = DATA.shape + + expected_count_per_bin = n_samples // n_bins + tol = int(0.05 * expected_count_per_bin) + + mapper = BinMapper(max_bins=n_bins, random_state=42).fit(DATA) + binned = mapper.transform(DATA) + + assert binned.shape == (n_samples, n_features) + assert binned.dtype == np.uint8 + assert_array_equal(binned.min(axis=0), np.array([0, 0])) + assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) + assert len(mapper.bin_thresholds_) == n_features + for i in range(len(mapper.bin_thresholds_)): + assert mapper.bin_thresholds_[i].shape == (n_bins - 1,) + assert mapper.bin_thresholds_[i].dtype == DATA.dtype + assert np.all(mapper.n_bins_per_feature_ == n_bins) + + # Check that the binned data is approximately balanced across bins. + for feature_idx in range(n_features): + for bin_idx in range(n_bins): + count = (binned[:, feature_idx] == bin_idx).sum() + assert abs(count - expected_count_per_bin) < tol + + +@pytest.mark.parametrize("n_samples, n_bins", [ + (5, 5), + (5, 10), + (5, 11), + (42, 255) +]) +def test_bin_mapper_small_random_data(n_samples, n_bins): + data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) + assert len(np.unique(data)) == n_samples + + mapper = BinMapper(max_bins=n_bins, random_state=42) + binned = mapper.fit_transform(data) + + assert binned.shape == data.shape + assert binned.dtype == np.uint8 + assert_array_equal(binned.ravel()[np.argsort(data.ravel())], + np.arange(n_samples)) + + +@pytest.mark.parametrize("n_bins, n_distinct, multiplier", [ + (5, 5, 1), + (5, 5, 3), + (255, 12, 42), +]) +def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): + data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) + binned = BinMapper(max_bins=n_bins).fit_transform(data) + assert_array_equal(data, binned) + + +@pytest.mark.parametrize('n_distinct', [2, 7, 42]) +def test_bin_mapper_repeated_values_invariance(n_distinct): + rng = np.random.RandomState(42) + distinct_values = rng.normal(size=n_distinct) + assert len(np.unique(distinct_values)) == n_distinct + + repeated_indices = rng.randint(low=0, high=n_distinct, size=1000) + data = distinct_values[repeated_indices] + rng.shuffle(data) + assert_array_equal(np.unique(data), np.sort(distinct_values)) + + data = data.reshape(-1, 1) + + mapper_1 = BinMapper(max_bins=n_distinct) + binned_1 = mapper_1.fit_transform(data) + assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) + + # Adding more bins to the mapper yields the same results (same thresholds) + mapper_2 = BinMapper(max_bins=min(256, n_distinct * 3)) + binned_2 = mapper_2.fit_transform(data) + + assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) + assert_array_equal(binned_1, binned_2) + + +@pytest.mark.parametrize("n_bins, scale, offset", [ + (3, 2, -1), + (42, 1, 0), + (256, 0.3, 42), +]) +def test_bin_mapper_identity_small(n_bins, scale, offset): + data = np.arange(n_bins).reshape(-1, 1) * scale + offset + binned = BinMapper(max_bins=n_bins).fit_transform(data) + assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1)) + + +@pytest.mark.parametrize('n_bins_small, n_bins_large', [ + (2, 2), + (3, 3), + (4, 4), + (42, 42), + (256, 256), + (5, 17), + (42, 256), +]) +def test_bin_mapper_idempotence(n_bins_small, n_bins_large): + assert n_bins_large >= n_bins_small + data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) + mapper_small = BinMapper(max_bins=n_bins_small) + mapper_large = BinMapper(max_bins=n_bins_large) + binned_small = mapper_small.fit_transform(data) + binned_large = mapper_large.fit_transform(binned_small) + assert_array_equal(binned_small, binned_large) + + +@pytest.mark.parametrize('max_bins', [10, 100, 256]) +@pytest.mark.parametrize('diff', [-5, 0, 5]) +def test_n_bins_per_feature(max_bins, diff): + # Check that n_bins_per_feature is n_unique_values when + # n_unique_values <= max_bins, else max_bins. + + n_unique_values = max_bins + diff + X = list(range(n_unique_values)) * 2 + X = np.array(X).reshape(-1, 1) + mapper = BinMapper(max_bins=max_bins).fit(X) + assert np.all(mapper.n_bins_per_feature_ == min(max_bins, n_unique_values)) + + +def test_subsample(): + # Make sure bin thresholds are different when applying subsampling + mapper_no_subsample = BinMapper(subsample=None, random_state=0).fit(DATA) + mapper_subsample = BinMapper(subsample=256, random_state=0).fit(DATA) + + for feature in range(DATA.shape[1]): + with pytest.raises(AssertionError): + np.testing.assert_array_almost_equal( + mapper_no_subsample.bin_thresholds_[feature], + mapper_subsample.bin_thresholds_[feature], + decimal=3 + ) From 11a5425d91593bf2135fcdc4cf952a7b61e7593c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 11 Jan 2019 09:18:33 -0500 Subject: [PATCH 016/247] Minimal changes --- gdb_test.py | 42 ++++++++++----------- sklearn/ensemble/gbm/_gradient_boosting.pyx | 29 +++++++++----- sklearn/ensemble/gbm/binning.pyx | 4 +- sklearn/ensemble/gbm/gradient_boosting.py | 14 +------ sklearn/ensemble/setup.py | 5 ++- 5 files changed, 49 insertions(+), 45 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index ea71f0f0611f0..d4fde1104370a 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -27,15 +27,15 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -pygbm_est = PYGBM_GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - random_state=0, - verbose=False) -print("compiling pygbm code") -pygbm_est.fit(X[:1000], y[:1000]) -print("done") +# pygbm_est = PYGBM_GBM( +# max_iter=max_iter, +# scoring=None, # no early stopping +# validation_split=None, +# random_state=0, +# verbose=False) +# print("compiling pygbm code") +# pygbm_est.fit(X[:1000], y[:1000]) +# print("done") gbm = GBM( max_iter=max_iter, @@ -44,21 +44,21 @@ n_iter_no_change=None, random_state=0, verbose=True) -tic = time() -gbm.fit(X, y) -fit_duration = time() - tic -print(f'sklearn gbm fit_duration: {fit_duration:.3f}s\n') +# tic = time() +# gbm.fit(X, y) +# fit_duration = time() - tic +# print(f'sklearn gbm fit_duration: {fit_duration:.3f}s\n') -pygbm_est.set_params(verbose=True) -tic = time() -pygbm_est.fit(X, y) -fit_duration = time() - tic -print(f'pygbm fit_duration: {fit_duration:.3f}s\n') +# pygbm_est.set_params(verbose=True) +# tic = time() +# pygbm_est.fit(X, y) +# fit_duration = time() - tic +# print(f'pygbm fit_duration: {fit_duration:.3f}s\n') -# cProfile.runctx("gbm.fit(X, y).predict(X)", globals(), locals(), "Profile.prof") -# s = pstats.Stats("Profile.prof") -# s.strip_dirs().sort_stats("time").print_stats(.2) +cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") +s = pstats.Stats("Profile.prof") +s.strip_dirs().sort_stats("time").print_stats(.2) # tic = time() # gbdt = GBDT(n_estimators=max_iter, diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx index ec2b1de0e87e8..c1f432d7c8183 100644 --- a/sklearn/ensemble/gbm/_gradient_boosting.pyx +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -1,15 +1,19 @@ +# cython: profile=True +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 cimport cython import numpy as np cimport numpy as np -ctypedef fused float_or_double: - float - double +ctypedef np.npy_float32 NPY_Y_DTYPE +ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE -@cython.boundscheck(False) # Deactivate bounds checking -@cython.wraparound(False) # Deactivate negative indexing. -def _update_raw_predictions__(float [:] leaves_values, list samples_indices_at_leaf, float_or_double [:] raw_predictions): +def _update_raw_predictions(float [:] leaves_values, + list samples_indices_at_leaf, + NPY_Y_DTYPE [:] raw_predictions): """Update raw_predictions by reading the predictions of the ith tree directly form the leaves. @@ -26,10 +30,17 @@ def _update_raw_predictions__(float [:] leaves_values, list samples_indices_at_l """ cdef: int leaf_idx - unsigned int sample_idx + float val unsigned int [:] sample_indices for leaf_idx in range(leaves_values.shape[0]): samples_indices = samples_indices_at_leaf[leaf_idx] - for sample_idx in samples_indices: - raw_predictions[sample_idx] += leaves_values[leaf_idx] \ No newline at end of file + val = leaves_values[leaf_idx] + blop(samples_indices, raw_predictions, val) + +cdef void blop(unsigned int [:] samples_indices, NPY_Y_DTYPE [:] raw_predictions, float + val): + cdef: + unsigned int sample_idx + for sample_idx in samples_indices: + raw_predictions[sample_idx] += val \ No newline at end of file diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/ensemble/gbm/binning.pyx index 1dc81d67ea1af..8ace124a6ede6 100644 --- a/sklearn/ensemble/gbm/binning.pyx +++ b/sklearn/ensemble/gbm/binning.pyx @@ -106,8 +106,8 @@ cpdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, int right int middle - # for i in prange(data.shape[0], schedule='static'): - for i in range(data.shape[0]): + # for i in range(data.shape[0]): + for i in prange(data.shape[0], schedule='static'): left, right = 0, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/ensemble/gbm/gradient_boosting.py index f86e6bc93ceae..952ead8753da7 100644 --- a/sklearn/ensemble/gbm/gradient_boosting.py +++ b/sklearn/ensemble/gbm/gradient_boosting.py @@ -12,7 +12,7 @@ from sklearn.metrics import check_scoring from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder -from ._gradient_boosting import _update_raw_predictions__ +from ._gradient_boosting import _update_raw_predictions from .types import Y_DTYPE, X_DTYPE from .binning import BinMapper @@ -241,11 +241,7 @@ def fit(self, X, y): leaves_values = [l.value for l in grower.finalized_leaves] samples_indices_in_leaves = [l.sample_indices for l in grower.finalized_leaves] leaves_values = np.array(leaves_values, dtype=np.float32) - _update_raw_predictions__(leaves_values, samples_indices_in_leaves, raw_predictions[:, k]) - # leaves_data = [(l.value, l.sample_indices) - # for l in grower.finalized_leaves] - # _update_raw_predictions(leaves_data, raw_predictions[:, k]) - + _update_raw_predictions(leaves_values, samples_indices_in_leaves, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred @@ -679,9 +675,3 @@ def _get_loss(self): return _LOSSES['categorical_crossentropy']() return _LOSSES[self.loss]() - -def _update_raw_predictions(leaves_data, raw_predictions): - for leaf_idx in range(len(leaves_data)): - leaf_value, sample_indices = leaves_data[leaf_idx] - for sample_idx in sample_indices: - raw_predictions[sample_idx] += leaf_value \ No newline at end of file diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index bc084917122ba..c6378c7c8da8e 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -22,7 +22,10 @@ def configuration(parent_package="", top_path=None): config.add_extension("gbm.binning", sources=["gbm/binning.pyx"], - include_dirs=[numpy.get_include()]) + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp'], + ) config.add_extension("gbm.predictor", sources=["gbm/predictor.pyx"], From 0c79c117c3967bfe2b1ca37915ff5cd7cf911e8f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 11 Jan 2019 09:50:08 -0500 Subject: [PATCH 017/247] Optimized _update_raw_predictions --- sklearn/ensemble/gbm/_gradient_boosting.pyx | 60 ++++++++++----------- sklearn/ensemble/gbm/gradient_boosting.py | 5 +- sklearn/ensemble/gbm/grower.py | 23 +++++++- sklearn/ensemble/gbm/splitting.pyx | 2 +- 4 files changed, 54 insertions(+), 36 deletions(-) diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx index c1f432d7c8183..de012c6f45b87 100644 --- a/sklearn/ensemble/gbm/_gradient_boosting.pyx +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -8,39 +8,39 @@ cimport cython import numpy as np cimport numpy as np +from .types import Y_DTYPE + ctypedef np.npy_float32 NPY_Y_DTYPE -ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE - -def _update_raw_predictions(float [:] leaves_values, - list samples_indices_at_leaf, - NPY_Y_DTYPE [:] raw_predictions): - """Update raw_predictions by reading the predictions of the ith tree - directly form the leaves. - - Can only be used for predicting the training data. raw_predictions - contains the sum of the tree values from iteration 0 to i - 1. This adds - the predictions of the ith tree to raw_predictions. - - Parameters - ---------- - leaves_data: list of tuples (leaf.value, leaf.sample_indices) - The leaves data used to update raw_predictions. - raw_predictions : array-like, shape=(n_samples,) - The raw predictions for the training data. - """ + +def _update_raw_predictions(NPY_Y_DTYPE [:] raw_predictions, grower): cdef: - int leaf_idx - float val - unsigned int [:] sample_indices + unsigned int [:] starts + unsigned int [:] stops + unsigned int [:] partition + NPY_Y_DTYPE [:] values + list leaves + + leaves = grower.finalized_leaves + starts = np.array([leaf.start for leaf in leaves], dtype=np.uint32) + stops = np.array([leaf.stop for leaf in leaves], dtype=np.uint32) + values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) + partition = grower.splitting_context.partition - for leaf_idx in range(leaves_values.shape[0]): - samples_indices = samples_indices_at_leaf[leaf_idx] - val = leaves_values[leaf_idx] - blop(samples_indices, raw_predictions, val) + _update_raw_predictions_helper(raw_predictions, starts, stops, partition, + values) + +cdef void _update_raw_predictions_helper( + NPY_Y_DTYPE [:] raw_predictions, + unsigned int [:] starts, + unsigned int [:] stops, + unsigned int [:] partition, + NPY_Y_DTYPE [:] values) nogil: -cdef void blop(unsigned int [:] samples_indices, NPY_Y_DTYPE [:] raw_predictions, float - val): cdef: unsigned int sample_idx - for sample_idx in samples_indices: - raw_predictions[sample_idx] += val \ No newline at end of file + unsigned int n_leaves + + n_leaves = starts.shape[0] + for leaf_idx in range(n_leaves): + for sample_idx in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[sample_idx] += values[leaf_idx] diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/ensemble/gbm/gradient_boosting.py index 952ead8753da7..d9b85ba3777a0 100644 --- a/sklearn/ensemble/gbm/gradient_boosting.py +++ b/sklearn/ensemble/gbm/gradient_boosting.py @@ -238,10 +238,7 @@ def fit(self, X, y): tic_pred = time() - leaves_values = [l.value for l in grower.finalized_leaves] - samples_indices_in_leaves = [l.sample_indices for l in grower.finalized_leaves] - leaves_values = np.array(leaves_values, dtype=np.float32) - _update_raw_predictions(leaves_values, samples_indices_in_leaves, raw_predictions[:, k]) + _update_raw_predictions(raw_predictions[:, k], grower) toc_pred = time() acc_prediction_time += toc_pred - tic_pred diff --git a/sklearn/ensemble/gbm/grower.py b/sklearn/ensemble/gbm/grower.py index 06723fe27f114..cf12219c33611 100644 --- a/sklearn/ensemble/gbm/grower.py +++ b/sklearn/ensemble/gbm/grower.py @@ -77,6 +77,16 @@ class TreeNode: apply_split_time = 0. hist_subtraction = False + # start and stop indices of the node in the splitting_context.partition + # array. Concretely, + # self.sample_indices = view(self.splitting_context.partition[start:stop]) + # Only used in _update_raw_prediction, because we need to iterate over the + # leaves and I don't know how to efficiently store the sample_indices views + # because they're all of different sizes. TODO: ask Olivier what he thinks + # about # this + start = 0 + stop = 0 + def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, parent=None): self.depth = depth @@ -249,6 +259,10 @@ def _intilialize_root(self): sum_gradients=np.sum(self.splitting_context.gradients), sum_hessians=hessian ) + + self.root.start = 0 + self.root.stop = n_samples + if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1): self._finalize_leaf(self.root) return @@ -338,7 +352,7 @@ def split_next(self): node = heappop(self.splittable_nodes) tic = time() - (sample_indices_left, sample_indices_right) = split_indices( + (sample_indices_left, sample_indices_right, i) = split_indices( self.splitting_context, node.split_info, node.sample_indices) toc = time() node.apply_split_time = toc - tic @@ -362,6 +376,13 @@ def split_next(self): right_child_node.sibling = left_child_node node.right_child = right_child_node node.left_child = left_child_node + + # set start and stop indices + left_child_node.start = node.start + left_child_node.stop = node.start + i + right_child_node.start = left_child_node.stop + right_child_node.stop = node.stop + self.n_nodes += 2 if self.max_depth is not None and depth == self.max_depth: diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index 992e2b3316e1a..13f2953eaed0a 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -227,7 +227,7 @@ def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [ i += 1 j -= 1 - return sample_indices[:i], sample_indices[i:] + return sample_indices[:i], sample_indices[i:], i def find_node_split(SplittingContext context, unsigned int [:] sample_indices): From dcfbe215796951234a6c87ff4480750bb85539b0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 11 Jan 2019 10:14:05 -0500 Subject: [PATCH 018/247] Parallelized loss --- sklearn/ensemble/gbm/_gradient_boosting.pyx | 8 +++++--- sklearn/ensemble/gbm/loss.pyx | 11 ++++++++--- sklearn/ensemble/setup.py | 11 +++++++---- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/ensemble/gbm/_gradient_boosting.pyx index de012c6f45b87..e45a7982e0e0e 100644 --- a/sklearn/ensemble/gbm/_gradient_boosting.pyx +++ b/sklearn/ensemble/gbm/_gradient_boosting.pyx @@ -4,6 +4,7 @@ # cython: wraparound=False # cython: language_level=3 cimport cython +from cython.parallel import prange import numpy as np cimport numpy as np @@ -37,10 +38,11 @@ cdef void _update_raw_predictions_helper( NPY_Y_DTYPE [:] values) nogil: cdef: - unsigned int sample_idx - unsigned int n_leaves + int sample_idx + int leaf_idx + int n_leaves n_leaves = starts.shape[0] - for leaf_idx in range(n_leaves): + for leaf_idx in prange(n_leaves): for sample_idx in range(starts[leaf_idx], stops[leaf_idx]): raw_predictions[sample_idx] += values[leaf_idx] diff --git a/sklearn/ensemble/gbm/loss.pyx b/sklearn/ensemble/gbm/loss.pyx index f4a448819c15c..eb6796d041aaf 100644 --- a/sklearn/ensemble/gbm/loss.pyx +++ b/sklearn/ensemble/gbm/loss.pyx @@ -2,6 +2,7 @@ # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False +# cython: language_level=3 """ This module contains the loss classes. @@ -11,6 +12,7 @@ classification. from abc import ABC, abstractmethod cimport cython +from cython.parallel import prange import numpy as np cimport numpy as np @@ -154,13 +156,16 @@ class LeastSquares(BaseLoss): raw_predictions) -def _update_gradients_least_squares(NPY_Y_DTYPE[:] gradients, NPY_Y_DTYPE[:] y_true, NPY_Y_DTYPE[:] raw_predictions): +cdef void _update_gradients_least_squares( + NPY_Y_DTYPE[:] gradients, + NPY_Y_DTYPE[:] y_true, + NPY_Y_DTYPE[:] raw_predictions) nogil: cdef: unsigned int n_samples - unsigned int i + int i n_samples = raw_predictions.shape[0] - for i in range(n_samples): + for i in prange(n_samples, schedule='static'): # Note: a more correct exp is 2 * (raw_predictions - y_true) but # since we use 1 for the constant hessian value (and not 2) this # is strictly equivalent for the leaves values. diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index c6378c7c8da8e..54245b69eee44 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -10,7 +10,9 @@ def configuration(parent_package="", top_path=None): config.add_extension("gbm._gradient_boosting", sources=["gbm/_gradient_boosting.pyx"], - include_dirs=[numpy.get_include()]) + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) config.add_extension("gbm.histogram", sources=["gbm/histogram.pyx"], @@ -24,8 +26,7 @@ def configuration(parent_package="", top_path=None): sources=["gbm/binning.pyx"], include_dirs=[numpy.get_include()], extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp'], - ) + extra_link_args=['-fopenmp']) config.add_extension("gbm.predictor", sources=["gbm/predictor.pyx"], @@ -33,7 +34,9 @@ def configuration(parent_package="", top_path=None): config.add_extension("gbm.loss", sources=["gbm/loss.pyx"], - include_dirs=[numpy.get_include()]) + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) config.add_extension("gbm.playground", sources=["gbm/playground.pyx"], From b65b52f69e38ed6ae9cbf867d2a7776095993661 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 11 Jan 2019 15:35:58 -0500 Subject: [PATCH 019/247] checkpoint before refactoring splitter --- sklearn/ensemble/gbm/grower.py | 16 +++++++++------- sklearn/ensemble/gbm/splitting.pyx | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/gbm/grower.py b/sklearn/ensemble/gbm/grower.py index cf12219c33611..9788ea8520234 100644 --- a/sklearn/ensemble/gbm/grower.py +++ b/sklearn/ensemble/gbm/grower.py @@ -306,13 +306,15 @@ def _compute_spittability(self, node, only_hist=False): node.hist_subtraction = True tic = time() - if node.hist_subtraction: - split_info, histograms = find_node_split_subtraction( - self.splitting_context, node.sample_indices, - node.parent.histograms, node.sibling.histograms) - else: - split_info, histograms = find_node_split( - self.splitting_context, node.sample_indices) + # if node.hist_subtraction: + # split_info, histograms = find_node_split_subtraction( + # self.splitting_context, node.sample_indices, + # node.parent.histograms, node.sibling.histograms) + # else: + # split_info, histograms = find_node_split( + # self.splitting_context, node.sample_indices) + split_info, histograms = find_node_split(self.splitting_context, + node.sample_indices) toc = time() node.find_split_time = toc - tic self.total_find_split_time += node.find_split_time diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index 13f2953eaed0a..643d5087f2c99 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -2,6 +2,7 @@ # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False +# cython: language_level=3 """This module contains njitted routines and data structures to: - Find the best possible split of a node. For a given node, a split is From 908f009e490deca045eb3675e043f43e4e14614a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 11 Jan 2019 16:46:34 -0500 Subject: [PATCH 020/247] historgams are now OUT variables in splitting --- sklearn/ensemble/gbm/grower.py | 24 ++++--- sklearn/ensemble/gbm/splitting.pyx | 105 ++++++++++++++--------------- 2 files changed, 65 insertions(+), 64 deletions(-) diff --git a/sklearn/ensemble/gbm/grower.py b/sklearn/ensemble/gbm/grower.py index 9788ea8520234..b62091f7034c8 100644 --- a/sklearn/ensemble/gbm/grower.py +++ b/sklearn/ensemble/gbm/grower.py @@ -9,9 +9,11 @@ from time import time from .splitting import (SplittingContext, split_indices, find_node_split, - find_node_split_subtraction) + find_node_split_subtraction, SplitInfo) from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE +from .types import HISTOGRAM_DTYPE + class TreeNode: """Tree Node class used in TreeGrower. @@ -192,6 +194,8 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) self.max_leaf_nodes = max_leaf_nodes + self.max_bins = max_bins + self.n_features = X_binned.shape[1] self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.X_binned = X_binned @@ -306,15 +310,15 @@ def _compute_spittability(self, node, only_hist=False): node.hist_subtraction = True tic = time() - # if node.hist_subtraction: - # split_info, histograms = find_node_split_subtraction( - # self.splitting_context, node.sample_indices, - # node.parent.histograms, node.sibling.histograms) - # else: - # split_info, histograms = find_node_split( - # self.splitting_context, node.sample_indices) - split_info, histograms = find_node_split(self.splitting_context, - node.sample_indices) + histograms = np.zeros(shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE) + if node.hist_subtraction: + split_info = find_node_split_subtraction( + self.splitting_context, node.sample_indices, + node.parent.histograms, node.sibling.histograms, histograms) + else: + split_info = find_node_split( + self.splitting_context, node.sample_indices, histograms) toc = time() node.find_split_time = toc - tic self.total_find_split_time += node.find_split_time diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/ensemble/gbm/splitting.pyx index 643d5087f2c99..d4e9f078894b4 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/ensemble/gbm/splitting.pyx @@ -51,6 +51,7 @@ cdef get_threads_chunks(unsigned int total_size): return starts, ends, n_threads @cython.freelist(100) +@cython.final cdef class SplitInfo: """Pure data class to store information about a potential split. @@ -102,6 +103,7 @@ cdef class SplitInfo: self.n_samples_right = n_samples_right +@cython.final cdef class SplittingContext: """Pure data class defining a splitting context. @@ -231,7 +233,8 @@ def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [ return sample_indices[:i], sample_indices[i:], i -def find_node_split(SplittingContext context, unsigned int [:] sample_indices): +def find_node_split(SplittingContext context, unsigned int [:] + sample_indices, hist_struct [:, :] histograms): """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -256,8 +259,6 @@ def find_node_split(SplittingContext context, unsigned int [:] sample_indices): """ cdef: unsigned int n_samples - hist_struct [:, :] view - hist_struct [:] histogram unsigned int feature_idx unsigned int i unsigned int thread_idx @@ -298,24 +299,22 @@ def find_node_split(SplittingContext context, unsigned int [:] sample_indices): split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) for i in range(context.n_features)] - histograms = np.empty( - shape=(np.int64(context.n_features), np.int64(context.max_bins)), - dtype=HISTOGRAM_DTYPE - ) - view = histograms for feature_idx in range(context.n_features): - split_info, histogram = _find_histogram_split( - context, feature_idx, sample_indices) + split_info = _find_histogram_split( + context, feature_idx, sample_indices, histograms[feature_idx]) split_infos[feature_idx] = split_info - view[feature_idx, :] = histogram split_info = _find_best_feature_to_split_helper(split_infos) - return split_info, histograms + return split_info -def find_node_split_subtraction(SplittingContext context, unsigned int [:] - sample_indices, np.ndarray parent_histograms, - np.ndarray sibling_histograms): + +def find_node_split_subtraction( + SplittingContext context, + unsigned int [:] sample_indices, + hist_struct [:, :] parent_histograms, + hist_struct [:, :] sibling_histograms, + hist_struct [:, :] histograms): """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -353,45 +352,45 @@ def find_node_split_subtraction(SplittingContext context, unsigned int [:] """ cdef: - hist_struct [:, :] view - hist_struct [:] histogram unsigned int feature_idx unsigned int n_samples SplitInfo split_info list split_infos + unsigned int i + + n_samples = sample_indices.shape[0] + # TODO: maybe change this computation... we could probably store sum_g/h in + # the SplitInfo for a speed gain + # Compute sum_hessians and sum_gradients. # We can pick any feature (here the first) in the histograms to # compute the gradients: they must be the same across all features # anyway, we have tests ensuring this. Maybe a more robust way would # be to compute an average but it's probably not worth it. - context.sum_gradients = (parent_histograms[0]['sum_gradients'].sum() - - sibling_histograms[0]['sum_gradients'].sum()) + context.sum_gradients = 0 + for i in range(context.max_bins): + context.sum_gradients += parent_histograms[0, i].sum_gradients - sibling_histograms[0, i].sum_gradients - n_samples = sample_indices.shape[0] if context.constant_hessian: context.sum_hessians = \ context.constant_hessian_value * float(n_samples) else: - context.sum_hessians = (parent_histograms[0]['sum_hessians'].sum() - - sibling_histograms[0]['sum_hessians'].sum()) + context.sum_hessians = 0 + for i in range(context.max_bins): + context.sum_hessians += parent_histograms[0, i].sum_hessians - sibling_histograms[0, i].sum_hessians # Pre-allocate the results datastructure to be able to use prange split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) for i in range(context.n_features)] - histograms = np.empty( - shape=(np.int64(context.n_features), np.int64(context.max_bins)), - dtype=HISTOGRAM_DTYPE - ) - view = histograms for feature_idx in range(context.n_features): - split_info, histogram = _find_histogram_split_subtraction( - context, feature_idx, parent_histograms, - sibling_histograms, n_samples) + split_info = _find_histogram_split_subtraction( + context, feature_idx, parent_histograms[feature_idx], + sibling_histograms[feature_idx], histograms[feature_idx], + n_samples) split_infos[feature_idx] = split_info - view[feature_idx, :] = histogram split_info = _find_best_feature_to_split_helper(split_infos) - return split_info, histograms + return split_info cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): @@ -412,7 +411,7 @@ cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices): + unsigned int [:] sample_indices, hist_struct [:] histogram): """Compute the histogram for a given feature Returns the best SplitInfo among all the possible bins of the feature. @@ -424,9 +423,6 @@ cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, unsigned int root_node = X_binned.shape[0] == n_samples float [:] ordered_gradients = context.ordered_gradients[:n_samples] float [:] ordered_hessians = context.ordered_hessians[:n_samples] - np.ndarray histogram - - histogram = np.zeros(context.max_bins, dtype=HISTOGRAM_DTYPE) if root_node: if context.constant_hessian: @@ -447,28 +443,31 @@ cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) -cdef _find_histogram_split_subtraction(SplittingContext context, unsigned int feature_idx, - np.ndarray parent_histograms, np.ndarray sibling_histograms, - unsigned int n_samples): +cdef _find_histogram_split_subtraction( + SplittingContext context, + unsigned int feature_idx, + hist_struct [:] parent_histogram, + hist_struct [:] sibling_histogram, + hist_struct [:] histogram, + unsigned int n_samples): """Compute the histogram by substraction of parent and sibling Uses the identity: hist(parent) = hist(left) + hist(right). Returns the best SplitInfo among all the possible bins of the feature. """ - cdef: - np.ndarray histogram - histogram = np.zeros(context.max_bins, dtype=HISTOGRAM_DTYPE) - _subtract_histograms(context.max_bins, parent_histograms[feature_idx], - sibling_histograms[feature_idx], histogram) + _subtract_histograms(context.max_bins, parent_histogram, + sibling_histogram, histogram) return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) -cdef _find_best_bin_to_split_helper(SplittingContext context, unsigned int feature_idx, - hist_struct [:] histogram, unsigned int - n_samples): +cdef _find_best_bin_to_split_helper( + SplittingContext context, + unsigned int feature_idx, + hist_struct [:] histogram, + unsigned int n_samples): """Find best bin to split on, and return the corresponding SplitInfo. Splits that do not satisfy the splitting constraints (min_gain_to_split, @@ -488,24 +487,22 @@ cdef _find_best_bin_to_split_helper(SplittingContext context, unsigned int featu float gain SplitInfo best_split - hist_struct [:] view = histogram - best_split = SplitInfo.__new__(SplitInfo) gradient_left, hessian_left = 0., 0. n_samples_left = 0 for bin_idx in range(context.n_bins_per_feature[feature_idx]): - n_samples_left += view[bin_idx].count + n_samples_left += histogram[bin_idx].count n_samples_right = n_samples_ - n_samples_left if context.constant_hessian: - hessian_left += ( view[bin_idx].count + hessian_left += ( histogram[bin_idx].count * context.constant_hessian_value) else: - hessian_left += view[bin_idx].sum_hessians + hessian_left += histogram[bin_idx].sum_hessians hessian_right = context.sum_hessians - hessian_left - gradient_left += view[bin_idx].sum_gradients + gradient_left += histogram[bin_idx].sum_gradients gradient_right = context.sum_gradients - gradient_left if n_samples_left < context.min_samples_leaf: @@ -549,7 +546,7 @@ cdef _find_best_bin_to_split_helper(SplittingContext context, unsigned int featu ) """ - return best_split, histogram + return best_split cdef inline float _split_gain(float gradient_left, float hessian_left, float gradient_right, From 7a23c5ad490a9809f9fb5c3224545b2e848dc5bd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 11 Jan 2019 18:53:14 -0500 Subject: [PATCH 021/247] Moved everything into sklearn/gbm and started removing gil from splitting --- gdb_test.py | 6 +- sklearn/__init__.py | 1 + sklearn/ensemble/__init__.py | 5 +- sklearn/ensemble/gbm/fun.py | 5 - sklearn/ensemble/gbm/playground.pyx | 8 -- sklearn/ensemble/setup.py | 68 +++++----- sklearn/gbm/__init__.py | 4 + .../{ensemble => }/gbm/_gradient_boosting.pyx | 0 sklearn/{ensemble => }/gbm/binning.pyx | 0 sklearn/gbm/fun.py | 3 + .../{ensemble => }/gbm/gradient_boosting.py | 0 sklearn/{ensemble => }/gbm/grower.py | 0 sklearn/gbm/histogram.pxd | 44 +++++++ sklearn/{ensemble => }/gbm/histogram.pyx | 10 -- sklearn/{ensemble => }/gbm/loss.pyx | 0 sklearn/gbm/playground.pyx | 15 +++ sklearn/{ensemble => }/gbm/predictor.pyx | 0 sklearn/gbm/setup.py | 50 ++++++++ sklearn/{ensemble => }/gbm/splitting.pyx | 120 +++++++++++------- .../{ensemble => }/gbm/tests/test_binning.py | 8 +- .../gbm/tests/test_compare_lightgbm.py | 6 +- .../gbm/tests/test_gradient_boosting.py | 6 +- .../{ensemble => }/gbm/tests/test_grower.py | 4 +- .../gbm/tests/test_histogram.py | 14 +- sklearn/{ensemble => }/gbm/tests/test_loss.py | 2 +- .../gbm/tests/test_predictor.py | 4 +- sklearn/{ensemble => }/gbm/types.py | 0 sklearn/{ensemble => }/gbm/utils.py | 0 sklearn/setup.py | 1 + 29 files changed, 254 insertions(+), 130 deletions(-) delete mode 100644 sklearn/ensemble/gbm/fun.py delete mode 100644 sklearn/ensemble/gbm/playground.pyx create mode 100644 sklearn/gbm/__init__.py rename sklearn/{ensemble => }/gbm/_gradient_boosting.pyx (100%) rename sklearn/{ensemble => }/gbm/binning.pyx (100%) create mode 100644 sklearn/gbm/fun.py rename sklearn/{ensemble => }/gbm/gradient_boosting.py (100%) rename sklearn/{ensemble => }/gbm/grower.py (100%) create mode 100644 sklearn/gbm/histogram.pxd rename sklearn/{ensemble => }/gbm/histogram.pyx (97%) rename sklearn/{ensemble => }/gbm/loss.pyx (100%) create mode 100644 sklearn/gbm/playground.pyx rename sklearn/{ensemble => }/gbm/predictor.pyx (100%) create mode 100644 sklearn/gbm/setup.py rename sklearn/{ensemble => }/gbm/splitting.pyx (87%) rename sklearn/{ensemble => }/gbm/tests/test_binning.py (97%) rename sklearn/{ensemble => }/gbm/tests/test_compare_lightgbm.py (98%) rename sklearn/{ensemble => }/gbm/tests/test_gradient_boosting.py (98%) rename sklearn/{ensemble => }/gbm/tests/test_grower.py (99%) rename sklearn/{ensemble => }/gbm/tests/test_histogram.py (94%) rename sklearn/{ensemble => }/gbm/tests/test_loss.py (99%) rename sklearn/{ensemble => }/gbm/tests/test_predictor.py (92%) rename sklearn/{ensemble => }/gbm/types.py (100%) rename sklearn/{ensemble => }/gbm/utils.py (100%) diff --git a/gdb_test.py b/gdb_test.py index d4fde1104370a..23c2d75baa95f 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_regression, make_classification from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import GBMRegressor -from sklearn.ensemble import GBMClassifier +from sklearn.gbm import GBMRegressor +from sklearn.gbm import GBMClassifier import pstats import cProfile @@ -12,7 +12,7 @@ classif = False n_classes = 3 -n_samples = int(1e6) +n_samples = int(1e4) max_iter = 5 if classif: diff --git a/sklearn/__init__.py b/sklearn/__init__.py index aafc8a34b2a13..da851e6483f72 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -75,6 +75,7 @@ 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', 'preprocessing', 'random_projection', 'semi_supervised', 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', + 'gbm', # Non-modules: 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index c1760ae39a763..5586a9e1e1fba 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -17,8 +17,6 @@ from .gradient_boosting import GradientBoostingClassifier from .gradient_boosting import GradientBoostingRegressor from .voting_classifier import VotingClassifier -from .gbm.gradient_boosting import GradientBoostingClassifier as GBMClassifier -from .gbm.gradient_boosting import GradientBoostingRegressor as GBMRegressor from . import bagging from . import forest @@ -34,5 +32,4 @@ "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "bagging", "forest", "gradient_boosting", - "partial_dependence", "weight_boosting", - "GBMClassifier", "GBMRegressor"] + "partial_dependence", "weight_boosting"] diff --git a/sklearn/ensemble/gbm/fun.py b/sklearn/ensemble/gbm/fun.py deleted file mode 100644 index e84dcc71d639a..0000000000000 --- a/sklearn/ensemble/gbm/fun.py +++ /dev/null @@ -1,5 +0,0 @@ -from playground import g - -a = g() -print(a) -print(a.dtype) \ No newline at end of file diff --git a/sklearn/ensemble/gbm/playground.pyx b/sklearn/ensemble/gbm/playground.pyx deleted file mode 100644 index b40b37d35bbd9..0000000000000 --- a/sklearn/ensemble/gbm/playground.pyx +++ /dev/null @@ -1,8 +0,0 @@ -cimport cython - -cdef class Shrubbery: - cdef int width, height - - def __init__(self, int w, int h): - self.width = w - self.height = h \ No newline at end of file diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index 54245b69eee44..a7cf5789fe608 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -8,42 +8,42 @@ def configuration(parent_package="", top_path=None): sources=["_gradient_boosting.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("gbm._gradient_boosting", - sources=["gbm/_gradient_boosting.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) - - config.add_extension("gbm.histogram", - sources=["gbm/histogram.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("gbm.splitting", - sources=["gbm/splitting.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("gbm.binning", - sources=["gbm/binning.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) - - config.add_extension("gbm.predictor", - sources=["gbm/predictor.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("gbm.loss", - sources=["gbm/loss.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) - - config.add_extension("gbm.playground", - sources=["gbm/playground.pyx"], - include_dirs=[numpy.get_include()]) + # config.add_extension("gbm._gradient_boosting", + # sources=["gbm/_gradient_boosting.pyx"], + # include_dirs=[numpy.get_include()], + # extra_compile_args=['-fopenmp'], + # extra_link_args=['-fopenmp']) + + # config.add_extension("gbm.histogram", + # sources=["gbm/histogram.pyx"], + # include_dirs=[numpy.get_include()]) + + # config.add_extension("gbm.splitting", + # sources=["gbm/splitting.pyx"], + # include_dirs=[numpy.get_include()]) + + # config.add_extension("gbm.binning", + # sources=["gbm/binning.pyx"], + # include_dirs=[numpy.get_include()], + # extra_compile_args=['-fopenmp'], + # extra_link_args=['-fopenmp']) + + # config.add_extension("gbm.predictor", + # sources=["gbm/predictor.pyx"], + # include_dirs=[numpy.get_include()]) + + # config.add_extension("gbm.loss", + # sources=["gbm/loss.pyx"], + # include_dirs=[numpy.get_include()], + # extra_compile_args=['-fopenmp'], + # extra_link_args=['-fopenmp']) + + # config.add_extension("gbm.playground", + # sources=["gbm/playground.pyx"], + # include_dirs=[numpy.get_include()]) config.add_subpackage("tests") - config.add_data_files("gbm/slitting.pxd") + # config.add_data_files("gbm/histogram.pxd") return config diff --git a/sklearn/gbm/__init__.py b/sklearn/gbm/__init__.py new file mode 100644 index 0000000000000..d50ebe248451f --- /dev/null +++ b/sklearn/gbm/__init__.py @@ -0,0 +1,4 @@ +from .gradient_boosting import GradientBoostingClassifier as GBMClassifier +from .gradient_boosting import GradientBoostingRegressor as GBMRegressor + +__all__ = ["GBMClassifier", "GBMRegressor"] \ No newline at end of file diff --git a/sklearn/ensemble/gbm/_gradient_boosting.pyx b/sklearn/gbm/_gradient_boosting.pyx similarity index 100% rename from sklearn/ensemble/gbm/_gradient_boosting.pyx rename to sklearn/gbm/_gradient_boosting.pyx diff --git a/sklearn/ensemble/gbm/binning.pyx b/sklearn/gbm/binning.pyx similarity index 100% rename from sklearn/ensemble/gbm/binning.pyx rename to sklearn/gbm/binning.pyx diff --git a/sklearn/gbm/fun.py b/sklearn/gbm/fun.py new file mode 100644 index 0000000000000..f4c5a5293a8fc --- /dev/null +++ b/sklearn/gbm/fun.py @@ -0,0 +1,3 @@ +from playground import hello + +print(hello()) diff --git a/sklearn/ensemble/gbm/gradient_boosting.py b/sklearn/gbm/gradient_boosting.py similarity index 100% rename from sklearn/ensemble/gbm/gradient_boosting.py rename to sklearn/gbm/gradient_boosting.py diff --git a/sklearn/ensemble/gbm/grower.py b/sklearn/gbm/grower.py similarity index 100% rename from sklearn/ensemble/gbm/grower.py rename to sklearn/gbm/grower.py diff --git a/sklearn/gbm/histogram.pxd b/sklearn/gbm/histogram.pxd new file mode 100644 index 0000000000000..ccc3532757f5f --- /dev/null +++ b/sklearn/gbm/histogram.pxd @@ -0,0 +1,44 @@ +import numpy as np +cimport numpy as np + +from .types import HISTOGRAM_DTYPE + +ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE +ctypedef np.npy_float32 NPY_Y_DTYPE + +cdef packed struct hist_struct: + float sum_gradients + float sum_hessians + unsigned int count + +cpdef void _subtract_histograms(unsigned int n_bins, + hist_struct [:] hist_a, + hist_struct [:] hist_b, + hist_struct [:] out) nogil + +cpdef void _build_histogram(unsigned int n_bins, + unsigned int [:] sample_indices, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] ordered_gradients, + NPY_Y_DTYPE [:] ordered_hessians, + hist_struct [:] out) nogil + +cpdef void _build_histogram_no_hessian( + unsigned int n_bins, + unsigned int [:] sample_indices, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] ordered_gradients, + hist_struct [:] out) nogil + +cpdef void _build_histogram_root_no_hessian( + unsigned int n_bins, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] all_gradients, + hist_struct [:] out) nogil + +cpdef void _build_histogram_root( + unsigned int n_bins, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] all_gradients, + NPY_Y_DTYPE [:] all_hessians, + hist_struct [:] out) nogil \ No newline at end of file diff --git a/sklearn/ensemble/gbm/histogram.pyx b/sklearn/gbm/histogram.pyx similarity index 97% rename from sklearn/ensemble/gbm/histogram.pyx rename to sklearn/gbm/histogram.pyx index ce180dd6206bf..dea4c9bdf803b 100644 --- a/sklearn/ensemble/gbm/histogram.pyx +++ b/sklearn/gbm/histogram.pyx @@ -15,16 +15,6 @@ cimport numpy as np from .types import HISTOGRAM_DTYPE - -ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE -ctypedef np.npy_float32 NPY_Y_DTYPE - -cdef packed struct hist_struct: - float sum_gradients - float sum_hessians - unsigned int count - - cpdef void _build_histogram_naive(unsigned int n_bins, unsigned int [:] sample_indices, NPY_X_BINNED_DTYPE [:] binned_feature, diff --git a/sklearn/ensemble/gbm/loss.pyx b/sklearn/gbm/loss.pyx similarity index 100% rename from sklearn/ensemble/gbm/loss.pyx rename to sklearn/gbm/loss.pyx diff --git a/sklearn/gbm/playground.pyx b/sklearn/gbm/playground.pyx new file mode 100644 index 0000000000000..bb8e9024dd0ad --- /dev/null +++ b/sklearn/gbm/playground.pyx @@ -0,0 +1,15 @@ +cimport cython + +cdef class MyClass: + cdef int width, height + + def __init__(self, int w, int h): + self.width = w + self.height = h + +def hello(): + o = MyClass(9, 5) + return zob(o) + +cdef int zob (MyClass o) nogil: + return o.width \ No newline at end of file diff --git a/sklearn/ensemble/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx similarity index 100% rename from sklearn/ensemble/gbm/predictor.pyx rename to sklearn/gbm/predictor.pyx diff --git a/sklearn/gbm/setup.py b/sklearn/gbm/setup.py new file mode 100644 index 0000000000000..e6b03d58a572a --- /dev/null +++ b/sklearn/gbm/setup.py @@ -0,0 +1,50 @@ +import numpy +from numpy.distutils.misc_util import Configuration + + +def configuration(parent_package="", top_path=None): + config = Configuration("gbm", parent_package, top_path) + + config.add_extension("_gradient_boosting", + sources=["_gradient_boosting.pyx"], + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) + + config.add_extension("histogram", + sources=["histogram.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("splitting", + sources=["splitting.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("binning", + sources=["binning.pyx"], + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) + + config.add_extension("predictor", + sources=["predictor.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("loss", + sources=["loss.pyx"], + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) + + config.add_extension("playground", + sources=["playground.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_subpackage("tests") + # config.add_data_files("histogram.pxd") + + return config + +if __name__ == "__main__": + from numpy.distutils.core import setup + setup(**configuration().todict()) + diff --git a/sklearn/ensemble/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx similarity index 87% rename from sklearn/ensemble/gbm/splitting.pyx rename to sklearn/gbm/splitting.pyx index d4e9f078894b4..075d6b8a8c121 100644 --- a/sklearn/ensemble/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -15,12 +15,15 @@ cimport cython import numpy as np cimport numpy as np -from .histogram import _build_histogram -from .histogram import _subtract_histograms -from .histogram import _build_histogram_no_hessian -from .histogram import _build_histogram_root -from .histogram import _build_histogram_root_no_hessian -from .histogram import HISTOGRAM_DTYPE +from .histogram cimport _build_histogram +from .histogram cimport _build_histogram_no_hessian +from .histogram cimport _build_histogram_root +from .histogram cimport _build_histogram_root_no_hessian +from .histogram cimport _subtract_histograms +from .histogram cimport NPY_X_BINNED_DTYPE +from .histogram cimport NPY_Y_DTYPE + +from .types import HISTOGRAM_DTYPE cdef struct hist_struct: float sum_gradients @@ -50,6 +53,17 @@ cdef get_threads_chunks(unsigned int total_size): return starts, ends, n_threads +cdef struct split_info_struct: + float gain + unsigned int feature_idx + unsigned int bin_idx + float gradient_left + float gradient_right + float hessian_left + float hessian_right + unsigned int n_samples_left + unsigned int n_samples_right + @cython.freelist(100) @cython.final cdef class SplitInfo: @@ -141,14 +155,14 @@ cdef class SplittingContext: be ignored. """ cdef public: - unsigned char [:, :] X_binned + NPY_X_BINNED_DTYPE [:, :] X_binned unsigned int n_features unsigned int max_bins unsigned int [:] n_bins_per_feature - float [:] gradients - float [:] hessians - float [:] ordered_gradients - float [:] ordered_hessians + NPY_Y_DTYPE [:] gradients + NPY_Y_DTYPE [:] hessians + NPY_Y_DTYPE [:] ordered_gradients + NPY_Y_DTYPE [:] ordered_hessians float sum_gradients float sum_hessians unsigned char constant_hessian @@ -162,9 +176,9 @@ cdef class SplittingContext: unsigned int [:] left_indices_buffer unsigned int [:] right_indices_buffer - def __cinit__(self, np.ndarray[np.uint8_t, ndim=2] X_binned, unsigned int max_bins, + def __cinit__(self, NPY_X_BINNED_DTYPE [:, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - np.ndarray [np.float32_t] gradients, np.ndarray[np.float32_t] hessians, float l2_regularization, + NPY_Y_DTYPE [:] gradients, NPY_Y_DTYPE [:] hessians, float l2_regularization, float min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, float min_gain_to_split=0.): @@ -179,8 +193,8 @@ cdef class SplittingContext: # for root node, gradients and hessians are already ordered self.ordered_gradients = gradients.copy() self.ordered_hessians = hessians.copy() - self.sum_gradients = gradients.sum() - self.sum_hessians = hessians.sum() + self.sum_gradients = np.sum(gradients) + self.sum_hessians = np.sum(hessians) self.constant_hessian = hessians.shape[0] == 1 self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split @@ -213,7 +227,7 @@ def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [ unsigned int j = n_samples - 1 unsigned char pivot = split_info.bin_idx unsigned int [:] view = sample_indices - unsigned char [:] binned_feature = context.X_binned.T[split_info.feature_idx] + NPY_X_BINNED_DTYPE [:] binned_feature = context.X_binned.T[split_info.feature_idx] while i != j: # continue until we find an element that should be on right @@ -266,7 +280,7 @@ def find_node_split(SplittingContext context, unsigned int [:] unsigned int [:] starts unsigned int [:] ends unsigned int n_threads - SplitInfo split_info + split_info_struct split_info list split_infos ctx = context # shorter name to avoid various line breaks @@ -297,16 +311,25 @@ def find_node_split(SplittingContext context, unsigned int [:] # ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() ctx.sum_hessians = np.sum(ctx.ordered_hessians[:n_samples]) - split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) - for i in range(context.n_features)] + split_infos = [] for feature_idx in range(context.n_features): split_info = _find_histogram_split( context, feature_idx, sample_indices, histograms[feature_idx]) - split_infos[feature_idx] = split_info + split_infos.append(split_info) split_info = _find_best_feature_to_split_helper(split_infos) - return split_info + return SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.gradient_left, + split_info.hessian_left, + split_info.gradient_right, + split_info.hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + ) def find_node_split_subtraction( @@ -354,7 +377,7 @@ def find_node_split_subtraction( cdef: unsigned int feature_idx unsigned int n_samples - SplitInfo split_info + split_info_struct split_info list split_infos unsigned int i @@ -379,26 +402,34 @@ def find_node_split_subtraction( for i in range(context.max_bins): context.sum_hessians += parent_histograms[0, i].sum_hessians - sibling_histograms[0, i].sum_hessians - # Pre-allocate the results datastructure to be able to use prange - split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) - for i in range(context.n_features)] + split_infos = [] for feature_idx in range(context.n_features): split_info = _find_histogram_split_subtraction( context, feature_idx, parent_histograms[feature_idx], sibling_histograms[feature_idx], histograms[feature_idx], n_samples) - split_infos[feature_idx] = split_info + split_infos.append(split_info) split_info = _find_best_feature_to_split_helper(split_infos) - return split_info - - -cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): + return SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.gradient_left, + split_info.hessian_left, + split_info.gradient_right, + split_info.hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + ) + + +cdef split_info_struct _find_best_feature_to_split_helper(list split_infos): cdef: float gain float best_gain - SplitInfo split_info - SplitInfo best_split_info + split_info_struct split_info + split_info_struct best_split_info unsigned int i best_gain = -1. @@ -410,8 +441,9 @@ cdef SplitInfo _find_best_feature_to_split_helper(list split_infos): return best_split_info -cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices, hist_struct [:] histogram): +cdef split_info_struct _find_histogram_split(SplittingContext context, unsigned int feature_idx, + unsigned int [:] sample_indices, hist_struct [:] + histogram) nogil: """Compute the histogram for a given feature Returns the best SplitInfo among all the possible bins of the feature. @@ -419,10 +451,10 @@ cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, cdef: unsigned int n_samples = sample_indices.shape[0] - unsigned char [:] X_binned = context.X_binned.T[feature_idx] + NPY_X_BINNED_DTYPE [:] X_binned = context.X_binned.T[feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples - float [:] ordered_gradients = context.ordered_gradients[:n_samples] - float [:] ordered_hessians = context.ordered_hessians[:n_samples] + NPY_Y_DTYPE [:] ordered_gradients = context.ordered_gradients[:n_samples] + NPY_Y_DTYPE [:] ordered_hessians = context.ordered_hessians[:n_samples] if root_node: if context.constant_hessian: @@ -443,13 +475,13 @@ cdef _find_histogram_split(SplittingContext context, unsigned int feature_idx, return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) -cdef _find_histogram_split_subtraction( +cdef split_info_struct _find_histogram_split_subtraction( SplittingContext context, unsigned int feature_idx, hist_struct [:] parent_histogram, hist_struct [:] sibling_histogram, hist_struct [:] histogram, - unsigned int n_samples): + unsigned int n_samples) nogil: """Compute the histogram by substraction of parent and sibling Uses the identity: hist(parent) = hist(left) + hist(right). @@ -457,17 +489,17 @@ cdef _find_histogram_split_subtraction( """ _subtract_histograms(context.max_bins, parent_histogram, - sibling_histogram, histogram) + sibling_histogram, histogram) return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) -cdef _find_best_bin_to_split_helper( +cdef split_info_struct _find_best_bin_to_split_helper( SplittingContext context, unsigned int feature_idx, hist_struct [:] histogram, - unsigned int n_samples): + unsigned int n_samples) nogil: """Find best bin to split on, and return the corresponding SplitInfo. Splits that do not satisfy the splitting constraints (min_gain_to_split, @@ -485,9 +517,9 @@ cdef _find_best_bin_to_split_helper( float gradient_left float gradient_right float gain - SplitInfo best_split + split_info_struct best_split - best_split = SplitInfo.__new__(SplitInfo) + best_split.gain = -1. gradient_left, hessian_left = 0., 0. n_samples_left = 0 diff --git a/sklearn/ensemble/gbm/tests/test_binning.py b/sklearn/gbm/tests/test_binning.py similarity index 97% rename from sklearn/ensemble/gbm/tests/test_binning.py rename to sklearn/gbm/tests/test_binning.py index 3a654af631a08..3da62073e2267 100644 --- a/sklearn/ensemble/gbm/tests/test_binning.py +++ b/sklearn/gbm/tests/test_binning.py @@ -2,10 +2,10 @@ from numpy.testing import assert_array_equal, assert_allclose import pytest -from sklearn.ensemble.gbm.binning import BinMapper -from sklearn.ensemble.gbm.binning import _find_binning_thresholds -from sklearn.ensemble.gbm.binning import _map_to_bins -from sklearn.ensemble.gbm.types import X_DTYPE, X_BINNED_DTYPE +from sklearn.gbm.binning import BinMapper +from sklearn.gbm.binning import _find_binning_thresholds +from sklearn.gbm.binning import _map_to_bins +from sklearn.gbm.types import X_DTYPE, X_BINNED_DTYPE diff --git a/sklearn/ensemble/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py similarity index 98% rename from sklearn/ensemble/gbm/tests/test_compare_lightgbm.py rename to sklearn/gbm/tests/test_compare_lightgbm.py index cdd6778452e95..904cca72847c0 100644 --- a/sklearn/ensemble/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -4,9 +4,9 @@ import numpy as np import pytest -from sklearn.ensemble import GBMRegressor, GBMClassifier -from sklearn.ensemble.gbm.binning import BinMapper -from sklearn.ensemble.gbm.utils import get_lightgbm_estimator +from sklearn import GBMRegressor, GBMClassifier +from sklearn.gbm.binning import BinMapper +from sklearn.gbm.utils import get_lightgbm_estimator pytest.importorskip("lightgbm") diff --git a/sklearn/ensemble/gbm/tests/test_gradient_boosting.py b/sklearn/gbm/tests/test_gradient_boosting.py similarity index 98% rename from sklearn/ensemble/gbm/tests/test_gradient_boosting.py rename to sklearn/gbm/tests/test_gradient_boosting.py index 9a8d06f726eba..3e6a2f8346443 100644 --- a/sklearn/ensemble/gbm/tests/test_gradient_boosting.py +++ b/sklearn/gbm/tests/test_gradient_boosting.py @@ -7,9 +7,9 @@ from sklearn.utils.testing import assert_raises_regex from sklearn.datasets import make_classification, make_regression -from sklearn.ensemble import GBMClassifier -from sklearn.ensemble import GBMRegressor -from sklearn.ensemble.gbm.binning import BinMapper +from sklearn import GBMClassifier +from sklearn import GBMRegressor +from sklearn.gbm.binning import BinMapper X_classification, y_classification = make_classification(random_state=0) diff --git a/sklearn/ensemble/gbm/tests/test_grower.py b/sklearn/gbm/tests/test_grower.py similarity index 99% rename from sklearn/ensemble/gbm/tests/test_grower.py rename to sklearn/gbm/tests/test_grower.py index 4e865589ee28e..9232e2eb93b74 100644 --- a/sklearn/ensemble/gbm/tests/test_grower.py +++ b/sklearn/gbm/tests/test_grower.py @@ -4,8 +4,8 @@ from pytest import approx from sklearn.utils.testing import assert_raises_regex -from sklearn.ensemble.gbm.grower import TreeGrower -from sklearn.ensemble.gbm.binning import BinMapper +from sklearn.gbm.grower import TreeGrower +from sklearn.gbm.binning import BinMapper def _make_training_data(n_bins=256, constant_hessian=True): diff --git a/sklearn/ensemble/gbm/tests/test_histogram.py b/sklearn/gbm/tests/test_histogram.py similarity index 94% rename from sklearn/ensemble/gbm/tests/test_histogram.py rename to sklearn/gbm/tests/test_histogram.py index 9af3fe7257209..9860e3d9fbcfd 100644 --- a/sklearn/ensemble/gbm/tests/test_histogram.py +++ b/sklearn/gbm/tests/test_histogram.py @@ -4,13 +4,13 @@ from numpy.testing import assert_allclose from numpy.testing import assert_array_equal -from sklearn.ensemble.gbm.histogram import _build_histogram_naive -from sklearn.ensemble.gbm.histogram import _build_histogram -from sklearn.ensemble.gbm.histogram import _build_histogram_no_hessian -from sklearn.ensemble.gbm.histogram import _build_histogram_root_no_hessian -from sklearn.ensemble.gbm.histogram import _build_histogram_root -from sklearn.ensemble.gbm.histogram import _subtract_histograms -from sklearn.ensemble.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.histogram import _build_histogram_naive +from sklearn.nsemble.gbm.histogram import _build_histogram +from sklearn.gbm.histogram import _build_histogram_no_hessian +from sklearn.gbm.histogram import _build_histogram_root_no_hessian +from sklearn.gbm.histogram import _build_histogram_root +from sklearn.gbm.histogram import _subtract_histograms +from sklearn.gbm.types import HISTOGRAM_DTYPE @pytest.mark.parametrize( diff --git a/sklearn/ensemble/gbm/tests/test_loss.py b/sklearn/gbm/tests/test_loss.py similarity index 99% rename from sklearn/ensemble/gbm/tests/test_loss.py rename to sklearn/gbm/tests/test_loss.py index 07c48f877d234..fe6d36bcca993 100644 --- a/sklearn/ensemble/gbm/tests/test_loss.py +++ b/sklearn/gbm/tests/test_loss.py @@ -5,7 +5,7 @@ from sklearn.utils import assert_all_finite import pytest -from sklearn.ensemble.gbm.loss import _LOSSES +from sklearn.gbm.loss import _LOSSES def get_derivatives_helper(loss): diff --git a/sklearn/ensemble/gbm/tests/test_predictor.py b/sklearn/gbm/tests/test_predictor.py similarity index 92% rename from sklearn/ensemble/gbm/tests/test_predictor.py rename to sklearn/gbm/tests/test_predictor.py index 35d57fd5f14a5..06fb0b0c35fa3 100644 --- a/sklearn/ensemble/gbm/tests/test_predictor.py +++ b/sklearn/gbm/tests/test_predictor.py @@ -5,8 +5,8 @@ from sklearn.metrics import r2_score import pytest -from sklearn.ensemble.gbm.binning import BinMapper -from sklearn.ensemble.gbm.grower import TreeGrower +from sklearn.gbm.binning import BinMapper +from sklearn.gbm.grower import TreeGrower @pytest.mark.parametrize('max_bins', [200, 256]) diff --git a/sklearn/ensemble/gbm/types.py b/sklearn/gbm/types.py similarity index 100% rename from sklearn/ensemble/gbm/types.py rename to sklearn/gbm/types.py diff --git a/sklearn/ensemble/gbm/utils.py b/sklearn/gbm/utils.py similarity index 100% rename from sklearn/ensemble/gbm/utils.py rename to sklearn/gbm/utils.py diff --git a/sklearn/setup.py b/sklearn/setup.py index a20d7e4e3fe22..f3a028be45565 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -56,6 +56,7 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('neighbors') config.add_subpackage('tree') config.add_subpackage('svm') + config.add_subpackage('gbm') # add cython extension module for isotonic regression config.add_extension('_isotonic', From 46adc5841b2ee656af0bfda3aea4f790bf785d48 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jan 2019 12:24:02 -0500 Subject: [PATCH 022/247] Updated some tests --- sklearn/gbm/binning.pyx | 6 +-- sklearn/gbm/predictor.pyx | 4 +- sklearn/gbm/splitting.pyx | 30 ++++++++----- sklearn/gbm/tests/test_compare_lightgbm.py | 4 +- sklearn/gbm/tests/test_gradient_boosting.py | 49 ++++----------------- sklearn/gbm/tests/test_grower.py | 5 ++- sklearn/gbm/tests/test_histogram.py | 2 +- sklearn/gbm/tests/test_loss.py | 17 +++---- 8 files changed, 46 insertions(+), 71 deletions(-) diff --git a/sklearn/gbm/binning.pyx b/sklearn/gbm/binning.pyx index 8ace124a6ede6..9e18cfeb57134 100644 --- a/sklearn/gbm/binning.pyx +++ b/sklearn/gbm/binning.pyx @@ -66,7 +66,7 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return binning_thresholds -cpdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, +cpdef _map_to_bins(const NPY_X_DTYPE [:, :] data, list binning_thresholds, NPY_X_BINNED_DTYPE [::1, :] binned): """Bin numerical values to discrete integer-coded levels. @@ -96,8 +96,8 @@ cpdef _map_to_bins(NPY_X_DTYPE [:, :] data, list binning_thresholds, binned[:, feature_idx]) -cpdef void _map_num_col_to_bins(NPY_X_DTYPE [:] data, - NPY_X_DTYPE [:] binning_thresholds, +cpdef void _map_num_col_to_bins(const NPY_X_DTYPE [:] data, + const NPY_X_DTYPE [:] binning_thresholds, NPY_X_BINNED_DTYPE [:] binned) nogil: """Binary search to the find the bin index for each value in data.""" cdef: diff --git a/sklearn/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx index 485145eac5ea7..0620a66a0e695 100644 --- a/sklearn/gbm/predictor.pyx +++ b/sklearn/gbm/predictor.pyx @@ -82,7 +82,7 @@ class TreePredictor: cdef float _predict_one_from_numeric_data( node_struct [:] nodes, - NPY_X_DTYPE [:] numeric_data) nogil: + const NPY_X_DTYPE [:] numeric_data) nogil: cdef: node_struct node = nodes[0] @@ -98,7 +98,7 @@ cdef float _predict_one_from_numeric_data( cdef void _predict_from_numeric_data( node_struct [:] nodes, - NPY_X_DTYPE [:, :] numeric_data, + const NPY_X_DTYPE [:, :] numeric_data, float [:] out) nogil: cdef: diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 075d6b8a8c121..5c282efa603a9 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -12,6 +12,8 @@ """ cimport cython +from libc.stdlib cimport malloc, free + import numpy as np cimport numpy as np @@ -281,7 +283,7 @@ def find_node_split(SplittingContext context, unsigned int [:] unsigned int [:] ends unsigned int n_threads split_info_struct split_info - list split_infos + split_info_struct * split_infos ctx = context # shorter name to avoid various line breaks n_samples = sample_indices.shape[0] @@ -311,13 +313,14 @@ def find_node_split(SplittingContext context, unsigned int [:] # ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() ctx.sum_hessians = np.sum(ctx.ordered_hessians[:n_samples]) - split_infos = [] + # TODO: this needs to be freed at some point + split_infos = malloc(context.n_features * sizeof(split_info_struct)) for feature_idx in range(context.n_features): split_info = _find_histogram_split( context, feature_idx, sample_indices, histograms[feature_idx]) - split_infos.append(split_info) + split_infos[feature_idx] = split_info - split_info = _find_best_feature_to_split_helper(split_infos) + split_info = _find_best_feature_to_split_helper(context, split_infos) return SplitInfo( split_info.gain, @@ -378,7 +381,7 @@ def find_node_split_subtraction( unsigned int feature_idx unsigned int n_samples split_info_struct split_info - list split_infos + split_info_struct * split_infos unsigned int i n_samples = sample_indices.shape[0] @@ -402,15 +405,17 @@ def find_node_split_subtraction( for i in range(context.max_bins): context.sum_hessians += parent_histograms[0, i].sum_hessians - sibling_histograms[0, i].sum_hessians - split_infos = [] + # TODO: this needs to be freed at some point + split_infos = malloc(context.n_features * sizeof(split_info_struct)) for feature_idx in range(context.n_features): split_info = _find_histogram_split_subtraction( context, feature_idx, parent_histograms[feature_idx], sibling_histograms[feature_idx], histograms[feature_idx], n_samples) - split_infos.append(split_info) + split_infos[feature_idx] = split_info + + split_info = _find_best_feature_to_split_helper(context, split_infos) - split_info = _find_best_feature_to_split_helper(split_infos) return SplitInfo( split_info.gain, split_info.feature_idx, @@ -424,16 +429,19 @@ def find_node_split_subtraction( ) -cdef split_info_struct _find_best_feature_to_split_helper(list split_infos): +cdef split_info_struct _find_best_feature_to_split_helper(SplittingContext +context, split_info_struct * split_infos) nogil: cdef: float gain float best_gain split_info_struct split_info split_info_struct best_split_info - unsigned int i + unsigned int feature_idx best_gain = -1. - for i, split_info in enumerate(split_infos): + # for i, split_info in enumerate(split_infos): + for feature_idx in range(context.n_features): + split_info = split_infos[feature_idx] gain = split_info.gain if best_gain == -1 or gain > best_gain: best_gain = gain diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py index 904cca72847c0..6995b511de143 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from sklearn import GBMRegressor, GBMClassifier +from sklearn.gbm import GBMRegressor, GBMClassifier from sklearn.gbm.binning import BinMapper from sklearn.gbm.utils import get_lightgbm_estimator @@ -83,6 +83,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, (255, 4096), (1000, 8), ]) +@pytest.mark.skip('classification not supported yet') def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification @@ -142,6 +143,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, (255, 4096), (10000, 8), ]) +@pytest.mark.skip('classification not supported yet') def test_same_predictions_multiclass_classification( seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification diff --git a/sklearn/gbm/tests/test_gradient_boosting.py b/sklearn/gbm/tests/test_gradient_boosting.py index 3e6a2f8346443..9e61c4426eccf 100644 --- a/sklearn/gbm/tests/test_gradient_boosting.py +++ b/sklearn/gbm/tests/test_gradient_boosting.py @@ -7,8 +7,8 @@ from sklearn.utils.testing import assert_raises_regex from sklearn.datasets import make_classification, make_regression -from sklearn import GBMClassifier -from sklearn import GBMRegressor +from sklearn.gbm import GBMClassifier +from sklearn.gbm import GBMRegressor from sklearn.gbm.binning import BinMapper @@ -17,7 +17,7 @@ @pytest.mark.parametrize('GradientBoosting, X, y', [ - (GBMClassifier, X_classification, y_classification), + # (GBMClassifier, X_classification, y_classification), TODO: unskip (GBMRegressor, X_regression, y_regression) ]) def test_init_parameters_validation(GradientBoosting, X, y): @@ -72,12 +72,6 @@ def test_init_parameters_validation(GradientBoosting, X, y): GradientBoosting(max_bins=max_bins).fit, X, y ) - assert_raises_regex( - ValueError, - f"max_bins is set to 4 but the data is pre-binned with 256 bins", - GradientBoosting(max_bins=4).fit, X.astype(np.uint8), y - ) - assert_raises_regex( ValueError, f"n_iter_no_change=-1 must be positive", @@ -143,9 +137,6 @@ def test_early_stopping_regression(scoring, validation_split, assert gb.n_iter_ == max_iter -@pytest.mark.skipif( - int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, - reason="Travis times out without numba") @pytest.mark.parametrize('data', ( make_classification(random_state=0), make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) @@ -157,6 +148,7 @@ def test_early_stopping_regression(scoring, validation_split, (None, None, 5, 1e-1), # use loss on training data (None, None, None, None), # no early stopping ]) +@pytest.mark.skip('classification not supported yet') def test_early_stopping_classification(data, scoring, validation_split, n_iter_no_change, tol): @@ -179,6 +171,7 @@ def test_early_stopping_classification(data, scoring, validation_split, assert gb.n_iter_ == max_iter +@pytest.mark.skip('classification not supported yet') def test_early_stopping_loss(): # Make sure that when scoring is None, the early stopping is done w.r.t to # the loss. Using scoring='neg_log_loss' and scoring=None should be @@ -275,7 +268,9 @@ def custom_check_estimator(Estimator): reason="Potentially long") @pytest.mark.parametrize('Estimator', ( GBMRegressor(), - GBMClassifier(n_iter_no_change=None, min_samples_leaf=5),)) + # TODO: unskip + # GBMClassifier(n_iter_no_change=None, min_samples_leaf=5), + )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. @@ -288,31 +283,3 @@ def test_estimator_checks(Estimator): # dataset, the root is never split with min_samples_leaf=20 and only the # majority class is predicted. custom_check_estimator(Estimator) - - -def test_pre_binned_data(): - # Make sure that: - # - training on numerical data and predicting on numerical data is the - # same as training on binned data and predicting on binned data - # - training on numerical data and predicting on numerical data is the - # same as training on numerical data and predicting on binned data - # - training on binned data and predicting on numerical data is not - # possible. - - X, y = make_regression(random_state=0) - gbdt = GBMRegressor(scoring=None, random_state=0) - mapper = BinMapper(random_state=0) - X_binned = mapper.fit_transform(X) - - fit_num_pred_num = gbdt.fit(X, y).predict(X) - fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned) - fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned) - - assert_allclose(fit_num_pred_num, fit_binned_pred_binned) - assert_allclose(fit_num_pred_num, fit_num_pred_binned) - - assert_raises_regex( - ValueError, - 'This estimator was fitted with pre-binned data ', - gbdt.fit(X_binned, y).predict, X - ) diff --git a/sklearn/gbm/tests/test_grower.py b/sklearn/gbm/tests/test_grower.py index 9232e2eb93b74..e900f15cda3b1 100644 --- a/sklearn/gbm/tests/test_grower.py +++ b/sklearn/gbm/tests/test_grower.py @@ -138,9 +138,10 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): # Check the values of the leaves: assert grower.root.left_child.value == approx(shrinkage) assert grower.root.right_child.left_child.value == approx(shrinkage) - assert grower.root.right_child.right_child.value == approx(-shrinkage) + assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3) +@pytest.mark.skip('Removed predict_binned') def test_predictor_from_grower(): # Build a tree on the toy 3-leaf dataset to extract the predictor. n_bins = 256 @@ -216,7 +217,7 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, max_leaf_nodes=n_samples) grower.grow() predictor = grower.make_predictor( - numerical_thresholds=mapper.numerical_thresholds_) + bin_thresholds=mapper.bin_thresholds_) if n_samples >= min_samples_leaf: for node in predictor.nodes: diff --git a/sklearn/gbm/tests/test_histogram.py b/sklearn/gbm/tests/test_histogram.py index 9860e3d9fbcfd..dcf7c4b2c23db 100644 --- a/sklearn/gbm/tests/test_histogram.py +++ b/sklearn/gbm/tests/test_histogram.py @@ -5,7 +5,7 @@ from numpy.testing import assert_array_equal from sklearn.gbm.histogram import _build_histogram_naive -from sklearn.nsemble.gbm.histogram import _build_histogram +from sklearn.gbm.histogram import _build_histogram from sklearn.gbm.histogram import _build_histogram_no_hessian from sklearn.gbm.histogram import _build_histogram_root_no_hessian from sklearn.gbm.histogram import _build_histogram_root diff --git a/sklearn/gbm/tests/test_loss.py b/sklearn/gbm/tests/test_loss.py index fe6d36bcca993..8afeddccd8cd4 100644 --- a/sklearn/gbm/tests/test_loss.py +++ b/sklearn/gbm/tests/test_loss.py @@ -10,11 +10,6 @@ def get_derivatives_helper(loss): """Return get_gradients() and get_hessians() functions for a given loss. - - Loss classes used to have get_gradients() and - get_hessians() methods, but now the update is done inplace in - update_gradient_and_hessians(). This helper is used to keep the tests - almost unchanged. """ def get_gradients(y_true, raw_predictions): @@ -55,6 +50,7 @@ def get_hessians(y_true, raw_predictions): ('binary_crossentropy', -12, 1), ('binary_crossentropy', 30, 1), ]) +@pytest.mark.skip('newton uses doubles but floats are expected') def test_derivatives(loss, x0, y_true): # Check that gradients are zero when the loss is minimized on 1D array # using the Newton-Raphson and the first and second order derivatives @@ -85,6 +81,7 @@ def fprime2(x): ('binary_crossentropy', 2, 1), ('categorical_crossentropy', 3, 3), ]) +@pytest.mark.skip('Fails because float32 precision is not enough for numeric checks') def test_numerical_gradients(loss, n_classes, prediction_dim): # Make sure gradients and hessians computed in the loss are correct, by # comparing with their approximations computed with finite central @@ -94,12 +91,12 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): rng = np.random.RandomState(0) n_samples = 100 if loss == 'least_squares': - y_true = rng.normal(size=n_samples).astype(np.float64) + y_true = rng.normal(size=n_samples).astype(np.float32) else: - y_true = rng.randint(0, n_classes, size=n_samples).astype(np.float64) + y_true = rng.randint(0, n_classes, size=n_samples).astype(np.float32) raw_predictions = rng.normal( size=(n_samples, prediction_dim) - ).astype(np.float64) + ).astype(np.float32) loss = _LOSSES[loss]() get_gradients, get_hessians = get_derivatives_helper(loss) @@ -118,7 +115,6 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) numerical_gradient = (f_plus_eps - f_minus_eps) / eps - numerical_gradient = numerical_gradient # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square @@ -127,7 +123,6 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): f_minus_eps = loss(y_true, raw_predictions - offset, average=False) f = loss(y_true, raw_predictions, average=False) numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps**2 - numerical_hessians = numerical_hessians def relative_error(a, b): return np.abs(a - b) / np.maximum(np.abs(a), np.abs(b)) @@ -147,6 +142,7 @@ def test_baseline_least_squares(): assert_almost_equal(baseline_prediction, y_train.mean()) +@pytest.mark.skip('binary crossentropy not supported yet') def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) @@ -170,6 +166,7 @@ def test_baseline_binary_crossentropy(): assert_almost_equal(baseline_prediction, np.log(p / (1 - p))) +@pytest.mark.skip('categorical crossentropy not supported yet') def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) From aef3bffae44309c8427719665233108c7c55e00e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jan 2019 17:36:33 -0500 Subject: [PATCH 023/247] Parallelized split finding, and added tests --- gdb_test.py | 61 +++-- sklearn/gbm/setup.py | 4 +- sklearn/gbm/splitting.pyx | 272 +++++++++++----------- sklearn/gbm/tests/test_splitting.py | 337 ++++++++++++++++++++++++++++ 4 files changed, 505 insertions(+), 169 deletions(-) create mode 100644 sklearn/gbm/tests/test_splitting.py diff --git a/gdb_test.py b/gdb_test.py index 23c2d75baa95f..3047fe21e1c92 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -12,7 +12,7 @@ classif = False n_classes = 3 -n_samples = int(1e4) +n_samples = int(1e6) max_iter = 5 if classif: @@ -27,15 +27,15 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -# pygbm_est = PYGBM_GBM( -# max_iter=max_iter, -# scoring=None, # no early stopping -# validation_split=None, -# random_state=0, -# verbose=False) -# print("compiling pygbm code") -# pygbm_est.fit(X[:1000], y[:1000]) -# print("done") +pygbm_est = PYGBM_GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + random_state=0, + verbose=False) +print("compiling pygbm code") +pygbm_est.fit(X[:1000], y[:1000]) +print("done") gbm = GBM( max_iter=max_iter, @@ -44,28 +44,27 @@ n_iter_no_change=None, random_state=0, verbose=True) -# tic = time() -# gbm.fit(X, y) -# fit_duration = time() - tic -# print(f'sklearn gbm fit_duration: {fit_duration:.3f}s\n') +tic = time() +gbm.fit(X, y) +fit_duration = time() - tic +tic = time() +print(f'score: {gbm.score(X, y)}') +score_duration = time() - tic +print(f'sklearn gbm fit_duration: {fit_duration:.3f}s\n') +print(f'sklearn gbm score_duration {score_duration:.3f}s') -# pygbm_est.set_params(verbose=True) -# tic = time() -# pygbm_est.fit(X, y) -# fit_duration = time() - tic -# print(f'pygbm fit_duration: {fit_duration:.3f}s\n') +pygbm_est.set_params(verbose=True) +tic = time() +pygbm_est.fit(X, y) +fit_duration = time() - tic +tic = time() +print(f'score: {pygbm_est.score(X, y)}') +score_duration = time() - tic +print(f'pygbm fit_duration: {fit_duration:.3f}s\n') +print(f'pygbm score_duration {score_duration:.3f}s') -cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") -s = pstats.Stats("Profile.prof") -s.strip_dirs().sort_stats("time").print_stats(.2) +# cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") +# s = pstats.Stats("Profile.prof") +# s.strip_dirs().sort_stats("time").print_stats(.2) -# tic = time() -# gbdt = GBDT(n_estimators=max_iter, -# n_iter_no_change=None, # no early stopping -# random_state=0, -# verbose=True).fit(X, y) -# print(gbdt.n_estimators_) -# print(f'score: {gbdt.score(X, y)}') -# duration = time() - tic -# print(f'Took {duration:.3f}s') diff --git a/sklearn/gbm/setup.py b/sklearn/gbm/setup.py index e6b03d58a572a..48678c19f67b2 100644 --- a/sklearn/gbm/setup.py +++ b/sklearn/gbm/setup.py @@ -17,7 +17,9 @@ def configuration(parent_package="", top_path=None): config.add_extension("splitting", sources=["splitting.pyx"], - include_dirs=[numpy.get_include()]) + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) config.add_extension("binning", sources=["binning.pyx"], diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 5c282efa603a9..0c48d734b4f76 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -11,6 +11,7 @@ into the newly created left and right childs. """ cimport cython +from cython.parallel import prange from libc.stdlib cimport malloc, free @@ -33,28 +34,6 @@ cdef struct hist_struct: unsigned int count -cdef get_threads_chunks(unsigned int total_size): - """Get start and end indices of threads in an array of size total_size. - - The interval [0, total_size - 1] is divided into n_threads contiguous - regions, and the starts and ends of each region are returned. Used to - simulate a 'static' scheduling. - """ - cdef: - np.ndarray[np.uint32_t] sizes - np.ndarray[np.uint32_t] starts - np.ndarray[np.uint32_t] ends - unsigned int n_threads - - n_threads = 1 # TODO: change this - sizes = np.full(n_threads, total_size // n_threads, dtype=np.uint32) - sizes[:total_size % n_threads] += 1 - starts = np.zeros(n_threads, dtype=np.uint32) - starts[1:] = np.cumsum(sizes[:-1]) - ends = starts + sizes - - return starts, ends, n_threads - cdef struct split_info_struct: float gain unsigned int feature_idx @@ -103,9 +82,8 @@ cdef class SplitInfo: unsigned int n_samples_left unsigned int n_samples_right - def __cinit__(self, float gain=-1., unsigned int feature_idx=0, unsigned - int bin_idx=0, - float gradient_left=0., float hessian_left=0., + def __init__(self, float gain=-1., unsigned int feature_idx=0, unsigned + int bin_idx=0, float gradient_left=0., float hessian_left=0., float gradient_right=0., float hessian_right=0., unsigned int n_samples_left=0, unsigned int n_samples_right=0): self.gain = gain @@ -222,7 +200,10 @@ cdef class SplittingContext: self.right_indices_buffer = np.empty_like(self.partition) -def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [:] sample_indices): +def split_indices( + SplittingContext context, + SplitInfo split_info, + unsigned int [:] sample_indices): cdef: unsigned int n_samples = sample_indices.shape[0] unsigned int i = 0 @@ -231,26 +212,29 @@ def split_indices(SplittingContext context, SplitInfo split_info, unsigned int [ unsigned int [:] view = sample_indices NPY_X_BINNED_DTYPE [:] binned_feature = context.X_binned.T[split_info.feature_idx] - while i != j: - # continue until we find an element that should be on right - while binned_feature[view[i]] <= pivot and i < n_samples: - i += 1 - # same, but now an element that should be on the left - while binned_feature[view[j]] > pivot and j >= 0: - j -= 1 - if i >= j: # j can become smaller than j! - break - else: - # swap - view[i], view[j] = view[j], view[i] - i += 1 - j -= 1 + with nogil: + while i != j: + # continue until we find an element that should be on right + while binned_feature[view[i]] <= pivot and i < n_samples: + i += 1 + # same, but now an element that should be on the left + while binned_feature[view[j]] > pivot and j >= 0: + j -= 1 + if i >= j: # j can become smaller than j! + break + else: + # swap + view[i], view[j] = view[j], view[i] + i += 1 + j -= 1 return sample_indices[:i], sample_indices[i:], i -def find_node_split(SplittingContext context, unsigned int [:] - sample_indices, hist_struct [:, :] histograms): +def find_node_split( + SplittingContext context, + unsigned int [:] sample_indices, + hist_struct [:, :] histograms): """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -275,52 +259,48 @@ def find_node_split(SplittingContext context, unsigned int [:] """ cdef: unsigned int n_samples - unsigned int feature_idx - unsigned int i + int feature_idx + int i unsigned int thread_idx - SplittingContext ctx unsigned int [:] starts unsigned int [:] ends unsigned int n_threads split_info_struct split_info split_info_struct * split_infos - ctx = context # shorter name to avoid various line breaks - n_samples = sample_indices.shape[0] - - # Populate ordered_gradients and ordered_hessians. (Already done for root) - # Ordering the gradients and hessians helps to improve cache hit. - # This is a parallelized version of the following vanilla code: - # for i range(n_samples): - # ctx.ordered_gradients[i] = ctx.gradients[samples_indices[i]] - if sample_indices.shape[0] != ctx.gradients.shape[0]: - starts, ends, n_threads = get_threads_chunks(n_samples) - if ctx.constant_hessian: - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - ctx.ordered_gradients[i] = ctx.gradients[sample_indices[i]] + with nogil: + n_samples = sample_indices.shape[0] + + # Populate ordered_gradients and ordered_hessians. (Already done for root) + # Ordering the gradients and hessians helps to improve cache hit. + if sample_indices.shape[0] != context.gradients.shape[0]: + if context.constant_hessian: + for i in prange(n_samples, schedule='static'): + context.ordered_gradients[i] = context.gradients[sample_indices[i]] + else: + for i in prange(n_samples, schedule='static'): + context.ordered_gradients[i] = context.gradients[sample_indices[i]] + context.ordered_hessians[i] = context.hessians[sample_indices[i]] + + context.sum_gradients = 0. + for i in range(n_samples): + context.sum_gradients += context.ordered_gradients[i] + + if context.constant_hessian: + context.sum_hessians = context.constant_hessian_value * (n_samples) else: - for thread_idx in range(n_threads): - for i in range(starts[thread_idx], ends[thread_idx]): - ctx.ordered_gradients[i] = ctx.gradients[sample_indices[i]] - ctx.ordered_hessians[i] = ctx.hessians[sample_indices[i]] - - # ctx.sum_gradients = ctx.ordered_gradients[:n_samples].sum() - ctx.sum_gradients = np.sum(ctx.ordered_gradients[:n_samples]) - if ctx.constant_hessian: - ctx.sum_hessians = ctx.constant_hessian_value * np.float32(n_samples) - else: - # ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum() - ctx.sum_hessians = np.sum(ctx.ordered_hessians[:n_samples]) + context.sum_hessians = 0. + for i in range(n_samples): + context.sum_hessians += context.ordered_hessians[i] - # TODO: this needs to be freed at some point - split_infos = malloc(context.n_features * sizeof(split_info_struct)) - for feature_idx in range(context.n_features): - split_info = _find_histogram_split( - context, feature_idx, sample_indices, histograms[feature_idx]) - split_infos[feature_idx] = split_info + # TODO: this needs to be freed at some point + split_infos = malloc(context.n_features * sizeof(split_info_struct)) + for feature_idx in prange(context.n_features): + split_info = _find_histogram_split( + context, feature_idx, sample_indices, histograms[feature_idx]) + split_infos[feature_idx] = split_info - split_info = _find_best_feature_to_split_helper(context, split_infos) + split_info = _find_best_feature_to_split_helper(context, split_infos) return SplitInfo( split_info.gain, @@ -378,43 +358,44 @@ def find_node_split_subtraction( """ cdef: - unsigned int feature_idx + int feature_idx unsigned int n_samples split_info_struct split_info split_info_struct * split_infos - unsigned int i - - n_samples = sample_indices.shape[0] - - # TODO: maybe change this computation... we could probably store sum_g/h in - # the SplitInfo for a speed gain - # Compute sum_hessians and sum_gradients. - # We can pick any feature (here the first) in the histograms to - # compute the gradients: they must be the same across all features - # anyway, we have tests ensuring this. Maybe a more robust way would - # be to compute an average but it's probably not worth it. - context.sum_gradients = 0 - for i in range(context.max_bins): - context.sum_gradients += parent_histograms[0, i].sum_gradients - sibling_histograms[0, i].sum_gradients - - if context.constant_hessian: - context.sum_hessians = \ - context.constant_hessian_value * float(n_samples) - else: - context.sum_hessians = 0 + int i + + with nogil: + n_samples = sample_indices.shape[0] + + # TODO: maybe change this computation... we could probably store sum_g/h in + # the SplitInfo for a speed gain + # Compute sum_hessians and sum_gradients. + # We can pick any feature (here the first) in the histograms to + # compute the gradients: they must be the same across all features + # anyway, we have tests ensuring this. Maybe a more robust way would + # be to compute an average but it's probably not worth it. + context.sum_gradients = 0. for i in range(context.max_bins): - context.sum_hessians += parent_histograms[0, i].sum_hessians - sibling_histograms[0, i].sum_hessians + context.sum_gradients += parent_histograms[0, i].sum_gradients - sibling_histograms[0, i].sum_gradients - # TODO: this needs to be freed at some point - split_infos = malloc(context.n_features * sizeof(split_info_struct)) - for feature_idx in range(context.n_features): - split_info = _find_histogram_split_subtraction( - context, feature_idx, parent_histograms[feature_idx], - sibling_histograms[feature_idx], histograms[feature_idx], - n_samples) - split_infos[feature_idx] = split_info + if context.constant_hessian: + context.sum_hessians = \ + context.constant_hessian_value * float(n_samples) + else: + context.sum_hessians = 0. + for i in range(context.max_bins): + context.sum_hessians += parent_histograms[0, i].sum_hessians - sibling_histograms[0, i].sum_hessians - split_info = _find_best_feature_to_split_helper(context, split_infos) + # TODO: this needs to be freed at some point + split_infos = malloc(context.n_features * sizeof(split_info_struct)) + for feature_idx in prange(context.n_features): + split_info = _find_histogram_split_subtraction( + context, feature_idx, parent_histograms[feature_idx], + sibling_histograms[feature_idx], histograms[feature_idx], + n_samples) + split_infos[feature_idx] = split_info + + split_info = _find_best_feature_to_split_helper(context, split_infos) return SplitInfo( split_info.gain, @@ -429,8 +410,9 @@ def find_node_split_subtraction( ) -cdef split_info_struct _find_best_feature_to_split_helper(SplittingContext -context, split_info_struct * split_infos) nogil: +cdef split_info_struct _find_best_feature_to_split_helper( + SplittingContext context, + split_info_struct * split_infos) nogil: cdef: float gain float best_gain @@ -439,7 +421,6 @@ context, split_info_struct * split_infos) nogil: unsigned int feature_idx best_gain = -1. - # for i, split_info in enumerate(split_infos): for feature_idx in range(context.n_features): split_info = split_infos[feature_idx] gain = split_info.gain @@ -448,10 +429,11 @@ context, split_info_struct * split_infos) nogil: best_split_info = split_info return best_split_info - -cdef split_info_struct _find_histogram_split(SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices, hist_struct [:] - histogram) nogil: +cdef split_info_struct _find_histogram_split( + SplittingContext context, + unsigned int feature_idx, + unsigned int [:] sample_indices, + hist_struct [:] histogram) nogil: """Compute the histogram for a given feature Returns the best SplitInfo among all the possible bins of the feature. @@ -471,7 +453,7 @@ cdef split_info_struct _find_histogram_split(SplittingContext context, unsigned else: _build_histogram_root(context.max_bins, X_binned, ordered_gradients, - context.ordered_hessians, histogram) + ordered_hessians, histogram) else: if context.constant_hessian: _build_histogram_no_hessian(context.max_bins, sample_indices, @@ -497,7 +479,7 @@ cdef split_info_struct _find_histogram_split_subtraction( """ _subtract_histograms(context.max_bins, parent_histogram, - sibling_histogram, histogram) + sibling_histogram, histogram) return _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples) @@ -572,26 +554,18 @@ cdef split_info_struct _find_best_bin_to_split_helper( best_split.hessian_right = hessian_right best_split.n_samples_left = n_samples_left best_split.n_samples_right = n_samples_right - """ - best_split = SplitInfo( - gain, - feature_idx, - bin_idx, - gradient_left, - gradient_right, - hessian_left, - hessian_right, - n_samples_left, - n_samples_right, - ) - """ return best_split -cdef inline float _split_gain(float gradient_left, float hessian_left, float gradient_right, - float hessian_right, float sum_gradients, float - sum_hessians, float l2_regularization) nogil: +cdef inline float _split_gain( + float gradient_left, + float hessian_left, + float gradient_right, + float hessian_right, + float sum_gradients, + float sum_hessians, + float l2_regularization) nogil: """Loss reduction Compute the reduction in loss after taking a split, compared to keeping @@ -601,12 +575,36 @@ cdef inline float _split_gain(float gradient_left, float hessian_left, float gra XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016 https://arxiv.org/abs/1603.02754 """ - cdef float gain + cdef: + float gain gain = negative_loss(gradient_left, hessian_left, l2_regularization) gain += negative_loss(gradient_right, hessian_right, l2_regularization) gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain -cdef inline float negative_loss(float gradient, float hessian, float -l2_regularization) nogil: +cdef inline float negative_loss( + float gradient, + float hessian, + float l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) + +# Only used for tests... not sure how to do it +def _find_histogram_split_wrapper( + SplittingContext context, + unsigned int feature_idx, + unsigned int [:] sample_indices, + hist_struct [:] histogram): + + split_info = _find_histogram_split(context, feature_idx, sample_indices, + histogram) + return SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.gradient_left, + split_info.hessian_left, + split_info.gradient_right, + split_info.hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + ) diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py new file mode 100644 index 0000000000000..d4bbf5f16c524 --- /dev/null +++ b/sklearn/gbm/tests/test_splitting.py @@ -0,0 +1,337 @@ +import numpy as np +from numpy.testing import assert_almost_equal +from numpy.testing import assert_array_almost_equal +import pytest + +from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.splitting import (SplittingContext, find_node_split, + find_node_split_subtraction, + split_indices, + _find_histogram_split_wrapper) + + +@pytest.mark.parametrize('n_bins', [3, 32, 256]) +def test_histogram_split(n_bins): + rng = np.random.RandomState(42) + feature_idx = 0 + l2_regularization = 0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + X_binned = np.asfortranarray( + rng.randint(0, n_bins, size=(int(1e4), 2)), dtype=np.uint8) + binned_feature = X_binned.T[feature_idx] + sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) + ordered_hessians = np.ones_like(binned_feature, dtype=np.float32) + all_hessians = ordered_hessians + + + for true_bin in range(1, n_bins - 1): + for sign in [-1, 1]: + ordered_gradients = np.full_like(binned_feature, sign, + dtype=np.float32) + ordered_gradients[binned_feature <= true_bin] *= -1 + all_gradients = ordered_gradients + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + context = SplittingContext(X_binned, + n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) + split_info = _find_histogram_split_wrapper( + context, feature_idx, sample_indices, histogram) + + assert split_info.bin_idx == true_bin + assert split_info.gain >= 0 + assert split_info.feature_idx == feature_idx + assert (split_info.n_samples_left + split_info.n_samples_right + == sample_indices.shape[0]) + # Constant hessian: 1. per sample. + assert split_info.n_samples_left == split_info.hessian_left + + +@pytest.mark.parametrize('constant_hessian', [True, False]) +def test_split_vs_split_subtraction(constant_hessian): + # Make sure find_node_split and find_node_split_subtraction return the + # same results. + # Should we add a test about computation time to make sure + # time(subtraction) < time(regular)? + rng = np.random.RandomState(42) + + n_bins = 10 + n_features = 20 + n_samples = 500 + l2_regularization = 0. + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + + X_binned = rng.randint(0, n_bins, size=(n_samples, n_features), + dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.randn(n_samples).astype(np.float32) + if constant_hessian: + all_hessians = np.ones(1, dtype=np.float32) + else: + all_hessians = rng.lognormal(size=n_samples).astype(np.float32) + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + context = SplittingContext(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + mask = rng.randint(0, 2, n_samples).astype(np.bool) + sample_indices_left = sample_indices[mask] + sample_indices_right = sample_indices[~mask] + + hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_left_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + + # first split parent, left and right with classical method + _ = find_node_split(context, sample_indices, hists_parent) + si_left = find_node_split(context, sample_indices_left, hists_left) + si_right = find_node_split(context, sample_indices_right, hists_right) + + # split left with subtraction method + si_left_sub = find_node_split_subtraction( + context, sample_indices_left, hists_parent, hists_right, hists_left_sub) + + # split right with subtraction method + si_right_sub = find_node_split_subtraction( + context, sample_indices_right, hists_parent, hists_left, hists_right_sub) + + # make sure histograms from classical and subtraction method are the same + for hists, hists_sub in ((hists_left, hists_left_sub), + (hists_right, hists_right_sub)): + for hist, hist_sub in zip(hists, hists_sub): + for key in ('count', 'sum_hessians', 'sum_gradients'): + assert_array_almost_equal(hist[key], hist_sub[key], decimal=4) + + # make sure split_infos from classical and subtraction method are the same + for si, si_sub in ((si_left, si_left_sub), (si_right, si_right_sub)): + assert_almost_equal(si.gain, si_sub.gain, decimal=3) + assert_almost_equal(si.feature_idx, si_sub.feature_idx, decimal=3) + assert_almost_equal(si.gradient_left, si_sub.gradient_left, decimal=3) + assert_almost_equal(si.gradient_right, si_sub.gradient_right, + decimal=3) + assert_almost_equal(si.hessian_right, si_sub.hessian_right, decimal=3) + assert_almost_equal(si.hessian_left, si_sub.hessian_left, decimal=3) + + +@pytest.mark.parametrize('constant_hessian', [True, False]) +def test_gradient_and_hessian_sanity(constant_hessian): + # This test checks that the values of gradients and hessians are + # consistent in different places: + # - in split_info: si.gradient_left + si.gradient_right must be equal to + # the gradient at the node. Same for hessians. + # - in the histograms: summing 'sum_gradients' over the bins must be + # constant across all features, and those sums must be equal to the + # node's gradient. Same for hessians. + # + # These checks are carried out for split_info and histograms resulting + # from both find_node_split() and find_node_split_subtraction(). + # + # The structure of this test is exactly the same as in + # test_split_vs_split_subtraction() but it's probably best to keep them + # separate because they're not checking the same things. + + rng = np.random.RandomState(42) + + n_bins = 10 + n_features = 20 + n_samples = 500 + l2_regularization = 0. + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + + X_binned = rng.randint(0, n_bins, size=(n_samples, n_features), + dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.randn(n_samples).astype(np.float32) + if constant_hessian: + all_hessians = np.ones(1, dtype=np.float32) + else: + all_hessians = rng.lognormal(size=n_samples).astype(np.float32) + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + context = SplittingContext(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + mask = rng.randint(0, 2, n_samples).astype(np.bool) + sample_indices_left = sample_indices[mask] + sample_indices_right = sample_indices[~mask] + + hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_left_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + + # first split parent, left and right with classical method + si_parent = find_node_split(context, sample_indices, hists_parent) + si_left = find_node_split(context, sample_indices_left, hists_left) + si_right = find_node_split(context, sample_indices_right, hists_right) + + # split left with subtraction method + si_left_sub = find_node_split_subtraction( + context, sample_indices_left, hists_parent, hists_right, hists_left_sub) + + # split right with subtraction method + si_right_sub = find_node_split_subtraction( + context, sample_indices_right, hists_parent, hists_left, hists_right_sub) + + # make sure that si.gradient_left + si.gradient_right have their expected + # value, same for hessians + for si, indices in ( + (si_parent, sample_indices), + (si_left, sample_indices_left), + (si_left_sub, sample_indices_left), + (si_right, sample_indices_right), + (si_right_sub, sample_indices_right)): + gradient = si.gradient_right + si.gradient_left + expected_gradient = all_gradients[indices].sum() + hessian = si.hessian_right + si.hessian_left + if constant_hessian: + expected_hessian = indices.shape[0] * all_hessians[0] + else: + expected_hessian = all_hessians[indices].sum() + + assert_almost_equal(gradient, expected_gradient, decimal=3) + assert_almost_equal(hessian, expected_hessian, decimal=3) + + # make sure sum of gradients in histograms are the same for all features, + # and make sure they're equal to their expected value + for hists, indices in ( + (hists_parent, sample_indices), + (hists_left, sample_indices_left), + (hists_left_sub, sample_indices_left), + (hists_right, sample_indices_right), + (hists_right_sub, sample_indices_right)): + # note: gradients and hessians have shape (n_features,), + # we're comparing them to *scalars*. This has the benefit of also + # making sure that all the entries are equal. + gradients = hists['sum_gradients'].sum(axis=1) # shape = (n_features,) + expected_gradient = all_gradients[indices].sum() # scalar + hessians = hists['sum_hessians'].sum(axis=1) + if constant_hessian: + # 0 is not the actual hessian, but it's not computed in this case + expected_hessian = 0. + else: + expected_hessian = all_hessians[indices].sum() + + assert_almost_equal(gradients, expected_gradient, decimal=4) + assert_almost_equal(hessians, expected_hessian, decimal=4) + + +def test_split_indices(): + # Check that split_indices returns the correct splits and that + # splitting_context.partition is consistent with what is returned. + rng = np.random.RandomState(421) + + n_bins = 5 + n_samples = 10 + l2_regularization = 0. + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0. + + # split will happen on feature 1 and on bin 3 + X_binned = [[0, 0], + [0, 3], + [0, 4], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 4], + [0, 0], + [0, 4]] + X_binned = np.asfortranarray(X_binned, dtype=np.uint8) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.randn(n_samples).astype(np.float32) + all_hessians = np.ones(1, dtype=np.float32) + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + context = SplittingContext(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + assert_array_almost_equal(sample_indices, context.partition) + + histograms = np.zeros(shape=(2, n_bins), dtype=HISTOGRAM_DTYPE) + si_root = find_node_split(context, sample_indices, histograms) + + # sanity checks for best split + assert si_root.feature_idx == 1 + assert si_root.bin_idx == 3 + + samples_left, samples_right, position_right = split_indices( + context, si_root, context.partition) + assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8]) + assert set(samples_right) == set([2, 7, 9]) + + assert_array_almost_equal(samples_left, + context.partition[:position_right]) + assert_array_almost_equal(samples_right, + context.partition[position_right:]) + + # Check that the resulting split indices sizes are consistent with the + # count statistics anticipated when looking for the best split. + assert samples_left.shape[0] == si_root.n_samples_left + assert samples_right.shape[0] == si_root.n_samples_right + + +def test_min_gain_to_split(): + # Try to split a pure node (all gradients are equal, same for hessians) + # with min_gain_to_split = 0 and make sure that the node is not split (best + # possible gain = -1). Note: before the strict inequality comparison, this + # test would fail because the node would be split with a gain of 0. + rng = np.random.RandomState(42) + feature_idx = 0 + l2_regularization = 0 + min_hessian_to_split = 0 + min_samples_leaf = 1 + min_gain_to_split = 0. + n_bins = 255 + n_samples = 100 + X_binned = np.asfortranarray( + rng.randint(0, n_bins, size=(n_samples, 2)), dtype=np.uint8) + binned_feature = X_binned.T[feature_idx] + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_hessians = np.ones_like(binned_feature, dtype=np.float32) + all_gradients = np.ones_like(binned_feature, dtype=np.float32) + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) + context = SplittingContext(X_binned, n_bins, n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) + split_info = _find_histogram_split_wrapper(context, feature_idx, + sample_indices, histogram) + assert split_info.gain == -1 From 733e91e619c10cb434ad51e2fd9708ddc5478153 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jan 2019 18:13:18 -0500 Subject: [PATCH 024/247] Added splitting benchmarks --- bench_binning.py | 3 - bench_find_node_split.py | 91 ++++++++++++++++++++++++++++++ bench_hist.py | 7 +-- bench_split_indices.py | 97 ++++++++++++++++++++++++++++++++ sklearn/gbm/gradient_boosting.py | 2 - 5 files changed, 190 insertions(+), 10 deletions(-) create mode 100644 bench_find_node_split.py create mode 100644 bench_split_indices.py diff --git a/bench_binning.py b/bench_binning.py index bacff736eec64..ba74ef500138c 100644 --- a/bench_binning.py +++ b/bench_binning.py @@ -1,8 +1,5 @@ """ Compare binning fitting and transform time with pygbm. - -run with -export NUMBA_NUM_THREADS=1 && make in && python bench_binning.py """ from time import time from collections import defaultdict diff --git a/bench_find_node_split.py b/bench_find_node_split.py new file mode 100644 index 0000000000000..fb226fb928d35 --- /dev/null +++ b/bench_find_node_split.py @@ -0,0 +1,91 @@ +from collections import defaultdict +from time import time + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.splitting import SplittingContext +from sklearn.gbm.splitting import find_node_split +from pygbm.splitting import SplittingContext as SplittingContext_pygbm +from pygbm.splitting import find_node_split as find_node_split_pygbm + +rng = np.random.RandomState(42) + +n_bins = 255 +n_features = 20 +l2_regularization = 0. +min_hessian_to_split = 1e-3 +min_samples_leaf = 1 +min_gain_to_split = 0. + +max_pow = 7 +n_samples_list = [10**x for x in range(2, max_pow + 1)] +n_exp = 10 + +n_samples = 10**max_pow + +X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=np.uint8) +sample_indices_ = np.arange(n_samples, dtype=np.uint32) +all_gradients_ = rng.randn(n_samples).astype(np.float32) +all_hessians_ = rng.lognormal(size=n_samples).astype(np.float32) + +def one_run(n_samples): + + X_binned = X_binned_[:n_samples] + X_binned = np.asfortranarray(X_binned) + sample_indices = sample_indices_[:n_samples] + all_gradients = all_gradients_[:n_samples] + all_hessians = all_hessians_[:n_samples] + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + + sklearn_context = SplittingContext(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + pygbm_context = SplittingContext_pygbm(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + tic = time() + histograms = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + _ = find_node_split(sklearn_context, sample_indices, histograms) + sklearn_duration = time() - tic + + tic = time() + _, _ = find_node_split_pygbm(pygbm_context, sample_indices) + pygbm_duration = time() - tic + + return sklearn_duration, pygbm_duration + +one_run(100) # compile pygbm + +durations = defaultdict(lambda: defaultdict(list)) + +for n_samples in n_samples_list: + for exp in range(n_exp): + + sklearn_duration, pygbm_duration = one_run(n_samples) + print(f"sklearn fit duration = {sklearn_duration:.3f}") + print(f"pygbm fit duration = {pygbm_duration:.3f}") + durations['sklearn'][n_samples].append(sklearn_duration) + durations['pygbm'][n_samples].append(pygbm_duration) + +fig, ax = plt.subplots(1) + +for implem in ('sklearn', 'pygbm'): + avgs = [np.mean(durations[implem][n_samples]) + for n_samples in n_samples_list] + stds = [np.std(durations[implem][n_samples]) + for n_samples in n_samples_list] + ax.errorbar(n_samples_list, avgs, yerr=stds, label=implem) + + +ax.set_xscale('log') +ax.legend(loc='best') + +fig.suptitle(f'Avg time for find_node_split {n_exp} runs\nfor different sample sizes') +plt.show() \ No newline at end of file diff --git a/bench_hist.py b/bench_hist.py index 188f05b445c32..66370c9282fa0 100644 --- a/bench_hist.py +++ b/bench_hist.py @@ -1,11 +1,8 @@ """ Compare histogram building function with pygbm. -run with -export NUMBA_NUM_THREADS=1 && make in && python bench_hist.py - -might be a bit unfair to cython code since we're calling the python versions of -the cpdef functions, which causes unnecessary conversions. +might be a bit unfair to cython code since we're calling the python versions +of the cpdef functions, which causes unnecessary conversions. """ from time import time from collections import defaultdict diff --git a/bench_split_indices.py b/bench_split_indices.py new file mode 100644 index 0000000000000..304f7c5366c82 --- /dev/null +++ b/bench_split_indices.py @@ -0,0 +1,97 @@ +from collections import defaultdict +from time import time + +import numpy as np +import matplotlib.pyplot as plt +from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.splitting import SplittingContext +from sklearn.gbm.splitting import find_node_split +from sklearn.gbm.splitting import split_indices +from pygbm.splitting import SplittingContext as SplittingContext_pygbm +from pygbm.splitting import find_node_split as find_node_split_pygbm +from pygbm.splitting import split_indices as split_indices_pygbm + +rng = np.random.RandomState(42) + +n_bins = 255 +n_features = 2 # Number of features has huge impact, it's weird +l2_regularization = 0. +min_hessian_to_split = 1e-3 +min_samples_leaf = 1 +min_gain_to_split = 0. + +max_pow = 7 +n_samples_list = [10**x for x in range(2, max_pow + 1)] +n_exp = 10 + +n_samples = 10**max_pow + +X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=np.uint8) +sample_indices_ = np.arange(n_samples, dtype=np.uint32) +all_gradients_ = rng.randn(n_samples).astype(np.float32) +all_hessians_ = rng.lognormal(size=n_samples).astype(np.float32) + +def one_run(n_samples): + + X_binned = X_binned_[:n_samples] + X_binned = np.asfortranarray(X_binned) + sample_indices = sample_indices_[:n_samples] + all_gradients = all_gradients_[:n_samples] + all_hessians = all_hessians_[:n_samples] + + n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + + sklearn_context = SplittingContext(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + pygbm_context = SplittingContext_pygbm(X_binned, n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) + + sample_indices = np.arange(n_samples, dtype=np.uint32) + + histograms = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + split_info = find_node_split(sklearn_context, sample_indices, histograms) + tic = time() + _, _, _ = split_indices(sklearn_context, split_info, sample_indices) + sklearn_duration = time() - tic + + split_info, _ = find_node_split_pygbm(pygbm_context, sample_indices) + tic = time() + _, _ = split_indices_pygbm(pygbm_context, split_info, sample_indices) + pygbm_duration = time() - tic + + return sklearn_duration, pygbm_duration + +one_run(100) # compile pygbm + +durations = defaultdict(lambda: defaultdict(list)) + +for n_samples in n_samples_list: + for exp in range(n_exp): + + sklearn_duration, pygbm_duration = one_run(n_samples) + print(f"sklearn fit duration = {sklearn_duration:.3f}") + print(f"pygbm fit duration = {pygbm_duration:.3f}") + durations['sklearn'][n_samples].append(sklearn_duration) + durations['pygbm'][n_samples].append(pygbm_duration) + +fig, ax = plt.subplots(1) + +for implem in ('sklearn', 'pygbm'): + avgs = [np.mean(durations[implem][n_samples]) + for n_samples in n_samples_list] + stds = [np.std(durations[implem][n_samples]) + for n_samples in n_samples_list] + ax.errorbar(n_samples_list, avgs, yerr=stds, label=implem) + + +ax.set_xscale('log') +ax.legend(loc='best') + +fig.suptitle(f'Avg time for split_indices over {n_exp} runs\nfor different sample sizes') +plt.show() diff --git a/sklearn/gbm/gradient_boosting.py b/sklearn/gbm/gradient_boosting.py index d9b85ba3777a0..e80f4446ea8ab 100644 --- a/sklearn/gbm/gradient_boosting.py +++ b/sklearn/gbm/gradient_boosting.py @@ -237,9 +237,7 @@ def fit(self, X, y): predictors[-1].append(predictor) tic_pred = time() - _update_raw_predictions(raw_predictions[:, k], grower) - toc_pred = time() acc_prediction_time += toc_pred - tic_pred From 80e645bf05dd6dda89264d4df66d5343a88cf2cd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 11:48:35 -0500 Subject: [PATCH 025/247] parallelized split_indices --- bench_split_indices.py | 2 +- gdb_test.py | 4 +- sklearn/gbm/splitting.pyx | 166 ++++++++++++++++++++++++++++++++------ 3 files changed, 146 insertions(+), 26 deletions(-) diff --git a/bench_split_indices.py b/bench_split_indices.py index 304f7c5366c82..709f3bef2f46e 100644 --- a/bench_split_indices.py +++ b/bench_split_indices.py @@ -14,7 +14,7 @@ rng = np.random.RandomState(42) n_bins = 255 -n_features = 2 # Number of features has huge impact, it's weird +n_features = 20 # Number of features has huge impact, it's weird l2_regularization = 0. min_hessian_to_split = 1e-3 min_samples_leaf = 1 diff --git a/gdb_test.py b/gdb_test.py index 3047fe21e1c92..4546f22a5c9d4 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -50,7 +50,7 @@ tic = time() print(f'score: {gbm.score(X, y)}') score_duration = time() - tic -print(f'sklearn gbm fit_duration: {fit_duration:.3f}s\n') +print(f'sklearn gbm fit_duration: {fit_duration:.3f}s') print(f'sklearn gbm score_duration {score_duration:.3f}s') @@ -61,7 +61,7 @@ tic = time() print(f'score: {pygbm_est.score(X, y)}') score_duration = time() - tic -print(f'pygbm fit_duration: {fit_duration:.3f}s\n') +print(f'pygbm fit_duration: {fit_duration:.3f}s') print(f'pygbm score_duration {score_duration:.3f}s') # cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 0c48d734b4f76..1e20d444fbf43 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -12,6 +12,7 @@ """ cimport cython from cython.parallel import prange +from openmp cimport omp_get_max_threads from libc.stdlib cimport malloc, free @@ -199,36 +200,155 @@ cdef class SplittingContext: self.left_indices_buffer = np.empty_like(self.partition) self.right_indices_buffer = np.empty_like(self.partition) - def split_indices( SplittingContext context, SplitInfo split_info, unsigned int [:] sample_indices): + """Split samples into left and right arrays. + + The split is performed according to the best possible split (split_info). + + Ultimately, this is nothing but a partition of the sample_indices array + with a given pivot, exactly like a quicksort subroutine. + + Parameters + ---------- + context : SplittingContext + The splitting context + split_info : SplitInfo + The SplitInfo of the node to split + sample_indices : array of unsigned int + The indices of the samples at the node to split. This is a view on + context.partition, and it is modified inplace by placing the indices + of the left child at the beginning, and the indices of the right child + at the end. + + Returns + ------- + left_indices : array of int + The indices of the samples in the left child. This is a view on + context.partition. + right_indices : array of int + The indices of the samples in the right child. This is a view on + context.partition. + right_child_position : int + The position of the right child in ``sample_indices`` + """ + # This is a multi-threaded implementation inspired by lightgbm. + # Here is a quick break down. Let's suppose we want to split a node with + # 24 samples named from a to x. context.partition looks like this (the * + # are indices in other leaves that we don't care about): + # partition = [*************abcdefghijklmnopqrstuvwx****************] + # ^ ^ + # node_position node_position + node.n_samples + + # Ultimately, we want to reorder the samples inside the boundaries of the + # leaf (which becomes a node) to now represent the samples in its left and + # right child. For example: + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + # ^ ^ + # left_child_pos right_child_pos + # Note that left_child_pos always takes the value of node_position, and + # right_child_pos = left_child_pos + left_child.n_samples. The order of + # the samples inside a leaf is irrelevant. + + # 1. samples_indices is a view on this region a..x. We conceptually + # divide it into n_threads regions. Each thread will be responsible for + # its own region. Here is an example with 4 threads: + # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] + # 2. Each thread processes 6 = 24 // 4 entries and maps them into + # left_indices_buffer or right_indices_buffer. For example, we could + # have the following mapping ('.' denotes an undefined entry): + # - left_indices_buffer = [abef..|il....|mnopqr|tux...] + # - right_indices_buffer = [cd....|ghjk..|......|svw...] + # 3. We keep track of the start positions of the regions (the '|') in + # ``offset_in_buffers`` as well as the size of each region. We also keep + # track of the number of samples put into the left/right child by each + # thread. Concretely: + # - left_counts = [4, 2, 6, 3] + # - right_counts = [2, 4, 0, 3] + # 4. Finally, we put left/right_indices_buffer back into the + # samples_indices, without any undefined entries and the partition looks + # as expected + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + + # Note: We here show left/right_indices_buffer as being the same size as + # sample_indices for simplicity, but in reality they are of the same size + # as partition. + cdef: - unsigned int n_samples = sample_indices.shape[0] - unsigned int i = 0 - unsigned int j = n_samples - 1 - unsigned char pivot = split_info.bin_idx - unsigned int [:] view = sample_indices - NPY_X_BINNED_DTYPE [:] binned_feature = context.X_binned.T[split_info.feature_idx] + int n_samples = sample_indices.shape[0] + NPY_X_BINNED_DTYPE [:] X_binned = context.X_binned.T[split_info.feature_idx] + unsigned int [:] left_indices_buffer = context.left_indices_buffer + unsigned int [:] right_indices_buffer = context.right_indices_buffer + int n_threads = omp_get_max_threads() + int [:] sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) + int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) + int [:] left_counts = np.empty(n_threads, dtype=np.int32) + int [:] right_counts = np.empty(n_threads, dtype=np.int32) + int left_count + int right_count + int start + int stop + int i + int thread_idx + int sample_idx + int right_child_position + int [:] left_offset = np.zeros(n_threads, dtype=np.int32) + int [:] right_offset = np.zeros(n_threads, dtype=np.int32) with nogil: - while i != j: - # continue until we find an element that should be on right - while binned_feature[view[i]] <= pivot and i < n_samples: - i += 1 - # same, but now an element that should be on the left - while binned_feature[view[j]] > pivot and j >= 0: - j -= 1 - if i >= j: # j can become smaller than j! - break - else: - # swap - view[i], view[j] = view[j], view[i] - i += 1 - j -= 1 - - return sample_indices[:i], sample_indices[i:], i + for thread_idx in range(n_samples % n_threads): + sizes[thread_idx] += 1 + + for thread_idx in range(1, n_threads): + offset_in_buffers[thread_idx] = offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] + + # map indices from samples_indices to left/right_indices_buffer + for thread_idx in prange(n_threads): + left_count = 0 + right_count = 0 + + start = offset_in_buffers[thread_idx] + stop = start + sizes[thread_idx] + for i in range(start, stop): + sample_idx = sample_indices[i] + if X_binned[sample_idx] <= split_info.bin_idx: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 + + left_counts[thread_idx] = left_count + right_counts[thread_idx] = right_count + + # position of right child = just after the left child + right_child_position = 0 + for thread_idx in range(n_threads): + right_child_position += left_counts[thread_idx] + + # offset of each thread in samples_indices for left and right child, i.e. + # where each thread will start to write. + right_offset[0] = right_child_position + for thread_idx in range(1, n_threads): + left_offset[thread_idx] = left_offset[thread_idx - 1] + left_counts[thread_idx - 1] + right_offset[thread_idx] = right_offset[thread_idx - 1] + right_counts[thread_idx - 1] + + # map indices in left/right_indices_buffer back into samples_indices. This + # also updates context.partition since samples_indice is a view. + for thread_idx in prange(n_threads): + + for i in range(left_counts[thread_idx]): + sample_indices[left_offset[thread_idx] + i] = \ + left_indices_buffer[offset_in_buffers[thread_idx] + i] + for i in range(right_counts[thread_idx]): + sample_indices[right_offset[thread_idx] + i] = \ + right_indices_buffer[offset_in_buffers[thread_idx] + i] + + return (sample_indices[:right_child_position], + sample_indices[right_child_position:], + right_child_position) def find_node_split( From 2f0c93a7c06b421f21aff08210ff9ed32d720251 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 14:39:33 -0500 Subject: [PATCH 026/247] Unified type imports in types --- sklearn/gbm/_gradient_boosting.pyx | 4 +- sklearn/gbm/binning.pyx | 13 +--- sklearn/gbm/histogram.pxd | 35 +++++----- sklearn/gbm/histogram.pyx | 67 ++++++++++-------- sklearn/gbm/loss.pyx | 11 ++- sklearn/gbm/predictor.pyx | 4 +- sklearn/gbm/setup.py | 4 ++ sklearn/gbm/splitting.pyx | 108 ++++++++++++++++------------- sklearn/gbm/types.pxd | 14 ++++ sklearn/gbm/types.py | 12 ---- sklearn/gbm/types.pyx | 11 +++ 11 files changed, 156 insertions(+), 127 deletions(-) create mode 100644 sklearn/gbm/types.pxd delete mode 100644 sklearn/gbm/types.py create mode 100644 sklearn/gbm/types.pyx diff --git a/sklearn/gbm/_gradient_boosting.pyx b/sklearn/gbm/_gradient_boosting.pyx index e45a7982e0e0e..cfc8d106a60fa 100644 --- a/sklearn/gbm/_gradient_boosting.pyx +++ b/sklearn/gbm/_gradient_boosting.pyx @@ -3,15 +3,15 @@ # cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 + cimport cython from cython.parallel import prange - import numpy as np cimport numpy as np from .types import Y_DTYPE +from .types cimport NPY_Y_DTYPE -ctypedef np.npy_float32 NPY_Y_DTYPE def _update_raw_predictions(NPY_Y_DTYPE [:] raw_predictions, grower): cdef: diff --git a/sklearn/gbm/binning.pyx b/sklearn/gbm/binning.pyx index 9e18cfeb57134..1c53ca8ea7a3a 100644 --- a/sklearn/gbm/binning.pyx +++ b/sklearn/gbm/binning.pyx @@ -16,14 +16,10 @@ import numpy as np cimport numpy as np from cython.parallel import prange -from sklearn.utils import check_random_state, check_array -from sklearn.base import BaseEstimator, TransformerMixin - +from ..utils import check_random_state, check_array +from ..base import BaseEstimator, TransformerMixin from .types import X_DTYPE, X_BINNED_DTYPE - - -ctypedef np.npy_float64 NPY_X_DTYPE -ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE +from .types cimport NPY_X_DTYPE, NPY_X_BINNED_DTYPE def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), @@ -85,8 +81,6 @@ cpdef _map_to_bins(const NPY_X_DTYPE [:, :] data, list binning_thresholds, binned_data : array of int, shape=data.shape The binned data. """ - # TODO: add support for categorical data encoded as integers - # TODO: add support for sparse data (numerical or categorical) cdef: int feature_idx @@ -106,7 +100,6 @@ cpdef void _map_num_col_to_bins(const NPY_X_DTYPE [:] data, int right int middle - # for i in range(data.shape[0]): for i in prange(data.shape[0], schedule='static'): left, right = 0, binning_thresholds.shape[0] while left < right: diff --git a/sklearn/gbm/histogram.pxd b/sklearn/gbm/histogram.pxd index ccc3532757f5f..11ef0bf831594 100644 --- a/sklearn/gbm/histogram.pxd +++ b/sklearn/gbm/histogram.pxd @@ -2,26 +2,25 @@ import numpy as np cimport numpy as np from .types import HISTOGRAM_DTYPE +from .types cimport NPY_X_BINNED_DTYPE +from .types cimport NPY_Y_DTYPE +from .types cimport hist_struct -ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE -ctypedef np.npy_float32 NPY_Y_DTYPE +# See histogram.pyx for docstrings and details -cdef packed struct hist_struct: - float sum_gradients - float sum_hessians - unsigned int count - -cpdef void _subtract_histograms(unsigned int n_bins, - hist_struct [:] hist_a, - hist_struct [:] hist_b, - hist_struct [:] out) nogil +cpdef void _subtract_histograms( + unsigned int n_bins, + hist_struct [:] hist_a, + hist_struct [:] hist_b, + hist_struct [:] out) nogil -cpdef void _build_histogram(unsigned int n_bins, - unsigned int [:] sample_indices, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] ordered_gradients, - NPY_Y_DTYPE [:] ordered_hessians, - hist_struct [:] out) nogil +cpdef void _build_histogram( + unsigned int n_bins, + unsigned int [:] sample_indices, + NPY_X_BINNED_DTYPE [:] binned_feature, + NPY_Y_DTYPE [:] ordered_gradients, + NPY_Y_DTYPE [:] ordered_hessians, + hist_struct [:] out) nogil cpdef void _build_histogram_no_hessian( unsigned int n_bins, @@ -41,4 +40,4 @@ cpdef void _build_histogram_root( NPY_X_BINNED_DTYPE [:] binned_feature, NPY_Y_DTYPE [:] all_gradients, NPY_Y_DTYPE [:] all_hessians, - hist_struct [:] out) nogil \ No newline at end of file + hist_struct [:] out) nogil diff --git a/sklearn/gbm/histogram.pyx b/sklearn/gbm/histogram.pyx index dea4c9bdf803b..ab8f20303a158 100644 --- a/sklearn/gbm/histogram.pyx +++ b/sklearn/gbm/histogram.pyx @@ -15,12 +15,16 @@ cimport numpy as np from .types import HISTOGRAM_DTYPE -cpdef void _build_histogram_naive(unsigned int n_bins, - unsigned int [:] sample_indices, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] ordered_gradients, - NPY_Y_DTYPE [:] ordered_hessians, - hist_struct [:] out) nogil: +# Note: IN views are read-only, OUT views are write-only + +cpdef void _build_histogram_naive( + unsigned int n_bins, + unsigned int [:] sample_indices, # IN + NPY_X_BINNED_DTYPE [:] binned_feature, # IN + NPY_Y_DTYPE [:] ordered_gradients, # IN + NPY_Y_DTYPE [:] ordered_hessians, # IN + hist_struct [:] out # OUT + ) nogil: """Build histogram in a naive way, without optimizing for cache hit.""" cdef: unsigned int i @@ -36,11 +40,13 @@ cpdef void _build_histogram_naive(unsigned int n_bins, out[bin_idx].count += 1 -cpdef void _subtract_histograms(unsigned int n_bins, - hist_struct [:] hist_a, - hist_struct [:] hist_b, - hist_struct [:] out) nogil: - """Return hist_a - hist_b""" +cpdef void _subtract_histograms( + unsigned int n_bins, + hist_struct [:] hist_a, # IN + hist_struct [:] hist_b, # IN + hist_struct [:] out # OUT + ) nogil: + """compute (hist_a - hist_b) in out""" cdef: unsigned int i = 0 @@ -50,12 +56,14 @@ cpdef void _subtract_histograms(unsigned int n_bins, out[i].count = hist_a[i].count - hist_b[i].count -cpdef void _build_histogram(unsigned int n_bins, - unsigned int [:] sample_indices, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] ordered_gradients, - NPY_Y_DTYPE [:] ordered_hessians, - hist_struct [:] out) nogil: +cpdef void _build_histogram( + unsigned int n_bins, + unsigned int [:] sample_indices, # IN + NPY_X_BINNED_DTYPE [:] binned_feature, # IN + NPY_Y_DTYPE [:] ordered_gradients, # IN + NPY_Y_DTYPE [:] ordered_hessians, # IN + hist_struct [:] out # OUT + ) nogil: """Return histogram for a given feature.""" cdef: unsigned int i = 0 @@ -98,10 +106,11 @@ cpdef void _build_histogram(unsigned int n_bins, cpdef void _build_histogram_no_hessian( unsigned int n_bins, - unsigned int [:] sample_indices, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] ordered_gradients, - hist_struct [:] out) nogil: + unsigned int [:] sample_indices, # IN + NPY_X_BINNED_DTYPE [:] binned_feature, # IN + NPY_Y_DTYPE [:] ordered_gradients, # OUT + hist_struct [:] out # OUT + ) nogil: """Return histogram for a given feature.""" cdef: unsigned int i = 0 @@ -138,9 +147,10 @@ cpdef void _build_histogram_no_hessian( cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] all_gradients, - hist_struct [:] out) nogil: + NPY_X_BINNED_DTYPE [:] binned_feature, # IN + NPY_Y_DTYPE [:] all_gradients, # IN + hist_struct [:] out # OUT + ) nogil: """Special case for the root node The root node has to find the split among all the samples from the @@ -184,10 +194,11 @@ cpdef void _build_histogram_root_no_hessian( cpdef void _build_histogram_root( unsigned int n_bins, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] all_gradients, - NPY_Y_DTYPE [:] all_hessians, - hist_struct [:] out) nogil: + NPY_X_BINNED_DTYPE [:] binned_feature, # IN + NPY_Y_DTYPE [:] all_gradients, # IN + NPY_Y_DTYPE [:] all_hessians, # IN + hist_struct [:] out # OUT + ) nogil: """Special case for the root node The root node has to find the split among all the samples from the diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index eb6796d041aaf..4c8f6ee673c9f 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -13,15 +13,12 @@ from abc import ABC, abstractmethod cimport cython from cython.parallel import prange - import numpy as np cimport numpy as np - from scipy.special import expit, logsumexp from .types import Y_DTYPE - -ctypedef np.npy_float32 NPY_Y_DTYPE +from .types cimport NPY_Y_DTYPE cdef get_threads_chunks(unsigned int total_size): @@ -157,9 +154,9 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( - NPY_Y_DTYPE[:] gradients, - NPY_Y_DTYPE[:] y_true, - NPY_Y_DTYPE[:] raw_predictions) nogil: + NPY_Y_DTYPE [:] gradients, + NPY_Y_DTYPE [:] y_true, + NPY_Y_DTYPE [:] raw_predictions) nogil: cdef: unsigned int n_samples int i diff --git a/sklearn/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx index 0620a66a0e695..a2b3c03a3955e 100644 --- a/sklearn/gbm/predictor.pyx +++ b/sklearn/gbm/predictor.pyx @@ -10,6 +10,7 @@ import numpy as np cimport numpy as np from .types import X_DTYPE +from .types cimport NPY_X_DTYPE PREDICTOR_RECORD_DTYPE = np.dtype([ @@ -23,11 +24,8 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ ('depth', np.uint32), ('is_leaf', np.uint8), ('bin_threshold', np.uint8), - # TODO: shrinkage in leaf for feature importance error bar? ]) -ctypedef np.npy_float64 NPY_X_DTYPE - cdef packed struct node_struct: float value unsigned int count diff --git a/sklearn/gbm/setup.py b/sklearn/gbm/setup.py index 48678c19f67b2..369406ada6ab2 100644 --- a/sklearn/gbm/setup.py +++ b/sklearn/gbm/setup.py @@ -37,6 +37,10 @@ def configuration(parent_package="", top_path=None): extra_compile_args=['-fopenmp'], extra_link_args=['-fopenmp']) + config.add_extension("types", + sources=["types.pyx"], + include_dirs=[numpy.get_include()]) + config.add_extension("playground", sources=["playground.pyx"], include_dirs=[numpy.get_include()]) diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 1e20d444fbf43..341ef10b88131 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -3,7 +3,7 @@ # cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 -"""This module contains njitted routines and data structures to: +"""This module contains routines and data structures to: - Find the best possible split of a node. For a given node, a split is characterized by a feature and a bin. @@ -12,30 +12,25 @@ """ cimport cython from cython.parallel import prange -from openmp cimport omp_get_max_threads - -from libc.stdlib cimport malloc, free - import numpy as np cimport numpy as np +from openmp cimport omp_get_max_threads +from libc.stdlib cimport malloc, free from .histogram cimport _build_histogram from .histogram cimport _build_histogram_no_hessian from .histogram cimport _build_histogram_root from .histogram cimport _build_histogram_root_no_hessian from .histogram cimport _subtract_histograms -from .histogram cimport NPY_X_BINNED_DTYPE -from .histogram cimport NPY_Y_DTYPE - +from .types cimport NPY_X_BINNED_DTYPE +from .types cimport NPY_Y_DTYPE +from .types cimport hist_struct from .types import HISTOGRAM_DTYPE -cdef struct hist_struct: - float sum_gradients - float sum_hessians - unsigned int count - cdef struct split_info_struct: + # Same as the SplitInfo class, but we need a C struct to use it in nogil + # mode. float gain unsigned int feature_idx unsigned int bin_idx @@ -46,7 +41,7 @@ cdef struct split_info_struct: unsigned int n_samples_left unsigned int n_samples_right -@cython.freelist(100) + @cython.final cdef class SplitInfo: """Pure data class to store information about a potential split. @@ -86,7 +81,8 @@ cdef class SplitInfo: def __init__(self, float gain=-1., unsigned int feature_idx=0, unsigned int bin_idx=0, float gradient_left=0., float hessian_left=0., float gradient_right=0., float hessian_right=0., - unsigned int n_samples_left=0, unsigned int n_samples_right=0): + unsigned int n_samples_left=0, unsigned int + n_samples_right=0): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx @@ -157,11 +153,12 @@ cdef class SplittingContext: unsigned int [:] left_indices_buffer unsigned int [:] right_indices_buffer - def __cinit__(self, NPY_X_BINNED_DTYPE [:, :] X_binned, unsigned int max_bins, - np.ndarray[np.uint32_t] n_bins_per_feature, - NPY_Y_DTYPE [:] gradients, NPY_Y_DTYPE [:] hessians, float l2_regularization, - float min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, - float min_gain_to_split=0.): + def __init__(self, NPY_X_BINNED_DTYPE [:, :] X_binned, unsigned int + max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, + NPY_Y_DTYPE [:] gradients, NPY_Y_DTYPE [:] hessians, float + l2_regularization, float min_hessian_to_split=1e-3, + unsigned int min_samples_leaf=20, float + min_gain_to_split=0.): self.X_binned = X_binned self.n_features = X_binned.shape[1] @@ -302,7 +299,8 @@ def split_indices( sizes[thread_idx] += 1 for thread_idx in range(1, n_threads): - offset_in_buffers[thread_idx] = offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] + offset_in_buffers[thread_idx] = \ + offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] # map indices from samples_indices to left/right_indices_buffer for thread_idx in prange(n_threads): @@ -332,8 +330,10 @@ def split_indices( # where each thread will start to write. right_offset[0] = right_child_position for thread_idx in range(1, n_threads): - left_offset[thread_idx] = left_offset[thread_idx - 1] + left_counts[thread_idx - 1] - right_offset[thread_idx] = right_offset[thread_idx - 1] + right_counts[thread_idx - 1] + left_offset[thread_idx] = \ + left_offset[thread_idx - 1] + left_counts[thread_idx - 1] + right_offset[thread_idx] = \ + right_offset[thread_idx - 1] + right_counts[thread_idx - 1] # map indices in left/right_indices_buffer back into samples_indices. This # also updates context.partition since samples_indice is a view. @@ -353,8 +353,8 @@ def split_indices( def find_node_split( SplittingContext context, - unsigned int [:] sample_indices, - hist_struct [:, :] histograms): + unsigned int [:] sample_indices, # IN + hist_struct [:, :] histograms): # OUT """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -396,25 +396,29 @@ def find_node_split( if sample_indices.shape[0] != context.gradients.shape[0]: if context.constant_hessian: for i in prange(n_samples, schedule='static'): - context.ordered_gradients[i] = context.gradients[sample_indices[i]] + context.ordered_gradients[i] = \ + context.gradients[sample_indices[i]] else: for i in prange(n_samples, schedule='static'): - context.ordered_gradients[i] = context.gradients[sample_indices[i]] - context.ordered_hessians[i] = context.hessians[sample_indices[i]] + context.ordered_gradients[i] = \ + context.gradients[sample_indices[i]] + context.ordered_hessians[i] = \ + context.hessians[sample_indices[i]] context.sum_gradients = 0. for i in range(n_samples): context.sum_gradients += context.ordered_gradients[i] if context.constant_hessian: - context.sum_hessians = context.constant_hessian_value * (n_samples) + context.sum_hessians = context.constant_hessian_value * n_samples else: context.sum_hessians = 0. for i in range(n_samples): context.sum_hessians += context.ordered_hessians[i] # TODO: this needs to be freed at some point - split_infos = malloc(context.n_features * sizeof(split_info_struct)) + split_infos = malloc( + context.n_features * sizeof(split_info_struct)) for feature_idx in prange(context.n_features): split_info = _find_histogram_split( context, feature_idx, sample_indices, histograms[feature_idx]) @@ -437,10 +441,10 @@ def find_node_split( def find_node_split_subtraction( SplittingContext context, - unsigned int [:] sample_indices, - hist_struct [:, :] parent_histograms, - hist_struct [:, :] sibling_histograms, - hist_struct [:, :] histograms): + unsigned int [:] sample_indices, # IN + hist_struct [:, :] parent_histograms, # IN + hist_struct [:, :] sibling_histograms, # IN + hist_struct [:, :] histograms): # OUT """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -466,6 +470,9 @@ def find_node_split_subtraction( sibling_histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) The histograms of the sibling + histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The computed histograms Returns ------- @@ -496,7 +503,8 @@ def find_node_split_subtraction( # be to compute an average but it's probably not worth it. context.sum_gradients = 0. for i in range(context.max_bins): - context.sum_gradients += parent_histograms[0, i].sum_gradients - sibling_histograms[0, i].sum_gradients + context.sum_gradients += (parent_histograms[0, i].sum_gradients - + sibling_histograms[0, i].sum_gradients) if context.constant_hessian: context.sum_hessians = \ @@ -504,10 +512,12 @@ def find_node_split_subtraction( else: context.sum_hessians = 0. for i in range(context.max_bins): - context.sum_hessians += parent_histograms[0, i].sum_hessians - sibling_histograms[0, i].sum_hessians + context.sum_hessians += (parent_histograms[0, i].sum_hessians - + sibling_histograms[0, i].sum_hessians) # TODO: this needs to be freed at some point - split_infos = malloc(context.n_features * sizeof(split_info_struct)) + split_infos = malloc( + context.n_features * sizeof(split_info_struct)) for feature_idx in prange(context.n_features): split_info = _find_histogram_split_subtraction( context, feature_idx, parent_histograms[feature_idx], @@ -532,7 +542,8 @@ def find_node_split_subtraction( cdef split_info_struct _find_best_feature_to_split_helper( SplittingContext context, - split_info_struct * split_infos) nogil: + split_info_struct * split_infos # IN + ) nogil: cdef: float gain float best_gain @@ -552,8 +563,9 @@ cdef split_info_struct _find_best_feature_to_split_helper( cdef split_info_struct _find_histogram_split( SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices, - hist_struct [:] histogram) nogil: + unsigned int [:] sample_indices, # IN + hist_struct [:] histogram # OUT + ) nogil: """Compute the histogram for a given feature Returns the best SplitInfo among all the possible bins of the feature. @@ -563,7 +575,8 @@ cdef split_info_struct _find_histogram_split( unsigned int n_samples = sample_indices.shape[0] NPY_X_BINNED_DTYPE [:] X_binned = context.X_binned.T[feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples - NPY_Y_DTYPE [:] ordered_gradients = context.ordered_gradients[:n_samples] + NPY_Y_DTYPE [:] ordered_gradients = \ + context.ordered_gradients[:n_samples] NPY_Y_DTYPE [:] ordered_hessians = context.ordered_hessians[:n_samples] if root_node: @@ -588,10 +601,11 @@ cdef split_info_struct _find_histogram_split( cdef split_info_struct _find_histogram_split_subtraction( SplittingContext context, unsigned int feature_idx, - hist_struct [:] parent_histogram, - hist_struct [:] sibling_histogram, - hist_struct [:] histogram, - unsigned int n_samples) nogil: + hist_struct [:] parent_histogram, # IN + hist_struct [:] sibling_histogram, # IN + hist_struct [:] histogram, # OUT + unsigned int n_samples + ) nogil: """Compute the histogram by substraction of parent and sibling Uses the identity: hist(parent) = hist(left) + hist(right). @@ -608,7 +622,7 @@ cdef split_info_struct _find_histogram_split_subtraction( cdef split_info_struct _find_best_bin_to_split_helper( SplittingContext context, unsigned int feature_idx, - hist_struct [:] histogram, + hist_struct [:] histogram, # IN unsigned int n_samples) nogil: """Find best bin to split on, and return the corresponding SplitInfo. @@ -708,7 +722,7 @@ cdef inline float negative_loss( float l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) -# Only used for tests... not sure how to do it +# Only used for tests... not great def _find_histogram_split_wrapper( SplittingContext context, unsigned int feature_idx, diff --git a/sklearn/gbm/types.pxd b/sklearn/gbm/types.pxd new file mode 100644 index 0000000000000..d4cea50da0b19 --- /dev/null +++ b/sklearn/gbm/types.pxd @@ -0,0 +1,14 @@ +import numpy as np +cimport numpy as np + + +ctypedef np.npy_float64 NPY_X_DTYPE +ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE +ctypedef np.npy_float32 NPY_Y_DTYPE + +# Same as histogram dtype but we need a struct to declare views. It needs to be +# packed since by default numpy dtypes aren't aligned +cdef packed struct hist_struct: + float sum_gradients + float sum_hessians + unsigned int count diff --git a/sklearn/gbm/types.py b/sklearn/gbm/types.py deleted file mode 100644 index 738ac539b46b4..0000000000000 --- a/sklearn/gbm/types.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np - - -Y_DTYPE = np.float32 -X_DTYPE = np.float64 -X_BINNED_DTYPE = np.uint8 - -HISTOGRAM_DTYPE = np.dtype([ - ('sum_gradients', np.float32), - ('sum_hessians', np.float32), - ('count', np.uint32), -]) diff --git a/sklearn/gbm/types.pyx b/sklearn/gbm/types.pyx new file mode 100644 index 0000000000000..24b27ba8917d0 --- /dev/null +++ b/sklearn/gbm/types.pyx @@ -0,0 +1,11 @@ +import numpy as np + +Y_DTYPE = np.float32 +X_DTYPE = np.float64 +X_BINNED_DTYPE = np.uint8 + +HISTOGRAM_DTYPE = np.dtype([ + ('sum_gradients', np.float32), # sum of sample gradients in bin + ('sum_hessians', np.float32), # sum of sample hessians in bin + ('count', np.uint32), # number of samples in bin +]) From ca4d144318f47c43a21d56d1065fe9f264dba511 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 14:58:19 -0500 Subject: [PATCH 027/247] Tried parallelize prediction but doesn't work :( --- sklearn/gbm/predictor.pyx | 6 +++++- sklearn/gbm/setup.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx index a2b3c03a3955e..a882011e15717 100644 --- a/sklearn/gbm/predictor.pyx +++ b/sklearn/gbm/predictor.pyx @@ -6,6 +6,8 @@ """ This module contains the TreePredictor class which is used for prediction. """ +cimport cython +from cython.parallel import prange import numpy as np cimport numpy as np @@ -100,7 +102,9 @@ cdef void _predict_from_numeric_data( float [:] out) nogil: cdef: - unsigned int i + int i + # TODO: Why does prange fail?? + # for i in prange(numeric_data.shape[0], schedule='static'): for i in range(numeric_data.shape[0]): out[i] = _predict_one_from_numeric_data(nodes, numeric_data[i]) diff --git a/sklearn/gbm/setup.py b/sklearn/gbm/setup.py index 369406ada6ab2..1ebee4cf3fbfe 100644 --- a/sklearn/gbm/setup.py +++ b/sklearn/gbm/setup.py @@ -29,7 +29,9 @@ def configuration(parent_package="", top_path=None): config.add_extension("predictor", sources=["predictor.pyx"], - include_dirs=[numpy.get_include()]) + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) config.add_extension("loss", sources=["loss.pyx"], @@ -53,4 +55,3 @@ def configuration(parent_package="", top_path=None): if __name__ == "__main__": from numpy.distutils.core import setup setup(**configuration().todict()) - From 67602e5decb81ee9955fcddb8136ace6e7c69f2e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 16:26:37 -0500 Subject: [PATCH 028/247] made tests use types in types.pyx instead of hardcoded types --- sklearn/gbm/_gradient_boosting.pyx | 10 ++--- sklearn/gbm/binning.pyx | 12 +++--- sklearn/gbm/histogram.pxd | 24 ++++++------ sklearn/gbm/histogram.pyx | 26 ++++++------- sklearn/gbm/loss.pyx | 10 ++--- sklearn/gbm/predictor.pyx | 8 ++-- sklearn/gbm/splitting.pyx | 26 ++++++------- sklearn/gbm/tests/test_compare_lightgbm.py | 24 ++++++------ sklearn/gbm/tests/test_grower.py | 19 +++++---- sklearn/gbm/tests/test_histogram.py | 30 ++++++++------- sklearn/gbm/tests/test_predictor.py | 6 +-- sklearn/gbm/tests/test_splitting.py | 45 ++++++++++++---------- sklearn/gbm/types.pxd | 10 ++--- sklearn/gbm/types.pyx | 6 +-- 14 files changed, 132 insertions(+), 124 deletions(-) diff --git a/sklearn/gbm/_gradient_boosting.pyx b/sklearn/gbm/_gradient_boosting.pyx index cfc8d106a60fa..631fea1c6f55e 100644 --- a/sklearn/gbm/_gradient_boosting.pyx +++ b/sklearn/gbm/_gradient_boosting.pyx @@ -10,15 +10,15 @@ import numpy as np cimport numpy as np from .types import Y_DTYPE -from .types cimport NPY_Y_DTYPE +from .types cimport Y_DTYPE_C -def _update_raw_predictions(NPY_Y_DTYPE [:] raw_predictions, grower): +def _update_raw_predictions(Y_DTYPE_C [:] raw_predictions, grower): cdef: unsigned int [:] starts unsigned int [:] stops unsigned int [:] partition - NPY_Y_DTYPE [:] values + Y_DTYPE_C [:] values list leaves leaves = grower.finalized_leaves @@ -31,11 +31,11 @@ def _update_raw_predictions(NPY_Y_DTYPE [:] raw_predictions, grower): values) cdef void _update_raw_predictions_helper( - NPY_Y_DTYPE [:] raw_predictions, + Y_DTYPE_C [:] raw_predictions, unsigned int [:] starts, unsigned int [:] stops, unsigned int [:] partition, - NPY_Y_DTYPE [:] values) nogil: + Y_DTYPE_C [:] values) nogil: cdef: int sample_idx diff --git a/sklearn/gbm/binning.pyx b/sklearn/gbm/binning.pyx index 1c53ca8ea7a3a..7abd49013a36d 100644 --- a/sklearn/gbm/binning.pyx +++ b/sklearn/gbm/binning.pyx @@ -19,7 +19,7 @@ from cython.parallel import prange from ..utils import check_random_state, check_array from ..base import BaseEstimator, TransformerMixin from .types import X_DTYPE, X_BINNED_DTYPE -from .types cimport NPY_X_DTYPE, NPY_X_BINNED_DTYPE +from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), @@ -62,8 +62,8 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return binning_thresholds -cpdef _map_to_bins(const NPY_X_DTYPE [:, :] data, list binning_thresholds, - NPY_X_BINNED_DTYPE [::1, :] binned): +cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, + X_BINNED_DTYPE_C [::1, :] binned): """Bin numerical values to discrete integer-coded levels. Parameters @@ -90,9 +90,9 @@ cpdef _map_to_bins(const NPY_X_DTYPE [:, :] data, list binning_thresholds, binned[:, feature_idx]) -cpdef void _map_num_col_to_bins(const NPY_X_DTYPE [:] data, - const NPY_X_DTYPE [:] binning_thresholds, - NPY_X_BINNED_DTYPE [:] binned) nogil: +cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, + const X_DTYPE_C [:] binning_thresholds, + X_BINNED_DTYPE_C [:] binned) nogil: """Binary search to the find the bin index for each value in data.""" cdef: int i diff --git a/sklearn/gbm/histogram.pxd b/sklearn/gbm/histogram.pxd index 11ef0bf831594..deb2d7b8e18bf 100644 --- a/sklearn/gbm/histogram.pxd +++ b/sklearn/gbm/histogram.pxd @@ -2,8 +2,8 @@ import numpy as np cimport numpy as np from .types import HISTOGRAM_DTYPE -from .types cimport NPY_X_BINNED_DTYPE -from .types cimport NPY_Y_DTYPE +from .types cimport X_BINNED_DTYPE_C +from .types cimport Y_DTYPE_C from .types cimport hist_struct # See histogram.pyx for docstrings and details @@ -17,27 +17,27 @@ cpdef void _subtract_histograms( cpdef void _build_histogram( unsigned int n_bins, unsigned int [:] sample_indices, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] ordered_gradients, - NPY_Y_DTYPE [:] ordered_hessians, + X_BINNED_DTYPE_C [:] binned_feature, + Y_DTYPE_C [:] ordered_gradients, + Y_DTYPE_C [:] ordered_hessians, hist_struct [:] out) nogil cpdef void _build_histogram_no_hessian( unsigned int n_bins, unsigned int [:] sample_indices, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] ordered_gradients, + X_BINNED_DTYPE_C [:] binned_feature, + Y_DTYPE_C [:] ordered_gradients, hist_struct [:] out) nogil cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] all_gradients, + X_BINNED_DTYPE_C [:] binned_feature, + Y_DTYPE_C [:] all_gradients, hist_struct [:] out) nogil cpdef void _build_histogram_root( unsigned int n_bins, - NPY_X_BINNED_DTYPE [:] binned_feature, - NPY_Y_DTYPE [:] all_gradients, - NPY_Y_DTYPE [:] all_hessians, + X_BINNED_DTYPE_C [:] binned_feature, + Y_DTYPE_C [:] all_gradients, + Y_DTYPE_C [:] all_hessians, hist_struct [:] out) nogil diff --git a/sklearn/gbm/histogram.pyx b/sklearn/gbm/histogram.pyx index ab8f20303a158..841e60905008d 100644 --- a/sklearn/gbm/histogram.pyx +++ b/sklearn/gbm/histogram.pyx @@ -20,9 +20,9 @@ from .types import HISTOGRAM_DTYPE cpdef void _build_histogram_naive( unsigned int n_bins, unsigned int [:] sample_indices, # IN - NPY_X_BINNED_DTYPE [:] binned_feature, # IN - NPY_Y_DTYPE [:] ordered_gradients, # IN - NPY_Y_DTYPE [:] ordered_hessians, # IN + X_BINNED_DTYPE_C [:] binned_feature, # IN + Y_DTYPE_C [:] ordered_gradients, # IN + Y_DTYPE_C [:] ordered_hessians, # IN hist_struct [:] out # OUT ) nogil: """Build histogram in a naive way, without optimizing for cache hit.""" @@ -59,9 +59,9 @@ cpdef void _subtract_histograms( cpdef void _build_histogram( unsigned int n_bins, unsigned int [:] sample_indices, # IN - NPY_X_BINNED_DTYPE [:] binned_feature, # IN - NPY_Y_DTYPE [:] ordered_gradients, # IN - NPY_Y_DTYPE [:] ordered_hessians, # IN + X_BINNED_DTYPE_C [:] binned_feature, # IN + Y_DTYPE_C [:] ordered_gradients, # IN + Y_DTYPE_C [:] ordered_hessians, # IN hist_struct [:] out # OUT ) nogil: """Return histogram for a given feature.""" @@ -107,8 +107,8 @@ cpdef void _build_histogram( cpdef void _build_histogram_no_hessian( unsigned int n_bins, unsigned int [:] sample_indices, # IN - NPY_X_BINNED_DTYPE [:] binned_feature, # IN - NPY_Y_DTYPE [:] ordered_gradients, # OUT + X_BINNED_DTYPE_C [:] binned_feature, # IN + Y_DTYPE_C [:] ordered_gradients, # OUT hist_struct [:] out # OUT ) nogil: """Return histogram for a given feature.""" @@ -147,8 +147,8 @@ cpdef void _build_histogram_no_hessian( cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - NPY_X_BINNED_DTYPE [:] binned_feature, # IN - NPY_Y_DTYPE [:] all_gradients, # IN + X_BINNED_DTYPE_C [:] binned_feature, # IN + Y_DTYPE_C [:] all_gradients, # IN hist_struct [:] out # OUT ) nogil: """Special case for the root node @@ -194,9 +194,9 @@ cpdef void _build_histogram_root_no_hessian( cpdef void _build_histogram_root( unsigned int n_bins, - NPY_X_BINNED_DTYPE [:] binned_feature, # IN - NPY_Y_DTYPE [:] all_gradients, # IN - NPY_Y_DTYPE [:] all_hessians, # IN + X_BINNED_DTYPE_C [:] binned_feature, # IN + Y_DTYPE_C [:] all_gradients, # IN + Y_DTYPE_C [:] all_hessians, # IN hist_struct [:] out # OUT ) nogil: """Special case for the root node diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index 4c8f6ee673c9f..2a62a91190e9b 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -18,7 +18,7 @@ cimport numpy as np from scipy.special import expit, logsumexp from .types import Y_DTYPE -from .types cimport NPY_Y_DTYPE +from .types cimport Y_DTYPE_C cdef get_threads_chunks(unsigned int total_size): @@ -141,7 +141,7 @@ class LeastSquares(BaseLoss): return loss.mean() if average else loss def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train) + return np.mean(y_train).astype(Y_DTYPE) def inverse_link_function(self, raw_predictions): return raw_predictions @@ -154,9 +154,9 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( - NPY_Y_DTYPE [:] gradients, - NPY_Y_DTYPE [:] y_true, - NPY_Y_DTYPE [:] raw_predictions) nogil: + Y_DTYPE_C [:] gradients, + Y_DTYPE_C [:] y_true, + Y_DTYPE_C [:] raw_predictions) nogil: cdef: unsigned int n_samples int i diff --git a/sklearn/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx index a882011e15717..e18aa1533bf74 100644 --- a/sklearn/gbm/predictor.pyx +++ b/sklearn/gbm/predictor.pyx @@ -12,7 +12,7 @@ import numpy as np cimport numpy as np from .types import X_DTYPE -from .types cimport NPY_X_DTYPE +from .types cimport X_DTYPE_C PREDICTOR_RECORD_DTYPE = np.dtype([ @@ -32,7 +32,7 @@ cdef packed struct node_struct: float value unsigned int count unsigned int feature_idx - NPY_X_DTYPE threshold + X_DTYPE_C threshold unsigned int left unsigned int right float gain @@ -82,7 +82,7 @@ class TreePredictor: cdef float _predict_one_from_numeric_data( node_struct [:] nodes, - const NPY_X_DTYPE [:] numeric_data) nogil: + const X_DTYPE_C [:] numeric_data) nogil: cdef: node_struct node = nodes[0] @@ -98,7 +98,7 @@ cdef float _predict_one_from_numeric_data( cdef void _predict_from_numeric_data( node_struct [:] nodes, - const NPY_X_DTYPE [:, :] numeric_data, + const X_DTYPE_C [:, :] numeric_data, float [:] out) nogil: cdef: diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 341ef10b88131..4d3a919027555 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -22,8 +22,8 @@ from .histogram cimport _build_histogram_no_hessian from .histogram cimport _build_histogram_root from .histogram cimport _build_histogram_root_no_hessian from .histogram cimport _subtract_histograms -from .types cimport NPY_X_BINNED_DTYPE -from .types cimport NPY_Y_DTYPE +from .types cimport X_BINNED_DTYPE_C +from .types cimport Y_DTYPE_C from .types cimport hist_struct from .types import HISTOGRAM_DTYPE @@ -132,14 +132,14 @@ cdef class SplittingContext: be ignored. """ cdef public: - NPY_X_BINNED_DTYPE [:, :] X_binned + X_BINNED_DTYPE_C [:, :] X_binned unsigned int n_features unsigned int max_bins unsigned int [:] n_bins_per_feature - NPY_Y_DTYPE [:] gradients - NPY_Y_DTYPE [:] hessians - NPY_Y_DTYPE [:] ordered_gradients - NPY_Y_DTYPE [:] ordered_hessians + Y_DTYPE_C [:] gradients + Y_DTYPE_C [:] hessians + Y_DTYPE_C [:] ordered_gradients + Y_DTYPE_C [:] ordered_hessians float sum_gradients float sum_hessians unsigned char constant_hessian @@ -153,9 +153,9 @@ cdef class SplittingContext: unsigned int [:] left_indices_buffer unsigned int [:] right_indices_buffer - def __init__(self, NPY_X_BINNED_DTYPE [:, :] X_binned, unsigned int + def __init__(self, X_BINNED_DTYPE_C [:, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - NPY_Y_DTYPE [:] gradients, NPY_Y_DTYPE [:] hessians, float + Y_DTYPE_C [:] gradients, Y_DTYPE_C [:] hessians, float l2_regularization, float min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, float min_gain_to_split=0.): @@ -275,7 +275,7 @@ def split_indices( cdef: int n_samples = sample_indices.shape[0] - NPY_X_BINNED_DTYPE [:] X_binned = context.X_binned.T[split_info.feature_idx] + X_BINNED_DTYPE_C [:] X_binned = context.X_binned.T[split_info.feature_idx] unsigned int [:] left_indices_buffer = context.left_indices_buffer unsigned int [:] right_indices_buffer = context.right_indices_buffer int n_threads = omp_get_max_threads() @@ -573,11 +573,11 @@ cdef split_info_struct _find_histogram_split( cdef: unsigned int n_samples = sample_indices.shape[0] - NPY_X_BINNED_DTYPE [:] X_binned = context.X_binned.T[feature_idx] + X_BINNED_DTYPE_C [:] X_binned = context.X_binned.T[feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples - NPY_Y_DTYPE [:] ordered_gradients = \ + Y_DTYPE_C [:] ordered_gradients = \ context.ordered_gradients[:n_samples] - NPY_Y_DTYPE [:] ordered_hessians = context.ordered_hessians[:n_samples] + Y_DTYPE_C [:] ordered_hessians = context.ordered_hessians[:n_samples] if root_node: if context.constant_hessian: diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py index 6995b511de143..23ee11b9c7809 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -51,30 +51,30 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = GBMRegressor(max_iter=max_iter, - max_bins=max_bins, - learning_rate=1, - n_iter_no_change=None, - min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_lightgbm_estimator(est_pygbm) + est_sklearn = GBMRegressor(max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) + est_lightgbm = get_lightgbm_estimator(est_sklearn) est_lightgbm.fit(X_train, y_train) - est_pygbm.fit(X_train, y_train) + est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lgbm = est_lightgbm.predict(X_train) - pred_pygbm = est_pygbm.predict(X_train) + pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal - assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011 + assert np.mean(abs(pred_lgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lgbm = est_lightgbm.predict(X_test) - pred_pygbm = est_pygbm.predict(X_test) + pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal - assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01 + assert np.mean(abs(pred_lgbm - pred_sklearn) > 1e-4) < .01 @pytest.mark.parametrize('seed', range(5)) diff --git a/sklearn/gbm/tests/test_grower.py b/sklearn/gbm/tests/test_grower.py index e900f15cda3b1..19ff05534ee74 100644 --- a/sklearn/gbm/tests/test_grower.py +++ b/sklearn/gbm/tests/test_grower.py @@ -2,10 +2,12 @@ from numpy.testing import assert_array_almost_equal import pytest from pytest import approx -from sklearn.utils.testing import assert_raises_regex +from sklearn.utils.testing import assert_raises_regex from sklearn.gbm.grower import TreeGrower from sklearn.gbm.binning import BinMapper +from sklearn.gbm.types import X_BINNED_DTYPE +from sklearn.gbm.types import Y_DTYPE def _make_training_data(n_bins=256, constant_hessian=True): @@ -14,7 +16,8 @@ def _make_training_data(n_bins=256, constant_hessian=True): # Generate some test data directly binned so as to test the grower code # independently of the binning logic. - X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=np.uint8) + X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), + dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) def true_decision_function(input_features): @@ -33,13 +36,13 @@ def true_decision_function(input_features): return 1 target = np.array([true_decision_function(x) for x in X_binned], - dtype=np.float32) + dtype=Y_DTYPE) # Assume a square loss applied to an initial model that always predicts 0 # (hardcoded for this test): all_gradients = target if constant_hessian: - all_hessians = np.ones(shape=1, dtype=np.float32) + all_hessians = np.ones(shape=1, dtype=Y_DTYPE) else: all_hessians = np.ones_like(all_gradients) return X_binned, all_gradients, all_hessians @@ -206,9 +209,9 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, mapper = BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) - all_gradients = y.astype(np.float32) + all_gradients = y.astype(Y_DTYPE) if constant_hessian: - all_hessians = np.ones(shape=1, dtype=np.float32) + all_hessians = np.ones(shape=1, dtype=Y_DTYPE) else: all_hessians = np.ones_like(all_gradients) grower = TreeGrower(X, all_gradients, all_hessians, @@ -245,8 +248,8 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): mapper = BinMapper(max_bins=max_bins) X = mapper.fit_transform(X) - all_gradients = y.astype(np.float32) - all_hessians = np.ones(shape=1, dtype=np.float32) + all_gradients = y.astype(Y_DTYPE) + all_hessians = np.ones(shape=1, dtype=Y_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=max_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, diff --git a/sklearn/gbm/tests/test_histogram.py b/sklearn/gbm/tests/test_histogram.py index dcf7c4b2c23db..d94c82c7ea33e 100644 --- a/sklearn/gbm/tests/test_histogram.py +++ b/sklearn/gbm/tests/test_histogram.py @@ -11,16 +11,18 @@ from sklearn.gbm.histogram import _build_histogram_root from sklearn.gbm.histogram import _subtract_histograms from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.types import Y_DTYPE +from sklearn.gbm.types import X_BINNED_DTYPE @pytest.mark.parametrize( 'build_func', [_build_histogram_naive, _build_histogram]) def test_build_histogram(build_func): - binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=np.uint8) + binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE) # Small sample_indices (below unrolling threshold) - ordered_gradients = np.array([0, 1, 3], dtype=np.float32) - ordered_hessians = np.array([1, 1, 2], dtype=np.float32) + ordered_gradients = np.array([0, 1, 3], dtype=Y_DTYPE) + ordered_hessians = np.array([1, 1, 2], dtype=Y_DTYPE) sample_indices = np.array([0, 2, 3], dtype=np.uint32) hist = np.zeros(3, dtype=HISTOGRAM_DTYPE) @@ -32,8 +34,8 @@ def test_build_histogram(build_func): # Larger sample_indices (above unrolling threshold) sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32) - ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=np.float32) - ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=np.float32) + ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=Y_DTYPE) + ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=Y_DTYPE) hist = np.zeros(3, dtype=HISTOGRAM_DTYPE) build_func(3, sample_indices, binned_feature, ordered_gradients, @@ -49,15 +51,15 @@ def test_histogram_sample_order_independence(): n_samples = 1000 n_bins = 256 - binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=X_BINNED_DTYPE) sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False) - ordered_gradients = rng.randn(n_sub_samples).astype(np.float32) + ordered_gradients = rng.randn(n_sub_samples).astype(Y_DTYPE) hist_gc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, ordered_gradients, hist_gc) - ordered_hessians = rng.exponential(size=n_sub_samples).astype(np.float32) + ordered_hessians = rng.exponential(size=n_sub_samples).astype(Y_DTYPE) hist_ghc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) _build_histogram(n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc) @@ -90,11 +92,11 @@ def test_unrolled_equivalent_to_naive(constant_hessian): n_bins = 5 sample_indices = np.arange(n_samples).astype(np.uint32) binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) - ordered_gradients = rng.randn(n_samples).astype(np.float32) + ordered_gradients = rng.randn(n_samples).astype(Y_DTYPE) if constant_hessian: - ordered_hessians = np.ones(n_samples, dtype=np.float32) + ordered_hessians = np.ones(n_samples, dtype=Y_DTYPE) else: - ordered_hessians = rng.lognormal(size=n_samples).astype(np.float32) + ordered_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) hist_gc_root = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) hist_ghc_root = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) @@ -131,11 +133,11 @@ def test_hist_subtraction(constant_hessian): n_bins = 5 sample_indices = np.arange(n_samples).astype(np.uint32) binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) - ordered_gradients = rng.randn(n_samples).astype(np.float32) + ordered_gradients = rng.randn(n_samples).astype(Y_DTYPE) if constant_hessian: - ordered_hessians = np.ones(n_samples, dtype=np.float32) + ordered_hessians = np.ones(n_samples, dtype=Y_DTYPE) else: - ordered_hessians = rng.lognormal(size=n_samples).astype(np.float32) + ordered_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) hist_parent = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) if constant_hessian: diff --git a/sklearn/gbm/tests/test_predictor.py b/sklearn/gbm/tests/test_predictor.py index 06fb0b0c35fa3..36dcc4f9f8634 100644 --- a/sklearn/gbm/tests/test_predictor.py +++ b/sklearn/gbm/tests/test_predictor.py @@ -7,6 +7,7 @@ from sklearn.gbm.binning import BinMapper from sklearn.gbm.grower import TreeGrower +from sklearn.gbm.types import Y_DTYPE @pytest.mark.parametrize('max_bins', [200, 256]) @@ -17,11 +18,10 @@ def test_boston_dataset(max_bins): mapper = BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) - X_test_binned = mapper.transform(X_test) # Init gradients and hessians to that of least squares loss - gradients = -y_train.astype(np.float32) - hessians = np.ones(1, dtype=np.float32) + gradients = -y_train.astype(Y_DTYPE) + hessians = np.ones(1, dtype=Y_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index d4bbf5f16c524..c74f3461040c1 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -4,10 +4,13 @@ import pytest from sklearn.gbm.types import HISTOGRAM_DTYPE -from sklearn.gbm.splitting import (SplittingContext, find_node_split, - find_node_split_subtraction, - split_indices, - _find_histogram_split_wrapper) +from sklearn.gbm.types import Y_DTYPE +from sklearn.gbm.types import X_BINNED_DTYPE +from sklearn.gbm.splitting import SplittingContext +from sklearn.gbm.splitting import find_node_split +from sklearn.gbm.splitting import split_indices +from sklearn.gbm.splitting import find_node_split_subtraction +from sklearn.gbm.splitting import _find_histogram_split_wrapper @pytest.mark.parametrize('n_bins', [3, 32, 256]) @@ -19,17 +22,17 @@ def test_histogram_split(n_bins): min_samples_leaf = 1 min_gain_to_split = 0. X_binned = np.asfortranarray( - rng.randint(0, n_bins, size=(int(1e4), 2)), dtype=np.uint8) + rng.randint(0, n_bins, size=(int(1e4), 2)), dtype=X_BINNED_DTYPE) binned_feature = X_binned.T[feature_idx] sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) - ordered_hessians = np.ones_like(binned_feature, dtype=np.float32) + ordered_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) all_hessians = ordered_hessians for true_bin in range(1, n_bins - 1): for sign in [-1, 1]: ordered_gradients = np.full_like(binned_feature, sign, - dtype=np.float32) + dtype=Y_DTYPE) ordered_gradients[binned_feature <= true_bin] *= -1 all_gradients = ordered_gradients @@ -73,14 +76,14 @@ def test_split_vs_split_subtraction(constant_hessian): min_gain_to_split = 0. X_binned = rng.randint(0, n_bins, size=(n_samples, n_features), - dtype=np.uint8) + dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(np.float32) + all_gradients = rng.randn(n_samples).astype(Y_DTYPE) if constant_hessian: - all_hessians = np.ones(1, dtype=np.float32) + all_hessians = np.ones(1, dtype=Y_DTYPE) else: - all_hessians = rng.lognormal(size=n_samples).astype(np.float32) + all_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -159,14 +162,14 @@ def test_gradient_and_hessian_sanity(constant_hessian): min_gain_to_split = 0. X_binned = rng.randint(0, n_bins, size=(n_samples, n_features), - dtype=np.uint8) + dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(np.float32) + all_gradients = rng.randn(n_samples).astype(Y_DTYPE) if constant_hessian: - all_hessians = np.ones(1, dtype=np.float32) + all_hessians = np.ones(1, dtype=Y_DTYPE) else: - all_hessians = rng.lognormal(size=n_samples).astype(np.float32) + all_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -265,10 +268,10 @@ def test_split_indices(): [0, 4], [0, 0], [0, 4]] - X_binned = np.asfortranarray(X_binned, dtype=np.uint8) + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(np.float32) - all_hessians = np.ones(1, dtype=np.float32) + all_gradients = rng.randn(n_samples).astype(Y_DTYPE) + all_hessians = np.ones(1, dtype=Y_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -317,11 +320,11 @@ def test_min_gain_to_split(): n_bins = 255 n_samples = 100 X_binned = np.asfortranarray( - rng.randint(0, n_bins, size=(n_samples, 2)), dtype=np.uint8) + rng.randint(0, n_bins, size=(n_samples, 2)), dtype=X_BINNED_DTYPE) binned_feature = X_binned.T[feature_idx] sample_indices = np.arange(n_samples, dtype=np.uint32) - all_hessians = np.ones_like(binned_feature, dtype=np.float32) - all_gradients = np.ones_like(binned_feature, dtype=np.float32) + all_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) + all_gradients = np.ones_like(binned_feature, dtype=Y_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) diff --git a/sklearn/gbm/types.pxd b/sklearn/gbm/types.pxd index d4cea50da0b19..c15dbca9dcfc7 100644 --- a/sklearn/gbm/types.pxd +++ b/sklearn/gbm/types.pxd @@ -2,13 +2,13 @@ import numpy as np cimport numpy as np -ctypedef np.npy_float64 NPY_X_DTYPE -ctypedef np.npy_uint8 NPY_X_BINNED_DTYPE -ctypedef np.npy_float32 NPY_Y_DTYPE +ctypedef np.npy_float64 X_DTYPE_C +ctypedef np.npy_uint8 X_BINNED_DTYPE_C +ctypedef np.npy_float64 Y_DTYPE_C # Same as histogram dtype but we need a struct to declare views. It needs to be # packed since by default numpy dtypes aren't aligned cdef packed struct hist_struct: - float sum_gradients - float sum_hessians + Y_DTYPE_C sum_gradients + Y_DTYPE_C sum_hessians unsigned int count diff --git a/sklearn/gbm/types.pyx b/sklearn/gbm/types.pyx index 24b27ba8917d0..f5dae1d17b856 100644 --- a/sklearn/gbm/types.pyx +++ b/sklearn/gbm/types.pyx @@ -1,11 +1,11 @@ import numpy as np -Y_DTYPE = np.float32 +Y_DTYPE = np.float64 X_DTYPE = np.float64 X_BINNED_DTYPE = np.uint8 HISTOGRAM_DTYPE = np.dtype([ - ('sum_gradients', np.float32), # sum of sample gradients in bin - ('sum_hessians', np.float32), # sum of sample hessians in bin + ('sum_gradients', Y_DTYPE), # sum of sample gradients in bin + ('sum_hessians', Y_DTYPE), # sum of sample hessians in bin ('count', np.uint32), # number of samples in bin ]) From 498fe50c22c59a46e7c326531bad54151ee42bda Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 17:08:07 -0500 Subject: [PATCH 029/247] lgbm tests are passsing \o/ --- sklearn/gbm/predictor.pyx | 24 ++++---- sklearn/gbm/splitting.pyx | 102 ++++++++++++++++----------------- sklearn/gbm/tests/test_loss.py | 30 ++++++---- 3 files changed, 82 insertions(+), 74 deletions(-) diff --git a/sklearn/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx index e18aa1533bf74..4abd4a3b1a8da 100644 --- a/sklearn/gbm/predictor.pyx +++ b/sklearn/gbm/predictor.pyx @@ -13,32 +13,36 @@ cimport numpy as np from .types import X_DTYPE from .types cimport X_DTYPE_C +from .types import Y_DTYPE +from .types cimport Y_DTYPE_C +from .types import X_BINNED_DTYPE +from .types cimport X_BINNED_DTYPE_C PREDICTOR_RECORD_DTYPE = np.dtype([ - ('value', np.float32), + ('value', Y_DTYPE), ('count', np.uint32), ('feature_idx', np.uint32), ('threshold', X_DTYPE), ('left', np.uint32), ('right', np.uint32), - ('gain', np.float32), + ('gain', Y_DTYPE), ('depth', np.uint32), ('is_leaf', np.uint8), - ('bin_threshold', np.uint8), + ('bin_threshold', X_BINNED_DTYPE), ]) cdef packed struct node_struct: - float value + Y_DTYPE_C value unsigned int count unsigned int feature_idx X_DTYPE_C threshold unsigned int left unsigned int right - float gain + Y_DTYPE_C gain unsigned int depth unsigned char is_leaf - unsigned char bin_threshold + X_BINNED_DTYPE_C bin_threshold class TreePredictor: @@ -73,14 +77,12 @@ class TreePredictor: y : array, shape (n_samples,) The raw predicted values. """ - # TODO: change dtype of out (should be same as Y_DTYPE I think since - # the value is grad/hess which are Y_DTYPE) - out = np.empty(X.shape[0], dtype=np.float32) + out = np.empty(X.shape[0], dtype=Y_DTYPE) _predict_from_numeric_data(self.nodes, X, out) return out -cdef float _predict_one_from_numeric_data( +cdef Y_DTYPE_C _predict_one_from_numeric_data( node_struct [:] nodes, const X_DTYPE_C [:] numeric_data) nogil: @@ -99,7 +101,7 @@ cdef float _predict_one_from_numeric_data( cdef void _predict_from_numeric_data( node_struct [:] nodes, const X_DTYPE_C [:, :] numeric_data, - float [:] out) nogil: + Y_DTYPE_C [:] out) nogil: cdef: int i diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 4d3a919027555..801a27eb0e13f 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -31,13 +31,13 @@ from .types import HISTOGRAM_DTYPE cdef struct split_info_struct: # Same as the SplitInfo class, but we need a C struct to use it in nogil # mode. - float gain + Y_DTYPE_C gain unsigned int feature_idx unsigned int bin_idx - float gradient_left - float gradient_right - float hessian_left - float hessian_right + Y_DTYPE_C gradient_left + Y_DTYPE_C gradient_right + Y_DTYPE_C hessian_left + Y_DTYPE_C hessian_right unsigned int n_samples_left unsigned int n_samples_right @@ -48,19 +48,19 @@ cdef class SplitInfo: Parameters ---------- - gain : float32 + gain : float The gain of the split feature_idx : int The index of the feature to be split bin_idx : int The index of the bin on which the split is made - gradient_left : float32 + gradient_left : float The sum of the gradients of all the samples in the left child - hessian_left : float32 + hessian_left : float The sum of the hessians of all the samples in the left child - gradient_right : float32 + gradient_right : float The sum of the gradients of all the samples in the right child - hessian_right : float32 + hessian_right : float The sum of the hessians of all the samples in the right child n_samples_left : int The number of samples in the left child @@ -68,21 +68,21 @@ cdef class SplitInfo: The number of samples in the right child """ cdef public: - float gain + Y_DTYPE_C gain unsigned int feature_idx unsigned int bin_idx - float gradient_left - float gradient_right - float hessian_left - float hessian_right + Y_DTYPE_C gradient_left + Y_DTYPE_C gradient_right + Y_DTYPE_C hessian_left + Y_DTYPE_C hessian_right unsigned int n_samples_left unsigned int n_samples_right - def __init__(self, float gain=-1., unsigned int feature_idx=0, unsigned - int bin_idx=0, float gradient_left=0., float hessian_left=0., - float gradient_right=0., float hessian_right=0., - unsigned int n_samples_left=0, unsigned int - n_samples_right=0): + def __init__(self, Y_DTYPE_C gain=-1., unsigned int feature_idx=0, unsigned + int bin_idx=0, Y_DTYPE_C gradient_left=0., Y_DTYPE_C + hessian_left=0., Y_DTYPE_C gradient_right=0., Y_DTYPE_C + hessian_right=0., unsigned int n_samples_left=0, unsigned + int n_samples_right=0): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx @@ -140,14 +140,14 @@ cdef class SplittingContext: Y_DTYPE_C [:] hessians Y_DTYPE_C [:] ordered_gradients Y_DTYPE_C [:] ordered_hessians - float sum_gradients - float sum_hessians + Y_DTYPE_C sum_gradients + Y_DTYPE_C sum_hessians unsigned char constant_hessian - float constant_hessian_value - float l2_regularization - float min_hessian_to_split + Y_DTYPE_C constant_hessian_value + Y_DTYPE_C l2_regularization + Y_DTYPE_C min_hessian_to_split unsigned int min_samples_leaf - float min_gain_to_split + Y_DTYPE_C min_gain_to_split unsigned int [:] partition unsigned int [:] left_indices_buffer @@ -155,9 +155,9 @@ cdef class SplittingContext: def __init__(self, X_BINNED_DTYPE_C [:, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - Y_DTYPE_C [:] gradients, Y_DTYPE_C [:] hessians, float - l2_regularization, float min_hessian_to_split=1e-3, - unsigned int min_samples_leaf=20, float + Y_DTYPE_C [:] gradients, Y_DTYPE_C [:] hessians, Y_DTYPE_C + l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, + unsigned int min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0.): self.X_binned = X_binned @@ -508,7 +508,7 @@ def find_node_split_subtraction( if context.constant_hessian: context.sum_hessians = \ - context.constant_hessian_value * float(n_samples) + context.constant_hessian_value * n_samples else: context.sum_hessians = 0. for i in range(context.max_bins): @@ -545,8 +545,8 @@ cdef split_info_struct _find_best_feature_to_split_helper( split_info_struct * split_infos # IN ) nogil: cdef: - float gain - float best_gain + Y_DTYPE_C gain + Y_DTYPE_C best_gain split_info_struct split_info split_info_struct best_split_info unsigned int feature_idx @@ -636,11 +636,11 @@ cdef split_info_struct _find_best_bin_to_split_helper( unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - float hessian_left - float hessian_right - float gradient_left - float gradient_right - float gain + Y_DTYPE_C hessian_left + Y_DTYPE_C hessian_right + Y_DTYPE_C gradient_left + Y_DTYPE_C gradient_right + Y_DTYPE_C gain split_info_struct best_split best_split.gain = -1. @@ -652,7 +652,7 @@ cdef split_info_struct _find_best_bin_to_split_helper( n_samples_right = n_samples_ - n_samples_left if context.constant_hessian: - hessian_left += ( histogram[bin_idx].count + hessian_left += (histogram[bin_idx].count * context.constant_hessian_value) else: hessian_left += histogram[bin_idx].sum_hessians @@ -692,14 +692,14 @@ cdef split_info_struct _find_best_bin_to_split_helper( return best_split -cdef inline float _split_gain( - float gradient_left, - float hessian_left, - float gradient_right, - float hessian_right, - float sum_gradients, - float sum_hessians, - float l2_regularization) nogil: +cdef inline Y_DTYPE_C _split_gain( + Y_DTYPE_C gradient_left, + Y_DTYPE_C hessian_left, + Y_DTYPE_C gradient_right, + Y_DTYPE_C hessian_right, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + Y_DTYPE_C l2_regularization) nogil: """Loss reduction Compute the reduction in loss after taking a split, compared to keeping @@ -710,16 +710,16 @@ cdef inline float _split_gain( https://arxiv.org/abs/1603.02754 """ cdef: - float gain + Y_DTYPE_C gain gain = negative_loss(gradient_left, hessian_left, l2_regularization) gain += negative_loss(gradient_right, hessian_right, l2_regularization) gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain -cdef inline float negative_loss( - float gradient, - float hessian, - float l2_regularization) nogil: +cdef inline Y_DTYPE_C negative_loss( + Y_DTYPE_C gradient, + Y_DTYPE_C hessian, + Y_DTYPE_C l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) # Only used for tests... not great diff --git a/sklearn/gbm/tests/test_loss.py b/sklearn/gbm/tests/test_loss.py index 8afeddccd8cd4..60739e42eb29b 100644 --- a/sklearn/gbm/tests/test_loss.py +++ b/sklearn/gbm/tests/test_loss.py @@ -1,11 +1,13 @@ import numpy as np from numpy.testing import assert_almost_equal +import scipy from scipy.optimize import newton from scipy.special import logsumexp from sklearn.utils import assert_all_finite import pytest from sklearn.gbm.loss import _LOSSES +from sklearn.gbm.types import Y_DTYPE def get_derivatives_helper(loss): @@ -46,19 +48,22 @@ def get_hessians(y_true, raw_predictions): ('least_squares', -2., 42), ('least_squares', 117., 1.05), ('least_squares', 0., 0.), - ('binary_crossentropy', 0.3, 0), - ('binary_crossentropy', -12, 1), - ('binary_crossentropy', 30, 1), + # ('binary_crossentropy', 0.3, 0), # TODO: unskip this + # ('binary_crossentropy', -12, 1), + # ('binary_crossentropy', 30, 1), ]) -@pytest.mark.skip('newton uses doubles but floats are expected') +@pytest.mark.skipif(scipy.__version__.split('.')[:2] == ['1', '2'], + reason='bug in scipy 1.2.0, see scipy issue #9608') +@pytest.mark.skipif(Y_DTYPE != np.float64, + reason='Newton internally uses float64 != Y_DTYPE') def test_derivatives(loss, x0, y_true): # Check that gradients are zero when the loss is minimized on 1D array # using the Newton-Raphson and the first and second order derivatives # computed by the Loss instance. loss = _LOSSES[loss]() - y_true = np.array([y_true], dtype=np.float32) - x0 = np.array([x0], dtype=np.float32).reshape(1, 1) + y_true = np.array([y_true], dtype=Y_DTYPE) + x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1) get_gradients, get_hessians = get_derivatives_helper(loss) def func(x): @@ -78,10 +83,11 @@ def fprime2(x): @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ ('least_squares', 0, 1), - ('binary_crossentropy', 2, 1), - ('categorical_crossentropy', 3, 3), + # ('binary_crossentropy', 2, 1), + # ('categorical_crossentropy', 3, 3), ]) -@pytest.mark.skip('Fails because float32 precision is not enough for numeric checks') +@pytest.mark.skipif(Y_DTYPE != np.float64, + reason='Need 64 bits float precision for numerical checks') def test_numerical_gradients(loss, n_classes, prediction_dim): # Make sure gradients and hessians computed in the loss are correct, by # comparing with their approximations computed with finite central @@ -91,12 +97,12 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): rng = np.random.RandomState(0) n_samples = 100 if loss == 'least_squares': - y_true = rng.normal(size=n_samples).astype(np.float32) + y_true = rng.normal(size=n_samples).astype(Y_DTYPE) else: - y_true = rng.randint(0, n_classes, size=n_samples).astype(np.float32) + y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) raw_predictions = rng.normal( size=(n_samples, prediction_dim) - ).astype(np.float32) + ).astype(Y_DTYPE) loss = _LOSSES[loss]() get_gradients, get_hessians = get_derivatives_helper(loss) From 889d39f967c981cbf52d860469ed1b31d2a6f644 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 18:07:12 -0500 Subject: [PATCH 030/247] Added binary classification support --- gdb_test.py | 4 +- sklearn/gbm/loss.pyx | 137 ++++++++++---------- sklearn/gbm/tests/test_compare_lightgbm.py | 1 - sklearn/gbm/tests/test_gradient_boosting.py | 7 +- sklearn/gbm/tests/test_loss.py | 18 +-- 5 files changed, 85 insertions(+), 82 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index 4546f22a5c9d4..296660f4ffc7f 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -10,8 +10,8 @@ import cProfile import pygbm -classif = False -n_classes = 3 +classif = True +n_classes = 2 n_samples = int(1e6) max_iter = 5 diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index 2a62a91190e9b..44227704eb4f4 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -16,6 +16,9 @@ from cython.parallel import prange import numpy as np cimport numpy as np from scipy.special import expit, logsumexp +from scipy.special.cython_special cimport expit as cexpit + +from libc.math cimport fabs, exp from .types import Y_DTYPE from .types cimport Y_DTYPE_C @@ -169,70 +172,70 @@ cdef void _update_gradients_least_squares( gradients[i] = raw_predictions[i] - y_true[i] -## class BinaryCrossEntropy(BaseLoss): -## """Binary cross-entropy loss, for binary classification. -## -## For a given sample x_i, the binary cross-entropy loss is defined as the -## negative log-likelihood of the model which can be expressed as:: -## -## loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i -## -## See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. -## """ -## -## hessian_is_constant = False -## inverse_link_function = staticmethod(expit) -## -## def __call__(self, y_true, raw_predictions, average=True): -## # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to -## # return a view. -## raw_predictions = raw_predictions.reshape(-1) -## # logaddexp(0, x) = log(1 + exp(x)) -## loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions -## return loss.mean() if average else loss -## -## def get_baseline_prediction(self, y_train, prediction_dim): -## proba_positive_class = np.mean(y_train) -## eps = np.finfo(y_train.dtype).eps -## proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) -## # log(x / 1 - x) is the anti function of sigmoid, or the link function -## # of the Binomial model. -## return np.log(proba_positive_class / (1 - proba_positive_class)) -## -## def update_gradients_and_hessians(self, gradients, hessians, y_true, -## raw_predictions): -## raw_predictions = raw_predictions.reshape(-1) -## return _update_gradients_hessians_binary_crossentropy( -## gradients, hessians, y_true, raw_predictions) -## -## def predict_proba(self, raw_predictions): -## # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to -## # return a view. -## raw_predictions = raw_predictions.reshape(-1) -## proba = np.empty((raw_predictions.shape[0], 2), dtype=np.float32) -## proba[:, 1] = expit(raw_predictions) -## proba[:, 0] = 1 - proba[:, 1] -## return proba -## -## -## def _update_gradients_hessians_binary_crossentropy(float [:] gradients, -## float [:] hessians, float_or_double [:] y_true, double [:] raw_predictions): -## cdef: -## unsigned int n_samples -## unsigned int i -## unsigned int thread_idx -## unsigned int n_threads -## unsigned int [:] starts -## unsigned int [:] ends -## n_samples = raw_predictions.shape[0] -## starts, ends, n_threads = get_threads_chunks(total_size=n_samples) -## for thread_idx in range(n_threads): -## for i in range(starts[thread_idx], ends[thread_idx]): -## gradients[i] = expit(raw_predictions[i]) - y_true[i] -## gradient_abs = np.abs(gradients[i]) -## hessians[i] = gradient_abs * (1. - gradient_abs) -## -## +class BinaryCrossEntropy(BaseLoss): + """Binary cross-entropy loss, for binary classification. + + For a given sample x_i, the binary cross-entropy loss is defined as the + negative log-likelihood of the model which can be expressed as:: + + loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i + + See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. + """ + + hessian_is_constant = False + inverse_link_function = staticmethod(expit) + + def __call__(self, y_true, raw_predictions, average=True): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + # logaddexp(0, x) = log(1 + exp(x)) + loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + proba_positive_class = np.mean(y_train) + eps = np.finfo(y_train.dtype).eps + proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) + # log(x / 1 - x) is the anti function of sigmoid, or the link function + # of the Binomial model. + return np.log(proba_positive_class / (1 - proba_positive_class)) + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + return _update_gradients_hessians_binary_crossentropy( + gradients, hessians, y_true, raw_predictions) + + def predict_proba(self, raw_predictions): + # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE) + proba[:, 1] = expit(raw_predictions) + proba[:, 0] = 1 - proba[:, 1] + return proba + +cdef void _update_gradients_hessians_binary_crossentropy( + Y_DTYPE_C [:] gradients, + Y_DTYPE_C [:] hessians, + Y_DTYPE_C [:] y_true, + Y_DTYPE_C [:] raw_predictions) nogil: + cdef: + unsigned int n_samples + Y_DTYPE_C gradient_abs + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static'): + gradients[i] = cexpit(raw_predictions[i]) - y_true[i] + gradient_abs = fabs(gradients[i]) + hessians[i] = gradient_abs * (1. - gradient_abs) + + ## class CategoricalCrossEntropy(BaseLoss): ## """Categorical cross-entropy loss, for multiclass classification. ## @@ -312,4 +315,8 @@ cdef void _update_gradients_least_squares( ## hessians_at_k[i] = p_k * (1. - p_k) -_LOSSES = {'least_squares': LeastSquares} +_LOSSES = { + 'least_squares': LeastSquares, + 'binary_crossentropy': BinaryCrossEntropy +} + diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py index 23ee11b9c7809..16f76acb40fdc 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -83,7 +83,6 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, (255, 4096), (1000, 8), ]) -@pytest.mark.skip('classification not supported yet') def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification diff --git a/sklearn/gbm/tests/test_gradient_boosting.py b/sklearn/gbm/tests/test_gradient_boosting.py index 9e61c4426eccf..acb2c9f3c41d3 100644 --- a/sklearn/gbm/tests/test_gradient_boosting.py +++ b/sklearn/gbm/tests/test_gradient_boosting.py @@ -139,7 +139,8 @@ def test_early_stopping_regression(scoring, validation_split, @pytest.mark.parametrize('data', ( make_classification(random_state=0), - make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) + # TODO: unskip this + # make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) )) @pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ ('accuracy', .1, 5, 1e-7), # use scorer @@ -148,7 +149,6 @@ def test_early_stopping_regression(scoring, validation_split, (None, None, 5, 1e-1), # use loss on training data (None, None, None, None), # no early stopping ]) -@pytest.mark.skip('classification not supported yet') def test_early_stopping_classification(data, scoring, validation_split, n_iter_no_change, tol): @@ -263,9 +263,6 @@ def custom_check_estimator(Estimator): warnings.warn(str(exception), SkipTestWarning) -@pytest.mark.skipif( - int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, - reason="Potentially long") @pytest.mark.parametrize('Estimator', ( GBMRegressor(), # TODO: unskip diff --git a/sklearn/gbm/tests/test_loss.py b/sklearn/gbm/tests/test_loss.py index 60739e42eb29b..f747226865ff5 100644 --- a/sklearn/gbm/tests/test_loss.py +++ b/sklearn/gbm/tests/test_loss.py @@ -17,8 +17,8 @@ def get_derivatives_helper(loss): def get_gradients(y_true, raw_predictions): # create gradients and hessians array, update inplace, and return shape = raw_predictions.shape[0] * raw_predictions.shape[1] - gradients = np.empty(shape=shape, dtype=raw_predictions.dtype) - hessians = np.empty(shape=shape, dtype=raw_predictions.dtype) + gradients = np.empty(shape=shape, dtype=Y_DTYPE) + hessians = np.empty(shape=shape, dtype=Y_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) @@ -30,8 +30,8 @@ def get_gradients(y_true, raw_predictions): def get_hessians(y_true, raw_predictions): # create gradients and hessians array, update inplace, and return shape = raw_predictions.shape[0] * raw_predictions.shape[1] - gradients = np.empty(shape=shape, dtype=raw_predictions.dtype) - hessians = np.empty(shape=shape, dtype=raw_predictions.dtype) + gradients = np.empty(shape=shape, dtype=Y_DTYPE) + hessians = np.empty(shape=shape, dtype=Y_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) @@ -48,9 +48,10 @@ def get_hessians(y_true, raw_predictions): ('least_squares', -2., 42), ('least_squares', 117., 1.05), ('least_squares', 0., 0.), - # ('binary_crossentropy', 0.3, 0), # TODO: unskip this - # ('binary_crossentropy', -12, 1), - # ('binary_crossentropy', 30, 1), + # I don't understand why but y_true == 0 fails :/ + # ('binary_crossentropy', 0.3, 0), + ('binary_crossentropy', -12, 1), + ('binary_crossentropy', 30, 1), ]) @pytest.mark.skipif(scipy.__version__.split('.')[:2] == ['1', '2'], reason='bug in scipy 1.2.0, see scipy issue #9608') @@ -83,7 +84,7 @@ def fprime2(x): @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ ('least_squares', 0, 1), - # ('binary_crossentropy', 2, 1), + ('binary_crossentropy', 2, 1), # ('categorical_crossentropy', 3, 3), ]) @pytest.mark.skipif(Y_DTYPE != np.float64, @@ -148,7 +149,6 @@ def test_baseline_least_squares(): assert_almost_equal(baseline_prediction, y_train.mean()) -@pytest.mark.skip('binary crossentropy not supported yet') def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) From 722a9824bf404889f44adca268d7f8b1ec590e17 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 13 Jan 2019 21:46:26 -0500 Subject: [PATCH 031/247] Added multiclass classification support, all tests are passing \o/ --- gdb_test.py | 2 +- sklearn/gbm/fun.py | 3 - sklearn/gbm/gradient_boosting.py | 5 - sklearn/gbm/loss.pyx | 169 +++++++++++--------- sklearn/gbm/playground.pyx | 26 +-- sklearn/gbm/setup.py | 4 +- sklearn/gbm/tests/test_compare_lightgbm.py | 1 - sklearn/gbm/tests/test_gradient_boosting.py | 91 +---------- sklearn/gbm/tests/test_loss.py | 3 +- 9 files changed, 113 insertions(+), 191 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index 296660f4ffc7f..b1d439c887541 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -11,7 +11,7 @@ import pygbm classif = True -n_classes = 2 +n_classes = 3 n_samples = int(1e6) max_iter = 5 diff --git a/sklearn/gbm/fun.py b/sklearn/gbm/fun.py index f4c5a5293a8fc..e69de29bb2d1d 100644 --- a/sklearn/gbm/fun.py +++ b/sklearn/gbm/fun.py @@ -1,3 +0,0 @@ -from playground import hello - -print(hello()) diff --git a/sklearn/gbm/gradient_boosting.py b/sklearn/gbm/gradient_boosting.py index e80f4446ea8ab..206039500327c 100644 --- a/sklearn/gbm/gradient_boosting.py +++ b/sklearn/gbm/gradient_boosting.py @@ -97,11 +97,6 @@ def fit(self, X, y): # TODO: add support for pre-binned data (pass-through)? X, y = check_X_y(X, y, dtype=[X_DTYPE]) y = self._encode_y(y) - if X.shape[0] == 1 or X.shape[1] == 1: - raise ValueError( - 'Passing only one sample or one feature is not supported yet. ' - 'See numba issue #3569.' - ) rng = check_random_state(self.random_state) self._validate_parameters() diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index 44227704eb4f4..b550c5132e01c 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -18,7 +18,7 @@ cimport numpy as np from scipy.special import expit, logsumexp from scipy.special.cython_special cimport expit as cexpit -from libc.math cimport fabs, exp +from libc.math cimport fabs, exp, log from .types import Y_DTYPE from .types cimport Y_DTYPE_C @@ -236,87 +236,96 @@ cdef void _update_gradients_hessians_binary_crossentropy( hessians[i] = gradient_abs * (1. - gradient_abs) -## class CategoricalCrossEntropy(BaseLoss): -## """Categorical cross-entropy loss, for multiclass classification. -## -## For a given sample x_i, the categorical cross-entropy loss is defined as -## the negative log-likelihood of the model and generalizes the binary -## cross-entropy to more than 2 classes. -## """ -## -## hessian_is_constant = False -## -## def __call__(self, y_true, raw_predictions, average=True): -## one_hot_true = np.zeros_like(raw_predictions) -## prediction_dim = raw_predictions.shape[1] -## for k in range(prediction_dim): -## one_hot_true[:, k] = (y_true == k) -## -## loss = (logsumexp(raw_predictions, axis=1) - -## (one_hot_true * raw_predictions).sum(axis=1)) -## return loss.mean() if average else loss -## -## def get_baseline_prediction(self, y_train, prediction_dim): -## init_value = np.zeros( -## shape=(1, prediction_dim), -## dtype=np.float32 -## ) -## eps = np.finfo(y_train.dtype).eps -## for k in range(prediction_dim): -## proba_kth_class = np.mean(y_train == k) -## proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) -## init_value[:, k] += np.log(proba_kth_class) -## -## return init_value -## -## def update_gradients_and_hessians(self, gradients, hessians, y_true, -## raw_predictions): -## return _update_gradients_hessians_categorical_crossentropy( -## gradients, hessians, y_true, raw_predictions) -## -## def predict_proba(self, raw_predictions): -## # TODO: This could be done in parallel -## # compute softmax (using exp(log(softmax))) -## return np.exp(raw_predictions - -## logsumexp(raw_predictions, axis=1)[:, np.newaxis]) -## -## -## def _update_gradients_hessians_categorical_crossentropy( -## float [:] gradients, float [:] hessians, float_or_double [:] y_true, -## float_or_double [:, :] raw_predictions): -## # Here gradients and hessians are of shape -## # (n_samples * prediction_dim,). -## # y_true is of shape (n_samples,). -## # raw_predictions is of shape (n_samples, raw_predictions) -## cdef: -## unsigned int n_samples -## unsigned int prediction_dim -## unsigned int i -## unsigned int k -## unsigned int thread_idx -## unsigned int n_threads -## unsigned int [:] starts -## unsigned int [:] ends -## float p_k -## -## n_samples = raw_predictions.shape[0] -## prediction_dim = raw_predictions.shape[1] -## starts, ends, n_threads = get_threads_chunks(total_size=n_samples) -## for k in range(prediction_dim): -## gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] -## hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] -## for thread_idx in range(n_threads): -## for i in range(starts[thread_idx], ends[thread_idx]): -## # p_k is the probability that class(ith sample) == k. -## # This is a regular softmax. -## p_k = np.exp(raw_predictions[i, k] - -## logsumexp(raw_predictions[i, :])) -## gradients_at_k[i] = p_k - (y_true[i] == k) -## hessians_at_k[i] = p_k * (1. - p_k) +class CategoricalCrossEntropy(BaseLoss): + """Categorical cross-entropy loss, for multiclass classification. + + For a given sample x_i, the categorical cross-entropy loss is defined as + the negative log-likelihood of the model and generalizes the binary + cross-entropy to more than 2 classes. + """ + + hessian_is_constant = False + + def __call__(self, y_true, raw_predictions, average=True): + one_hot_true = np.zeros_like(raw_predictions) + prediction_dim = raw_predictions.shape[1] + for k in range(prediction_dim): + one_hot_true[:, k] = (y_true == k) + + loss = (logsumexp(raw_predictions, axis=1) - + (one_hot_true * raw_predictions).sum(axis=1)) + return loss.mean() if average else loss + + def get_baseline_prediction(self, y_train, prediction_dim): + init_value = np.zeros(shape=(1, prediction_dim), dtype=Y_DTYPE) + eps = np.finfo(y_train.dtype).eps + for k in range(prediction_dim): + proba_kth_class = np.mean(y_train == k) + proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) + init_value[:, k] += np.log(proba_kth_class) + + return init_value + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions): + return _update_gradients_hessians_categorical_crossentropy( + gradients, hessians, y_true, raw_predictions) + + def predict_proba(self, raw_predictions): + # TODO: This could be done in parallel + # compute softmax (using exp(log(softmax))) + return np.exp(raw_predictions - + logsumexp(raw_predictions, axis=1)[:, np.newaxis]) + + +cdef inline Y_DTYPE_C _logsumexp(Y_DTYPE_C [:, :] a, int i) nogil: + # Need to pass the whole array, else prange won't work + cdef: + int k + Y_DTYPE_C out = 0. + # Y_DTYPE_C amax + + # TODO: use the numerically safer option: + # amax = max(a[i]) + # for k in range(a.shape[1]): + # out += exp(a[i, k] - amax) + # return log(out) + amax + + for k in range(a.shape[1]): + out += exp(a[i, k]) + return log(out) + + +cdef void _update_gradients_hessians_categorical_crossentropy( + Y_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT + Y_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT + Y_DTYPE_C [:] y_true, # shape (n_samples,), IN + Y_DTYPE_C [:, :] raw_predictions # shape (n_samples, n_tree_per_iter), IN + ) nogil: + cdef: + unsigned int n_samples + unsigned int prediction_dim + unsigned int k + int i + Y_DTYPE_C p_k + Y_DTYPE_C [:] gradients_at_k, + Y_DTYPE_C [:] hessians_at_k, + + n_samples = raw_predictions.shape[0] + prediction_dim = raw_predictions.shape[1] + for k in range(prediction_dim): + gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] + hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] + for i in prange(n_samples, schedule='static'): + # p_k is the probability that class(ith sample) == k. + # This is a regular softmax. + p_k = exp(raw_predictions[i, k] - _logsumexp(raw_predictions, i)) + gradients_at_k[i] = p_k - (y_true[i] == k) + hessians_at_k[i] = p_k * (1. - p_k) _LOSSES = { 'least_squares': LeastSquares, - 'binary_crossentropy': BinaryCrossEntropy + 'binary_crossentropy': BinaryCrossEntropy, + 'categorical_crossentropy': CategoricalCrossEntropy } - diff --git a/sklearn/gbm/playground.pyx b/sklearn/gbm/playground.pyx index bb8e9024dd0ad..d84bc1602be68 100644 --- a/sklearn/gbm/playground.pyx +++ b/sklearn/gbm/playground.pyx @@ -1,15 +1,19 @@ -cimport cython +import numpy as np +from cython.parallel import prange -cdef class MyClass: - cdef int width, height - def __init__(self, int w, int h): - self.width = w - self.height = h +def wrapper(): + print('in') + a = np.random.uniform(0, 100, size=(100, 100)).astype(np.int32) + g(a) -def hello(): - o = MyClass(9, 5) - return zob(o) +cdef int f(int [:] a) nogil: + return 3 -cdef int zob (MyClass o) nogil: - return o.width \ No newline at end of file +cdef int g(int [:, :] a) nogil: + + cdef: + int i + + for i in range(a.shape[0]): + f(a[i]) \ No newline at end of file diff --git a/sklearn/gbm/setup.py b/sklearn/gbm/setup.py index 1ebee4cf3fbfe..1c3cd25c555be 100644 --- a/sklearn/gbm/setup.py +++ b/sklearn/gbm/setup.py @@ -45,7 +45,9 @@ def configuration(parent_package="", top_path=None): config.add_extension("playground", sources=["playground.pyx"], - include_dirs=[numpy.get_include()]) + include_dirs=[numpy.get_include()], + extra_compile_args=['-fopenmp'], + extra_link_args=['-fopenmp']) config.add_subpackage("tests") # config.add_data_files("histogram.pxd") diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py index 16f76acb40fdc..78e294af59f3e 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -142,7 +142,6 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, (255, 4096), (10000, 8), ]) -@pytest.mark.skip('classification not supported yet') def test_same_predictions_multiclass_classification( seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification diff --git a/sklearn/gbm/tests/test_gradient_boosting.py b/sklearn/gbm/tests/test_gradient_boosting.py index acb2c9f3c41d3..e5add16269d9e 100644 --- a/sklearn/gbm/tests/test_gradient_boosting.py +++ b/sklearn/gbm/tests/test_gradient_boosting.py @@ -6,6 +6,7 @@ import pytest from sklearn.utils.testing import assert_raises_regex from sklearn.datasets import make_classification, make_regression +from sklearn.utils.estimator_checks import check_estimator from sklearn.gbm import GBMClassifier from sklearn.gbm import GBMRegressor @@ -92,22 +93,6 @@ def test_init_parameters_validation(GradientBoosting, X, y): ) -def test_one_sample_one_feature(): - # Until numba issue #3569 is fixed, we raise an informative error message - # when X is only one sample or one feature in fit (it's OK in predict). - # The array is both F and C contiguous, and numba can't compile. - gb = GBMClassifier() - for X, y in (([[1, 2]], [0]), ([[1], [2]], [0, 1])): - assert_raises_regex( - ValueError, - 'Passing only one sample or one feature is not supported yet.', - gb.fit, X, y - ) - - -@pytest.mark.skipif( - int(os.environ.get("NUMBA_DISABLE_JIT", 0)) == 1, - reason="Travis times out without numba") @pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on training data @@ -139,8 +124,7 @@ def test_early_stopping_regression(scoring, validation_split, @pytest.mark.parametrize('data', ( make_classification(random_state=0), - # TODO: unskip this - # make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) + make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) )) @pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ ('accuracy', .1, 5, 1e-7), # use scorer @@ -171,39 +155,6 @@ def test_early_stopping_classification(data, scoring, validation_split, assert gb.n_iter_ == max_iter -@pytest.mark.skip('classification not supported yet') -def test_early_stopping_loss(): - # Make sure that when scoring is None, the early stopping is done w.r.t to - # the loss. Using scoring='neg_log_loss' and scoring=None should be - # equivalent since the loss is precisely the negative log likelihood - n_samples = int(1e3) - max_iter = 100 - n_iter_no_change = 5 - - X, y = make_classification(n_samples, random_state=0) - - clf_scoring = GBMClassifier(max_iter=max_iter, - scoring='neg_log_loss', - validation_split=.1, - n_iter_no_change=n_iter_no_change, - tol=1e-4, - verbose=1, - random_state=0) - clf_scoring.fit(X, y) - - clf_loss = GBMClassifier(max_iter=max_iter, - scoring=None, - validation_split=.1, - n_iter_no_change=n_iter_no_change, - tol=1e-4, - verbose=1, - random_state=0) - clf_loss.fit(X, y) - - assert n_iter_no_change < clf_loss.n_iter_ < max_iter - assert clf_loss.n_iter_ == clf_scoring.n_iter_ - - def test_should_stop(): def should_stop(scores, n_iter_no_change, tol): @@ -230,43 +181,9 @@ def should_stop(scores, n_iter_no_change, tol): assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=5) -# TODO: Remove if / when numba issue 3569 is fixed and check_classifiers_train -# is less strict -def custom_check_estimator(Estimator): - # Same as sklearn.check_estimator, skipping tests that can't succeed. - - from sklearn.utils.estimator_checks import _yield_all_checks - from sklearn.utils.testing import SkipTest - from sklearn.exceptions import SkipTestWarning - from sklearn.utils import estimator_checks - - estimator = Estimator - name = type(estimator).__name__ - - for check in _yield_all_checks(name, estimator): - if (check is estimator_checks.check_fit2d_1feature or - check is estimator_checks.check_fit2d_1sample): - # X is both Fortran and C aligned and numba can't compile. - # Opened numba issue 3569 - continue - if check is estimator_checks.check_classifiers_train: - continue # probas don't exactly sum to 1 (very close though) - if (hasattr(check, 'func') and - check.func is estimator_checks.check_classifiers_train): - continue # same, wrapped in a functools.partial object. - - try: - check(name, estimator) - except SkipTest as exception: - # the only SkipTest thrown currently results from not - # being able to import pandas. - warnings.warn(str(exception), SkipTestWarning) - - @pytest.mark.parametrize('Estimator', ( GBMRegressor(), - # TODO: unskip - # GBMClassifier(n_iter_no_change=None, min_samples_leaf=5), + GBMClassifier(scoring=None, validation_split=None, min_samples_leaf=5), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. @@ -279,4 +196,4 @@ def test_estimator_checks(Estimator): # check_classifiers_classes() to pass: with only 30 samples on the # dataset, the root is never split with min_samples_leaf=20 and only the # majority class is predicted. - custom_check_estimator(Estimator) + check_estimator(Estimator) diff --git a/sklearn/gbm/tests/test_loss.py b/sklearn/gbm/tests/test_loss.py index f747226865ff5..8e00d63e6b384 100644 --- a/sklearn/gbm/tests/test_loss.py +++ b/sklearn/gbm/tests/test_loss.py @@ -85,7 +85,7 @@ def fprime2(x): @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ ('least_squares', 0, 1), ('binary_crossentropy', 2, 1), - # ('categorical_crossentropy', 3, 3), + ('categorical_crossentropy', 3, 3), ]) @pytest.mark.skipif(Y_DTYPE != np.float64, reason='Need 64 bits float precision for numerical checks') @@ -172,7 +172,6 @@ def test_baseline_binary_crossentropy(): assert_almost_equal(baseline_prediction, np.log(p / (1 - p))) -@pytest.mark.skip('categorical crossentropy not supported yet') def test_baseline_categorical_crossentropy(): rng = np.random.RandomState(0) From 1ea65e2c994c9a7bb3fbcfe9fa551c4326c69105 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 14 Jan 2019 09:35:02 -0500 Subject: [PATCH 032/247] Parallelize predictions --- bench_predict.py | 11 ++--------- sklearn/gbm/loss.pyx | 7 ++++--- sklearn/gbm/predictor.pyx | 16 ++++++++++------ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/bench_predict.py b/bench_predict.py index e859470eaa3fa..5738678f4ab02 100644 --- a/bench_predict.py +++ b/bench_predict.py @@ -1,8 +1,5 @@ """ Compare prediction time with pygbm. - -run with -export NUMBA_NUM_THREADS=1 && make in && python bench_predict.py """ from time import time @@ -13,10 +10,8 @@ import matplotlib.pyplot as plt from sklearn.datasets import make_regression, make_classification -from sklearn.ensemble import GradientBoostingRegressor -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import GBMRegressor -from sklearn.ensemble import GBMClassifier +from sklearn.gbm import GBMRegressor +from sklearn.gbm import GBMClassifier classif = False n_classes = 3 @@ -30,13 +25,11 @@ random_state=0, n_classes=n_classes, n_clusters_per_class=1) GBM = GBMClassifier - GBDT = GradientBoostingClassifier PYGBM_GBM = pygbm.GradientBoostingClassifier else: X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0) GBM = GBMRegressor - GBDT = GradientBoostingRegressor PYGBM_GBM = pygbm.GradientBoostingRegressor diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index b550c5132e01c..95978be23209b 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -278,8 +278,9 @@ class CategoricalCrossEntropy(BaseLoss): logsumexp(raw_predictions, axis=1)[:, np.newaxis]) -cdef inline Y_DTYPE_C _logsumexp(Y_DTYPE_C [:, :] a, int i) nogil: - # Need to pass the whole array, else prange won't work +cdef inline Y_DTYPE_C _logsumexp(Y_DTYPE_C [:, :] a, const int row) nogil: + # Need to pass the whole array, else prange won't work. See issue Cython + # #2798 cdef: int k Y_DTYPE_C out = 0. @@ -292,7 +293,7 @@ cdef inline Y_DTYPE_C _logsumexp(Y_DTYPE_C [:, :] a, int i) nogil: # return log(out) + amax for k in range(a.shape[1]): - out += exp(a[i, k]) + out += exp(a[row, k]) return log(out) diff --git a/sklearn/gbm/predictor.pyx b/sklearn/gbm/predictor.pyx index 4abd4a3b1a8da..0d9e249fa45d1 100644 --- a/sklearn/gbm/predictor.pyx +++ b/sklearn/gbm/predictor.pyx @@ -82,9 +82,13 @@ class TreePredictor: return out -cdef Y_DTYPE_C _predict_one_from_numeric_data( +cdef inline Y_DTYPE_C _predict_one_from_numeric_data( node_struct [:] nodes, - const X_DTYPE_C [:] numeric_data) nogil: + const X_DTYPE_C [:, :] numeric_data, + const int row + ) nogil: + # Need to pass the whole array, else prange won't work. See issue Cython + # #2798 cdef: node_struct node = nodes[0] @@ -92,7 +96,7 @@ cdef Y_DTYPE_C _predict_one_from_numeric_data( while True: if node.is_leaf: return node.value - if numeric_data[node.feature_idx] <= node.threshold: + if numeric_data[row, node.feature_idx] <= node.threshold: node = nodes[node.left] else: node = nodes[node.right] @@ -107,6 +111,6 @@ cdef void _predict_from_numeric_data( int i # TODO: Why does prange fail?? - # for i in prange(numeric_data.shape[0], schedule='static'): - for i in range(numeric_data.shape[0]): - out[i] = _predict_one_from_numeric_data(nodes, numeric_data[i]) + # for i in range(numeric_data.shape[0]): + for i in prange(numeric_data.shape[0], schedule='static'): + out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i) From e9c25094d4d8bb836b248dfcdd2f52197322809e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 14 Jan 2019 09:38:51 -0500 Subject: [PATCH 033/247] removed get_threads_chunks --- sklearn/gbm/loss.pyx | 23 ----------------------- sklearn/gbm/utils.py | 20 -------------------- 2 files changed, 43 deletions(-) diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index 95978be23209b..99b3b9dbbe4ee 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -24,29 +24,6 @@ from .types import Y_DTYPE from .types cimport Y_DTYPE_C -cdef get_threads_chunks(unsigned int total_size): - """Get start and end indices of threads in an array of size total_size. - - The interval [0, total_size - 1] is divided into n_threads contiguous - regions, and the starts and ends of each region are returned. Used to - simulate a 'static' scheduling. - """ - cdef: - np.ndarray[np.uint32_t] sizes - np.ndarray[np.uint32_t] starts - np.ndarray[np.uint32_t] ends - unsigned int n_threads - - n_threads = 1 # TODO: change this - sizes = np.full(n_threads, total_size // n_threads, dtype=np.uint32) - sizes[:total_size % n_threads] += 1 - starts = np.zeros(n_threads, dtype=np.uint32) - starts[1:] = np.cumsum(sizes[:-1]) - ends = starts + sizes - - return starts, ends, n_threads - - class BaseLoss(ABC): """Base class for a loss.""" diff --git a/sklearn/gbm/utils.py b/sklearn/gbm/utils.py index 628c8e95639b1..7b0239b0e22b1 100644 --- a/sklearn/gbm/utils.py +++ b/sklearn/gbm/utils.py @@ -57,23 +57,3 @@ def get_lightgbm_estimator(pygbm_estimator): Est = LGBMRegressor return Est(**lgbm_params) - - -def get_threads_chunks(total_size): - """Get start and end indices of threads in an array of size total_size. - - The interval [0, total_size - 1] is divided into n_threads contiguous - regions, and the starts and ends of each region are returned. Used to - simulate a 'static' scheduling. - """ - n_threads = 4 # TODO: change this - sizes = np.full(n_threads, total_size // n_threads, dtype=np.int32) - if total_size % n_threads > 0: - # array[:0] will cause a bug in numba 0.41 so we need the if. - # Remove once issue numba 3554 is fixed. - sizes[:total_size % n_threads] += 1 - starts = np.zeros(n_threads, dtype=np.int32) - starts[1:] = np.cumsum(sizes[:-1]) - ends = starts + sizes - - return starts, ends, n_threads From cf3f7235923cbfe524f7f5c11b82548793d5a2ad Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 15 Jan 2019 11:41:24 -0500 Subject: [PATCH 034/247] n_features param to test script --- gdb_test.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index b1d439c887541..14aa1282de0e2 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -11,17 +11,18 @@ import pygbm classif = True -n_classes = 3 -n_samples = int(1e6) +n_classes = 2 +n_features = 20 +n_samples = int(1e7) max_iter = 5 if classif: - X, y = make_classification(n_samples=n_samples, random_state=0, n_classes=n_classes, n_clusters_per_class=1) + X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0, n_classes=n_classes, n_clusters_per_class=1) GBM = GBMClassifier GBDT = GradientBoostingClassifier PYGBM_GBM = pygbm.GradientBoostingClassifier else: - X, y = make_regression(n_samples=n_samples, random_state=0) + X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0) GBM = GBMRegressor GBDT = GradientBoostingRegressor PYGBM_GBM = pygbm.GradientBoostingRegressor From c6227cd4861e5309407c9f8a9e04f6ab40ba6a7c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 10:50:04 -0500 Subject: [PATCH 035/247] Specified array alignments in splitting and histogram --- gdb_test.py | 40 ++++++++++++------------- sklearn/gbm/histogram.pxd | 38 ++++++++++++------------ sklearn/gbm/histogram.pyx | 38 ++++++++++++------------ sklearn/gbm/splitting.pyx | 62 +++++++++++++++++++-------------------- 4 files changed, 89 insertions(+), 89 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index 14aa1282de0e2..dc618de5619c3 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -10,10 +10,10 @@ import cProfile import pygbm -classif = True +classif = False n_classes = 2 n_features = 20 -n_samples = int(1e7) +n_samples = int(1e6) max_iter = 5 if classif: @@ -28,15 +28,15 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -pygbm_est = PYGBM_GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - random_state=0, - verbose=False) -print("compiling pygbm code") -pygbm_est.fit(X[:1000], y[:1000]) -print("done") +# pygbm_est = PYGBM_GBM( +# max_iter=max_iter, +# scoring=None, # no early stopping +# validation_split=None, +# random_state=0, +# verbose=False) +# print("compiling pygbm code") +# pygbm_est.fit(X[:1000], y[:1000]) +# print("done") gbm = GBM( max_iter=max_iter, @@ -55,15 +55,15 @@ print(f'sklearn gbm score_duration {score_duration:.3f}s') -pygbm_est.set_params(verbose=True) -tic = time() -pygbm_est.fit(X, y) -fit_duration = time() - tic -tic = time() -print(f'score: {pygbm_est.score(X, y)}') -score_duration = time() - tic -print(f'pygbm fit_duration: {fit_duration:.3f}s') -print(f'pygbm score_duration {score_duration:.3f}s') +# pygbm_est.set_params(verbose=True) +# tic = time() +# pygbm_est.fit(X, y) +# fit_duration = time() - tic +# tic = time() +# print(f'score: {pygbm_est.score(X, y)}') +# score_duration = time() - tic +# print(f'pygbm fit_duration: {fit_duration:.3f}s') +# print(f'pygbm score_duration {score_duration:.3f}s') # cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") # s = pstats.Stats("Profile.prof") diff --git a/sklearn/gbm/histogram.pxd b/sklearn/gbm/histogram.pxd index deb2d7b8e18bf..622662ccc08f0 100644 --- a/sklearn/gbm/histogram.pxd +++ b/sklearn/gbm/histogram.pxd @@ -10,34 +10,34 @@ from .types cimport hist_struct cpdef void _subtract_histograms( unsigned int n_bins, - hist_struct [:] hist_a, - hist_struct [:] hist_b, - hist_struct [:] out) nogil + hist_struct [::1] hist_a, + hist_struct [::1] hist_b, + hist_struct [::1] out) nogil cpdef void _build_histogram( unsigned int n_bins, - unsigned int [:] sample_indices, - X_BINNED_DTYPE_C [:] binned_feature, - Y_DTYPE_C [:] ordered_gradients, - Y_DTYPE_C [:] ordered_hessians, - hist_struct [:] out) nogil + unsigned int [::1] sample_indices, + X_BINNED_DTYPE_C [::1] binned_feature, + Y_DTYPE_C [::1] ordered_gradients, + Y_DTYPE_C [::1] ordered_hessians, + hist_struct [::1] out) nogil cpdef void _build_histogram_no_hessian( unsigned int n_bins, - unsigned int [:] sample_indices, - X_BINNED_DTYPE_C [:] binned_feature, - Y_DTYPE_C [:] ordered_gradients, - hist_struct [:] out) nogil + unsigned int [::1] sample_indices, + X_BINNED_DTYPE_C [::1] binned_feature, + Y_DTYPE_C [::1] ordered_gradients, + hist_struct [::1] out) nogil cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - X_BINNED_DTYPE_C [:] binned_feature, - Y_DTYPE_C [:] all_gradients, - hist_struct [:] out) nogil + X_BINNED_DTYPE_C [::1] binned_feature, + Y_DTYPE_C [::1] all_gradients, + hist_struct [::1] out) nogil cpdef void _build_histogram_root( unsigned int n_bins, - X_BINNED_DTYPE_C [:] binned_feature, - Y_DTYPE_C [:] all_gradients, - Y_DTYPE_C [:] all_hessians, - hist_struct [:] out) nogil + X_BINNED_DTYPE_C [::1] binned_feature, + Y_DTYPE_C [::1] all_gradients, + Y_DTYPE_C [::1] all_hessians, + hist_struct [::1] out) nogil diff --git a/sklearn/gbm/histogram.pyx b/sklearn/gbm/histogram.pyx index 841e60905008d..5db553768449b 100644 --- a/sklearn/gbm/histogram.pyx +++ b/sklearn/gbm/histogram.pyx @@ -42,9 +42,9 @@ cpdef void _build_histogram_naive( cpdef void _subtract_histograms( unsigned int n_bins, - hist_struct [:] hist_a, # IN - hist_struct [:] hist_b, # IN - hist_struct [:] out # OUT + hist_struct [::1] hist_a, # IN + hist_struct [::1] hist_b, # IN + hist_struct [::1] out # OUT ) nogil: """compute (hist_a - hist_b) in out""" @@ -58,11 +58,11 @@ cpdef void _subtract_histograms( cpdef void _build_histogram( unsigned int n_bins, - unsigned int [:] sample_indices, # IN - X_BINNED_DTYPE_C [:] binned_feature, # IN - Y_DTYPE_C [:] ordered_gradients, # IN - Y_DTYPE_C [:] ordered_hessians, # IN - hist_struct [:] out # OUT + unsigned int [::1] sample_indices, # IN + X_BINNED_DTYPE_C [::1] binned_feature, # IN + Y_DTYPE_C [::1] ordered_gradients, # IN + Y_DTYPE_C [::1] ordered_hessians, # IN + hist_struct [::1] out # OUT ) nogil: """Return histogram for a given feature.""" cdef: @@ -106,10 +106,10 @@ cpdef void _build_histogram( cpdef void _build_histogram_no_hessian( unsigned int n_bins, - unsigned int [:] sample_indices, # IN - X_BINNED_DTYPE_C [:] binned_feature, # IN - Y_DTYPE_C [:] ordered_gradients, # OUT - hist_struct [:] out # OUT + unsigned int [::1] sample_indices, # IN + X_BINNED_DTYPE_C [::1] binned_feature, # IN + Y_DTYPE_C [::1] ordered_gradients, # OUT + hist_struct [::1] out # OUT ) nogil: """Return histogram for a given feature.""" cdef: @@ -147,9 +147,9 @@ cpdef void _build_histogram_no_hessian( cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - X_BINNED_DTYPE_C [:] binned_feature, # IN - Y_DTYPE_C [:] all_gradients, # IN - hist_struct [:] out # OUT + X_BINNED_DTYPE_C [::1] binned_feature, # IN + Y_DTYPE_C [::1] all_gradients, # IN + hist_struct [::1] out # OUT ) nogil: """Special case for the root node @@ -194,10 +194,10 @@ cpdef void _build_histogram_root_no_hessian( cpdef void _build_histogram_root( unsigned int n_bins, - X_BINNED_DTYPE_C [:] binned_feature, # IN - Y_DTYPE_C [:] all_gradients, # IN - Y_DTYPE_C [:] all_hessians, # IN - hist_struct [:] out # OUT + X_BINNED_DTYPE_C [::1] binned_feature, # IN + Y_DTYPE_C [::1] all_gradients, # IN + Y_DTYPE_C [::1] all_hessians, # IN + hist_struct [::1] out # OUT ) nogil: """Special case for the root node diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 801a27eb0e13f..ac7d8519a4e85 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -132,14 +132,14 @@ cdef class SplittingContext: be ignored. """ cdef public: - X_BINNED_DTYPE_C [:, :] X_binned + X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features unsigned int max_bins unsigned int [:] n_bins_per_feature - Y_DTYPE_C [:] gradients - Y_DTYPE_C [:] hessians - Y_DTYPE_C [:] ordered_gradients - Y_DTYPE_C [:] ordered_hessians + Y_DTYPE_C [::1] gradients + Y_DTYPE_C [::1] hessians + Y_DTYPE_C [::1] ordered_gradients + Y_DTYPE_C [::1] ordered_hessians Y_DTYPE_C sum_gradients Y_DTYPE_C sum_hessians unsigned char constant_hessian @@ -149,13 +149,13 @@ cdef class SplittingContext: unsigned int min_samples_leaf Y_DTYPE_C min_gain_to_split - unsigned int [:] partition - unsigned int [:] left_indices_buffer - unsigned int [:] right_indices_buffer + unsigned int [::1] partition + unsigned int [::1] left_indices_buffer + unsigned int [::1] right_indices_buffer - def __init__(self, X_BINNED_DTYPE_C [:, :] X_binned, unsigned int + def __init__(self, X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - Y_DTYPE_C [:] gradients, Y_DTYPE_C [:] hessians, Y_DTYPE_C + Y_DTYPE_C [::1] gradients, Y_DTYPE_C [::1] hessians, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0.): @@ -200,7 +200,7 @@ cdef class SplittingContext: def split_indices( SplittingContext context, SplitInfo split_info, - unsigned int [:] sample_indices): + unsigned int [::1] sample_indices): """Split samples into left and right arrays. The split is performed according to the best possible split (split_info). @@ -275,9 +275,9 @@ def split_indices( cdef: int n_samples = sample_indices.shape[0] - X_BINNED_DTYPE_C [:] X_binned = context.X_binned.T[split_info.feature_idx] - unsigned int [:] left_indices_buffer = context.left_indices_buffer - unsigned int [:] right_indices_buffer = context.right_indices_buffer + X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, split_info.feature_idx] + unsigned int [::1] left_indices_buffer = context.left_indices_buffer + unsigned int [::1] right_indices_buffer = context.right_indices_buffer int n_threads = omp_get_max_threads() int [:] sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) @@ -353,8 +353,8 @@ def split_indices( def find_node_split( SplittingContext context, - unsigned int [:] sample_indices, # IN - hist_struct [:, :] histograms): # OUT + unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms): # OUT """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -441,10 +441,10 @@ def find_node_split( def find_node_split_subtraction( SplittingContext context, - unsigned int [:] sample_indices, # IN - hist_struct [:, :] parent_histograms, # IN - hist_struct [:, :] sibling_histograms, # IN - hist_struct [:, :] histograms): # OUT + unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] sibling_histograms, # IN + hist_struct [:, ::1] histograms): # OUT """For each feature, find the best bin to split on at a given node. Returns the best split info among all features, and the histograms of @@ -563,8 +563,8 @@ cdef split_info_struct _find_best_feature_to_split_helper( cdef split_info_struct _find_histogram_split( SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices, # IN - hist_struct [:] histogram # OUT + unsigned int [::1] sample_indices, # IN + hist_struct [::1] histogram # OUT ) nogil: """Compute the histogram for a given feature @@ -573,11 +573,11 @@ cdef split_info_struct _find_histogram_split( cdef: unsigned int n_samples = sample_indices.shape[0] - X_BINNED_DTYPE_C [:] X_binned = context.X_binned.T[feature_idx] + X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples - Y_DTYPE_C [:] ordered_gradients = \ + Y_DTYPE_C [::1] ordered_gradients = \ context.ordered_gradients[:n_samples] - Y_DTYPE_C [:] ordered_hessians = context.ordered_hessians[:n_samples] + Y_DTYPE_C [::1] ordered_hessians = context.ordered_hessians[:n_samples] if root_node: if context.constant_hessian: @@ -601,9 +601,9 @@ cdef split_info_struct _find_histogram_split( cdef split_info_struct _find_histogram_split_subtraction( SplittingContext context, unsigned int feature_idx, - hist_struct [:] parent_histogram, # IN - hist_struct [:] sibling_histogram, # IN - hist_struct [:] histogram, # OUT + hist_struct [::1] parent_histogram, # IN + hist_struct [::1] sibling_histogram, # IN + hist_struct [::1] histogram, # OUT unsigned int n_samples ) nogil: """Compute the histogram by substraction of parent and sibling @@ -622,7 +622,7 @@ cdef split_info_struct _find_histogram_split_subtraction( cdef split_info_struct _find_best_bin_to_split_helper( SplittingContext context, unsigned int feature_idx, - hist_struct [:] histogram, # IN + hist_struct [::1] histogram, # IN unsigned int n_samples) nogil: """Find best bin to split on, and return the corresponding SplitInfo. @@ -726,8 +726,8 @@ cdef inline Y_DTYPE_C negative_loss( def _find_histogram_split_wrapper( SplittingContext context, unsigned int feature_idx, - unsigned int [:] sample_indices, - hist_struct [:] histogram): + unsigned int [::1] sample_indices, + hist_struct [::1] histogram): split_info = _find_histogram_split(context, feature_idx, sample_indices, histogram) From 10520dadbc744bd4ba95de6e3dac3962075f82e8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 11:54:36 -0500 Subject: [PATCH 036/247] used const views where possible and used prange sum reduction --- sklearn/gbm/histogram.pxd | 63 +++++++++++++++++++++++++-------------- sklearn/gbm/histogram.pyx | 62 ++++++++++++++------------------------ sklearn/gbm/splitting.pyx | 32 +++++++++++--------- 3 files changed, 81 insertions(+), 76 deletions(-) diff --git a/sklearn/gbm/histogram.pxd b/sklearn/gbm/histogram.pxd index 622662ccc08f0..0b1b8e61bd4f0 100644 --- a/sklearn/gbm/histogram.pxd +++ b/sklearn/gbm/histogram.pxd @@ -1,3 +1,10 @@ +# cython: language_level=3 +"""This module contains njitted routines for building histograms. + +A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each +feature has its own histogram. A histogram contains the sum of gradients and +hessians of all the samples belonging to each bin. +""" import numpy as np cimport numpy as np @@ -6,38 +13,48 @@ from .types cimport X_BINNED_DTYPE_C from .types cimport Y_DTYPE_C from .types cimport hist_struct -# See histogram.pyx for docstrings and details - +"""compute (hist_a - hist_b) in out""" cpdef void _subtract_histograms( unsigned int n_bins, - hist_struct [::1] hist_a, - hist_struct [::1] hist_b, - hist_struct [::1] out) nogil + const hist_struct [::1] hist_a, # IN + const hist_struct [::1] hist_b, # IN + hist_struct [::1] out) nogil # OUT + +"""Return histogram for a given feature.""" cpdef void _build_histogram( unsigned int n_bins, - unsigned int [::1] sample_indices, - X_BINNED_DTYPE_C [::1] binned_feature, - Y_DTYPE_C [::1] ordered_gradients, - Y_DTYPE_C [::1] ordered_hessians, - hist_struct [::1] out) nogil + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] ordered_gradients, # IN + const Y_DTYPE_C [::1] ordered_hessians, # IN + hist_struct [::1] out) nogil # OUT + +"""Return histogram for a given feature, not updating hessians. +Used when the hessians of the loss are constant (tipycally LS loss).""" cpdef void _build_histogram_no_hessian( unsigned int n_bins, - unsigned int [::1] sample_indices, - X_BINNED_DTYPE_C [::1] binned_feature, - Y_DTYPE_C [::1] ordered_gradients, - hist_struct [::1] out) nogil + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] ordered_gradients, # IN + hist_struct [::1] out) nogil # OUT -cpdef void _build_histogram_root_no_hessian( +"""Compute histogram of the root node. +Unlike other nodes, the root node has to find the split among *all* the +samples from the training set. binned_feature and all_gradients / +all_hessians already have a consistent ordering.""" +cpdef void _build_histogram_root( unsigned int n_bins, - X_BINNED_DTYPE_C [::1] binned_feature, - Y_DTYPE_C [::1] all_gradients, - hist_struct [::1] out) nogil + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] all_gradients, # IN + const Y_DTYPE_C [::1] all_hessians, # IN + hist_struct [::1] out) nogil # OUT -cpdef void _build_histogram_root( +"""Compute histogram of the root node, not updating hessians. +Used when the hessians of the loss are constant (tipycally LS loss).""" +cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - X_BINNED_DTYPE_C [::1] binned_feature, - Y_DTYPE_C [::1] all_gradients, - Y_DTYPE_C [::1] all_hessians, - hist_struct [::1] out) nogil + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] all_gradients, # IN + hist_struct [::1] out) nogil # OUT diff --git a/sklearn/gbm/histogram.pyx b/sklearn/gbm/histogram.pyx index 5db553768449b..eefc0c84b6951 100644 --- a/sklearn/gbm/histogram.pyx +++ b/sklearn/gbm/histogram.pyx @@ -16,6 +16,8 @@ cimport numpy as np from .types import HISTOGRAM_DTYPE # Note: IN views are read-only, OUT views are write-only +# See histogram.pxd for docstrings and details + cpdef void _build_histogram_naive( unsigned int n_bins, @@ -46,8 +48,6 @@ cpdef void _subtract_histograms( hist_struct [::1] hist_b, # IN hist_struct [::1] out # OUT ) nogil: - """compute (hist_a - hist_b) in out""" - cdef: unsigned int i = 0 for i in range(n_bins): @@ -58,13 +58,12 @@ cpdef void _subtract_histograms( cpdef void _build_histogram( unsigned int n_bins, - unsigned int [::1] sample_indices, # IN - X_BINNED_DTYPE_C [::1] binned_feature, # IN - Y_DTYPE_C [::1] ordered_gradients, # IN - Y_DTYPE_C [::1] ordered_hessians, # IN + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] ordered_gradients, # IN + const Y_DTYPE_C [::1] ordered_hessians, # IN hist_struct [::1] out # OUT ) nogil: - """Return histogram for a given feature.""" cdef: unsigned int i = 0 unsigned int n_node_samples = sample_indices.shape[0] @@ -106,12 +105,11 @@ cpdef void _build_histogram( cpdef void _build_histogram_no_hessian( unsigned int n_bins, - unsigned int [::1] sample_indices, # IN - X_BINNED_DTYPE_C [::1] binned_feature, # IN - Y_DTYPE_C [::1] ordered_gradients, # OUT + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] ordered_gradients, # OUT hist_struct [::1] out # OUT ) nogil: - """Return histogram for a given feature.""" cdef: unsigned int i = 0 unsigned int n_node_samples = sample_indices.shape[0] @@ -145,20 +143,13 @@ cpdef void _build_histogram_no_hessian( out[bin_idx].count += 1 -cpdef void _build_histogram_root_no_hessian( +cpdef void _build_histogram_root( unsigned int n_bins, - X_BINNED_DTYPE_C [::1] binned_feature, # IN - Y_DTYPE_C [::1] all_gradients, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] all_gradients, # IN + const Y_DTYPE_C [::1] all_hessians, # IN hist_struct [::1] out # OUT ) nogil: - """Special case for the root node - - The root node has to find the split among all the samples from the - training set. binned_feature and all_gradients already have a consistent - ordering. - - Hessians are not updated (used when hessians are constant) - """ cdef: unsigned int i = 0 unsigned int n_samples = binned_feature.shape[0] @@ -171,6 +162,7 @@ cpdef void _build_histogram_root_no_hessian( unsigned int bin_idx for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[i] bin_1 = binned_feature[i + 1] bin_2 = binned_feature[i + 2] @@ -181,6 +173,11 @@ cpdef void _build_histogram_root_no_hessian( out[bin_2].sum_gradients += all_gradients[i + 2] out[bin_3].sum_gradients += all_gradients[i + 3] + out[bin_0].sum_hessians += all_hessians[i] + out[bin_1].sum_hessians += all_hessians[i + 1] + out[bin_2].sum_hessians += all_hessians[i + 2] + out[bin_3].sum_hessians += all_hessians[i + 3] + out[bin_0].count += 1 out[bin_1].count += 1 out[bin_2].count += 1 @@ -189,22 +186,16 @@ cpdef void _build_histogram_root_no_hessian( for i in range(unrolled_upper, n_samples): bin_idx = binned_feature[i] out[bin_idx].sum_gradients += all_gradients[i] + out[bin_idx].sum_hessians += all_hessians[i] out[bin_idx].count += 1 -cpdef void _build_histogram_root( +cpdef void _build_histogram_root_no_hessian( unsigned int n_bins, - X_BINNED_DTYPE_C [::1] binned_feature, # IN - Y_DTYPE_C [::1] all_gradients, # IN - Y_DTYPE_C [::1] all_hessians, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] all_gradients, # IN hist_struct [::1] out # OUT ) nogil: - """Special case for the root node - - The root node has to find the split among all the samples from the - training set. binned_feature and all_gradients and all_hessians already - have a consistent ordering. - """ cdef: unsigned int i = 0 unsigned int n_samples = binned_feature.shape[0] @@ -217,7 +208,6 @@ cpdef void _build_histogram_root( unsigned int bin_idx for i in range(0, unrolled_upper, 4): - bin_0 = binned_feature[i] bin_1 = binned_feature[i + 1] bin_2 = binned_feature[i + 2] @@ -228,11 +218,6 @@ cpdef void _build_histogram_root( out[bin_2].sum_gradients += all_gradients[i + 2] out[bin_3].sum_gradients += all_gradients[i + 3] - out[bin_0].sum_hessians += all_hessians[i] - out[bin_1].sum_hessians += all_hessians[i + 1] - out[bin_2].sum_hessians += all_hessians[i + 2] - out[bin_3].sum_hessians += all_hessians[i + 3] - out[bin_0].count += 1 out[bin_1].count += 1 out[bin_2].count += 1 @@ -241,5 +226,4 @@ cpdef void _build_histogram_root( for i in range(unrolled_upper, n_samples): bin_idx = binned_feature[i] out[bin_idx].sum_gradients += all_gradients[i] - out[bin_idx].sum_hessians += all_hessians[i] out[bin_idx].count += 1 diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index ac7d8519a4e85..4bb08e7e84bf0 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -132,7 +132,7 @@ cdef class SplittingContext: be ignored. """ cdef public: - X_BINNED_DTYPE_C [::1, :] X_binned + const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features unsigned int max_bins unsigned int [:] n_bins_per_feature @@ -153,7 +153,7 @@ cdef class SplittingContext: unsigned int [::1] left_indices_buffer unsigned int [::1] right_indices_buffer - def __init__(self, X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int + def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, Y_DTYPE_C [::1] gradients, Y_DTYPE_C [::1] hessians, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, @@ -275,7 +275,7 @@ def split_indices( cdef: int n_samples = sample_indices.shape[0] - X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, split_info.feature_idx] + const X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, split_info.feature_idx] unsigned int [::1] left_indices_buffer = context.left_indices_buffer unsigned int [::1] right_indices_buffer = context.right_indices_buffer int n_threads = omp_get_max_threads() @@ -353,7 +353,7 @@ def split_indices( def find_node_split( SplittingContext context, - unsigned int [::1] sample_indices, # IN + const unsigned int [::1] sample_indices, # IN hist_struct [:, ::1] histograms): # OUT """For each feature, find the best bin to split on at a given node. @@ -387,6 +387,9 @@ def find_node_split( unsigned int n_threads split_info_struct split_info split_info_struct * split_infos + # For some reason, we need to use local variables for prange reduction. + Y_DTYPE_C sum_gradients = 0. + Y_DTYPE_C sum_hessians = 0. with nogil: n_samples = sample_indices.shape[0] @@ -405,16 +408,17 @@ def find_node_split( context.ordered_hessians[i] = \ context.hessians[sample_indices[i]] - context.sum_gradients = 0. - for i in range(n_samples): - context.sum_gradients += context.ordered_gradients[i] + # Compute context.sum_gradients and context.sum_hessians + for i in prange(n_samples, schedule='static'): + sum_gradients += context.ordered_gradients[i] + context.sum_gradients = sum_gradients if context.constant_hessian: - context.sum_hessians = context.constant_hessian_value * n_samples + sum_hessians = context.constant_hessian_value * n_samples else: - context.sum_hessians = 0. - for i in range(n_samples): - context.sum_hessians += context.ordered_hessians[i] + for i in prange(n_samples, schedule='static'): + sum_hessians += context.ordered_hessians[i] + context.sum_hessians = sum_hessians # TODO: this needs to be freed at some point split_infos = malloc( @@ -563,7 +567,7 @@ cdef split_info_struct _find_best_feature_to_split_helper( cdef split_info_struct _find_histogram_split( SplittingContext context, unsigned int feature_idx, - unsigned int [::1] sample_indices, # IN + const unsigned int [::1] sample_indices, # IN hist_struct [::1] histogram # OUT ) nogil: """Compute the histogram for a given feature @@ -573,7 +577,7 @@ cdef split_info_struct _find_histogram_split( cdef: unsigned int n_samples = sample_indices.shape[0] - X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, feature_idx] + const X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples Y_DTYPE_C [::1] ordered_gradients = \ context.ordered_gradients[:n_samples] @@ -622,7 +626,7 @@ cdef split_info_struct _find_histogram_split_subtraction( cdef split_info_struct _find_best_bin_to_split_helper( SplittingContext context, unsigned int feature_idx, - hist_struct [::1] histogram, # IN + const hist_struct [::1] histogram, # IN unsigned int n_samples) nogil: """Find best bin to split on, and return the corresponding SplitInfo. From 2a80af8002beac9b24c1525c301b5299fbbf5169 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 12:29:01 -0500 Subject: [PATCH 037/247] Directly pass sum_gradient and sum_hessians to find_node_split_subtraction --- sklearn/gbm/grower.py | 9 ++++++++- sklearn/gbm/splitting.pyx | 24 ++++-------------------- sklearn/gbm/tests/test_splitting.py | 27 ++++++++++++++------------- 3 files changed, 26 insertions(+), 34 deletions(-) diff --git a/sklearn/gbm/grower.py b/sklearn/gbm/grower.py index b62091f7034c8..c4ead962c9a77 100644 --- a/sklearn/gbm/grower.py +++ b/sklearn/gbm/grower.py @@ -313,9 +313,16 @@ def _compute_spittability(self, node, only_hist=False): histograms = np.zeros(shape=(self.n_features, self.max_bins), dtype=HISTOGRAM_DTYPE) if node.hist_subtraction: + if node is node.parent.right_child: + sum_gradients = node.parent.split_info.gradient_right + sum_hessians = node.parent.split_info.hessian_right + else: + sum_gradients = node.parent.split_info.gradient_left + sum_hessians = node.parent.split_info.hessian_left split_info = find_node_split_subtraction( self.splitting_context, node.sample_indices, - node.parent.histograms, node.sibling.histograms, histograms) + sum_gradients, sum_hessians, node.parent.histograms, + node.sibling.histograms, histograms) else: split_info = find_node_split( self.splitting_context, node.sample_indices, histograms) diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 4bb08e7e84bf0..464d4a2ba6988 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -446,6 +446,8 @@ def find_node_split( def find_node_split_subtraction( SplittingContext context, unsigned int [::1] sample_indices, # IN + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, hist_struct [:, ::1] parent_histograms, # IN hist_struct [:, ::1] sibling_histograms, # IN hist_struct [:, ::1] histograms): # OUT @@ -498,26 +500,8 @@ def find_node_split_subtraction( with nogil: n_samples = sample_indices.shape[0] - # TODO: maybe change this computation... we could probably store sum_g/h in - # the SplitInfo for a speed gain - # Compute sum_hessians and sum_gradients. - # We can pick any feature (here the first) in the histograms to - # compute the gradients: they must be the same across all features - # anyway, we have tests ensuring this. Maybe a more robust way would - # be to compute an average but it's probably not worth it. - context.sum_gradients = 0. - for i in range(context.max_bins): - context.sum_gradients += (parent_histograms[0, i].sum_gradients - - sibling_histograms[0, i].sum_gradients) - - if context.constant_hessian: - context.sum_hessians = \ - context.constant_hessian_value * n_samples - else: - context.sum_hessians = 0. - for i in range(context.max_bins): - context.sum_hessians += (parent_histograms[0, i].sum_hessians - - sibling_histograms[0, i].sum_hessians) + context.sum_gradients = sum_gradients + context.sum_hessians = sum_hessians # TODO: this needs to be freed at some point split_infos = malloc( diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index c74f3461040c1..ff37223f26ad2 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -93,10 +93,6 @@ def test_split_vs_split_subtraction(constant_hessian): l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) - mask = rng.randint(0, 2, n_samples).astype(np.bool) - sample_indices_left = sample_indices[mask] - sample_indices_right = sample_indices[~mask] - hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) @@ -104,17 +100,21 @@ def test_split_vs_split_subtraction(constant_hessian): hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method - _ = find_node_split(context, sample_indices, hists_parent) + si_parent = find_node_split(context, sample_indices, hists_parent) + sample_indices_left, sample_indices_right, _ = split_indices( + context, si_parent, sample_indices) si_left = find_node_split(context, sample_indices_left, hists_left) si_right = find_node_split(context, sample_indices_right, hists_right) # split left with subtraction method si_left_sub = find_node_split_subtraction( - context, sample_indices_left, hists_parent, hists_right, hists_left_sub) + context, sample_indices_left, si_parent.gradient_left, + si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method si_right_sub = find_node_split_subtraction( - context, sample_indices_right, hists_parent, hists_left, hists_right_sub) + context, sample_indices_right, si_parent.gradient_right, + si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) # make sure histograms from classical and subtraction method are the same for hists, hists_sub in ((hists_left, hists_left_sub), @@ -179,10 +179,6 @@ def test_gradient_and_hessian_sanity(constant_hessian): l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) - mask = rng.randint(0, 2, n_samples).astype(np.bool) - sample_indices_left = sample_indices[mask] - sample_indices_right = sample_indices[~mask] - hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) @@ -191,16 +187,21 @@ def test_gradient_and_hessian_sanity(constant_hessian): # first split parent, left and right with classical method si_parent = find_node_split(context, sample_indices, hists_parent) + sample_indices_left, sample_indices_right, _ = split_indices( + context, si_parent, sample_indices) + si_left = find_node_split(context, sample_indices_left, hists_left) si_right = find_node_split(context, sample_indices_right, hists_right) # split left with subtraction method si_left_sub = find_node_split_subtraction( - context, sample_indices_left, hists_parent, hists_right, hists_left_sub) + context, sample_indices_left, si_parent.gradient_left, + si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method si_right_sub = find_node_split_subtraction( - context, sample_indices_right, hists_parent, hists_left, hists_right_sub) + context, sample_indices_right, si_parent.gradient_right, + si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) # make sure that si.gradient_left + si.gradient_right have their expected # value, same for hessians From 6fafd85bef36d5afe7fa5b32d148250e5ca5e535 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 13:25:46 -0500 Subject: [PATCH 038/247] local variables to avoid python interactions --- bench_find_node_split.py | 11 ++++++++--- bench_hist.py | 31 ++++++++++++++++++------------- bench_split_indices.py | 9 +++++++-- sklearn/gbm/splitting.pyx | 33 +++++++++++++++++++-------------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/bench_find_node_split.py b/bench_find_node_split.py index fb226fb928d35..a476d9a2790b7 100644 --- a/bench_find_node_split.py +++ b/bench_find_node_split.py @@ -4,6 +4,9 @@ import numpy as np import matplotlib.pyplot as plt from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.types import X_DTYPE +from sklearn.gbm.types import X_BINNED_DTYPE +from sklearn.gbm.types import Y_DTYPE from sklearn.gbm.splitting import SplittingContext from sklearn.gbm.splitting import find_node_split from pygbm.splitting import SplittingContext as SplittingContext_pygbm @@ -24,10 +27,10 @@ n_samples = 10**max_pow -X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=np.uint8) +X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE) sample_indices_ = np.arange(n_samples, dtype=np.uint32) -all_gradients_ = rng.randn(n_samples).astype(np.float32) -all_hessians_ = rng.lognormal(size=n_samples).astype(np.float32) +all_gradients_ = rng.randn(n_samples).astype(Y_DTYPE) +all_hessians_ = rng.lognormal(size=n_samples).astype(Y_DTYPE) def one_run(n_samples): @@ -44,6 +47,8 @@ def one_run(n_samples): all_gradients, all_hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) + all_gradients = all_gradients.astype(np.float32) + all_hessians = all_hessians.astype(np.float32) pygbm_context = SplittingContext_pygbm(X_binned, n_bins, n_bins_per_feature, all_gradients, all_hessians, diff --git a/bench_hist.py b/bench_hist.py index 66370c9282fa0..aa16ef2e13d58 100644 --- a/bench_hist.py +++ b/bench_hist.py @@ -17,26 +17,28 @@ from pygbm.histogram import _build_histogram_root_no_hessian as pygbm_build_histogram_root_no_hessian from pygbm.histogram import _subtract_histograms as pygbm_subtract_histograms -from sklearn.ensemble.gbm.histogram import _build_histogram_naive -from sklearn.ensemble.gbm.histogram import _build_histogram -from sklearn.ensemble.gbm.histogram import _build_histogram_no_hessian -from sklearn.ensemble.gbm.histogram import _build_histogram_root -from sklearn.ensemble.gbm.histogram import _build_histogram_root_no_hessian -from sklearn.ensemble.gbm.histogram import _subtract_histograms -from sklearn.ensemble.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.histogram import _build_histogram_naive +from sklearn.gbm.histogram import _build_histogram +from sklearn.gbm.histogram import _build_histogram_no_hessian +from sklearn.gbm.histogram import _build_histogram_root +from sklearn.gbm.histogram import _build_histogram_root_no_hessian +from sklearn.gbm.histogram import _subtract_histograms +from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.types import X_DTYPE +from sklearn.gbm.types import X_BINNED_DTYPE +from sklearn.gbm.types import Y_DTYPE m = Memory(location='/tmp') @m.cache -def make_data(n_bins=256, n_samples=int(1e8), loss_dtype=np.float32, - binned_feature_dtype=np.uint8, seed=42): +def make_data(n_bins=256, n_samples=int(1e8), seed=42): rng = np.random.RandomState(seed) sample_indices = np.arange(n_samples, dtype=np.uint32) - ordered_gradients = rng.randn(n_samples).astype(loss_dtype) - ordered_hessians = rng.exponential(size=n_samples).astype(loss_dtype) - binned_feature = rng.randint(0, n_bins, size=n_samples, dtype=np.uint8) + ordered_gradients = rng.randn(n_samples).astype(Y_DTYPE) + ordered_hessians = rng.exponential(size=n_samples).astype(Y_DTYPE) + binned_feature = rng.randint(0, n_bins, size=n_samples, dtype=X_BINNED_DTYPE) return sample_indices, binned_feature, ordered_gradients, ordered_hessians @@ -63,7 +65,6 @@ def one_run(sklearn_fun, pygbm_fun): # specal case for subtract... crappy a = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) b = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) args = [n_bins, a, b] tic = time() @@ -71,7 +72,11 @@ def one_run(sklearn_fun, pygbm_fun): pygbm_duration = time() - tic print(f"pygbm: Built in {pygbm_duration:.3f}s") + a = a.astype(HISTOGRAM_DTYPE) + b = b.astype(HISTOGRAM_DTYPE) + args = [n_bins, a, b] tic = time() + histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) args.append(histogram) sklearn_fun(*args) sklearn_duration = time() - tic diff --git a/bench_split_indices.py b/bench_split_indices.py index 709f3bef2f46e..a15612a49b4a2 100644 --- a/bench_split_indices.py +++ b/bench_split_indices.py @@ -4,6 +4,9 @@ import numpy as np import matplotlib.pyplot as plt from sklearn.gbm.types import HISTOGRAM_DTYPE +from sklearn.gbm.types import X_DTYPE +from sklearn.gbm.types import X_BINNED_DTYPE +from sklearn.gbm.types import Y_DTYPE from sklearn.gbm.splitting import SplittingContext from sklearn.gbm.splitting import find_node_split from sklearn.gbm.splitting import split_indices @@ -28,8 +31,8 @@ X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=np.uint8) sample_indices_ = np.arange(n_samples, dtype=np.uint32) -all_gradients_ = rng.randn(n_samples).astype(np.float32) -all_hessians_ = rng.lognormal(size=n_samples).astype(np.float32) +all_gradients_ = rng.randn(n_samples).astype(Y_DTYPE) +all_hessians_ = rng.lognormal(size=n_samples).astype(Y_DTYPE) def one_run(n_samples): @@ -46,6 +49,8 @@ def one_run(n_samples): all_gradients, all_hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) + all_gradients = all_gradients.astype(np.float32) + all_hessians = all_hessians.astype(np.float32) pygbm_context = SplittingContext_pygbm(X_binned, n_bins, n_bins_per_feature, all_gradients, all_hessians, diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 464d4a2ba6988..9b06bf04fdbcf 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -390,6 +390,12 @@ def find_node_split( # For some reason, we need to use local variables for prange reduction. Y_DTYPE_C sum_gradients = 0. Y_DTYPE_C sum_hessians = 0. + # Also, need local views to avoid python interactions + Y_DTYPE_C [::1] ordered_gradients = context.ordered_gradients + Y_DTYPE_C [::1] gradients = context.gradients + Y_DTYPE_C [::1] ordered_hessians = context.ordered_hessians + Y_DTYPE_C [::1] hessians = context.hessians + with nogil: n_samples = sample_indices.shape[0] @@ -399,28 +405,25 @@ def find_node_split( if sample_indices.shape[0] != context.gradients.shape[0]: if context.constant_hessian: for i in prange(n_samples, schedule='static'): - context.ordered_gradients[i] = \ - context.gradients[sample_indices[i]] + ordered_gradients[i] = gradients[sample_indices[i]] else: for i in prange(n_samples, schedule='static'): - context.ordered_gradients[i] = \ - context.gradients[sample_indices[i]] - context.ordered_hessians[i] = \ - context.hessians[sample_indices[i]] + ordered_gradients[i] = gradients[sample_indices[i]] + ordered_hessians[i] = hessians[sample_indices[i]] # Compute context.sum_gradients and context.sum_hessians - for i in prange(n_samples, schedule='static'): - sum_gradients += context.ordered_gradients[i] + # for i in prange(n_samples, schedule='static'): + for i in range(n_samples): + sum_gradients += ordered_gradients[i] context.sum_gradients = sum_gradients if context.constant_hessian: sum_hessians = context.constant_hessian_value * n_samples else: for i in prange(n_samples, schedule='static'): - sum_hessians += context.ordered_hessians[i] + sum_hessians += ordered_hessians[i] context.sum_hessians = sum_hessians - # TODO: this needs to be freed at some point split_infos = malloc( context.n_features * sizeof(split_info_struct)) for feature_idx in prange(context.n_features): @@ -430,7 +433,7 @@ def find_node_split( split_info = _find_best_feature_to_split_helper(context, split_infos) - return SplitInfo( + out = SplitInfo( split_info.gain, split_info.feature_idx, split_info.bin_idx, @@ -441,7 +444,8 @@ def find_node_split( split_info.n_samples_left, split_info.n_samples_right, ) - + free(split_infos) + return out def find_node_split_subtraction( SplittingContext context, @@ -503,7 +507,6 @@ def find_node_split_subtraction( context.sum_gradients = sum_gradients context.sum_hessians = sum_hessians - # TODO: this needs to be freed at some point split_infos = malloc( context.n_features * sizeof(split_info_struct)) for feature_idx in prange(context.n_features): @@ -515,7 +518,7 @@ def find_node_split_subtraction( split_info = _find_best_feature_to_split_helper(context, split_infos) - return SplitInfo( + out = SplitInfo( split_info.gain, split_info.feature_idx, split_info.bin_idx, @@ -526,6 +529,8 @@ def find_node_split_subtraction( split_info.n_samples_left, split_info.n_samples_right, ) + free(split_infos) + return out cdef split_info_struct _find_best_feature_to_split_helper( From dac76a130b67fe837dffcdfb017e0e7988fd7803 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 15:59:15 -0500 Subject: [PATCH 039/247] split_indices is now a method --- sklearn/gbm/grower.py | 6 +- sklearn/gbm/splitting.pyx | 300 ++++++++++++++-------------- sklearn/gbm/tests/test_splitting.py | 13 +- 3 files changed, 157 insertions(+), 162 deletions(-) diff --git a/sklearn/gbm/grower.py b/sklearn/gbm/grower.py index c4ead962c9a77..07f37b8436ca4 100644 --- a/sklearn/gbm/grower.py +++ b/sklearn/gbm/grower.py @@ -8,7 +8,7 @@ import numpy as np from time import time -from .splitting import (SplittingContext, split_indices, find_node_split, +from .splitting import (SplittingContext, find_node_split, find_node_split_subtraction, SplitInfo) from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE @@ -365,8 +365,8 @@ def split_next(self): node = heappop(self.splittable_nodes) tic = time() - (sample_indices_left, sample_indices_right, i) = split_indices( - self.splitting_context, node.split_info, node.sample_indices) + (sample_indices_left, sample_indices_right, i) = self.splitting_context.split_indices( + node.split_info, node.sample_indices) toc = time() node.apply_split_time = toc - tic self.total_apply_split_time += node.apply_split_time diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 9b06bf04fdbcf..33b873f216ff4 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -197,158 +197,154 @@ cdef class SplittingContext: self.left_indices_buffer = np.empty_like(self.partition) self.right_indices_buffer = np.empty_like(self.partition) -def split_indices( - SplittingContext context, - SplitInfo split_info, - unsigned int [::1] sample_indices): - """Split samples into left and right arrays. - - The split is performed according to the best possible split (split_info). - - Ultimately, this is nothing but a partition of the sample_indices array - with a given pivot, exactly like a quicksort subroutine. - - Parameters - ---------- - context : SplittingContext - The splitting context - split_info : SplitInfo - The SplitInfo of the node to split - sample_indices : array of unsigned int - The indices of the samples at the node to split. This is a view on - context.partition, and it is modified inplace by placing the indices - of the left child at the beginning, and the indices of the right child - at the end. - - Returns - ------- - left_indices : array of int - The indices of the samples in the left child. This is a view on - context.partition. - right_indices : array of int - The indices of the samples in the right child. This is a view on - context.partition. - right_child_position : int - The position of the right child in ``sample_indices`` - """ - # This is a multi-threaded implementation inspired by lightgbm. - # Here is a quick break down. Let's suppose we want to split a node with - # 24 samples named from a to x. context.partition looks like this (the * - # are indices in other leaves that we don't care about): - # partition = [*************abcdefghijklmnopqrstuvwx****************] - # ^ ^ - # node_position node_position + node.n_samples - - # Ultimately, we want to reorder the samples inside the boundaries of the - # leaf (which becomes a node) to now represent the samples in its left and - # right child. For example: - # partition = [*************abefilmnopqrtuxcdghjksvw*****************] - # ^ ^ - # left_child_pos right_child_pos - # Note that left_child_pos always takes the value of node_position, and - # right_child_pos = left_child_pos + left_child.n_samples. The order of - # the samples inside a leaf is irrelevant. - - # 1. samples_indices is a view on this region a..x. We conceptually - # divide it into n_threads regions. Each thread will be responsible for - # its own region. Here is an example with 4 threads: - # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] - # 2. Each thread processes 6 = 24 // 4 entries and maps them into - # left_indices_buffer or right_indices_buffer. For example, we could - # have the following mapping ('.' denotes an undefined entry): - # - left_indices_buffer = [abef..|il....|mnopqr|tux...] - # - right_indices_buffer = [cd....|ghjk..|......|svw...] - # 3. We keep track of the start positions of the regions (the '|') in - # ``offset_in_buffers`` as well as the size of each region. We also keep - # track of the number of samples put into the left/right child by each - # thread. Concretely: - # - left_counts = [4, 2, 6, 3] - # - right_counts = [2, 4, 0, 3] - # 4. Finally, we put left/right_indices_buffer back into the - # samples_indices, without any undefined entries and the partition looks - # as expected - # partition = [*************abefilmnopqrtuxcdghjksvw*****************] - - # Note: We here show left/right_indices_buffer as being the same size as - # sample_indices for simplicity, but in reality they are of the same size - # as partition. - - cdef: - int n_samples = sample_indices.shape[0] - const X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, split_info.feature_idx] - unsigned int [::1] left_indices_buffer = context.left_indices_buffer - unsigned int [::1] right_indices_buffer = context.right_indices_buffer - int n_threads = omp_get_max_threads() - int [:] sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) - int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) - int [:] left_counts = np.empty(n_threads, dtype=np.int32) - int [:] right_counts = np.empty(n_threads, dtype=np.int32) - int left_count - int right_count - int start - int stop - int i - int thread_idx - int sample_idx - int right_child_position - int [:] left_offset = np.zeros(n_threads, dtype=np.int32) - int [:] right_offset = np.zeros(n_threads, dtype=np.int32) - - with nogil: - for thread_idx in range(n_samples % n_threads): - sizes[thread_idx] += 1 - - for thread_idx in range(1, n_threads): - offset_in_buffers[thread_idx] = \ - offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] - - # map indices from samples_indices to left/right_indices_buffer - for thread_idx in prange(n_threads): - left_count = 0 - right_count = 0 - - start = offset_in_buffers[thread_idx] - stop = start + sizes[thread_idx] - for i in range(start, stop): - sample_idx = sample_indices[i] - if X_binned[sample_idx] <= split_info.bin_idx: - left_indices_buffer[start + left_count] = sample_idx - left_count = left_count + 1 - else: - right_indices_buffer[start + right_count] = sample_idx - right_count = right_count + 1 - - left_counts[thread_idx] = left_count - right_counts[thread_idx] = right_count - - # position of right child = just after the left child - right_child_position = 0 - for thread_idx in range(n_threads): - right_child_position += left_counts[thread_idx] - - # offset of each thread in samples_indices for left and right child, i.e. - # where each thread will start to write. - right_offset[0] = right_child_position - for thread_idx in range(1, n_threads): - left_offset[thread_idx] = \ - left_offset[thread_idx - 1] + left_counts[thread_idx - 1] - right_offset[thread_idx] = \ - right_offset[thread_idx - 1] + right_counts[thread_idx - 1] - - # map indices in left/right_indices_buffer back into samples_indices. This - # also updates context.partition since samples_indice is a view. - for thread_idx in prange(n_threads): - - for i in range(left_counts[thread_idx]): - sample_indices[left_offset[thread_idx] + i] = \ - left_indices_buffer[offset_in_buffers[thread_idx] + i] - for i in range(right_counts[thread_idx]): - sample_indices[right_offset[thread_idx] + i] = \ - right_indices_buffer[offset_in_buffers[thread_idx] + i] - - return (sample_indices[:right_child_position], - sample_indices[right_child_position:], - right_child_position) + def split_indices(self, SplitInfo split_info, unsigned int [::1] + sample_indices): + """Split samples into left and right arrays. + + The split is performed according to the best possible split (split_info). + + Ultimately, this is nothing but a partition of the sample_indices array + with a given pivot, exactly like a quicksort subroutine. + + Parameters + ---------- + split_info : SplitInfo + The SplitInfo of the node to split + sample_indices : array of unsigned int + The indices of the samples at the node to split. This is a view on + self.partition, and it is modified inplace by placing the indices + of the left child at the beginning, and the indices of the right child + at the end. + + Returns + ------- + left_indices : array of int + The indices of the samples in the left child. This is a view on + self.partition. + right_indices : array of int + The indices of the samples in the right child. This is a view on + self.partition. + right_child_position : int + The position of the right child in ``sample_indices`` + """ + # This is a multi-threaded implementation inspired by lightgbm. + # Here is a quick break down. Let's suppose we want to split a node with + # 24 samples named from a to x. self.partition looks like this (the * + # are indices in other leaves that we don't care about): + # partition = [*************abcdefghijklmnopqrstuvwx****************] + # ^ ^ + # node_position node_position + node.n_samples + + # Ultimately, we want to reorder the samples inside the boundaries of the + # leaf (which becomes a node) to now represent the samples in its left and + # right child. For example: + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + # ^ ^ + # left_child_pos right_child_pos + # Note that left_child_pos always takes the value of node_position, and + # right_child_pos = left_child_pos + left_child.n_samples. The order of + # the samples inside a leaf is irrelevant. + + # 1. samples_indices is a view on this region a..x. We conceptually + # divide it into n_threads regions. Each thread will be responsible for + # its own region. Here is an example with 4 threads: + # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] + # 2. Each thread processes 6 = 24 // 4 entries and maps them into + # left_indices_buffer or right_indices_buffer. For example, we could + # have the following mapping ('.' denotes an undefined entry): + # - left_indices_buffer = [abef..|il....|mnopqr|tux...] + # - right_indices_buffer = [cd....|ghjk..|......|svw...] + # 3. We keep track of the start positions of the regions (the '|') in + # ``offset_in_buffers`` as well as the size of each region. We also keep + # track of the number of samples put into the left/right child by each + # thread. Concretely: + # - left_counts = [4, 2, 6, 3] + # - right_counts = [2, 4, 0, 3] + # 4. Finally, we put left/right_indices_buffer back into the + # samples_indices, without any undefined entries and the partition looks + # as expected + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + + # Note: We here show left/right_indices_buffer as being the same size as + # sample_indices for simplicity, but in reality they are of the same size + # as partition. + + cdef: + int n_samples = sample_indices.shape[0] + const X_BINNED_DTYPE_C [::1] X_binned = self.X_binned[:, split_info.feature_idx] + unsigned int [::1] left_indices_buffer = self.left_indices_buffer + unsigned int [::1] right_indices_buffer = self.right_indices_buffer + int n_threads = omp_get_max_threads() + int [:] sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) + int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) + int [:] left_counts = np.empty(n_threads, dtype=np.int32) + int [:] right_counts = np.empty(n_threads, dtype=np.int32) + int left_count + int right_count + int start + int stop + int i + int thread_idx + int sample_idx + int right_child_position + int [:] left_offset = np.zeros(n_threads, dtype=np.int32) + int [:] right_offset = np.zeros(n_threads, dtype=np.int32) + + with nogil: + for thread_idx in range(n_samples % n_threads): + sizes[thread_idx] += 1 + + for thread_idx in range(1, n_threads): + offset_in_buffers[thread_idx] = \ + offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] + + # map indices from samples_indices to left/right_indices_buffer + for thread_idx in prange(n_threads): + left_count = 0 + right_count = 0 + + start = offset_in_buffers[thread_idx] + stop = start + sizes[thread_idx] + for i in range(start, stop): + sample_idx = sample_indices[i] + if X_binned[sample_idx] <= split_info.bin_idx: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 + + left_counts[thread_idx] = left_count + right_counts[thread_idx] = right_count + + # position of right child = just after the left child + right_child_position = 0 + for thread_idx in range(n_threads): + right_child_position += left_counts[thread_idx] + + # offset of each thread in samples_indices for left and right child, i.e. + # where each thread will start to write. + right_offset[0] = right_child_position + for thread_idx in range(1, n_threads): + left_offset[thread_idx] = \ + left_offset[thread_idx - 1] + left_counts[thread_idx - 1] + right_offset[thread_idx] = \ + right_offset[thread_idx - 1] + right_counts[thread_idx - 1] + + # map indices in left/right_indices_buffer back into samples_indices. This + # also updates self.partition since samples_indice is a view. + for thread_idx in prange(n_threads): + + for i in range(left_counts[thread_idx]): + sample_indices[left_offset[thread_idx] + i] = \ + left_indices_buffer[offset_in_buffers[thread_idx] + i] + for i in range(right_counts[thread_idx]): + sample_indices[right_offset[thread_idx] + i] = \ + right_indices_buffer[offset_in_buffers[thread_idx] + i] + + return (sample_indices[:right_child_position], + sample_indices[right_child_position:], + right_child_position) def find_node_split( diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index ff37223f26ad2..a3f0114dbca54 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -8,7 +8,6 @@ from sklearn.gbm.types import X_BINNED_DTYPE from sklearn.gbm.splitting import SplittingContext from sklearn.gbm.splitting import find_node_split -from sklearn.gbm.splitting import split_indices from sklearn.gbm.splitting import find_node_split_subtraction from sklearn.gbm.splitting import _find_histogram_split_wrapper @@ -101,8 +100,8 @@ def test_split_vs_split_subtraction(constant_hessian): # first split parent, left and right with classical method si_parent = find_node_split(context, sample_indices, hists_parent) - sample_indices_left, sample_indices_right, _ = split_indices( - context, si_parent, sample_indices) + sample_indices_left, sample_indices_right, _ = context.split_indices( + si_parent, sample_indices) si_left = find_node_split(context, sample_indices_left, hists_left) si_right = find_node_split(context, sample_indices_right, hists_right) @@ -187,8 +186,8 @@ def test_gradient_and_hessian_sanity(constant_hessian): # first split parent, left and right with classical method si_parent = find_node_split(context, sample_indices, hists_parent) - sample_indices_left, sample_indices_right, _ = split_indices( - context, si_parent, sample_indices) + sample_indices_left, sample_indices_right, _ = context.split_indices( + si_parent, sample_indices) si_left = find_node_split(context, sample_indices_left, hists_left) si_right = find_node_split(context, sample_indices_right, hists_right) @@ -291,8 +290,8 @@ def test_split_indices(): assert si_root.feature_idx == 1 assert si_root.bin_idx == 3 - samples_left, samples_right, position_right = split_indices( - context, si_root, context.partition) + samples_left, samples_right, position_right = context.split_indices( + si_root, context.partition) assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8]) assert set(samples_right) == set([2, 7, 9]) From 3614a7e9ab08789b710726ff6a655fb4f171704a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 16:02:50 -0500 Subject: [PATCH 040/247] find_node_split is now a method --- sklearn/gbm/grower.py | 6 +- sklearn/gbm/splitting.pyx | 176 ++++++++++++++-------------- sklearn/gbm/tests/test_splitting.py | 15 ++- 3 files changed, 98 insertions(+), 99 deletions(-) diff --git a/sklearn/gbm/grower.py b/sklearn/gbm/grower.py index 07f37b8436ca4..88e00cecc00c0 100644 --- a/sklearn/gbm/grower.py +++ b/sklearn/gbm/grower.py @@ -8,7 +8,7 @@ import numpy as np from time import time -from .splitting import (SplittingContext, find_node_split, +from .splitting import (SplittingContext, find_node_split_subtraction, SplitInfo) from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE @@ -324,8 +324,8 @@ def _compute_spittability(self, node, only_hist=False): sum_gradients, sum_hessians, node.parent.histograms, node.sibling.histograms, histograms) else: - split_info = find_node_split( - self.splitting_context, node.sample_indices, histograms) + split_info = self.splitting_context.find_node_split( + node.sample_indices, histograms) toc = time() node.find_split_time = toc - tic self.total_find_split_time += node.find_split_time diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index 33b873f216ff4..b848879fcc6c9 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -347,101 +347,101 @@ cdef class SplittingContext: right_child_position) -def find_node_split( - SplittingContext context, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms): # OUT - """For each feature, find the best bin to split on at a given node. + def find_node_split( + self, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms): # OUT + """For each feature, find the best bin to split on at a given node. - Returns the best split info among all features, and the histograms of - all the features. The histograms are computed by scanning the whole - data. + Returns the best split info among all features, and the histograms of + all the features. The histograms are computed by scanning the whole + data. - Parameters - ---------- - context : SplittingContext - The splitting context - sample_indices : array of int - The indices of the samples at the node to split. - - Returns - ------- - best_split_info : SplitInfo - The info about the best possible split among all features. - histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) - The histograms of each feature. A histogram is an array of - HISTOGRAM_DTYPE of size ``max_bins`` (only - ``n_bins_per_features[feature]`` entries are relevant). - """ - cdef: - unsigned int n_samples - int feature_idx - int i - unsigned int thread_idx - unsigned int [:] starts - unsigned int [:] ends - unsigned int n_threads - split_info_struct split_info - split_info_struct * split_infos - # For some reason, we need to use local variables for prange reduction. - Y_DTYPE_C sum_gradients = 0. - Y_DTYPE_C sum_hessians = 0. - # Also, need local views to avoid python interactions - Y_DTYPE_C [::1] ordered_gradients = context.ordered_gradients - Y_DTYPE_C [::1] gradients = context.gradients - Y_DTYPE_C [::1] ordered_hessians = context.ordered_hessians - Y_DTYPE_C [::1] hessians = context.hessians + Parameters + ---------- + self : SplittingContext + The splitting self + sample_indices : array of int + The indices of the samples at the node to split. + Returns + ------- + best_split_info : SplitInfo + The info about the best possible split among all features. + histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) + The histograms of each feature. A histogram is an array of + HISTOGRAM_DTYPE of size ``max_bins`` (only + ``n_bins_per_features[feature]`` entries are relevant). + """ + cdef: + unsigned int n_samples + int feature_idx + int i + unsigned int thread_idx + unsigned int [:] starts + unsigned int [:] ends + unsigned int n_threads + split_info_struct split_info + split_info_struct * split_infos + # For some reason, we need to use local variables for prange reduction. + Y_DTYPE_C sum_gradients = 0. + Y_DTYPE_C sum_hessians = 0. + # Also, need local views to avoid python interactions + Y_DTYPE_C [::1] ordered_gradients = self.ordered_gradients + Y_DTYPE_C [::1] gradients = self.gradients + Y_DTYPE_C [::1] ordered_hessians = self.ordered_hessians + Y_DTYPE_C [::1] hessians = self.hessians - with nogil: - n_samples = sample_indices.shape[0] - # Populate ordered_gradients and ordered_hessians. (Already done for root) - # Ordering the gradients and hessians helps to improve cache hit. - if sample_indices.shape[0] != context.gradients.shape[0]: - if context.constant_hessian: - for i in prange(n_samples, schedule='static'): - ordered_gradients[i] = gradients[sample_indices[i]] + with nogil: + n_samples = sample_indices.shape[0] + + # Populate ordered_gradients and ordered_hessians. (Already done for root) + # Ordering the gradients and hessians helps to improve cache hit. + if sample_indices.shape[0] != self.gradients.shape[0]: + if self.constant_hessian: + for i in prange(n_samples, schedule='static'): + ordered_gradients[i] = gradients[sample_indices[i]] + else: + for i in prange(n_samples, schedule='static'): + ordered_gradients[i] = gradients[sample_indices[i]] + ordered_hessians[i] = hessians[sample_indices[i]] + + # Compute self.sum_gradients and self.sum_hessians + # for i in prange(n_samples, schedule='static'): + for i in range(n_samples): + sum_gradients += ordered_gradients[i] + self.sum_gradients = sum_gradients + + if self.constant_hessian: + sum_hessians = self.constant_hessian_value * n_samples else: for i in prange(n_samples, schedule='static'): - ordered_gradients[i] = gradients[sample_indices[i]] - ordered_hessians[i] = hessians[sample_indices[i]] - - # Compute context.sum_gradients and context.sum_hessians - # for i in prange(n_samples, schedule='static'): - for i in range(n_samples): - sum_gradients += ordered_gradients[i] - context.sum_gradients = sum_gradients - - if context.constant_hessian: - sum_hessians = context.constant_hessian_value * n_samples - else: - for i in prange(n_samples, schedule='static'): - sum_hessians += ordered_hessians[i] - context.sum_hessians = sum_hessians - - split_infos = malloc( - context.n_features * sizeof(split_info_struct)) - for feature_idx in prange(context.n_features): - split_info = _find_histogram_split( - context, feature_idx, sample_indices, histograms[feature_idx]) - split_infos[feature_idx] = split_info - - split_info = _find_best_feature_to_split_helper(context, split_infos) - - out = SplitInfo( - split_info.gain, - split_info.feature_idx, - split_info.bin_idx, - split_info.gradient_left, - split_info.hessian_left, - split_info.gradient_right, - split_info.hessian_right, - split_info.n_samples_left, - split_info.n_samples_right, - ) - free(split_infos) - return out + sum_hessians += ordered_hessians[i] + self.sum_hessians = sum_hessians + + split_infos = malloc( + self.n_features * sizeof(split_info_struct)) + for feature_idx in prange(self.n_features): + split_info = _find_histogram_split( + self, feature_idx, sample_indices, histograms[feature_idx]) + split_infos[feature_idx] = split_info + + split_info = _find_best_feature_to_split_helper(self, split_infos) + + out = SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.gradient_left, + split_info.hessian_left, + split_info.gradient_right, + split_info.hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + ) + free(split_infos) + return out def find_node_split_subtraction( SplittingContext context, diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index a3f0114dbca54..0db8ed6dd3f39 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -7,7 +7,6 @@ from sklearn.gbm.types import Y_DTYPE from sklearn.gbm.types import X_BINNED_DTYPE from sklearn.gbm.splitting import SplittingContext -from sklearn.gbm.splitting import find_node_split from sklearn.gbm.splitting import find_node_split_subtraction from sklearn.gbm.splitting import _find_histogram_split_wrapper @@ -99,11 +98,11 @@ def test_split_vs_split_subtraction(constant_hessian): hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method - si_parent = find_node_split(context, sample_indices, hists_parent) + si_parent = context.find_node_split(sample_indices, hists_parent) sample_indices_left, sample_indices_right, _ = context.split_indices( si_parent, sample_indices) - si_left = find_node_split(context, sample_indices_left, hists_left) - si_right = find_node_split(context, sample_indices_right, hists_right) + si_left = context.find_node_split(sample_indices_left, hists_left) + si_right = context.find_node_split(sample_indices_right, hists_right) # split left with subtraction method si_left_sub = find_node_split_subtraction( @@ -185,12 +184,12 @@ def test_gradient_and_hessian_sanity(constant_hessian): hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method - si_parent = find_node_split(context, sample_indices, hists_parent) + si_parent = context.find_node_split(sample_indices, hists_parent) sample_indices_left, sample_indices_right, _ = context.split_indices( si_parent, sample_indices) - si_left = find_node_split(context, sample_indices_left, hists_left) - si_right = find_node_split(context, sample_indices_right, hists_right) + si_left = context.find_node_split(sample_indices_left, hists_left) + si_right = context.find_node_split(sample_indices_right, hists_right) # split left with subtraction method si_left_sub = find_node_split_subtraction( @@ -284,7 +283,7 @@ def test_split_indices(): assert_array_almost_equal(sample_indices, context.partition) histograms = np.zeros(shape=(2, n_bins), dtype=HISTOGRAM_DTYPE) - si_root = find_node_split(context, sample_indices, histograms) + si_root = context.find_node_split(sample_indices, histograms) # sanity checks for best split assert si_root.feature_idx == 1 From 8e8b92703f71fc598b3f7c38837244bf0caa0ba7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 16:08:00 -0500 Subject: [PATCH 041/247] find_node_split_subtraction is now a method --- sklearn/gbm/grower.py | 7 +- sklearn/gbm/splitting.pyx | 152 ++++++++++++++-------------- sklearn/gbm/tests/test_splitting.py | 17 ++-- 3 files changed, 87 insertions(+), 89 deletions(-) diff --git a/sklearn/gbm/grower.py b/sklearn/gbm/grower.py index 88e00cecc00c0..b4c62a6e45b41 100644 --- a/sklearn/gbm/grower.py +++ b/sklearn/gbm/grower.py @@ -8,8 +8,7 @@ import numpy as np from time import time -from .splitting import (SplittingContext, - find_node_split_subtraction, SplitInfo) +from .splitting import SplittingContext, SplitInfo from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE from .types import HISTOGRAM_DTYPE @@ -319,8 +318,8 @@ def _compute_spittability(self, node, only_hist=False): else: sum_gradients = node.parent.split_info.gradient_left sum_hessians = node.parent.split_info.hessian_left - split_info = find_node_split_subtraction( - self.splitting_context, node.sample_indices, + split_info = self.splitting_context.find_node_split_subtraction( + node.sample_indices, sum_gradients, sum_hessians, node.parent.histograms, node.sibling.histograms, histograms) else: diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index b848879fcc6c9..a17dc179fa9e6 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -443,94 +443,94 @@ cdef class SplittingContext: free(split_infos) return out -def find_node_split_subtraction( - SplittingContext context, - unsigned int [::1] sample_indices, # IN - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians, - hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] sibling_histograms, # IN - hist_struct [:, ::1] histograms): # OUT - """For each feature, find the best bin to split on at a given node. + def find_node_split_subtraction( + SplittingContext self, + unsigned int [::1] sample_indices, # IN + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] sibling_histograms, # IN + hist_struct [:, ::1] histograms): # OUT + """For each feature, find the best bin to split on at a given node. - Returns the best split info among all features, and the histograms of - all the features. + Returns the best split info among all features, and the histograms of + all the features. - This does the same job as ``find_node_split()`` but uses the histograms - of the parent and sibling of the node to split. This allows to use the - identity: ``histogram(parent) = histogram(node) - histogram(sibling)``, - which is significantly faster than computing the histograms from data. + This does the same job as ``find_node_split()`` but uses the histograms + of the parent and sibling of the node to split. This allows to use the + identity: ``histogram(parent) = histogram(node) - histogram(sibling)``, + which is significantly faster than computing the histograms from data. - Returns the best SplitInfo among all features, along with all the feature - histograms that can be latter used to compute the sibling or children - histograms by substraction. + Returns the best SplitInfo among all features, along with all the feature + histograms that can be latter used to compute the sibling or children + histograms by substraction. - Parameters - ---------- - context : SplittingContext - The splitting context - sample_indices : array of int - The indices of the samples at the node to split. - parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) - The histograms of the parent - sibling_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the sibling - histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The computed histograms - - Returns - ------- - best_split_info : SplitInfo - The info about the best possible split among all features. - histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) - The histograms of each feature. A histogram is an array of - HISTOGRAM_DTYPE of size ``max_bins`` (only - ``n_bins_per_features[feature]`` entries are relevant). - """ + Parameters + ---------- + self : SplittingContext + The splitting self + sample_indices : array of int + The indices of the samples at the node to split. + parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + The histograms of the parent + sibling_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the sibling + histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The computed histograms - cdef: - int feature_idx - unsigned int n_samples - split_info_struct split_info - split_info_struct * split_infos - int i + Returns + ------- + best_split_info : SplitInfo + The info about the best possible split among all features. + histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) + The histograms of each feature. A histogram is an array of + HISTOGRAM_DTYPE of size ``max_bins`` (only + ``n_bins_per_features[feature]`` entries are relevant). + """ + + cdef: + int feature_idx + unsigned int n_samples + split_info_struct split_info + split_info_struct * split_infos + int i - with nogil: - n_samples = sample_indices.shape[0] + with nogil: + n_samples = sample_indices.shape[0] - context.sum_gradients = sum_gradients - context.sum_hessians = sum_hessians + self.sum_gradients = sum_gradients + self.sum_hessians = sum_hessians - split_infos = malloc( - context.n_features * sizeof(split_info_struct)) - for feature_idx in prange(context.n_features): - split_info = _find_histogram_split_subtraction( - context, feature_idx, parent_histograms[feature_idx], - sibling_histograms[feature_idx], histograms[feature_idx], - n_samples) - split_infos[feature_idx] = split_info + split_infos = malloc( + self.n_features * sizeof(split_info_struct)) + for feature_idx in prange(self.n_features): + split_info = _find_histogram_split_subtraction( + self, feature_idx, parent_histograms[feature_idx], + sibling_histograms[feature_idx], histograms[feature_idx], + n_samples) + split_infos[feature_idx] = split_info - split_info = _find_best_feature_to_split_helper(context, split_infos) + split_info = _find_best_feature_to_split_helper(self, split_infos) - out = SplitInfo( - split_info.gain, - split_info.feature_idx, - split_info.bin_idx, - split_info.gradient_left, - split_info.hessian_left, - split_info.gradient_right, - split_info.hessian_right, - split_info.n_samples_left, - split_info.n_samples_right, - ) - free(split_infos) - return out + out = SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.gradient_left, + split_info.hessian_left, + split_info.gradient_right, + split_info.hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + ) + free(split_infos) + return out cdef split_info_struct _find_best_feature_to_split_helper( - SplittingContext context, + SplittingContext self, split_info_struct * split_infos # IN ) nogil: cdef: @@ -541,7 +541,7 @@ cdef split_info_struct _find_best_feature_to_split_helper( unsigned int feature_idx best_gain = -1. - for feature_idx in range(context.n_features): + for feature_idx in range(self.n_features): split_info = split_infos[feature_idx] gain = split_info.gain if best_gain == -1 or gain > best_gain: diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index 0db8ed6dd3f39..899150fdff67d 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -7,7 +7,6 @@ from sklearn.gbm.types import Y_DTYPE from sklearn.gbm.types import X_BINNED_DTYPE from sklearn.gbm.splitting import SplittingContext -from sklearn.gbm.splitting import find_node_split_subtraction from sklearn.gbm.splitting import _find_histogram_split_wrapper @@ -105,13 +104,13 @@ def test_split_vs_split_subtraction(constant_hessian): si_right = context.find_node_split(sample_indices_right, hists_right) # split left with subtraction method - si_left_sub = find_node_split_subtraction( - context, sample_indices_left, si_parent.gradient_left, + si_left_sub = context.find_node_split_subtraction( + sample_indices_left, si_parent.gradient_left, si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method - si_right_sub = find_node_split_subtraction( - context, sample_indices_right, si_parent.gradient_right, + si_right_sub = context.find_node_split_subtraction( + sample_indices_right, si_parent.gradient_right, si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) # make sure histograms from classical and subtraction method are the same @@ -192,13 +191,13 @@ def test_gradient_and_hessian_sanity(constant_hessian): si_right = context.find_node_split(sample_indices_right, hists_right) # split left with subtraction method - si_left_sub = find_node_split_subtraction( - context, sample_indices_left, si_parent.gradient_left, + si_left_sub = context.find_node_split_subtraction( + sample_indices_left, si_parent.gradient_left, si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method - si_right_sub = find_node_split_subtraction( - context, sample_indices_right, si_parent.gradient_right, + si_right_sub = context.find_node_split_subtraction( + sample_indices_right, si_parent.gradient_right, si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) # make sure that si.gradient_left + si.gradient_right have their expected From 1fac60a2d42d725c1a8fd568de2d41a52d863a73 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 16:12:03 -0500 Subject: [PATCH 042/247] find_node_split_subtraction is now a method --- sklearn/gbm/splitting.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index a17dc179fa9e6..afc0becaa0e2e 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -408,8 +408,7 @@ cdef class SplittingContext: ordered_hessians[i] = hessians[sample_indices[i]] # Compute self.sum_gradients and self.sum_hessians - # for i in prange(n_samples, schedule='static'): - for i in range(n_samples): + for i in prange(n_samples, schedule='static'): sum_gradients += ordered_gradients[i] self.sum_gradients = sum_gradients From f8500a2bc1de1fbd5ee038be4dc2e5e0d5e126e7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 17:36:02 -0500 Subject: [PATCH 043/247] Refactored SplittingContext into a proper Splitter --- gdb_test.py | 38 +-- sklearn/gbm/_gradient_boosting.pyx | 2 +- sklearn/gbm/grower.py | 32 +-- sklearn/gbm/splitting.pyx | 385 +++++++++++++--------------- sklearn/gbm/tests/test_splitting.py | 71 ++--- 5 files changed, 257 insertions(+), 271 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index dc618de5619c3..d45c3956c3438 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -13,7 +13,7 @@ classif = False n_classes = 2 n_features = 20 -n_samples = int(1e6) +n_samples = int(1e7) max_iter = 5 if classif: @@ -28,15 +28,15 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -# pygbm_est = PYGBM_GBM( -# max_iter=max_iter, -# scoring=None, # no early stopping -# validation_split=None, -# random_state=0, -# verbose=False) -# print("compiling pygbm code") -# pygbm_est.fit(X[:1000], y[:1000]) -# print("done") +pygbm_est = PYGBM_GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + random_state=0, + verbose=False) +print("compiling pygbm code") +pygbm_est.fit(X[:1000], y[:1000]) +print("done") gbm = GBM( max_iter=max_iter, @@ -55,15 +55,15 @@ print(f'sklearn gbm score_duration {score_duration:.3f}s') -# pygbm_est.set_params(verbose=True) -# tic = time() -# pygbm_est.fit(X, y) -# fit_duration = time() - tic -# tic = time() -# print(f'score: {pygbm_est.score(X, y)}') -# score_duration = time() - tic -# print(f'pygbm fit_duration: {fit_duration:.3f}s') -# print(f'pygbm score_duration {score_duration:.3f}s') +pygbm_est.set_params(verbose=True) +tic = time() +pygbm_est.fit(X, y) +fit_duration = time() - tic +tic = time() +print(f'score: {pygbm_est.score(X, y)}') +score_duration = time() - tic +print(f'pygbm fit_duration: {fit_duration:.3f}s') +print(f'pygbm score_duration {score_duration:.3f}s') # cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") # s = pstats.Stats("Profile.prof") diff --git a/sklearn/gbm/_gradient_boosting.pyx b/sklearn/gbm/_gradient_boosting.pyx index 631fea1c6f55e..c076bc36af56e 100644 --- a/sklearn/gbm/_gradient_boosting.pyx +++ b/sklearn/gbm/_gradient_boosting.pyx @@ -25,7 +25,7 @@ def _update_raw_predictions(Y_DTYPE_C [:] raw_predictions, grower): starts = np.array([leaf.start for leaf in leaves], dtype=np.uint32) stops = np.array([leaf.stop for leaf in leaves], dtype=np.uint32) values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) - partition = grower.splitting_context.partition + partition = grower.splitter.partition _update_raw_predictions_helper(raw_predictions, starts, stops, partition, values) diff --git a/sklearn/gbm/grower.py b/sklearn/gbm/grower.py index b4c62a6e45b41..11c8ac4a4e9e8 100644 --- a/sklearn/gbm/grower.py +++ b/sklearn/gbm/grower.py @@ -8,7 +8,7 @@ import numpy as np from time import time -from .splitting import SplittingContext, SplitInfo +from .splitting import Splitter, SplitInfo from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE from .types import HISTOGRAM_DTYPE @@ -78,9 +78,9 @@ class TreeNode: apply_split_time = 0. hist_subtraction = False - # start and stop indices of the node in the splitting_context.partition + # start and stop indices of the node in the splitter.partition # array. Concretely, - # self.sample_indices = view(self.splitting_context.partition[start:stop]) + # self.sample_indices = view(self.splitter.partition[start:stop]) # Only used in _update_raw_prediction, because we need to iterate over the # leaves and I don't know how to efficiently store the sample_indices views # because they're all of different sizes. TODO: ask Olivier what he thinks @@ -188,7 +188,7 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, [n_bins_per_feature] * X_binned.shape[1], dtype=np.uint32) - self.splitting_context = SplittingContext( + self.splitter = Splitter( X_binned, max_bins, n_bins_per_feature, gradients, hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) @@ -212,7 +212,7 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, l2_regularization, min_hessian_to_split): """Validate parameters passed to __init__. - Also validate parameters passed to SplittingContext because we cannot + Also validate parameters passed to splitter because we cannot raise exceptions in a jitclass. """ if X_binned.dtype != np.uint8: @@ -250,16 +250,16 @@ def _intilialize_root(self): """Initialize root node and finalize it if needed.""" n_samples = self.X_binned.shape[0] depth = 0 - if self.splitting_context.constant_hessian: - hessian = self.splitting_context.hessians[0] * n_samples + if self.splitter.constant_hessian: + hessian = self.splitter.hessians[0] * n_samples else: - hessian = np.sum(self.splitting_context.hessians) + hessian = np.sum(self.splitter.hessians) self.root = TreeNode( depth=depth, - #sample_indices=self.splitting_context.partition.view(), - sample_indices=self.splitting_context.partition, - #sum_gradients=self.splitting_context.gradients.sum(), - sum_gradients=np.sum(self.splitting_context.gradients), + #sample_indices=self.splitter.partition.view(), + sample_indices=self.splitter.partition, + #sum_gradients=self.splitter.gradients.sum(), + sum_gradients=np.sum(self.splitter.gradients), sum_hessians=hessian ) @@ -318,12 +318,12 @@ def _compute_spittability(self, node, only_hist=False): else: sum_gradients = node.parent.split_info.gradient_left sum_hessians = node.parent.split_info.hessian_left - split_info = self.splitting_context.find_node_split_subtraction( + split_info = self.splitter.find_node_split_subtraction( node.sample_indices, sum_gradients, sum_hessians, node.parent.histograms, node.sibling.histograms, histograms) else: - split_info = self.splitting_context.find_node_split( + split_info = self.splitter.find_node_split( node.sample_indices, histograms) toc = time() node.find_split_time = toc - tic @@ -364,7 +364,7 @@ def split_next(self): node = heappop(self.splittable_nodes) tic = time() - (sample_indices_left, sample_indices_right, i) = self.splitting_context.split_indices( + (sample_indices_left, sample_indices_right, i) = self.splitter.split_indices( node.split_info, node.sample_indices) toc = time() node.apply_split_time = toc - tic @@ -436,7 +436,7 @@ def _finalize_leaf(self, node): https://arxiv.org/abs/1603.02754 """ node.value = -self.shrinkage * node.sum_gradients / ( - node.sum_hessians + self.splitting_context.l2_regularization) + node.sum_hessians + self.splitter.l2_regularization) self.finalized_leaves.append(node) def _finalize_splittable_nodes(self): diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index afc0becaa0e2e..ea7c60339e575 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -95,13 +95,13 @@ cdef class SplitInfo: @cython.final -cdef class SplittingContext: - """Pure data class defining a splitting context. +cdef class Splitter: + """Splitter used to find the best possible split at each node. - Ideally it would also have methods but numba does not support annotating - jitclasses (so we can't use parallel=True). This structure is - instanciated in the grower and stores all the required information to - compute the SplitInfo and histograms of each node. + The 'best' split is computed accross all features and all bins. + + The Splitter is also responsible for partitioning the samples among the + leaf nodes (see split_indices() and the partition attribute). Parameters ---------- @@ -171,8 +171,6 @@ cdef class SplittingContext: # for root node, gradients and hessians are already ordered self.ordered_gradients = gradients.copy() self.ordered_hessians = hessians.copy() - self.sum_gradients = np.sum(gradients) - self.sum_hessians = np.sum(hessians) self.constant_hessian = hessians.shape[0] == 1 self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split @@ -346,7 +344,6 @@ cdef class SplittingContext: sample_indices[right_child_position:], right_child_position) - def find_node_split( self, const unsigned int [::1] sample_indices, # IN @@ -359,8 +356,6 @@ cdef class SplittingContext: Parameters ---------- - self : SplittingContext - The splitting self sample_indices : array of int The indices of the samples at the node to split. @@ -383,7 +378,6 @@ cdef class SplittingContext: unsigned int n_threads split_info_struct split_info split_info_struct * split_infos - # For some reason, we need to use local variables for prange reduction. Y_DTYPE_C sum_gradients = 0. Y_DTYPE_C sum_hessians = 0. # Also, need local views to avoid python interactions @@ -392,12 +386,12 @@ cdef class SplittingContext: Y_DTYPE_C [::1] ordered_hessians = self.ordered_hessians Y_DTYPE_C [::1] hessians = self.hessians - with nogil: n_samples = sample_indices.shape[0] - # Populate ordered_gradients and ordered_hessians. (Already done for root) - # Ordering the gradients and hessians helps to improve cache hit. + # Populate ordered_gradients and ordered_hessians. (Already done + # for root) Ordering the gradients and hessians helps to improve + # cache hit. if sample_indices.shape[0] != self.gradients.shape[0]: if self.constant_hessian: for i in prange(n_samples, schedule='static'): @@ -407,26 +401,31 @@ cdef class SplittingContext: ordered_gradients[i] = gradients[sample_indices[i]] ordered_hessians[i] = hessians[sample_indices[i]] - # Compute self.sum_gradients and self.sum_hessians + # Compute sums of gradients and hessians at the node for i in prange(n_samples, schedule='static'): sum_gradients += ordered_gradients[i] - self.sum_gradients = sum_gradients - if self.constant_hessian: sum_hessians = self.constant_hessian_value * n_samples else: for i in prange(n_samples, schedule='static'): sum_hessians += ordered_hessians[i] - self.sum_hessians = sum_hessians split_infos = malloc( self.n_features * sizeof(split_info_struct)) for feature_idx in prange(self.n_features): - split_info = _find_histogram_split( - self, feature_idx, sample_indices, histograms[feature_idx]) + # Compute histogram of each feature + self._compute_histogram(feature_idx, sample_indices, + histograms[feature_idx]) + + # and get the best possible split for the feature among all + # bins + split_info = self._find_best_bin_to_split_helper( + feature_idx, histograms[feature_idx], n_samples, + sum_gradients, sum_hessians) split_infos[feature_idx] = split_info - split_info = _find_best_feature_to_split_helper(self, split_infos) + # then compute best possible split among all feature + split_info = self._find_best_feature_to_split_helper(split_infos) out = SplitInfo( split_info.gain, @@ -442,8 +441,43 @@ cdef class SplittingContext: free(split_infos) return out + cdef void _compute_histogram( + self, + unsigned int feature_idx, + const unsigned int [::1] sample_indices, # IN + hist_struct [::1] histogram # OUT + ) nogil: + """Compute the histogram for a given feature + + Returns the best SplitInfo among all the possible bins of the feature. + """ + + cdef: + unsigned int n_samples = sample_indices.shape[0] + const X_BINNED_DTYPE_C [::1] X_binned = self.X_binned[:, feature_idx] + unsigned int root_node = X_binned.shape[0] == n_samples + Y_DTYPE_C [::1] ordered_gradients = \ + self.ordered_gradients[:n_samples] + Y_DTYPE_C [::1] ordered_hessians = self.ordered_hessians[:n_samples] + + if root_node: + if self.constant_hessian: + _build_histogram_root_no_hessian(self.max_bins, X_binned, + ordered_gradients, histogram) + else: + _build_histogram_root(self.max_bins, X_binned, + ordered_gradients, + ordered_hessians, histogram) + else: + if self.constant_hessian: + _build_histogram_no_hessian(self.max_bins, sample_indices, + X_binned, ordered_gradients, histogram) + else: + _build_histogram(self.max_bins, sample_indices, X_binned, + ordered_gradients, ordered_hessians, histogram) + def find_node_split_subtraction( - SplittingContext self, + Splitter self, unsigned int [::1] sample_indices, # IN Y_DTYPE_C sum_gradients, Y_DTYPE_C sum_hessians, @@ -466,8 +500,6 @@ cdef class SplittingContext: Parameters ---------- - self : SplittingContext - The splitting self sample_indices : array of int The indices of the samples at the node to split. parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) @@ -499,19 +531,23 @@ cdef class SplittingContext: with nogil: n_samples = sample_indices.shape[0] - self.sum_gradients = sum_gradients - self.sum_hessians = sum_hessians - split_infos = malloc( self.n_features * sizeof(split_info_struct)) for feature_idx in prange(self.n_features): - split_info = _find_histogram_split_subtraction( - self, feature_idx, parent_histograms[feature_idx], - sibling_histograms[feature_idx], histograms[feature_idx], - n_samples) + # Compute histogram of each feature + _subtract_histograms(self.max_bins, + parent_histograms[feature_idx], + sibling_histograms[feature_idx], + histograms[feature_idx]) + # and get the best possible split for the feature among all + # bins + split_info = self._find_best_bin_to_split_helper( + feature_idx, histograms[feature_idx], n_samples, + sum_gradients, sum_hessians) split_infos[feature_idx] = split_info - split_info = _find_best_feature_to_split_helper(self, split_infos) + # then compute best possible split among all feature + split_info = self._find_best_feature_to_split_helper(split_infos) out = SplitInfo( split_info.gain, @@ -527,157 +563,125 @@ cdef class SplittingContext: free(split_infos) return out + cdef split_info_struct _find_best_feature_to_split_helper(self, + split_info_struct * split_infos # IN + ) nogil: + cdef: + Y_DTYPE_C gain + Y_DTYPE_C best_gain + split_info_struct split_info + split_info_struct best_split_info + unsigned int feature_idx + + best_gain = -1. + for feature_idx in range(self.n_features): + split_info = split_infos[feature_idx] + gain = split_info.gain + if best_gain == -1 or gain > best_gain: + best_gain = gain + best_split_info = split_info + return best_split_info + + cdef split_info_struct _find_best_bin_to_split_helper( + self, + unsigned int feature_idx, + const hist_struct [::1] histogram, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians) nogil: + """Find best bin to split on, and return the corresponding SplitInfo. -cdef split_info_struct _find_best_feature_to_split_helper( - SplittingContext self, - split_info_struct * split_infos # IN - ) nogil: - cdef: - Y_DTYPE_C gain - Y_DTYPE_C best_gain - split_info_struct split_info - split_info_struct best_split_info - unsigned int feature_idx - - best_gain = -1. - for feature_idx in range(self.n_features): - split_info = split_infos[feature_idx] - gain = split_info.gain - if best_gain == -1 or gain > best_gain: - best_gain = gain - best_split_info = split_info - return best_split_info - -cdef split_info_struct _find_histogram_split( - SplittingContext context, - unsigned int feature_idx, - const unsigned int [::1] sample_indices, # IN - hist_struct [::1] histogram # OUT - ) nogil: - """Compute the histogram for a given feature - - Returns the best SplitInfo among all the possible bins of the feature. - """ - - cdef: - unsigned int n_samples = sample_indices.shape[0] - const X_BINNED_DTYPE_C [::1] X_binned = context.X_binned[:, feature_idx] - unsigned int root_node = X_binned.shape[0] == n_samples - Y_DTYPE_C [::1] ordered_gradients = \ - context.ordered_gradients[:n_samples] - Y_DTYPE_C [::1] ordered_hessians = context.ordered_hessians[:n_samples] - - if root_node: - if context.constant_hessian: - _build_histogram_root_no_hessian(context.max_bins, X_binned, - ordered_gradients, histogram) - else: - _build_histogram_root(context.max_bins, X_binned, - ordered_gradients, - ordered_hessians, histogram) - else: - if context.constant_hessian: - _build_histogram_no_hessian(context.max_bins, sample_indices, - X_binned, ordered_gradients, histogram) - else: - _build_histogram(context.max_bins, sample_indices, X_binned, - ordered_gradients, ordered_hessians, histogram) - - return _find_best_bin_to_split_helper(context, feature_idx, histogram, - n_samples) - -cdef split_info_struct _find_histogram_split_subtraction( - SplittingContext context, - unsigned int feature_idx, - hist_struct [::1] parent_histogram, # IN - hist_struct [::1] sibling_histogram, # IN - hist_struct [::1] histogram, # OUT - unsigned int n_samples - ) nogil: - """Compute the histogram by substraction of parent and sibling - - Uses the identity: hist(parent) = hist(left) + hist(right). - Returns the best SplitInfo among all the possible bins of the feature. - """ - - _subtract_histograms(context.max_bins, parent_histogram, - sibling_histogram, histogram) - - return _find_best_bin_to_split_helper(context, feature_idx, histogram, - n_samples) - - -cdef split_info_struct _find_best_bin_to_split_helper( - SplittingContext context, - unsigned int feature_idx, - const hist_struct [::1] histogram, # IN - unsigned int n_samples) nogil: - """Find best bin to split on, and return the corresponding SplitInfo. - - Splits that do not satisfy the splitting constraints (min_gain_to_split, - etc.) are discarded here. If no split can satisfy the constraints, a - SplitInfo with a gain of -1 is returned. If for a given node the best - SplitInfo has a gain of -1, it is finalized into a leaf. - """ - cdef: - unsigned int bin_idx - unsigned int n_samples_left - unsigned int n_samples_right - unsigned int n_samples_ = n_samples - Y_DTYPE_C hessian_left - Y_DTYPE_C hessian_right - Y_DTYPE_C gradient_left - Y_DTYPE_C gradient_right - Y_DTYPE_C gain - split_info_struct best_split + Splits that do not satisfy the splitting constraints (min_gain_to_split, + etc.) are discarded here. If no split can satisfy the constraints, a + SplitInfo with a gain of -1 is returned. If for a given node the best + SplitInfo has a gain of -1, it is finalized into a leaf. + """ + cdef: + unsigned int bin_idx + unsigned int n_samples_left + unsigned int n_samples_right + unsigned int n_samples_ = n_samples + Y_DTYPE_C hessian_left + Y_DTYPE_C hessian_right + Y_DTYPE_C gradient_left + Y_DTYPE_C gradient_right + Y_DTYPE_C gain + split_info_struct best_split + + best_split.gain = -1. + gradient_left, hessian_left = 0., 0. + n_samples_left = 0 + + for bin_idx in range(self.n_bins_per_feature[feature_idx]): + n_samples_left += histogram[bin_idx].count + n_samples_right = n_samples_ - n_samples_left - best_split.gain = -1. - gradient_left, hessian_left = 0., 0. - n_samples_left = 0 + if self.constant_hessian: + hessian_left += (histogram[bin_idx].count + * self.constant_hessian_value) + else: + hessian_left += histogram[bin_idx].sum_hessians + hessian_right = sum_hessians - hessian_left + + gradient_left += histogram[bin_idx].sum_gradients + gradient_right = sum_gradients - gradient_left + + if n_samples_left < self.min_samples_leaf: + continue + if n_samples_right < self.min_samples_leaf: + # won't get any better + break + + if hessian_left < self.min_hessian_to_split: + continue + if hessian_right < self.min_hessian_to_split: + # won't get any better (hessians are > 0 since loss is convex) + break + + gain = _split_gain(gradient_left, hessian_left, + gradient_right, hessian_right, + sum_gradients, sum_hessians, + self.l2_regularization) + + if gain > best_split.gain and gain > self.min_gain_to_split: + best_split.gain = gain + best_split.feature_idx = feature_idx + best_split.bin_idx = bin_idx + best_split.gradient_left = gradient_left + best_split.gradient_right = gradient_right + best_split.hessian_left = hessian_left + best_split.hessian_right = hessian_right + best_split.n_samples_left = n_samples_left + best_split.n_samples_right = n_samples_right + + return best_split + + # Only used for tests... not great + def find_best_split_wrapper( + self, + unsigned int feature_idx, + unsigned int [::1] sample_indices, + hist_struct [::1] histogram, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians): - for bin_idx in range(context.n_bins_per_feature[feature_idx]): - n_samples_left += histogram[bin_idx].count - n_samples_right = n_samples_ - n_samples_left + self._compute_histogram(feature_idx, sample_indices, histogram) + n_samples = sample_indices.shape[0] + split_info = self._find_best_bin_to_split_helper( + feature_idx, histogram, n_samples, + sum_gradients, sum_hessians) - if context.constant_hessian: - hessian_left += (histogram[bin_idx].count - * context.constant_hessian_value) - else: - hessian_left += histogram[bin_idx].sum_hessians - hessian_right = context.sum_hessians - hessian_left - - gradient_left += histogram[bin_idx].sum_gradients - gradient_right = context.sum_gradients - gradient_left - - if n_samples_left < context.min_samples_leaf: - continue - if n_samples_right < context.min_samples_leaf: - # won't get any better - break - - if hessian_left < context.min_hessian_to_split: - continue - if hessian_right < context.min_hessian_to_split: - # won't get any better (hessians are > 0 since loss is convex) - break - - gain = _split_gain(gradient_left, hessian_left, - gradient_right, hessian_right, - context.sum_gradients, context.sum_hessians, - context.l2_regularization) - - if gain > best_split.gain and gain > context.min_gain_to_split: - best_split.gain = gain - best_split.feature_idx = feature_idx - best_split.bin_idx = bin_idx - best_split.gradient_left = gradient_left - best_split.gradient_right = gradient_right - best_split.hessian_left = hessian_left - best_split.hessian_right = hessian_right - best_split.n_samples_left = n_samples_left - best_split.n_samples_right = n_samples_right - - return best_split + return SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.gradient_left, + split_info.hessian_left, + split_info.gradient_right, + split_info.hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + ) cdef inline Y_DTYPE_C _split_gain( @@ -709,24 +713,3 @@ cdef inline Y_DTYPE_C negative_loss( Y_DTYPE_C hessian, Y_DTYPE_C l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) - -# Only used for tests... not great -def _find_histogram_split_wrapper( - SplittingContext context, - unsigned int feature_idx, - unsigned int [::1] sample_indices, - hist_struct [::1] histogram): - - split_info = _find_histogram_split(context, feature_idx, sample_indices, - histogram) - return SplitInfo( - split_info.gain, - split_info.feature_idx, - split_info.bin_idx, - split_info.gradient_left, - split_info.hessian_left, - split_info.gradient_right, - split_info.hessian_right, - split_info.n_samples_left, - split_info.n_samples_right, - ) diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index 899150fdff67d..2d5da80e38d94 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -6,8 +6,7 @@ from sklearn.gbm.types import HISTOGRAM_DTYPE from sklearn.gbm.types import Y_DTYPE from sklearn.gbm.types import X_BINNED_DTYPE -from sklearn.gbm.splitting import SplittingContext -from sklearn.gbm.splitting import _find_histogram_split_wrapper +from sklearn.gbm.splitting import Splitter @pytest.mark.parametrize('n_bins', [3, 32, 256]) @@ -24,6 +23,7 @@ def test_histogram_split(n_bins): sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) ordered_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) all_hessians = ordered_hessians + sum_hessians = all_hessians.sum() for true_bin in range(1, n_bins - 1): @@ -32,10 +32,11 @@ def test_histogram_split(n_bins): dtype=Y_DTYPE) ordered_gradients[binned_feature <= true_bin] *= -1 all_gradients = ordered_gradients + sum_gradients = all_gradients.sum() n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - context = SplittingContext(X_binned, + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, all_gradients, all_hessians, @@ -44,8 +45,9 @@ def test_histogram_split(n_bins): min_samples_leaf, min_gain_to_split) histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) - split_info = _find_histogram_split_wrapper( - context, feature_idx, sample_indices, histogram) + split_info = splitter.find_best_split_wrapper( + feature_idx, sample_indices, histogram, sum_gradients, + sum_hessians) assert split_info.bin_idx == true_bin assert split_info.gain >= 0 @@ -84,11 +86,9 @@ def test_split_vs_split_subtraction(constant_hessian): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - context = SplittingContext(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, all_gradients, + all_hessians, l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) @@ -97,19 +97,19 @@ def test_split_vs_split_subtraction(constant_hessian): hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method - si_parent = context.find_node_split(sample_indices, hists_parent) - sample_indices_left, sample_indices_right, _ = context.split_indices( + si_parent = splitter.find_node_split(sample_indices, hists_parent) + sample_indices_left, sample_indices_right, _ = splitter.split_indices( si_parent, sample_indices) - si_left = context.find_node_split(sample_indices_left, hists_left) - si_right = context.find_node_split(sample_indices_right, hists_right) + si_left = splitter.find_node_split(sample_indices_left, hists_left) + si_right = splitter.find_node_split(sample_indices_right, hists_right) # split left with subtraction method - si_left_sub = context.find_node_split_subtraction( + si_left_sub = splitter.find_node_split_subtraction( sample_indices_left, si_parent.gradient_left, si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method - si_right_sub = context.find_node_split_subtraction( + si_right_sub = splitter.find_node_split_subtraction( sample_indices_right, si_parent.gradient_right, si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) @@ -170,7 +170,7 @@ def test_gradient_and_hessian_sanity(constant_hessian): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - context = SplittingContext(X_binned, n_bins, + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, all_gradients, all_hessians, l2_regularization, min_hessian_to_split, @@ -183,20 +183,20 @@ def test_gradient_and_hessian_sanity(constant_hessian): hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method - si_parent = context.find_node_split(sample_indices, hists_parent) - sample_indices_left, sample_indices_right, _ = context.split_indices( + si_parent = splitter.find_node_split(sample_indices, hists_parent) + sample_indices_left, sample_indices_right, _ = splitter.split_indices( si_parent, sample_indices) - si_left = context.find_node_split(sample_indices_left, hists_left) - si_right = context.find_node_split(sample_indices_right, hists_right) + si_left = splitter.find_node_split(sample_indices_left, hists_left) + si_right = splitter.find_node_split(sample_indices_right, hists_right) # split left with subtraction method - si_left_sub = context.find_node_split_subtraction( + si_left_sub = splitter.find_node_split_subtraction( sample_indices_left, si_parent.gradient_left, si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method - si_right_sub = context.find_node_split_subtraction( + si_right_sub = splitter.find_node_split_subtraction( sample_indices_right, si_parent.gradient_right, si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) @@ -245,7 +245,7 @@ def test_gradient_and_hessian_sanity(constant_hessian): def test_split_indices(): # Check that split_indices returns the correct splits and that - # splitting_context.partition is consistent with what is returned. + # splitter.partition is consistent with what is returned. rng = np.random.RandomState(421) n_bins = 5 @@ -273,30 +273,30 @@ def test_split_indices(): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - context = SplittingContext(X_binned, n_bins, + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, all_gradients, all_hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) - assert_array_almost_equal(sample_indices, context.partition) + assert_array_almost_equal(sample_indices, splitter.partition) histograms = np.zeros(shape=(2, n_bins), dtype=HISTOGRAM_DTYPE) - si_root = context.find_node_split(sample_indices, histograms) + si_root = splitter.find_node_split(sample_indices, histograms) # sanity checks for best split assert si_root.feature_idx == 1 assert si_root.bin_idx == 3 - samples_left, samples_right, position_right = context.split_indices( - si_root, context.partition) + samples_left, samples_right, position_right = splitter.split_indices( + si_root, splitter.partition) assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8]) assert set(samples_right) == set([2, 7, 9]) assert_array_almost_equal(samples_left, - context.partition[:position_right]) + splitter.partition[:position_right]) assert_array_almost_equal(samples_right, - context.partition[position_right:]) + splitter.partition[position_right:]) # Check that the resulting split indices sizes are consistent with the # count statistics anticipated when looking for the best split. @@ -323,16 +323,19 @@ def test_min_gain_to_split(): sample_indices = np.arange(n_samples, dtype=np.uint32) all_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) all_gradients = np.ones_like(binned_feature, dtype=Y_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = all_hessians.sum() n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - context = SplittingContext(X_binned, n_bins, n_bins_per_feature, + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, all_gradients, all_hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) - split_info = _find_histogram_split_wrapper(context, feature_idx, - sample_indices, histogram) + split_info = splitter.find_best_split_wrapper( + feature_idx, sample_indices, histogram, sum_gradients, + sum_hessians) assert split_info.gain == -1 From c4d00f01e611496412aff81ea8d888f6240a0006 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jan 2019 18:05:39 -0500 Subject: [PATCH 044/247] lots of cosmetics --- sklearn/gbm/grower.py | 23 +-- sklearn/gbm/loss.pyx | 5 +- sklearn/gbm/splitting.pyx | 147 ++++++++++---------- sklearn/gbm/tests/test_binning.py | 1 - sklearn/gbm/tests/test_compare_lightgbm.py | 24 ++-- sklearn/gbm/tests/test_gradient_boosting.py | 32 ++--- sklearn/gbm/tests/test_grower.py | 3 +- sklearn/gbm/tests/test_histogram.py | 7 +- sklearn/gbm/tests/test_loss.py | 1 - sklearn/gbm/tests/test_predictor.py | 1 - sklearn/gbm/tests/test_splitting.py | 50 +++---- 11 files changed, 148 insertions(+), 146 deletions(-) diff --git a/sklearn/gbm/grower.py b/sklearn/gbm/grower.py index 11c8ac4a4e9e8..7f521776306ab 100644 --- a/sklearn/gbm/grower.py +++ b/sklearn/gbm/grower.py @@ -8,7 +8,7 @@ import numpy as np from time import time -from .splitting import Splitter, SplitInfo +from .splitting import Splitter from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE from .types import HISTOGRAM_DTYPE @@ -250,17 +250,16 @@ def _intilialize_root(self): """Initialize root node and finalize it if needed.""" n_samples = self.X_binned.shape[0] depth = 0 + sum_gradients = np.sum(self.splitter.gradients) if self.splitter.constant_hessian: - hessian = self.splitter.hessians[0] * n_samples + sum_hessians = self.splitter.hessians[0] * n_samples else: - hessian = np.sum(self.splitter.hessians) + sum_hessians = np.sum(self.splitter.hessians) self.root = TreeNode( depth=depth, - #sample_indices=self.splitter.partition.view(), sample_indices=self.splitter.partition, - #sum_gradients=self.splitter.gradients.sum(), - sum_gradients=np.sum(self.splitter.gradients), - sum_hessians=hessian + sum_gradients=sum_gradients, + sum_hessians=sum_hessians ) self.root.start = 0 @@ -364,8 +363,10 @@ def split_next(self): node = heappop(self.splittable_nodes) tic = time() - (sample_indices_left, sample_indices_right, i) = self.splitter.split_indices( - node.split_info, node.sample_indices) + (sample_indices_left, + sample_indices_right, + right_child_pos) = self.splitter.split_indices(node.split_info, + node.sample_indices) toc = time() node.apply_split_time = toc - tic self.total_apply_split_time += node.apply_split_time @@ -391,8 +392,8 @@ def split_next(self): # set start and stop indices left_child_node.start = node.start - left_child_node.stop = node.start + i - right_child_node.start = left_child_node.stop + left_child_node.stop = node.start + right_child_pos + right_child_node.start = left_child_node.stop right_child_node.stop = node.stop self.n_nodes += 2 diff --git a/sklearn/gbm/loss.pyx b/sklearn/gbm/loss.pyx index 99b3b9dbbe4ee..54f3c949911d6 100644 --- a/sklearn/gbm/loss.pyx +++ b/sklearn/gbm/loss.pyx @@ -256,14 +256,15 @@ class CategoricalCrossEntropy(BaseLoss): cdef inline Y_DTYPE_C _logsumexp(Y_DTYPE_C [:, :] a, const int row) nogil: - # Need to pass the whole array, else prange won't work. See issue Cython + # Need to pass the whole array, else prange won't work. See Cython issue # #2798 cdef: int k Y_DTYPE_C out = 0. # Y_DTYPE_C amax - # TODO: use the numerically safer option: + # TODO: use the numerically safer option + # But I don't now how to properly write a max() # amax = max(a[i]) # for k in range(a.shape[1]): # out += exp(a[i, k] - amax) diff --git a/sklearn/gbm/splitting.pyx b/sklearn/gbm/splitting.pyx index ea7c60339e575..af3b2edbf5b11 100644 --- a/sklearn/gbm/splitting.pyx +++ b/sklearn/gbm/splitting.pyx @@ -29,8 +29,8 @@ from .types import HISTOGRAM_DTYPE cdef struct split_info_struct: - # Same as the SplitInfo class, but we need a C struct to use it in nogil - # mode. + # Same as the SplitInfo class, but we need a C struct to use it in the + # nogil sections Y_DTYPE_C gain unsigned int feature_idx unsigned int bin_idx @@ -98,10 +98,10 @@ cdef class SplitInfo: cdef class Splitter: """Splitter used to find the best possible split at each node. - The 'best' split is computed accross all features and all bins. + A split (see SplitInfo) is characterized by a feature and a bin. The Splitter is also responsible for partitioning the samples among the - leaf nodes (see split_indices() and the partition attribute). + leaves of the tree (see split_indices() and the partition attribute). Parameters ---------- @@ -155,10 +155,10 @@ cdef class Splitter: def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - Y_DTYPE_C [::1] gradients, Y_DTYPE_C [::1] hessians, Y_DTYPE_C - l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, - unsigned int min_samples_leaf=20, Y_DTYPE_C - min_gain_to_split=0.): + Y_DTYPE_C [::1] gradients, Y_DTYPE_C [::1] hessians, + Y_DTYPE_C l2_regularization, Y_DTYPE_C + min_hessian_to_split=1e-3, unsigned int + min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0.): self.X_binned = X_binned self.n_features = X_binned.shape[1] @@ -199,20 +199,21 @@ cdef class Splitter: sample_indices): """Split samples into left and right arrays. - The split is performed according to the best possible split (split_info). + The split is performed according to the best possible split + (split_info). - Ultimately, this is nothing but a partition of the sample_indices array - with a given pivot, exactly like a quicksort subroutine. + Ultimately, this is nothing but a partition of the sample_indices + array with a given pivot, exactly like a quicksort subroutine. Parameters ---------- split_info : SplitInfo The SplitInfo of the node to split sample_indices : array of unsigned int - The indices of the samples at the node to split. This is a view on - self.partition, and it is modified inplace by placing the indices - of the left child at the beginning, and the indices of the right child - at the end. + The indices of the samples at the node to split. This is a view + on self.partition, and it is modified inplace by placing the + indices of the left child at the beginning, and the indices of + the right child at the end. Returns ------- @@ -225,27 +226,27 @@ cdef class Splitter: right_child_position : int The position of the right child in ``sample_indices`` """ - # This is a multi-threaded implementation inspired by lightgbm. - # Here is a quick break down. Let's suppose we want to split a node with - # 24 samples named from a to x. self.partition looks like this (the * - # are indices in other leaves that we don't care about): + # This is a multi-threaded implementation inspired by lightgbm. Here + # is a quick break down. Let's suppose we want to split a node with 24 + # samples named from a to x. self.partition looks like this (the * are + # indices in other leaves that we don't care about): # partition = [*************abcdefghijklmnopqrstuvwx****************] # ^ ^ # node_position node_position + node.n_samples - # Ultimately, we want to reorder the samples inside the boundaries of the - # leaf (which becomes a node) to now represent the samples in its left and - # right child. For example: + # Ultimately, we want to reorder the samples inside the boundaries of + # the leaf (which becomes a node) to now represent the samples in its + # left and right child. For example: # partition = [*************abefilmnopqrtuxcdghjksvw*****************] # ^ ^ # left_child_pos right_child_pos - # Note that left_child_pos always takes the value of node_position, and - # right_child_pos = left_child_pos + left_child.n_samples. The order of - # the samples inside a leaf is irrelevant. + # Note that left_child_pos always takes the value of node_position, + # and right_child_pos = left_child_pos + left_child.n_samples. The + # order of the samples inside a leaf is irrelevant. # 1. samples_indices is a view on this region a..x. We conceptually - # divide it into n_threads regions. Each thread will be responsible for - # its own region. Here is an example with 4 threads: + # divide it into n_threads regions. Each thread will be responsible + # for its own region. Here is an example with 4 threads: # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] # 2. Each thread processes 6 = 24 // 4 entries and maps them into # left_indices_buffer or right_indices_buffer. For example, we could @@ -253,27 +254,29 @@ cdef class Splitter: # - left_indices_buffer = [abef..|il....|mnopqr|tux...] # - right_indices_buffer = [cd....|ghjk..|......|svw...] # 3. We keep track of the start positions of the regions (the '|') in - # ``offset_in_buffers`` as well as the size of each region. We also keep - # track of the number of samples put into the left/right child by each - # thread. Concretely: + # ``offset_in_buffers`` as well as the size of each region. We also + # keep track of the number of samples put into the left/right child + # by each thread. Concretely: # - left_counts = [4, 2, 6, 3] # - right_counts = [2, 4, 0, 3] # 4. Finally, we put left/right_indices_buffer back into the - # samples_indices, without any undefined entries and the partition looks - # as expected + # samples_indices, without any undefined entries and the partition + # looks as expected # partition = [*************abefilmnopqrtuxcdghjksvw*****************] - # Note: We here show left/right_indices_buffer as being the same size as - # sample_indices for simplicity, but in reality they are of the same size - # as partition. + # Note: We here show left/right_indices_buffer as being the same size + # as sample_indices for simplicity, but in reality they are of the + # same size as partition. cdef: int n_samples = sample_indices.shape[0] - const X_BINNED_DTYPE_C [::1] X_binned = self.X_binned[:, split_info.feature_idx] + const X_BINNED_DTYPE_C [::1] X_binned = \ + self.X_binned[:, split_info.feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer unsigned int [::1] right_indices_buffer = self.right_indices_buffer int n_threads = omp_get_max_threads() - int [:] sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) + int [:] sizes = np.full(n_threads, n_samples // n_threads, + dtype=np.int32) int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) int [:] left_counts = np.empty(n_threads, dtype=np.int32) int [:] right_counts = np.empty(n_threads, dtype=np.int32) @@ -320,8 +323,8 @@ cdef class Splitter: for thread_idx in range(n_threads): right_child_position += left_counts[thread_idx] - # offset of each thread in samples_indices for left and right child, i.e. - # where each thread will start to write. + # offset of each thread in samples_indices for left and right + # child, i.e. where each thread will start to write. right_offset[0] = right_child_position for thread_idx in range(1, n_threads): left_offset[thread_idx] = \ @@ -329,8 +332,9 @@ cdef class Splitter: right_offset[thread_idx] = \ right_offset[thread_idx - 1] + right_counts[thread_idx - 1] - # map indices in left/right_indices_buffer back into samples_indices. This - # also updates self.partition since samples_indice is a view. + # map indices in left/right_indices_buffer back into + # samples_indices. This also updates self.partition since + # samples_indice is a view. for thread_idx in prange(n_threads): for i in range(left_counts[thread_idx]): @@ -363,10 +367,6 @@ cdef class Splitter: ------- best_split_info : SplitInfo The info about the best possible split among all features. - histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) - The histograms of each feature. A histogram is an array of - HISTOGRAM_DTYPE of size ``max_bins`` (only - ``n_bins_per_features[feature]`` entries are relevant). """ cdef: unsigned int n_samples @@ -447,23 +447,22 @@ cdef class Splitter: const unsigned int [::1] sample_indices, # IN hist_struct [::1] histogram # OUT ) nogil: - """Compute the histogram for a given feature - - Returns the best SplitInfo among all the possible bins of the feature. - """ + """Compute the histogram for a given feature.""" cdef: unsigned int n_samples = sample_indices.shape[0] - const X_BINNED_DTYPE_C [::1] X_binned = self.X_binned[:, feature_idx] + const X_BINNED_DTYPE_C [::1] X_binned = \ + self.X_binned[:, feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples Y_DTYPE_C [::1] ordered_gradients = \ self.ordered_gradients[:n_samples] - Y_DTYPE_C [::1] ordered_hessians = self.ordered_hessians[:n_samples] + Y_DTYPE_C [::1] ordered_hessians = \ + self.ordered_hessians[:n_samples] if root_node: if self.constant_hessian: _build_histogram_root_no_hessian(self.max_bins, X_binned, - ordered_gradients, histogram) + ordered_gradients, histogram) else: _build_histogram_root(self.max_bins, X_binned, ordered_gradients, @@ -471,10 +470,12 @@ cdef class Splitter: else: if self.constant_hessian: _build_histogram_no_hessian(self.max_bins, sample_indices, - X_binned, ordered_gradients, histogram) + X_binned, ordered_gradients, + histogram) else: _build_histogram(self.max_bins, sample_indices, X_binned, - ordered_gradients, ordered_hessians, histogram) + ordered_gradients, ordered_hessians, + histogram) def find_node_split_subtraction( Splitter self, @@ -489,19 +490,24 @@ cdef class Splitter: Returns the best split info among all features, and the histograms of all the features. - This does the same job as ``find_node_split()`` but uses the histograms - of the parent and sibling of the node to split. This allows to use the - identity: ``histogram(parent) = histogram(node) - histogram(sibling)``, - which is significantly faster than computing the histograms from data. + This does the same job as ``find_node_split()`` but uses the + histograms of the parent and sibling of the node to split. This + allows to use the identity: ``histogram(parent) = histogram(node) - + histogram(sibling)``, which is significantly faster than computing + the histograms from data. - Returns the best SplitInfo among all features, along with all the feature - histograms that can be latter used to compute the sibling or children - histograms by substraction. + Returns the best SplitInfo among all features, along with all the + feature histograms that can be later used to compute the sibling or + children histograms by substraction. Parameters ---------- sample_indices : array of int The indices of the samples at the node to split. + sum_gradients : float + Sum of the samples gradients at the current node + sum_hessians : float + Sum of the samples hessians at the current node parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) The histograms of the parent sibling_histograms : array of HISTOGRAM_DTYPE of \ @@ -515,10 +521,6 @@ cdef class Splitter: ------- best_split_info : SplitInfo The info about the best possible split among all features. - histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins) - The histograms of each feature. A histogram is an array of - HISTOGRAM_DTYPE of size ``max_bins`` (only - ``n_bins_per_features[feature]`` entries are relevant). """ cdef: @@ -566,6 +568,7 @@ cdef class Splitter: cdef split_info_struct _find_best_feature_to_split_helper(self, split_info_struct * split_infos # IN ) nogil: + """Returns the best split_info among those in splits_infos.""" cdef: Y_DTYPE_C gain Y_DTYPE_C best_gain @@ -589,12 +592,13 @@ cdef class Splitter: unsigned int n_samples, Y_DTYPE_C sum_gradients, Y_DTYPE_C sum_hessians) nogil: - """Find best bin to split on, and return the corresponding SplitInfo. + """Find best bin to split on for a given feature. - Splits that do not satisfy the splitting constraints (min_gain_to_split, - etc.) are discarded here. If no split can satisfy the constraints, a - SplitInfo with a gain of -1 is returned. If for a given node the best - SplitInfo has a gain of -1, it is finalized into a leaf. + Splits that do not satisfy the splitting constraints + (min_gain_to_split, etc.) are discarded here. If no split can + satisfy the constraints, a SplitInfo with a gain of -1 is returned. + If for a given node the best SplitInfo has a gain of -1, it is + finalized into a leaf. """ cdef: unsigned int bin_idx @@ -656,7 +660,8 @@ cdef class Splitter: return best_split - # Only used for tests... not great + # Only used for tests (python code cannot use cdef functions) + # Not sure if this is a good practice... def find_best_split_wrapper( self, unsigned int feature_idx, diff --git a/sklearn/gbm/tests/test_binning.py b/sklearn/gbm/tests/test_binning.py index 3da62073e2267..3e72a15d259c9 100644 --- a/sklearn/gbm/tests/test_binning.py +++ b/sklearn/gbm/tests/test_binning.py @@ -8,7 +8,6 @@ from sklearn.gbm.types import X_DTYPE, X_BINNED_DTYPE - DATA = np.random.RandomState(42).normal( loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2) ).astype(X_DTYPE) diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py index 78e294af59f3e..15b63febe0297 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -103,12 +103,12 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GBMClassifier(loss='binary_crossentropy', - max_iter=max_iter, - max_bins=max_bins, - learning_rate=1, - n_iter_no_change=None, - min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes) + max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) @@ -164,12 +164,12 @@ def test_same_predictions_multiclass_classification( X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GBMClassifier(loss='categorical_crossentropy', - max_iter=max_iter, - max_bins=max_bins, - learning_rate=lr, - n_iter_no_change=None, - min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes) + max_iter=max_iter, + max_bins=max_bins, + learning_rate=lr, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) diff --git a/sklearn/gbm/tests/test_gradient_boosting.py b/sklearn/gbm/tests/test_gradient_boosting.py index e5add16269d9e..11b2f62686eb8 100644 --- a/sklearn/gbm/tests/test_gradient_boosting.py +++ b/sklearn/gbm/tests/test_gradient_boosting.py @@ -1,8 +1,3 @@ -import os -import warnings - -import numpy as np -from numpy.testing import assert_allclose import pytest from sklearn.utils.testing import assert_raises_regex from sklearn.datasets import make_classification, make_regression @@ -10,7 +5,6 @@ from sklearn.gbm import GBMClassifier from sklearn.gbm import GBMRegressor -from sklearn.gbm.binning import BinMapper X_classification, y_classification = make_classification(random_state=0) @@ -108,12 +102,12 @@ def test_early_stopping_regression(scoring, validation_split, X, y = make_regression(random_state=0) gb = GBMRegressor(verbose=1, # just for coverage - scoring=scoring, - tol=tol, - validation_split=validation_split, - max_iter=max_iter, - n_iter_no_change=n_iter_no_change, - random_state=0) + scoring=scoring, + tol=tol, + validation_split=validation_split, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0) gb.fit(X, y) if n_iter_no_change is not None: @@ -141,12 +135,12 @@ def test_early_stopping_classification(data, scoring, validation_split, X, y = data gb = GBMClassifier(verbose=1, # just for coverage - scoring=scoring, - tol=tol, - validation_split=validation_split, - max_iter=max_iter, - n_iter_no_change=n_iter_no_change, - random_state=0) + scoring=scoring, + tol=tol, + validation_split=validation_split, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0) gb.fit(X, y) if n_iter_no_change is not None: @@ -159,7 +153,7 @@ def test_should_stop(): def should_stop(scores, n_iter_no_change, tol): gbdt = GBMClassifier(n_iter_no_change=n_iter_no_change, - tol=tol) + tol=tol) return gbdt._should_stop(scores) # not enough iterations diff --git a/sklearn/gbm/tests/test_grower.py b/sklearn/gbm/tests/test_grower.py index 19ff05534ee74..574821fce4c58 100644 --- a/sklearn/gbm/tests/test_grower.py +++ b/sklearn/gbm/tests/test_grower.py @@ -141,7 +141,8 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): # Check the values of the leaves: assert grower.root.left_child.value == approx(shrinkage) assert grower.root.right_child.left_child.value == approx(shrinkage) - assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3) + assert grower.root.right_child.right_child.value == approx(-shrinkage, + rel=1e-3) @pytest.mark.skip('Removed predict_binned') diff --git a/sklearn/gbm/tests/test_histogram.py b/sklearn/gbm/tests/test_histogram.py index d94c82c7ea33e..730a7e8b763a5 100644 --- a/sklearn/gbm/tests/test_histogram.py +++ b/sklearn/gbm/tests/test_histogram.py @@ -51,7 +51,8 @@ def test_histogram_sample_order_independence(): n_samples = 1000 n_bins = 256 - binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=X_BINNED_DTYPE) + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, + dtype=X_BINNED_DTYPE) sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False) ordered_gradients = rng.randn(n_sub_samples).astype(Y_DTYPE) @@ -144,8 +145,8 @@ def test_hist_subtraction(constant_hessian): _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, ordered_gradients, hist_parent) else: - _build_histogram(n_bins, sample_indices, binned_feature, - ordered_gradients, ordered_hessians, hist_parent) + _build_histogram(n_bins, sample_indices, binned_feature, + ordered_gradients, ordered_hessians, hist_parent) mask = rng.randint(0, 2, n_samples).astype(np.bool) diff --git a/sklearn/gbm/tests/test_loss.py b/sklearn/gbm/tests/test_loss.py index 8e00d63e6b384..a4bdb51aaa27b 100644 --- a/sklearn/gbm/tests/test_loss.py +++ b/sklearn/gbm/tests/test_loss.py @@ -2,7 +2,6 @@ from numpy.testing import assert_almost_equal import scipy from scipy.optimize import newton -from scipy.special import logsumexp from sklearn.utils import assert_all_finite import pytest diff --git a/sklearn/gbm/tests/test_predictor.py b/sklearn/gbm/tests/test_predictor.py index 36dcc4f9f8634..be934e52e1f9a 100644 --- a/sklearn/gbm/tests/test_predictor.py +++ b/sklearn/gbm/tests/test_predictor.py @@ -1,5 +1,4 @@ import numpy as np -from numpy.testing import assert_allclose from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/gbm/tests/test_splitting.py index 2d5da80e38d94..8521cb034b939 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/gbm/tests/test_splitting.py @@ -25,7 +25,6 @@ def test_histogram_split(n_bins): all_hessians = ordered_hessians sum_hessians = all_hessians.sum() - for true_bin in range(1, n_bins - 1): for sign in [-1, 1]: ordered_gradients = np.full_like(binned_feature, sign, @@ -37,12 +36,12 @@ def test_histogram_split(n_bins): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) splitter = Splitter(X_binned, - n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, - min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + n_bins, + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, min_gain_to_split) histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) split_info = splitter.find_best_split_wrapper( @@ -93,8 +92,10 @@ def test_split_vs_split_subtraction(constant_hessian): hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_left_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) + hists_left_sub = np.zeros(shape=(n_features, n_bins), + dtype=HISTOGRAM_DTYPE) + hists_right_sub = np.zeros(shape=(n_features, n_bins), + dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method si_parent = splitter.find_node_split(sample_indices, hists_parent) @@ -171,17 +172,18 @@ def test_gradient_and_hessian_sanity(constant_hessian): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) splitter = Splitter(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_left_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_right_sub = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - + hists_left_sub = np.zeros(shape=(n_features, n_bins), + dtype=HISTOGRAM_DTYPE) + hists_right_sub = np.zeros(shape=(n_features, n_bins), + dtype=HISTOGRAM_DTYPE) # first split parent, left and right with classical method si_parent = splitter.find_node_split(sample_indices, hists_parent) sample_indices_left, sample_indices_right, _ = splitter.split_indices( @@ -274,10 +276,10 @@ def test_split_indices(): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) splitter = Splitter(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + n_bins_per_feature, + all_gradients, all_hessians, + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split) assert_array_almost_equal(sample_indices, splitter.partition) @@ -329,10 +331,10 @@ def test_min_gain_to_split(): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) splitter = Splitter(X_binned, n_bins, n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, - min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + all_gradients, all_hessians, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, min_gain_to_split) histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) split_info = splitter.find_best_split_wrapper( From 628ea6148ef5cf4345990a39d7dffcc14d18beaf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 07:46:14 -0500 Subject: [PATCH 045/247] fixed test segfault --- sklearn/gbm/binning.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/gbm/binning.pyx b/sklearn/gbm/binning.pyx index 7abd49013a36d..c741aa9b48188 100644 --- a/sklearn/gbm/binning.pyx +++ b/sklearn/gbm/binning.pyx @@ -17,6 +17,7 @@ cimport numpy as np from cython.parallel import prange from ..utils import check_random_state, check_array +from ..utils.validation import check_is_fitted from ..base import BaseEstimator, TransformerMixin from .types import X_DTYPE, X_BINNED_DTYPE from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C @@ -182,6 +183,13 @@ class BinMapper(BaseEstimator, TransformerMixin): The binned data """ X = check_array(X, dtype=[X_DTYPE]) + check_is_fitted(self, ['bin_thresholds_', 'n_bins_per_feature_']) + if X.shape[1] != self.n_bins_per_feature_.shape[0]: + raise ValueError( + 'This estimator was fitted with {} features but {} got passed ' + 'to transform()'.format(self.n_bins_per_feature_.shape[0], + X.shape[1]) + ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') _map_to_bins(X, self.bin_thresholds_, binned) return binned From 5d8c21ad9e6efac3228df176b0cbabe8d024602d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 08:43:33 -0500 Subject: [PATCH 046/247] init file for tests --- sklearn/gbm/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sklearn/gbm/tests/__init__.py diff --git a/sklearn/gbm/tests/__init__.py b/sklearn/gbm/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 35343f294d70dca0d527db984293a375ded0afe9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 08:49:32 -0500 Subject: [PATCH 047/247] renamed estimators --- bench_predict.py | 8 ++++---- gdb_test.py | 8 ++++---- sklearn/gbm/__init__.py | 6 +++--- sklearn/gbm/gradient_boosting.py | 13 +++++++------ sklearn/gbm/tests/test_compare_lightgbm.py | 8 ++++---- sklearn/gbm/tests/test_gradient_boosting.py | 16 ++++++++-------- sklearn/gbm/utils.py | 4 ++-- 7 files changed, 32 insertions(+), 31 deletions(-) diff --git a/bench_predict.py b/bench_predict.py index 5738678f4ab02..8bf2e776fa65a 100644 --- a/bench_predict.py +++ b/bench_predict.py @@ -10,8 +10,8 @@ import matplotlib.pyplot as plt from sklearn.datasets import make_regression, make_classification -from sklearn.gbm import GBMRegressor -from sklearn.gbm import GBMClassifier +from sklearn.gbm import FastGradientBoostingRegressor +from sklearn.gbm import FastGradientBoostingClassifier classif = False n_classes = 3 @@ -24,12 +24,12 @@ X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0, n_classes=n_classes, n_clusters_per_class=1) - GBM = GBMClassifier + GBM = FastGradientBoostingClassifier PYGBM_GBM = pygbm.GradientBoostingClassifier else: X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0) - GBM = GBMRegressor + GBM = FastGradientBoostingRegressor PYGBM_GBM = pygbm.GradientBoostingRegressor diff --git a/gdb_test.py b/gdb_test.py index d45c3956c3438..d8282433cc9bd 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_regression, make_classification from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier -from sklearn.gbm import GBMRegressor -from sklearn.gbm import GBMClassifier +from sklearn.gbm import FastGradientBoostingRegressor +from sklearn.gbm import FastGradientBoostingClassifier import pstats import cProfile @@ -18,12 +18,12 @@ if classif: X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0, n_classes=n_classes, n_clusters_per_class=1) - GBM = GBMClassifier + GBM = FastGradientBoostingClassifier GBDT = GradientBoostingClassifier PYGBM_GBM = pygbm.GradientBoostingClassifier else: X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0) - GBM = GBMRegressor + GBM = FastGradientBoostingRegressor GBDT = GradientBoostingRegressor PYGBM_GBM = pygbm.GradientBoostingRegressor diff --git a/sklearn/gbm/__init__.py b/sklearn/gbm/__init__.py index d50ebe248451f..da843a6213b9b 100644 --- a/sklearn/gbm/__init__.py +++ b/sklearn/gbm/__init__.py @@ -1,4 +1,4 @@ -from .gradient_boosting import GradientBoostingClassifier as GBMClassifier -from .gradient_boosting import GradientBoostingRegressor as GBMRegressor +from .gradient_boosting import FastGradientBoostingClassifier +from .gradient_boosting import FastGradientBoostingRegressor -__all__ = ["GBMClassifier", "GBMRegressor"] \ No newline at end of file +__all__ = ["FastGradientBoostingClassifier", "FastGradientBoostingRegressor"] \ No newline at end of file diff --git a/sklearn/gbm/gradient_boosting.py b/sklearn/gbm/gradient_boosting.py index 206039500327c..98e94e25b67cd 100644 --- a/sklearn/gbm/gradient_boosting.py +++ b/sklearn/gbm/gradient_boosting.py @@ -20,8 +20,8 @@ from .loss import _LOSSES -class BaseGradientBoostingMachine(BaseEstimator, ABC): - """Base class for gradient boosting estimators.""" +class BaseFastGradientBoosting(BaseEstimator, ABC): + """Base class for fast gradient boosting estimators.""" @abstractmethod def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, @@ -401,7 +401,7 @@ def n_iter_(self): return len(self.predictors_) -class GradientBoostingRegressor(BaseGradientBoostingMachine, RegressorMixin): +class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): """Scikit-learn compatible Gradient Boosting Tree for regression. Parameters @@ -479,7 +479,7 @@ def __init__(self, loss='least_squares', learning_rate=0.1, min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, validation_split=0.1, n_iter_no_change=5, tol=1e-7, verbose=0, random_state=None): - super(GradientBoostingRegressor, self).__init__( + super(FastGradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -516,7 +516,8 @@ def _get_loss(self): return _LOSSES[self.loss]() -class GradientBoostingClassifier(BaseGradientBoostingMachine, ClassifierMixin): +class FastGradientBoostingClassifier(BaseFastGradientBoosting, + ClassifierMixin): """Scikit-learn compatible Gradient Boosting Tree for classification. Parameters @@ -598,7 +599,7 @@ def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, l2_regularization=0., max_bins=256, scoring=None, validation_split=0.1, n_iter_no_change=5, tol=1e-7, verbose=0, random_state=None): - super(GradientBoostingClassifier, self).__init__( + super(FastGradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/gbm/tests/test_compare_lightgbm.py index 15b63febe0297..dbc6da9714ead 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/gbm/tests/test_compare_lightgbm.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from sklearn.gbm import GBMRegressor, GBMClassifier +from sklearn.gbm import FastGradientBoostingRegressor, FastGradientBoostingClassifier from sklearn.gbm.binning import BinMapper from sklearn.gbm.utils import get_lightgbm_estimator @@ -51,7 +51,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_sklearn = GBMRegressor(max_iter=max_iter, + est_sklearn = FastGradientBoostingRegressor(max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, @@ -102,7 +102,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = GBMClassifier(loss='binary_crossentropy', + est_pygbm = FastGradientBoostingClassifier(loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, @@ -163,7 +163,7 @@ def test_same_predictions_multiclass_classification( X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = GBMClassifier(loss='categorical_crossentropy', + est_pygbm = FastGradientBoostingClassifier(loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, diff --git a/sklearn/gbm/tests/test_gradient_boosting.py b/sklearn/gbm/tests/test_gradient_boosting.py index 11b2f62686eb8..b44e2fdee55d3 100644 --- a/sklearn/gbm/tests/test_gradient_boosting.py +++ b/sklearn/gbm/tests/test_gradient_boosting.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_classification, make_regression from sklearn.utils.estimator_checks import check_estimator -from sklearn.gbm import GBMClassifier -from sklearn.gbm import GBMRegressor +from sklearn.gbm import FastGradientBoostingClassifier +from sklearn.gbm import FastGradientBoostingRegressor X_classification, y_classification = make_classification(random_state=0) @@ -13,7 +13,7 @@ @pytest.mark.parametrize('GradientBoosting, X, y', [ # (GBMClassifier, X_classification, y_classification), TODO: unskip - (GBMRegressor, X_regression, y_regression) + (FastGradientBoostingRegressor, X_regression, y_regression) ]) def test_init_parameters_validation(GradientBoosting, X, y): @@ -101,7 +101,7 @@ def test_early_stopping_regression(scoring, validation_split, X, y = make_regression(random_state=0) - gb = GBMRegressor(verbose=1, # just for coverage + gb = FastGradientBoostingRegressor(verbose=1, # just for coverage scoring=scoring, tol=tol, validation_split=validation_split, @@ -134,7 +134,7 @@ def test_early_stopping_classification(data, scoring, validation_split, X, y = data - gb = GBMClassifier(verbose=1, # just for coverage + gb = FastGradientBoostingClassifier(verbose=1, # just for coverage scoring=scoring, tol=tol, validation_split=validation_split, @@ -152,7 +152,7 @@ def test_early_stopping_classification(data, scoring, validation_split, def test_should_stop(): def should_stop(scores, n_iter_no_change, tol): - gbdt = GBMClassifier(n_iter_no_change=n_iter_no_change, + gbdt = FastGradientBoostingClassifier(n_iter_no_change=n_iter_no_change, tol=tol) return gbdt._should_stop(scores) @@ -176,8 +176,8 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( - GBMRegressor(), - GBMClassifier(scoring=None, validation_split=None, min_samples_leaf=5), + FastGradientBoostingRegressor(), + FastGradientBoostingClassifier(scoring=None, validation_split=None, min_samples_leaf=5), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. diff --git a/sklearn/gbm/utils.py b/sklearn/gbm/utils.py index 7b0239b0e22b1..ea5454dbcf397 100644 --- a/sklearn/gbm/utils.py +++ b/sklearn/gbm/utils.py @@ -12,7 +12,7 @@ def get_lightgbm_estimator(pygbm_estimator): from lightgbm import LGBMClassifier # Import here to avoid cyclic dependencies - from .gradient_boosting import GradientBoostingClassifier + from .gradient_boosting import FastGradientBoostingClassifier pygbm_params = pygbm_estimator.get_params() @@ -51,7 +51,7 @@ def get_lightgbm_estimator(pygbm_estimator): lgbm_params['min_sum_hessian_in_leaf'] *= 2 lgbm_params['learning_rate'] *= 2 - if isinstance(pygbm_estimator, GradientBoostingClassifier): + if isinstance(pygbm_estimator, FastGradientBoostingClassifier): Est = LGBMClassifier else: Est = LGBMRegressor From d0f73cd9576e65cafee6e5a5ac45f759bcdac14f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 09:18:59 -0500 Subject: [PATCH 048/247] made module private and estimators are available in ensemble --- bench_binning.py | 2 +- bench_find_node_split.py | 12 +++---- bench_hist.py | 20 +++++------ bench_predict.py | 4 +-- bench_split_indices.py | 14 ++++---- gdb_test.py | 4 +-- sklearn/__init__.py | 1 - .../__init__.py | 0 .../_gradient_boosting.pyx | 0 .../binning.pyx | 0 .../{gbm => _fast_gradient_boosting}/fun.py | 0 .../gradient_boosting.py | 0 .../grower.py | 0 .../histogram.pxd | 0 .../histogram.pyx | 0 .../{gbm => _fast_gradient_boosting}/loss.pyx | 0 .../playground.pyx | 0 .../predictor.pyx | 0 .../{gbm => _fast_gradient_boosting}/setup.py | 2 +- .../splitting.pyx | 0 .../tests/__init__.py | 0 .../tests/test_binning.py | 8 ++--- .../tests/test_compare_lightgbm.py | 6 ++-- .../tests/test_gradient_boosting.py | 4 +-- .../tests/test_grower.py | 8 ++--- .../tests/test_histogram.py | 18 +++++----- .../tests/test_loss.py | 4 +-- .../tests/test_predictor.py | 6 ++-- .../tests/test_splitting.py | 8 ++--- .../types.pxd | 0 .../types.pyx | 0 .../{gbm => _fast_gradient_boosting}/utils.py | 0 sklearn/ensemble/__init__.py | 5 ++- sklearn/ensemble/setup.py | 34 ------------------- sklearn/setup.py | 2 +- 35 files changed, 65 insertions(+), 97 deletions(-) rename sklearn/{gbm => _fast_gradient_boosting}/__init__.py (100%) rename sklearn/{gbm => _fast_gradient_boosting}/_gradient_boosting.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/binning.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/fun.py (100%) rename sklearn/{gbm => _fast_gradient_boosting}/gradient_boosting.py (100%) rename sklearn/{gbm => _fast_gradient_boosting}/grower.py (100%) rename sklearn/{gbm => _fast_gradient_boosting}/histogram.pxd (100%) rename sklearn/{gbm => _fast_gradient_boosting}/histogram.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/loss.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/playground.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/predictor.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/setup.py (96%) rename sklearn/{gbm => _fast_gradient_boosting}/splitting.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/__init__.py (100%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_binning.py (96%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_compare_lightgbm.py (97%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_gradient_boosting.py (98%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_grower.py (97%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_histogram.py (92%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_loss.py (98%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_predictor.py (86%) rename sklearn/{gbm => _fast_gradient_boosting}/tests/test_splitting.py (98%) rename sklearn/{gbm => _fast_gradient_boosting}/types.pxd (100%) rename sklearn/{gbm => _fast_gradient_boosting}/types.pyx (100%) rename sklearn/{gbm => _fast_gradient_boosting}/utils.py (100%) diff --git a/bench_binning.py b/bench_binning.py index ba74ef500138c..6748487f12e19 100644 --- a/bench_binning.py +++ b/bench_binning.py @@ -9,7 +9,7 @@ import matplotlib.pyplot as plt from sklearn.datasets import make_regression -from sklearn.ensemble.gbm.binning import BinMapper +from sklearn._fast_gradient_boosting.binning import BinMapper n_features = 5 diff --git a/bench_find_node_split.py b/bench_find_node_split.py index a476d9a2790b7..6433fa8ffddab 100644 --- a/bench_find_node_split.py +++ b/bench_find_node_split.py @@ -3,12 +3,12 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn.gbm.types import HISTOGRAM_DTYPE -from sklearn.gbm.types import X_DTYPE -from sklearn.gbm.types import X_BINNED_DTYPE -from sklearn.gbm.types import Y_DTYPE -from sklearn.gbm.splitting import SplittingContext -from sklearn.gbm.splitting import find_node_split +from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn._fast_gradient_boosting.types import X_DTYPE +from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.splitting import SplittingContext +from sklearn._fast_gradient_boosting.splitting import find_node_split from pygbm.splitting import SplittingContext as SplittingContext_pygbm from pygbm.splitting import find_node_split as find_node_split_pygbm diff --git a/bench_hist.py b/bench_hist.py index aa16ef2e13d58..6156db2317e30 100644 --- a/bench_hist.py +++ b/bench_hist.py @@ -17,16 +17,16 @@ from pygbm.histogram import _build_histogram_root_no_hessian as pygbm_build_histogram_root_no_hessian from pygbm.histogram import _subtract_histograms as pygbm_subtract_histograms -from sklearn.gbm.histogram import _build_histogram_naive -from sklearn.gbm.histogram import _build_histogram -from sklearn.gbm.histogram import _build_histogram_no_hessian -from sklearn.gbm.histogram import _build_histogram_root -from sklearn.gbm.histogram import _build_histogram_root_no_hessian -from sklearn.gbm.histogram import _subtract_histograms -from sklearn.gbm.types import HISTOGRAM_DTYPE -from sklearn.gbm.types import X_DTYPE -from sklearn.gbm.types import X_BINNED_DTYPE -from sklearn.gbm.types import Y_DTYPE +from sklearn._fast_gradient_boosting.histogram import _build_histogram_naive +from sklearn._fast_gradient_boosting.histogram import _build_histogram +from sklearn._fast_gradient_boosting.histogram import _build_histogram_no_hessian +from sklearn._fast_gradient_boosting.histogram import _build_histogram_root +from sklearn._fast_gradient_boosting.histogram import _build_histogram_root_no_hessian +from sklearn._fast_gradient_boosting.histogram import _subtract_histograms +from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn._fast_gradient_boosting.types import X_DTYPE +from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.types import Y_DTYPE m = Memory(location='/tmp') diff --git a/bench_predict.py b/bench_predict.py index 8bf2e776fa65a..cf47d9660b17e 100644 --- a/bench_predict.py +++ b/bench_predict.py @@ -10,8 +10,8 @@ import matplotlib.pyplot as plt from sklearn.datasets import make_regression, make_classification -from sklearn.gbm import FastGradientBoostingRegressor -from sklearn.gbm import FastGradientBoostingClassifier +from sklearn._fast_gradient_boosting import FastGradientBoostingRegressor +from sklearn._fast_gradient_boosting import FastGradientBoostingClassifier classif = False n_classes = 3 diff --git a/bench_split_indices.py b/bench_split_indices.py index a15612a49b4a2..f53d69269805f 100644 --- a/bench_split_indices.py +++ b/bench_split_indices.py @@ -3,13 +3,13 @@ import numpy as np import matplotlib.pyplot as plt -from sklearn.gbm.types import HISTOGRAM_DTYPE -from sklearn.gbm.types import X_DTYPE -from sklearn.gbm.types import X_BINNED_DTYPE -from sklearn.gbm.types import Y_DTYPE -from sklearn.gbm.splitting import SplittingContext -from sklearn.gbm.splitting import find_node_split -from sklearn.gbm.splitting import split_indices +from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn._fast_gradient_boosting.types import X_DTYPE +from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.splitting import SplittingContext +from sklearn._fast_gradient_boosting.splitting import find_node_split +from sklearn._fast_gradient_boosting.splitting import split_indices from pygbm.splitting import SplittingContext as SplittingContext_pygbm from pygbm.splitting import find_node_split as find_node_split_pygbm from pygbm.splitting import split_indices as split_indices_pygbm diff --git a/gdb_test.py b/gdb_test.py index d8282433cc9bd..361907ea41d8e 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_regression, make_classification from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier -from sklearn.gbm import FastGradientBoostingRegressor -from sklearn.gbm import FastGradientBoostingClassifier +from sklearn.ensemble import FastGradientBoostingRegressor +from sklearn.ensemble import FastGradientBoostingClassifier import pstats import cProfile diff --git a/sklearn/__init__.py b/sklearn/__init__.py index da851e6483f72..aafc8a34b2a13 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -75,7 +75,6 @@ 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', 'preprocessing', 'random_projection', 'semi_supervised', 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', - 'gbm', # Non-modules: 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] diff --git a/sklearn/gbm/__init__.py b/sklearn/_fast_gradient_boosting/__init__.py similarity index 100% rename from sklearn/gbm/__init__.py rename to sklearn/_fast_gradient_boosting/__init__.py diff --git a/sklearn/gbm/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx similarity index 100% rename from sklearn/gbm/_gradient_boosting.pyx rename to sklearn/_fast_gradient_boosting/_gradient_boosting.pyx diff --git a/sklearn/gbm/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx similarity index 100% rename from sklearn/gbm/binning.pyx rename to sklearn/_fast_gradient_boosting/binning.pyx diff --git a/sklearn/gbm/fun.py b/sklearn/_fast_gradient_boosting/fun.py similarity index 100% rename from sklearn/gbm/fun.py rename to sklearn/_fast_gradient_boosting/fun.py diff --git a/sklearn/gbm/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py similarity index 100% rename from sklearn/gbm/gradient_boosting.py rename to sklearn/_fast_gradient_boosting/gradient_boosting.py diff --git a/sklearn/gbm/grower.py b/sklearn/_fast_gradient_boosting/grower.py similarity index 100% rename from sklearn/gbm/grower.py rename to sklearn/_fast_gradient_boosting/grower.py diff --git a/sklearn/gbm/histogram.pxd b/sklearn/_fast_gradient_boosting/histogram.pxd similarity index 100% rename from sklearn/gbm/histogram.pxd rename to sklearn/_fast_gradient_boosting/histogram.pxd diff --git a/sklearn/gbm/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx similarity index 100% rename from sklearn/gbm/histogram.pyx rename to sklearn/_fast_gradient_boosting/histogram.pyx diff --git a/sklearn/gbm/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx similarity index 100% rename from sklearn/gbm/loss.pyx rename to sklearn/_fast_gradient_boosting/loss.pyx diff --git a/sklearn/gbm/playground.pyx b/sklearn/_fast_gradient_boosting/playground.pyx similarity index 100% rename from sklearn/gbm/playground.pyx rename to sklearn/_fast_gradient_boosting/playground.pyx diff --git a/sklearn/gbm/predictor.pyx b/sklearn/_fast_gradient_boosting/predictor.pyx similarity index 100% rename from sklearn/gbm/predictor.pyx rename to sklearn/_fast_gradient_boosting/predictor.pyx diff --git a/sklearn/gbm/setup.py b/sklearn/_fast_gradient_boosting/setup.py similarity index 96% rename from sklearn/gbm/setup.py rename to sklearn/_fast_gradient_boosting/setup.py index 1c3cd25c555be..9dba224175bc0 100644 --- a/sklearn/gbm/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -3,7 +3,7 @@ def configuration(parent_package="", top_path=None): - config = Configuration("gbm", parent_package, top_path) + config = Configuration("_fast_gradient_boosting", parent_package, top_path) config.add_extension("_gradient_boosting", sources=["_gradient_boosting.pyx"], diff --git a/sklearn/gbm/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx similarity index 100% rename from sklearn/gbm/splitting.pyx rename to sklearn/_fast_gradient_boosting/splitting.pyx diff --git a/sklearn/gbm/tests/__init__.py b/sklearn/_fast_gradient_boosting/tests/__init__.py similarity index 100% rename from sklearn/gbm/tests/__init__.py rename to sklearn/_fast_gradient_boosting/tests/__init__.py diff --git a/sklearn/gbm/tests/test_binning.py b/sklearn/_fast_gradient_boosting/tests/test_binning.py similarity index 96% rename from sklearn/gbm/tests/test_binning.py rename to sklearn/_fast_gradient_boosting/tests/test_binning.py index 3e72a15d259c9..c543a18f16a88 100644 --- a/sklearn/gbm/tests/test_binning.py +++ b/sklearn/_fast_gradient_boosting/tests/test_binning.py @@ -2,10 +2,10 @@ from numpy.testing import assert_array_equal, assert_allclose import pytest -from sklearn.gbm.binning import BinMapper -from sklearn.gbm.binning import _find_binning_thresholds -from sklearn.gbm.binning import _map_to_bins -from sklearn.gbm.types import X_DTYPE, X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.binning import _find_binning_thresholds +from sklearn._fast_gradient_boosting.binning import _map_to_bins +from sklearn._fast_gradient_boosting.types import X_DTYPE, X_BINNED_DTYPE DATA = np.random.RandomState(42).normal( diff --git a/sklearn/gbm/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py similarity index 97% rename from sklearn/gbm/tests/test_compare_lightgbm.py rename to sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index dbc6da9714ead..886f973b07ffd 100644 --- a/sklearn/gbm/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -4,9 +4,9 @@ import numpy as np import pytest -from sklearn.gbm import FastGradientBoostingRegressor, FastGradientBoostingClassifier -from sklearn.gbm.binning import BinMapper -from sklearn.gbm.utils import get_lightgbm_estimator +from sklearn.ensemble import FastGradientBoostingRegressor, FastGradientBoostingClassifier +from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator pytest.importorskip("lightgbm") diff --git a/sklearn/gbm/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py similarity index 98% rename from sklearn/gbm/tests/test_gradient_boosting.py rename to sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index b44e2fdee55d3..c3861e19e29fa 100644 --- a/sklearn/gbm/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -3,8 +3,8 @@ from sklearn.datasets import make_classification, make_regression from sklearn.utils.estimator_checks import check_estimator -from sklearn.gbm import FastGradientBoostingClassifier -from sklearn.gbm import FastGradientBoostingRegressor +from sklearn.ensemble import FastGradientBoostingClassifier +from sklearn.ensemble import FastGradientBoostingRegressor X_classification, y_classification = make_classification(random_state=0) diff --git a/sklearn/gbm/tests/test_grower.py b/sklearn/_fast_gradient_boosting/tests/test_grower.py similarity index 97% rename from sklearn/gbm/tests/test_grower.py rename to sklearn/_fast_gradient_boosting/tests/test_grower.py index 574821fce4c58..9015cbac40298 100644 --- a/sklearn/gbm/tests/test_grower.py +++ b/sklearn/_fast_gradient_boosting/tests/test_grower.py @@ -4,10 +4,10 @@ from pytest import approx from sklearn.utils.testing import assert_raises_regex -from sklearn.gbm.grower import TreeGrower -from sklearn.gbm.binning import BinMapper -from sklearn.gbm.types import X_BINNED_DTYPE -from sklearn.gbm.types import Y_DTYPE +from sklearn._fast_gradient_boosting.grower import TreeGrower +from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.types import Y_DTYPE def _make_training_data(n_bins=256, constant_hessian=True): diff --git a/sklearn/gbm/tests/test_histogram.py b/sklearn/_fast_gradient_boosting/tests/test_histogram.py similarity index 92% rename from sklearn/gbm/tests/test_histogram.py rename to sklearn/_fast_gradient_boosting/tests/test_histogram.py index 730a7e8b763a5..6d18c12329a66 100644 --- a/sklearn/gbm/tests/test_histogram.py +++ b/sklearn/_fast_gradient_boosting/tests/test_histogram.py @@ -4,15 +4,15 @@ from numpy.testing import assert_allclose from numpy.testing import assert_array_equal -from sklearn.gbm.histogram import _build_histogram_naive -from sklearn.gbm.histogram import _build_histogram -from sklearn.gbm.histogram import _build_histogram_no_hessian -from sklearn.gbm.histogram import _build_histogram_root_no_hessian -from sklearn.gbm.histogram import _build_histogram_root -from sklearn.gbm.histogram import _subtract_histograms -from sklearn.gbm.types import HISTOGRAM_DTYPE -from sklearn.gbm.types import Y_DTYPE -from sklearn.gbm.types import X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.histogram import _build_histogram_naive +from sklearn._fast_gradient_boosting.histogram import _build_histogram +from sklearn._fast_gradient_boosting.histogram import _build_histogram_no_hessian +from sklearn._fast_gradient_boosting.histogram import _build_histogram_root_no_hessian +from sklearn._fast_gradient_boosting.histogram import _build_histogram_root +from sklearn._fast_gradient_boosting.histogram import _subtract_histograms +from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE @pytest.mark.parametrize( diff --git a/sklearn/gbm/tests/test_loss.py b/sklearn/_fast_gradient_boosting/tests/test_loss.py similarity index 98% rename from sklearn/gbm/tests/test_loss.py rename to sklearn/_fast_gradient_boosting/tests/test_loss.py index a4bdb51aaa27b..7750fcf999bd2 100644 --- a/sklearn/gbm/tests/test_loss.py +++ b/sklearn/_fast_gradient_boosting/tests/test_loss.py @@ -5,8 +5,8 @@ from sklearn.utils import assert_all_finite import pytest -from sklearn.gbm.loss import _LOSSES -from sklearn.gbm.types import Y_DTYPE +from sklearn._fast_gradient_boosting.loss import _LOSSES +from sklearn._fast_gradient_boosting.types import Y_DTYPE def get_derivatives_helper(loss): diff --git a/sklearn/gbm/tests/test_predictor.py b/sklearn/_fast_gradient_boosting/tests/test_predictor.py similarity index 86% rename from sklearn/gbm/tests/test_predictor.py rename to sklearn/_fast_gradient_boosting/tests/test_predictor.py index be934e52e1f9a..9ee07a2adf439 100644 --- a/sklearn/gbm/tests/test_predictor.py +++ b/sklearn/_fast_gradient_boosting/tests/test_predictor.py @@ -4,9 +4,9 @@ from sklearn.metrics import r2_score import pytest -from sklearn.gbm.binning import BinMapper -from sklearn.gbm.grower import TreeGrower -from sklearn.gbm.types import Y_DTYPE +from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.grower import TreeGrower +from sklearn._fast_gradient_boosting.types import Y_DTYPE @pytest.mark.parametrize('max_bins', [200, 256]) diff --git a/sklearn/gbm/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py similarity index 98% rename from sklearn/gbm/tests/test_splitting.py rename to sklearn/_fast_gradient_boosting/tests/test_splitting.py index 8521cb034b939..f19af4e43214b 100644 --- a/sklearn/gbm/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -3,10 +3,10 @@ from numpy.testing import assert_array_almost_equal import pytest -from sklearn.gbm.types import HISTOGRAM_DTYPE -from sklearn.gbm.types import Y_DTYPE -from sklearn.gbm.types import X_BINNED_DTYPE -from sklearn.gbm.splitting import Splitter +from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE +from sklearn._fast_gradient_boosting.splitting import Splitter @pytest.mark.parametrize('n_bins', [3, 32, 256]) diff --git a/sklearn/gbm/types.pxd b/sklearn/_fast_gradient_boosting/types.pxd similarity index 100% rename from sklearn/gbm/types.pxd rename to sklearn/_fast_gradient_boosting/types.pxd diff --git a/sklearn/gbm/types.pyx b/sklearn/_fast_gradient_boosting/types.pyx similarity index 100% rename from sklearn/gbm/types.pyx rename to sklearn/_fast_gradient_boosting/types.pyx diff --git a/sklearn/gbm/utils.py b/sklearn/_fast_gradient_boosting/utils.py similarity index 100% rename from sklearn/gbm/utils.py rename to sklearn/_fast_gradient_boosting/utils.py diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index 5586a9e1e1fba..282f477c76679 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -17,6 +17,8 @@ from .gradient_boosting import GradientBoostingClassifier from .gradient_boosting import GradientBoostingRegressor from .voting_classifier import VotingClassifier +from .._fast_gradient_boosting import FastGradientBoostingClassifier +from .._fast_gradient_boosting import FastGradientBoostingRegressor from . import bagging from . import forest @@ -32,4 +34,5 @@ "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "bagging", "forest", "gradient_boosting", - "partial_dependence", "weight_boosting"] + "partial_dependence", "weight_boosting", + "FastGradientBoostingClassifier", "FastGradientBoostingRegressor"] diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index a7cf5789fe608..63a9f25947f91 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -8,40 +8,6 @@ def configuration(parent_package="", top_path=None): sources=["_gradient_boosting.pyx"], include_dirs=[numpy.get_include()]) - # config.add_extension("gbm._gradient_boosting", - # sources=["gbm/_gradient_boosting.pyx"], - # include_dirs=[numpy.get_include()], - # extra_compile_args=['-fopenmp'], - # extra_link_args=['-fopenmp']) - - # config.add_extension("gbm.histogram", - # sources=["gbm/histogram.pyx"], - # include_dirs=[numpy.get_include()]) - - # config.add_extension("gbm.splitting", - # sources=["gbm/splitting.pyx"], - # include_dirs=[numpy.get_include()]) - - # config.add_extension("gbm.binning", - # sources=["gbm/binning.pyx"], - # include_dirs=[numpy.get_include()], - # extra_compile_args=['-fopenmp'], - # extra_link_args=['-fopenmp']) - - # config.add_extension("gbm.predictor", - # sources=["gbm/predictor.pyx"], - # include_dirs=[numpy.get_include()]) - - # config.add_extension("gbm.loss", - # sources=["gbm/loss.pyx"], - # include_dirs=[numpy.get_include()], - # extra_compile_args=['-fopenmp'], - # extra_link_args=['-fopenmp']) - - # config.add_extension("gbm.playground", - # sources=["gbm/playground.pyx"], - # include_dirs=[numpy.get_include()]) - config.add_subpackage("tests") # config.add_data_files("gbm/histogram.pxd") diff --git a/sklearn/setup.py b/sklearn/setup.py index f3a028be45565..6b55407ecc2ce 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -56,7 +56,7 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('neighbors') config.add_subpackage('tree') config.add_subpackage('svm') - config.add_subpackage('gbm') + config.add_subpackage('_fast_gradient_boosting') # add cython extension module for isotonic regression config.add_extension('_isotonic', From af23becb0ee2c32344808d29198f99f8be374cd8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 09:27:32 -0500 Subject: [PATCH 049/247] pep8 --- sklearn/_fast_gradient_boosting/__init__.py | 7 ++- sklearn/_fast_gradient_boosting/grower.py | 2 +- sklearn/_fast_gradient_boosting/setup.py | 1 + .../tests/test_compare_lightgbm.py | 46 ++++++++++--------- .../tests/test_gradient_boosting.py | 32 +++++++------ .../tests/test_histogram.py | 6 ++- sklearn/_fast_gradient_boosting/utils.py | 1 - 7 files changed, 54 insertions(+), 41 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/__init__.py b/sklearn/_fast_gradient_boosting/__init__.py index da843a6213b9b..0318177174f98 100644 --- a/sklearn/_fast_gradient_boosting/__init__.py +++ b/sklearn/_fast_gradient_boosting/__init__.py @@ -1,4 +1,9 @@ +"""This module implements the 'fast' gradient boosting estimators. + +The implementation is a port from pygbm which is itself strongly inspired +from LightGBM. +""" from .gradient_boosting import FastGradientBoostingClassifier from .gradient_boosting import FastGradientBoostingRegressor -__all__ = ["FastGradientBoostingClassifier", "FastGradientBoostingRegressor"] \ No newline at end of file +__all__ = ["FastGradientBoostingClassifier", "FastGradientBoostingRegressor"] diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 7f521776306ab..7d79f2753117b 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -393,7 +393,7 @@ def split_next(self): # set start and stop indices left_child_node.start = node.start left_child_node.stop = node.start + right_child_pos - right_child_node.start = left_child_node.stop + right_child_node.start = left_child_node.stop right_child_node.stop = node.stop self.n_nodes += 2 diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index 9dba224175bc0..d65b0f36fe74f 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -54,6 +54,7 @@ def configuration(parent_package="", top_path=None): return config + if __name__ == "__main__": from numpy.distutils.core import setup setup(**configuration().todict()) diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 886f973b07ffd..05ba2d36a5e84 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -4,7 +4,8 @@ import numpy as np import pytest -from sklearn.ensemble import FastGradientBoostingRegressor, FastGradientBoostingClassifier +from sklearn.ensemble import FastGradientBoostingRegressor +from sklearn.ensemble import FastGradientBoostingClassifier from sklearn._fast_gradient_boosting.binning import BinMapper from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator @@ -51,12 +52,13 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_sklearn = FastGradientBoostingRegressor(max_iter=max_iter, - max_bins=max_bins, - learning_rate=1, - n_iter_no_change=None, - min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes) + est_sklearn = FastGradientBoostingRegressor( + max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_sklearn) est_lightgbm.fit(X_train, y_train) @@ -102,13 +104,14 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = FastGradientBoostingClassifier(loss='binary_crossentropy', - max_iter=max_iter, - max_bins=max_bins, - learning_rate=1, - n_iter_no_change=None, - min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes) + est_pygbm = FastGradientBoostingClassifier( + loss='binary_crossentropy', + max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) @@ -163,13 +166,14 @@ def test_same_predictions_multiclass_classification( X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = FastGradientBoostingClassifier(loss='categorical_crossentropy', - max_iter=max_iter, - max_bins=max_bins, - learning_rate=lr, - n_iter_no_change=None, - min_samples_leaf=min_samples_leaf, - max_leaf_nodes=max_leaf_nodes) + est_pygbm = FastGradientBoostingClassifier( + loss='categorical_crossentropy', + max_iter=max_iter, + max_bins=max_bins, + learning_rate=lr, + n_iter_no_change=None, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index c3861e19e29fa..b4ee63a5e5474 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -102,12 +102,12 @@ def test_early_stopping_regression(scoring, validation_split, X, y = make_regression(random_state=0) gb = FastGradientBoostingRegressor(verbose=1, # just for coverage - scoring=scoring, - tol=tol, - validation_split=validation_split, - max_iter=max_iter, - n_iter_no_change=n_iter_no_change, - random_state=0) + scoring=scoring, + tol=tol, + validation_split=validation_split, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0) gb.fit(X, y) if n_iter_no_change is not None: @@ -135,12 +135,12 @@ def test_early_stopping_classification(data, scoring, validation_split, X, y = data gb = FastGradientBoostingClassifier(verbose=1, # just for coverage - scoring=scoring, - tol=tol, - validation_split=validation_split, - max_iter=max_iter, - n_iter_no_change=n_iter_no_change, - random_state=0) + scoring=scoring, + tol=tol, + validation_split=validation_split, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0) gb.fit(X, y) if n_iter_no_change is not None: @@ -152,8 +152,9 @@ def test_early_stopping_classification(data, scoring, validation_split, def test_should_stop(): def should_stop(scores, n_iter_no_change, tol): - gbdt = FastGradientBoostingClassifier(n_iter_no_change=n_iter_no_change, - tol=tol) + gbdt = FastGradientBoostingClassifier( + n_iter_no_change=n_iter_no_change, + tol=tol) return gbdt._should_stop(scores) # not enough iterations @@ -177,7 +178,8 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( FastGradientBoostingRegressor(), - FastGradientBoostingClassifier(scoring=None, validation_split=None, min_samples_leaf=5), + FastGradientBoostingClassifier(scoring=None, validation_split=None, + min_samples_leaf=5), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. diff --git a/sklearn/_fast_gradient_boosting/tests/test_histogram.py b/sklearn/_fast_gradient_boosting/tests/test_histogram.py index 6d18c12329a66..dceb9bf22a108 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_histogram.py +++ b/sklearn/_fast_gradient_boosting/tests/test_histogram.py @@ -6,8 +6,10 @@ from sklearn._fast_gradient_boosting.histogram import _build_histogram_naive from sklearn._fast_gradient_boosting.histogram import _build_histogram -from sklearn._fast_gradient_boosting.histogram import _build_histogram_no_hessian -from sklearn._fast_gradient_boosting.histogram import _build_histogram_root_no_hessian +from sklearn._fast_gradient_boosting.histogram import \ + _build_histogram_no_hessian +from sklearn._fast_gradient_boosting.histogram import \ + _build_histogram_root_no_hessian from sklearn._fast_gradient_boosting.histogram import _build_histogram_root from sklearn._fast_gradient_boosting.histogram import _subtract_histograms from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE diff --git a/sklearn/_fast_gradient_boosting/utils.py b/sklearn/_fast_gradient_boosting/utils.py index ea5454dbcf397..3481cba080f8d 100644 --- a/sklearn/_fast_gradient_boosting/utils.py +++ b/sklearn/_fast_gradient_boosting/utils.py @@ -1,5 +1,4 @@ """This module contains utility routines.""" -import numpy as np def get_lightgbm_estimator(pygbm_estimator): From 2fd29e14086c392b5fa089919d1a6e62fdf3cca7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 09:42:16 -0500 Subject: [PATCH 050/247] some comments --- sklearn/_fast_gradient_boosting/binning.pyx | 8 +++++--- sklearn/_fast_gradient_boosting/gradient_boosting.py | 3 --- sklearn/_fast_gradient_boosting/grower.py | 5 ++--- sklearn/_fast_gradient_boosting/predictor.pyx | 3 +-- .../tests/test_gradient_boosting.py | 5 +++-- sklearn/_fast_gradient_boosting/types.pxd | 1 + 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx index c741aa9b48188..3daf590547ddb 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.pyx @@ -42,14 +42,16 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), subset = rng.choice(np.arange(data.shape[0]), subsample) data = data[subset] - # TODO: DONT USE NEGATIVE INDEXING (see warning when compiling with cython) - percentiles = np.linspace(0, 100, num=max_bins + 1)[1:-1] + percentiles = np.linspace(0, 100, num=max_bins + 1) + end = percentiles.shape[0] # no negative indexing! + percentiles = percentiles[1:end - 1] binning_thresholds = [] for f_idx in range(data.shape[1]): col_data = np.ascontiguousarray(data[:, f_idx], dtype=X_DTYPE) distinct_values = np.unique(col_data) if len(distinct_values) <= max_bins: - midpoints = (distinct_values[:-1] + distinct_values[1:]) + end = distinct_values.shape[0] # no negative indexing! + midpoints = (distinct_values[:end - 1] + distinct_values[1:]) midpoints *= .5 else: # We sort again the data in this case. We could compute diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 98e94e25b67cd..4fd5148555ce0 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -92,9 +92,6 @@ def fit(self, X, y): acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. - # TODO: add support for mixed-typed (numerical + categorical) data - # TODO: add support for missing data - # TODO: add support for pre-binned data (pass-through)? X, y = check_X_y(X, y, dtype=[X_DTYPE]) y = self._encode_y(y) rng = check_random_state(self.random_state) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 7d79f2753117b..0f9fdc69b90aa 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -82,9 +82,8 @@ class TreeNode: # array. Concretely, # self.sample_indices = view(self.splitter.partition[start:stop]) # Only used in _update_raw_prediction, because we need to iterate over the - # leaves and I don't know how to efficiently store the sample_indices views - # because they're all of different sizes. TODO: ask Olivier what he thinks - # about # this + # leaves and I don't know how to efficiently store the sample_indices + # views because they're all of different sizes. start = 0 stop = 0 diff --git a/sklearn/_fast_gradient_boosting/predictor.pyx b/sklearn/_fast_gradient_boosting/predictor.pyx index 0d9e249fa45d1..a36d6ce8a0c4d 100644 --- a/sklearn/_fast_gradient_boosting/predictor.pyx +++ b/sklearn/_fast_gradient_boosting/predictor.pyx @@ -32,6 +32,7 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ ('bin_threshold', X_BINNED_DTYPE), ]) + cdef packed struct node_struct: Y_DTYPE_C value unsigned int count @@ -110,7 +111,5 @@ cdef void _predict_from_numeric_data( cdef: int i - # TODO: Why does prange fail?? - # for i in range(numeric_data.shape[0]): for i in prange(numeric_data.shape[0], schedule='static'): out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index b4ee63a5e5474..a56fa0ccb0d0f 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -12,7 +12,7 @@ @pytest.mark.parametrize('GradientBoosting, X, y', [ - # (GBMClassifier, X_classification, y_classification), TODO: unskip + (FastGradientBoostingClassifier, X_classification, y_classification), (FastGradientBoostingRegressor, X_regression, y_regression) ]) def test_init_parameters_validation(GradientBoosting, X, y): @@ -184,7 +184,8 @@ def should_stop(scores, n_iter_no_change, tol): def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. - # Notes: + # Default parameters to the estimators have to be changed to pass the + # tests: # - Can't do early stopping with classifier because often # validation_split=.1 leads to test_size=2 < n_classes and # train_test_split raises an error. diff --git a/sklearn/_fast_gradient_boosting/types.pxd b/sklearn/_fast_gradient_boosting/types.pxd index c15dbca9dcfc7..d9470ecef62f8 100644 --- a/sklearn/_fast_gradient_boosting/types.pxd +++ b/sklearn/_fast_gradient_boosting/types.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 import numpy as np cimport numpy as np From 5a8253437faf8cb7408e6f3a8f797ca6554d89bd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 10:40:35 -0500 Subject: [PATCH 051/247] checkpoint before changing scoring param --- .../gradient_boosting.py | 120 +++++++++--------- sklearn/_fast_gradient_boosting/grower.py | 3 +- sklearn/_fast_gradient_boosting/histogram.pxd | 10 +- sklearn/_fast_gradient_boosting/histogram.pyx | 2 +- .../tests/test_compare_lightgbm.py | 12 +- .../tests/test_gradient_boosting.py | 42 +++--- sklearn/_fast_gradient_boosting/utils.py | 6 +- 7 files changed, 101 insertions(+), 94 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 4fd5148555ce0..c29a2673831ca 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -24,20 +24,20 @@ class BaseFastGradientBoosting(BaseEstimator, ABC): """Base class for fast gradient boosting estimators.""" @abstractmethod - def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, + def __init__(self, loss, learning_rate, n_estimators, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, - scoring, validation_split, n_iter_no_change, tol, verbose, + scoring, validation_fraction, n_iter_no_change, tol, verbose, random_state): self.loss = loss self.learning_rate = learning_rate - self.max_iter = max_iter + self.n_estimators = n_estimators self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.n_iter_no_change = n_iter_no_change - self.validation_split = validation_split + self.validation_fraction = validation_fraction self.scoring = scoring self.tol = tol self.verbose = verbose @@ -58,14 +58,14 @@ def _validate_parameters(self): if self.learning_rate <= 0: raise ValueError(f'learning_rate={self.learning_rate} must ' f'be strictly positive') - if self.max_iter < 1: - raise ValueError(f'max_iter={self.max_iter} must ' + if self.n_estimators < 1: + raise ValueError(f'n_estimators={self.n_estimators} must ' f'not be smaller than 1.') if self.n_iter_no_change is not None and self.n_iter_no_change < 0: raise ValueError(f'n_iter_no_change={self.n_iter_no_change} ' f'must be positive.') - if self.validation_split is not None and self.validation_split <= 0: - raise ValueError(f'validation_split={self.validation_split} ' + if self.validation_fraction is not None and self.validation_fraction <= 0: + raise ValueError(f'validation_fraction={self.validation_fraction} ' f'must be strictly positive, or None.') if self.tol is not None and self.tol < 0: raise ValueError(f'tol={self.tol} ' @@ -116,19 +116,19 @@ def fit(self, X, y): self.do_early_stopping_ = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) - if self.do_early_stopping_ and self.validation_split is not None: + if self.do_early_stopping_ and self.validation_fraction is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( - X_binned, y, test_size=self.validation_split, + X_binned, y, test_size=self.validation_fraction, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' - f'perform early stopping with validation_split=' - f'{self.validation_split}. Use more training data or ' - f'adjust validation_split.' + f'perform early stopping with validation_fraction=' + f'{self.validation_fraction}. Use more training data or ' + f'adjust validation_fraction.' ) # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. @@ -138,15 +138,15 @@ def fit(self, X, y): X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None - # Subsample the training set for score-based monitoring. + # Subsample the training set for early stopping if self.do_early_stopping_: - subsample_size = 10000 + subsample_size = 10000 # should we expose this? indices = np.arange(X_binned_train.shape[0]) if X_binned_train.shape[0] > subsample_size: indices = rng.choice(indices, subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] - # Predicting is faster of C-contiguous arrays. + # Predicting is faster on C-contiguous arrays. X_binned_small_train = np.ascontiguousarray(X_binned_small_train) if self.verbose: @@ -170,8 +170,8 @@ def fit(self, X, y): prediction_dim=self.n_trees_per_iteration_ ) - # predictors_ is a matrix of TreePredictor objects with shape - # (n_iter_, n_trees_per_iteration) + # predictors_ is a matrix (list of lists) of TreePredictor objects + # with shape (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls @@ -184,15 +184,15 @@ def fit(self, X, y): self.train_scores_.append( self._get_scores(X_binned_train, y_train)) - if self.validation_split is not None: + if self.validation_fraction is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) - for iteration in range(self.max_iter): + for iteration in range(self.n_estimators): if self.verbose: iteration_start_time = time() - print(f"[{iteration + 1}/{self.max_iter}] ", end='', + print(f"[{iteration + 1}/{self.n_estimators}] ", end='', flush=True) # Update gradients and hessians, inplace @@ -277,7 +277,7 @@ def _check_early_stopping(self, X_binned_train, y_train, self.train_scores_.append( self._get_scores(X_binned_train, y_train)) - if self.validation_split is not None: + if self.validation_fraction is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) return self._should_stop(self.validation_scores_) @@ -342,7 +342,7 @@ def _print_iteration_stats(self, iteration_start_time): if self.do_early_stopping_: log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, " - if self.validation_split is not None: + if self.validation_fraction is not None: log_msg += (f"{self.scoring} val: " f"{self.validation_scores_[-1]:.5f}, ") @@ -357,8 +357,7 @@ def _raw_predict(self, X): Parameters ---------- X : array-like, shape=(n_samples, n_features) - The input samples. If ``X.dtype == np.uint8``, the data is assumed - to be pre-binned. + The input samples. Returns ------- @@ -409,7 +408,7 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. - max_iter : int, optional(default=100) + n_estimators : int, optional(default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees. max_leaf_nodes : int or None, optional(default=None) @@ -428,25 +427,26 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. Must be no larger than 256. - scoring : str or callable or None, \ - optional (default=None) - Scoring parameter to use for early stopping (see sklearn.metrics for - available options). If None, early stopping is check w.r.t the loss - value. - validation_split : int or float or None, optional(default=0.1) + scoring : str or callable or None, optional (default=None) + Scoring parameter to use for early stopping. It can be a single + string (see :ref:`scoring_parameter`) or a callable (see + :ref:`scoring`). If None, the estimator's default scorer (if + available) is used. If ``scoring='loss'``, early stopping is checked + w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. + validation_fraction : int or float or None, optional(default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on - the training data. + the training data. Only used if ``n_iter_no_change`` is not None. n_iter_no_change : int or None, optional (default=5) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some tolerance. If None or 0, no early-stopping is done. tol : float or None optional (default=1e-7) - The absolute tolerance to use when comparing scores. The higher the - tolerance, the more likely we are to early stop: higher tolerance - means that it will be harder for subsequent iterations to be - considered an improvement upon the reference score. + The absolute tolerance to use when comparing scores during early + stopping. The higher the tolerance, the more likely we are to early + stop: higher tolerance means that it will be harder for subsequent + iterations to be considered an improvement upon the reference score. verbose: int, optional (default=0) The verbosity level. If not zero, print some information about the fitting process. @@ -454,9 +454,7 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): optional (default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping - is enabled. See - `scikit-learn glossary - `_. + is enabled. See :term:`random_state`. Examples @@ -472,16 +470,16 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): _VALID_LOSSES = ('least_squares',) def __init__(self, loss='least_squares', learning_rate=0.1, - max_iter=100, max_leaf_nodes=31, max_depth=None, + n_estimators=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, - scoring=None, validation_split=0.1, n_iter_no_change=5, + scoring=None, validation_fraction=0.1, n_iter_no_change=5, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingRegressor, self).__init__( - loss=loss, learning_rate=learning_rate, max_iter=max_iter, + loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, - scoring=scoring, validation_split=validation_split, + scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) @@ -491,8 +489,7 @@ def predict(self, X): Parameters ---------- X : array-like, shape=(n_samples, n_features) - The input samples. If ``X.dtype == np.uint8``, the data is assumed - to be pre-binned. + The input samples. Returns ------- @@ -504,7 +501,7 @@ def predict(self, X): return self._raw_predict(X).ravel() def _encode_y(self, y): - # Just convert y to float32 + # Just convert y to the expected dtype self.n_trees_per_iteration_ = 1 y = y.astype(Y_DTYPE, copy=False) return y @@ -530,7 +527,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. - max_iter : int, optional(default=100) + n_estimators : int, optional(default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built. @@ -551,10 +548,12 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, number of unique values may use less than ``max_bins`` bins. Must be no larger than 256. scoring : str or callable or None, optional (default=None) - Scoring parameter to use for early stopping (see sklearn.metrics for - available options). If None, early stopping is check w.r.t the loss - value. - validation_split : int or float or None, optional(default=0.1) + Scoring parameter to use for early stopping. It can be a single + string (see :ref:`scoring_parameter`) or a callable (see + :ref:`scoring`). If None, the estimator's default scorer (if + available) is used. If ``scoring='loss'``, early stopping is checked + w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. + validation_fraction : int or float or None, optional(default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. @@ -575,8 +574,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, optional(default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping - is enabled. See `scikit-learn glossary - `_. + is enabled. See :term:`random_state`. Examples -------- @@ -591,17 +589,17 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', 'auto') - def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, + def __init__(self, loss='auto', learning_rate=0.1, n_estimators=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, - validation_split=0.1, n_iter_no_change=5, tol=1e-7, + validation_fraction=0.1, n_iter_no_change=5, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingClassifier, self).__init__( - loss=loss, learning_rate=learning_rate, max_iter=max_iter, + loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, - scoring=scoring, validation_split=validation_split, + scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) @@ -611,8 +609,7 @@ def predict(self, X): Parameters ---------- X : array-like, shape=(n_samples, n_features) - The input samples. If ``X.dtype == np.uint8``, the data is assumed - to be pre-binned. + The input samples. Returns ------- @@ -629,8 +626,7 @@ def predict_proba(self, X): Parameters ---------- X : array-like, shape=(n_samples, n_features) - The input samples. If ``X.dtype == np.uint8``, the data is assumed - to be pre-binned. + The input samples. Returns ------- diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 0f9fdc69b90aa..a50bb7ff715da 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -211,8 +211,7 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, l2_regularization, min_hessian_to_split): """Validate parameters passed to __init__. - Also validate parameters passed to splitter because we cannot - raise exceptions in a jitclass. + Also validate parameters passed to splitter. """ if X_binned.dtype != np.uint8: raise NotImplementedError( diff --git a/sklearn/_fast_gradient_boosting/histogram.pxd b/sklearn/_fast_gradient_boosting/histogram.pxd index 0b1b8e61bd4f0..ce9c10a48e3c1 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pxd +++ b/sklearn/_fast_gradient_boosting/histogram.pxd @@ -1,9 +1,17 @@ # cython: language_level=3 -"""This module contains njitted routines for building histograms. +"""This module contains routines for building histograms. A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each feature has its own histogram. A histogram contains the sum of gradients and hessians of all the samples belonging to each bin. + +There are different ways to build a histogram: +- by subtraction: hist(child) = hist(parent) - hist(sibling) +- from scratch. In this case we have rountines that update the hessians or not + (not useful when hessians are constant for some losses e.g. least squares). + Also, there's a special case for the root which contains all the samples, + leading to some possible optimizations. Overall all the implementations look + the same, and are optimized for cache hit. """ import numpy as np cimport numpy as np diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index eefc0c84b6951..39176fc770daa 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -2,7 +2,7 @@ # cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 -"""This module contains njitted routines for building histograms. +"""This module contains routines for building histograms. A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each feature has its own histogram. A histogram contains the sum of gradients and diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 05ba2d36a5e84..887cf059dd2ff 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -39,7 +39,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples - max_iter = 1 + n_estimators = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, @@ -53,7 +53,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = FastGradientBoostingRegressor( - max_iter=max_iter, + n_estimators=n_estimators, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, @@ -91,7 +91,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples - max_iter = 1 + n_estimators = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, @@ -106,7 +106,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, est_pygbm = FastGradientBoostingClassifier( loss='binary_crossentropy', - max_iter=max_iter, + n_estimators=n_estimators, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, @@ -151,7 +151,7 @@ def test_same_predictions_multiclass_classification( rng = np.random.RandomState(seed=seed) n_samples = n_samples - max_iter = 1 + n_estimators = 1 max_bins = 256 lr = 1 @@ -168,7 +168,7 @@ def test_same_predictions_multiclass_classification( est_pygbm = FastGradientBoostingClassifier( loss='categorical_crossentropy', - max_iter=max_iter, + n_estimators=n_estimators, max_bins=max_bins, learning_rate=lr, n_iter_no_change=None, diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index a56fa0ccb0d0f..20a2fee690f61 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -32,8 +32,8 @@ def test_init_parameters_validation(GradientBoosting, X, y): assert_raises_regex( ValueError, - f"max_iter=0 must not be smaller than 1", - GradientBoosting(max_iter=0).fit, X, y + f"n_estimators=0 must not be smaller than 1", + GradientBoosting(n_estimators=0).fit, X, y ) assert_raises_regex( @@ -73,11 +73,11 @@ def test_init_parameters_validation(GradientBoosting, X, y): GradientBoosting(n_iter_no_change=-1).fit, X, y ) - for validation_split in (-1, 0): + for validation_fraction in (-1, 0): assert_raises_regex( ValueError, - f"validation_split={validation_split} must be strictly positive", - GradientBoosting(validation_split=validation_split).fit, X, y + f"validation_fraction={validation_fraction} must be strictly positive", + GradientBoosting(validation_fraction=validation_fraction).fit, X, y ) assert_raises_regex( @@ -87,66 +87,66 @@ def test_init_parameters_validation(GradientBoosting, X, y): ) -@pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ +@pytest.mark.parametrize('scoring, validation_fraction, n_iter_no_change, tol', [ ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on training data (None, .1, 5, 1e-7), # use loss (None, None, 5, 1e-1), # use loss on training data (None, None, None, None), # no early stopping ]) -def test_early_stopping_regression(scoring, validation_split, +def test_e(scoring, validation_fraction, n_iter_no_change, tol): - max_iter = 500 + n_estimators = 500 X, y = make_regression(random_state=0) gb = FastGradientBoostingRegressor(verbose=1, # just for coverage scoring=scoring, tol=tol, - validation_split=validation_split, - max_iter=max_iter, + validation_fraction=validation_fraction, + n_estimators=n_estimators, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if n_iter_no_change is not None: - assert n_iter_no_change <= gb.n_iter_ < max_iter + assert n_iter_no_change <= gb.n_iter_ < n_estimators else: - assert gb.n_iter_ == max_iter + assert gb.n_iter_ == n_estimators @pytest.mark.parametrize('data', ( make_classification(random_state=0), make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) )) -@pytest.mark.parametrize('scoring, validation_split, n_iter_no_change, tol', [ +@pytest.mark.parametrize('scoring, validation_fraction, n_iter_no_change, tol', [ ('accuracy', .1, 5, 1e-7), # use scorer ('accuracy', None, 5, 1e-1), # use scorer on training data (None, .1, 5, 1e-7), # use loss (None, None, 5, 1e-1), # use loss on training data (None, None, None, None), # no early stopping ]) -def test_early_stopping_classification(data, scoring, validation_split, +def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): - max_iter = 500 + n_estimators = 500 X, y = data gb = FastGradientBoostingClassifier(verbose=1, # just for coverage scoring=scoring, tol=tol, - validation_split=validation_split, - max_iter=max_iter, + validation_fraction=validation_fraction, + n_estimators=n_estimators, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if n_iter_no_change is not None: - assert n_iter_no_change <= gb.n_iter_ < max_iter + assert n_iter_no_change <= gb.n_iter_ < n_estimators else: - assert gb.n_iter_ == max_iter + assert gb.n_iter_ == n_estimators def test_should_stop(): @@ -178,7 +178,7 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( FastGradientBoostingRegressor(), - FastGradientBoostingClassifier(scoring=None, validation_split=None, + FastGradientBoostingClassifier(scoring=None, validation_fraction=None, min_samples_leaf=5), )) def test_estimator_checks(Estimator): @@ -187,7 +187,7 @@ def test_estimator_checks(Estimator): # Default parameters to the estimators have to be changed to pass the # tests: # - Can't do early stopping with classifier because often - # validation_split=.1 leads to test_size=2 < n_classes and + # validation_fraction=.1 leads to test_size=2 < n_classes and # train_test_split raises an error. # - Also, need to set a low min_samples_leaf for # check_classifiers_classes() to pass: with only 30 samples on the diff --git a/sklearn/_fast_gradient_boosting/utils.py b/sklearn/_fast_gradient_boosting/utils.py index 3481cba080f8d..f9c9b59f42849 100644 --- a/sklearn/_fast_gradient_boosting/utils.py +++ b/sklearn/_fast_gradient_boosting/utils.py @@ -1,4 +1,5 @@ """This module contains utility routines.""" +from .binning import BinMapper def get_lightgbm_estimator(pygbm_estimator): @@ -30,7 +31,7 @@ def get_lightgbm_estimator(pygbm_estimator): lgbm_params = { 'objective': loss_mapping[pygbm_params['loss']], 'learning_rate': pygbm_params['learning_rate'], - 'n_estimators': pygbm_params['max_iter'], + 'n_estimators': pygbm_params['n_estimators'], 'num_leaves': pygbm_params['max_leaf_nodes'], 'max_depth': pygbm_params['max_depth'], 'min_data_in_leaf': pygbm_params['min_samples_leaf'], @@ -41,6 +42,9 @@ def get_lightgbm_estimator(pygbm_estimator): 'min_gain_to_split': 0, 'verbosity': 10 if pygbm_params['verbose'] else 0, 'boost_from_average': True, + 'enable_bundle': False, # also makes feature order consistent + 'min_data_in_bin': 1, + 'bin_construct_sample_cnt': BinMapper().subsample, } # TODO: change hardcoded values when / if they're arguments to the # estimator. From ae4640ed2029ef0519885fe066fc1c7d41de7243 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 19:27:09 -0500 Subject: [PATCH 052/247] Fixed bug in update_raw_predictions --- .../_fast_gradient_boosting/_gradient_boosting.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index c076bc36af56e..4c7c3427a2f36 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -32,17 +32,17 @@ def _update_raw_predictions(Y_DTYPE_C [:] raw_predictions, grower): cdef void _update_raw_predictions_helper( Y_DTYPE_C [:] raw_predictions, - unsigned int [:] starts, - unsigned int [:] stops, - unsigned int [:] partition, + const unsigned int [:] starts, + const unsigned int [:] stops, + const unsigned int [:] partition, Y_DTYPE_C [:] values) nogil: cdef: - int sample_idx + unsigned int position int leaf_idx int n_leaves n_leaves = starts.shape[0] for leaf_idx in prange(n_leaves): - for sample_idx in range(starts[leaf_idx], stops[leaf_idx]): - raw_predictions[sample_idx] += values[leaf_idx] + for position in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[partition[position]] += values[leaf_idx] From ec5128c3f01f4dee56a91b91d77dda43eced97ce Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 19:28:27 -0500 Subject: [PATCH 053/247] small optimization for root node splitting --- sklearn/_fast_gradient_boosting/grower.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index a50bb7ff715da..f1021996ae221 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -270,6 +270,9 @@ def _intilialize_root(self): # Do not even bother computing any splitting statistics. self._finalize_leaf(self.root) return + # if sum_hessians < self.splitter.min_hessian_to_split: + # self._finalize_leaf(self.root) + # return self._compute_spittability(self.root) From 565e9364f1ed2f32c9edb267d71e8dd2a7675ce2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 19:29:55 -0500 Subject: [PATCH 054/247] numerically stable logsumexp --- sklearn/_fast_gradient_boosting/loss.pyx | 29 +++++++++++------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 54f3c949911d6..2cb6a4fb9077d 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -135,8 +135,8 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( Y_DTYPE_C [:] gradients, - Y_DTYPE_C [:] y_true, - Y_DTYPE_C [:] raw_predictions) nogil: + const Y_DTYPE_C [:] y_true, + const Y_DTYPE_C [:] raw_predictions) nogil: cdef: unsigned int n_samples int i @@ -199,8 +199,8 @@ class BinaryCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_binary_crossentropy( Y_DTYPE_C [:] gradients, Y_DTYPE_C [:] hessians, - Y_DTYPE_C [:] y_true, - Y_DTYPE_C [:] raw_predictions) nogil: + const Y_DTYPE_C [:] y_true, + const Y_DTYPE_C [:] raw_predictions) nogil: cdef: unsigned int n_samples Y_DTYPE_C gradient_abs @@ -255,31 +255,28 @@ class CategoricalCrossEntropy(BaseLoss): logsumexp(raw_predictions, axis=1)[:, np.newaxis]) -cdef inline Y_DTYPE_C _logsumexp(Y_DTYPE_C [:, :] a, const int row) nogil: +cdef inline Y_DTYPE_C _logsumexp(const Y_DTYPE_C [:, :] a, const int row) nogil: # Need to pass the whole array, else prange won't work. See Cython issue # #2798 cdef: int k Y_DTYPE_C out = 0. - # Y_DTYPE_C amax + Y_DTYPE_C amax = a[row, 0] - # TODO: use the numerically safer option - # But I don't now how to properly write a max() - # amax = max(a[i]) - # for k in range(a.shape[1]): - # out += exp(a[i, k] - amax) - # return log(out) + amax + for k in range(1, a.shape[1]): + if amax < a[row, k]: + amax = a[row, k] for k in range(a.shape[1]): - out += exp(a[row, k]) - return log(out) + out += exp(a[row, k] - amax) + return log(out) + amax cdef void _update_gradients_hessians_categorical_crossentropy( Y_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT Y_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT - Y_DTYPE_C [:] y_true, # shape (n_samples,), IN - Y_DTYPE_C [:, :] raw_predictions # shape (n_samples, n_tree_per_iter), IN + const Y_DTYPE_C [:] y_true, # shape (n_samples,), IN + const Y_DTYPE_C [:, :] raw_predictions # shape (n_samples, n_tree_per_iter), IN ) nogil: cdef: unsigned int n_samples From 713d838b9152b00d764413972d2cf3a6b1fe8f28 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 19:31:08 -0500 Subject: [PATCH 055/247] minimal splitter change --- sklearn/_fast_gradient_boosting/splitting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index af3b2edbf5b11..7099c71c3ee99 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -580,7 +580,7 @@ cdef class Splitter: for feature_idx in range(self.n_features): split_info = split_infos[feature_idx] gain = split_info.gain - if best_gain == -1 or gain > best_gain: + if best_gain < 0. or gain > best_gain: best_gain = gain best_split_info = split_info return best_split_info From 10affef7c63dd9d64a0c2405476eaba74b9f7e75 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 19:39:57 -0500 Subject: [PATCH 056/247] more sensible early stopping --- gdb_test.py | 44 +++++++------- .../gradient_boosting.py | 60 ++++++++++++------- sklearn/_fast_gradient_boosting/predictor.pyx | 48 +++++++++++++++ .../tests/test_gradient_boosting.py | 35 +++++------ .../tests/test_grower.py | 1 - 5 files changed, 121 insertions(+), 67 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index 361907ea41d8e..108bede05605c 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -13,7 +13,7 @@ classif = False n_classes = 2 n_features = 20 -n_samples = int(1e7) +n_samples = int(1e6) max_iter = 5 if classif: @@ -28,20 +28,20 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -pygbm_est = PYGBM_GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - random_state=0, - verbose=False) -print("compiling pygbm code") -pygbm_est.fit(X[:1000], y[:1000]) -print("done") +# pygbm_est = PYGBM_GBM( +# max_iter=max_iter, +# scoring=None, # no early stopping +# validation_split=None, +# random_state=0, +# verbose=False) +# print("compiling pygbm code") +# pygbm_est.fit(X[:1000], y[:1000]) +# print("done") gbm = GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, + n_estimators=max_iter, + scoring=None, + validation_fraction=None, n_iter_no_change=None, random_state=0, verbose=True) @@ -55,15 +55,15 @@ print(f'sklearn gbm score_duration {score_duration:.3f}s') -pygbm_est.set_params(verbose=True) -tic = time() -pygbm_est.fit(X, y) -fit_duration = time() - tic -tic = time() -print(f'score: {pygbm_est.score(X, y)}') -score_duration = time() - tic -print(f'pygbm fit_duration: {fit_duration:.3f}s') -print(f'pygbm score_duration {score_duration:.3f}s') +# pygbm_est.set_params(verbose=True) +# tic = time() +# pygbm_est.fit(X, y) +# fit_duration = time() - tic +# tic = time() +# print(f'score: {pygbm_est.score(X, y)}') +# score_duration = time() - tic +# print(f'pygbm fit_duration: {fit_duration:.3f}s') +# print(f'pygbm score_duration {score_duration:.3f}s') # cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") # s = pstats.Stats("Profile.prof") diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index c29a2673831ca..2b8db41cc37bc 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from ._gradient_boosting import _update_raw_predictions -from .types import Y_DTYPE, X_DTYPE +from .types import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE from .binning import BinMapper from .grower import TreeGrower @@ -87,6 +87,7 @@ def fit(self, X, y): self : object """ + self._in_fit = True # TODO: document this fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes @@ -176,13 +177,16 @@ def fit(self, X, y): # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. - self.scorer_ = check_scoring(self, self.scoring) + if self.scoring != 'loss': + self.scorer_ = check_scoring(self, self.scoring) + else: + self.scorer_ = None self.train_scores_ = [] self.validation_scores_ = [] if self.do_early_stopping_: # Add predictions of the initial model (before the first tree) self.train_scores_.append( - self._get_scores(X_binned_train, y_train)) + self._get_scores(X_binned_small_train, y_small_train)) if self.validation_fraction is not None: self.validation_scores_.append( @@ -242,6 +246,11 @@ def fit(self, X, y): if self.verbose: self._print_iteration_stats(iteration_start_time) + # if the only trees we could build are stumps, stop training + if all(predictor.get_n_leaf_nodes() == 1 + for predictor in self.predictors_[-1]): + should_early_stop = True + if should_early_stop: break @@ -265,6 +274,7 @@ def fit(self, X, y): self.train_scores_ = np.asarray(self.train_scores_) self.validation_scores_ = np.asarray(self.validation_scores_) + self._in_fit = False return self def _check_early_stopping(self, X_binned_train, y_train, @@ -307,11 +317,12 @@ def _should_stop(self, scores): def _get_scores(self, X, y): """Compute scores on data X with target y. - Scores are either computed with a scorer if scoring parameter is not - None, else with the loss. As higher is always better, we return + Scores are computed with a scorer if scoring parameter is not + 'loss', else with the loss. As higher is always better, we return -loss_value. """ - if self.scoring is not None: + + if not isinstance(self.scoring, str) and self.scoring != 'loss': return self.scorer_(self, X, y) # Else, use loss @@ -364,13 +375,14 @@ def _raw_predict(self, X): raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ - X = check_array(X, dtype=X_DTYPE) + X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE]) check_is_fitted(self, 'predictors_') if X.shape[1] != self.n_features_: raise ValueError( f'X has {X.shape[1]} features but this estimator was ' f'trained with {self.n_features_} features.' ) + is_binned = self._in_fit and X.dtype == X_BINNED_DTYPE n_samples = X.shape[0] raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), @@ -379,7 +391,9 @@ def _raw_predict(self, X): raw_predictions += self.baseline_prediction_ for predictors_of_ith_iteration in self.predictors_: for k, predictor in enumerate(predictors_of_ith_iteration): - raw_predictions[:, k] += predictor.predict(X) + predict = (predictor.predict_binned if is_binned + else predictor.predict) + raw_predictions[:, k] += predict(X) return raw_predictions @@ -430,14 +444,14 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see - :ref:`scoring`). If None, the estimator's default scorer (if - available) is used. If ``scoring='loss'``, early stopping is checked - w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. + :ref:`scoring`). If None, the estimator's default scorer is used. If + ``scoring='loss'``, early stopping is checked w.r.t the loss value. + Only used if ``n_iter_no_change`` is not None. validation_fraction : int or float or None, optional(default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. Only used if ``n_iter_no_change`` is not None. - n_iter_no_change : int or None, optional (default=5) + n_iter_no_change : int or None, optional (default=None) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some @@ -460,11 +474,11 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): Examples -------- >>> from sklearn.datasets import load_boston - >>> from pygbm import GradientBoostingRegressor + >>> from sklearn.ensemble import FastGradientBoostingRegressor >>> X, y = load_boston(return_X_y=True) - >>> est = GradientBoostingRegressor().fit(X, y) + >>> est = FastGradientBoostingRegressor().fit(X, y) >>> est.score(X, y) - 0.92... + 0.98... """ _VALID_LOSSES = ('least_squares',) @@ -472,7 +486,7 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): def __init__(self, loss='least_squares', learning_rate=0.1, n_estimators=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, - scoring=None, validation_fraction=0.1, n_iter_no_change=5, + scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, @@ -550,14 +564,14 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping. It can be a single string (see :ref:`scoring_parameter`) or a callable (see - :ref:`scoring`). If None, the estimator's default scorer (if - available) is used. If ``scoring='loss'``, early stopping is checked + :ref:`scoring`). If None, the estimator's default scorer + is used. If ``scoring='loss'``, early stopping is checked w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. validation_fraction : int or float or None, optional(default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. - n_iter_no_change : int or None, optional (default=5) + n_iter_no_change : int or None, optional (default=None) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some @@ -579,11 +593,11 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, Examples -------- >>> from sklearn.datasets import load_iris - >>> from pygbm import GradientBoostingClassifier + >>> from sklearn.ensemble import FastGradientBoostingClassifier >>> X, y = load_iris(return_X_y=True) - >>> clf = GradientBoostingClassifier().fit(X, y) + >>> clf = FastGradientBoostingClassifier().fit(X, y) >>> clf.score(X, y) - 0.97... + 1.0 """ _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', @@ -592,7 +606,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, def __init__(self, loss='auto', learning_rate=0.1, n_estimators=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, - validation_fraction=0.1, n_iter_no_change=5, tol=1e-7, + validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, diff --git a/sklearn/_fast_gradient_boosting/predictor.pyx b/sklearn/_fast_gradient_boosting/predictor.pyx index a36d6ce8a0c4d..eff4d768bf2f5 100644 --- a/sklearn/_fast_gradient_boosting/predictor.pyx +++ b/sklearn/_fast_gradient_boosting/predictor.pyx @@ -82,6 +82,22 @@ class TreePredictor: _predict_from_numeric_data(self.nodes, X, out) return out + def predict_binned(self, X): + """Predict raw values for binned data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y : array, shape (n_samples,) + The raw predicted values. + """ + out = np.empty(X.shape[0], dtype=Y_DTYPE) + _predict_from_binned_data(self.nodes, X, out) + return out cdef inline Y_DTYPE_C _predict_one_from_numeric_data( node_struct [:] nodes, @@ -113,3 +129,35 @@ cdef void _predict_from_numeric_data( for i in prange(numeric_data.shape[0], schedule='static'): out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i) + + +cdef inline Y_DTYPE_C _predict_one_from_binned_data( + node_struct [:] nodes, + const X_BINNED_DTYPE_C [:, :] binned_data, + const int row + ) nogil: + # Need to pass the whole array, else prange won't work. See issue Cython + # #2798 + + cdef: + node_struct node = nodes[0] + + while True: + if node.is_leaf: + return node.value + if binned_data[row, node.feature_idx] <= node.bin_threshold: + node = nodes[node.left] + else: + node = nodes[node.right] + + +cdef void _predict_from_binned_data( + node_struct [:] nodes, + const X_BINNED_DTYPE_C [:, :] binned_data, + Y_DTYPE_C [:] out) nogil: + + cdef: + int i + + for i in prange(binned_data.shape[0], schedule='static'): + out[i] = _predict_one_from_binned_data(nodes, binned_data, i) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 20a2fee690f61..e7b66adc576ec 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -90,11 +90,13 @@ def test_init_parameters_validation(GradientBoosting, X, y): @pytest.mark.parametrize('scoring, validation_fraction, n_iter_no_change, tol', [ ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on training data - (None, .1, 5, 1e-7), # use loss - (None, None, 5, 1e-1), # use loss on training data + (None, .1, 5, 1e-7), # same with default scorer + (None, None, 5, 1e-1), + ('loss', .1, 5, 1e-7), # use loss + ('loss', None, 5, 1e-1), # use loss on training data (None, None, None, None), # no early stopping ]) -def test_e(scoring, validation_fraction, +def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): n_estimators = 500 @@ -123,9 +125,10 @@ def test_e(scoring, validation_fraction, @pytest.mark.parametrize('scoring, validation_fraction, n_iter_no_change, tol', [ ('accuracy', .1, 5, 1e-7), # use scorer ('accuracy', None, 5, 1e-1), # use scorer on training data - (None, .1, 5, 1e-7), # use loss - (None, None, 5, 1e-1), # use loss on training data - (None, None, None, None), # no early stopping + (None, .1, 5, 1e-7), # same with default scorerscor + (None, None, 5, 1e-1), + ('loss', .1, 5, 1e-7), # use loss + ('loss', None, 5, 1e-1), # use loss on training data ]) def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): @@ -143,10 +146,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction, random_state=0) gb.fit(X, y) - if n_iter_no_change is not None: - assert n_iter_no_change <= gb.n_iter_ < n_estimators - else: - assert gb.n_iter_ == n_estimators + assert n_iter_no_change <= gb.n_iter_ < n_estimators def test_should_stop(): @@ -178,19 +178,12 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( FastGradientBoostingRegressor(), - FastGradientBoostingClassifier(scoring=None, validation_fraction=None, - min_samples_leaf=5), + FastGradientBoostingClassifier(min_samples_leaf=5), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. - # Default parameters to the estimators have to be changed to pass the - # tests: - # - Can't do early stopping with classifier because often - # validation_fraction=.1 leads to test_size=2 < n_classes and - # train_test_split raises an error. - # - Also, need to set a low min_samples_leaf for - # check_classifiers_classes() to pass: with only 30 samples on the - # dataset, the root is never split with min_samples_leaf=20 and only the - # majority class is predicted. + # need to set a low min_samples_leaf for check_classifiers_classes() to + # pass: with only 30 samples on the dataset, the root is never split with + # min_samples_leaf=20 and only the majority class is predicted. check_estimator(Estimator) diff --git a/sklearn/_fast_gradient_boosting/tests/test_grower.py b/sklearn/_fast_gradient_boosting/tests/test_grower.py index 9015cbac40298..e9cc3a0a04908 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_grower.py +++ b/sklearn/_fast_gradient_boosting/tests/test_grower.py @@ -145,7 +145,6 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): rel=1e-3) -@pytest.mark.skip('Removed predict_binned') def test_predictor_from_grower(): # Build a tree on the toy 3-leaf dataset to extract the predictor. n_bins = 256 From 1cd23f13dfeedeed41a949766890974ea2159617 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 20:14:37 -0500 Subject: [PATCH 057/247] changed min_sammples_leaf default to 5 --- sklearn/_fast_gradient_boosting/gradient_boosting.py | 10 +++++----- .../tests/test_gradient_boosting.py | 6 +----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 2b8db41cc37bc..3fd2e99cbf109 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -431,7 +431,7 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): max_depth : int or None, optional(default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. - min_samples_leaf : int, optional(default=20) + min_samples_leaf : int, optional(default=5) The minimum number of samples per leaf. l2_regularization : float, optional(default=0) The L2 regularization parameter. Use 0 for no regularization. @@ -478,14 +478,14 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): >>> X, y = load_boston(return_X_y=True) >>> est = FastGradientBoostingRegressor().fit(X, y) >>> est.score(X, y) - 0.98... + 0.99... """ _VALID_LOSSES = ('least_squares',) def __init__(self, loss='least_squares', learning_rate=0.1, n_estimators=100, max_leaf_nodes=31, max_depth=None, - min_samples_leaf=20, l2_regularization=0., max_bins=256, + min_samples_leaf=5, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingRegressor, self).__init__( @@ -551,7 +551,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, max_depth : int or None, optional(default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. - min_samples_leaf : int, optional(default=20) + min_samples_leaf : int, optional(default=5) The minimum number of samples per leaf. l2_regularization : float, optional(default=0) The L2 regularization parameter. Use 0 for no regularization. @@ -604,7 +604,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, 'auto') def __init__(self, loss='auto', learning_rate=0.1, n_estimators=100, - max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, + max_leaf_nodes=31, max_depth=None, min_samples_leaf=5, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index e7b66adc576ec..355ad5522ef1c 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -178,12 +178,8 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( FastGradientBoostingRegressor(), - FastGradientBoostingClassifier(min_samples_leaf=5), + FastGradientBoostingClassifier(), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. - - # need to set a low min_samples_leaf for check_classifiers_classes() to - # pass: with only 30 samples on the dataset, the root is never split with - # min_samples_leaf=20 and only the majority class is predicted. check_estimator(Estimator) From 5060aee1ce9bb011d96138ee333364866ed8348b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Jan 2019 22:19:15 -0500 Subject: [PATCH 058/247] pass feature_idx to histogram builders to avoid python interactions --- gdb_test.py | 38 ++--- sklearn/_fast_gradient_boosting/histogram.pxd | 20 ++- sklearn/_fast_gradient_boosting/histogram.pyx | 134 +++++++++--------- sklearn/_fast_gradient_boosting/splitting.pyx | 57 ++++---- .../tests/test_histogram.py | 84 ++++++----- .../tests/test_splitting.py | 8 +- 6 files changed, 184 insertions(+), 157 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index 108bede05605c..c96a7d851dfd6 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -13,7 +13,7 @@ classif = False n_classes = 2 n_features = 20 -n_samples = int(1e6) +n_samples = int(5e6) max_iter = 5 if classif: @@ -28,15 +28,15 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -# pygbm_est = PYGBM_GBM( -# max_iter=max_iter, -# scoring=None, # no early stopping -# validation_split=None, -# random_state=0, -# verbose=False) -# print("compiling pygbm code") -# pygbm_est.fit(X[:1000], y[:1000]) -# print("done") +pygbm_est = PYGBM_GBM( + max_iter=max_iter, + scoring=None, # no early stopping + validation_split=None, + random_state=0, + verbose=False) +print("compiling pygbm code") +pygbm_est.fit(X[:1000], y[:1000]) +print("done") gbm = GBM( n_estimators=max_iter, @@ -55,15 +55,15 @@ print(f'sklearn gbm score_duration {score_duration:.3f}s') -# pygbm_est.set_params(verbose=True) -# tic = time() -# pygbm_est.fit(X, y) -# fit_duration = time() - tic -# tic = time() -# print(f'score: {pygbm_est.score(X, y)}') -# score_duration = time() - tic -# print(f'pygbm fit_duration: {fit_duration:.3f}s') -# print(f'pygbm score_duration {score_duration:.3f}s') +pygbm_est.set_params(verbose=True) +tic = time() +pygbm_est.fit(X, y) +fit_duration = time() - tic +tic = time() +print(f'score: {pygbm_est.score(X, y)}') +score_duration = time() - tic +print(f'pygbm fit_duration: {fit_duration:.3f}s') +print(f'pygbm score_duration {score_duration:.3f}s') # cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") # s = pstats.Stats("Profile.prof") diff --git a/sklearn/_fast_gradient_boosting/histogram.pxd b/sklearn/_fast_gradient_boosting/histogram.pxd index ce9c10a48e3c1..e89582d03a266 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pxd +++ b/sklearn/_fast_gradient_boosting/histogram.pxd @@ -23,46 +23,52 @@ from .types cimport hist_struct """compute (hist_a - hist_b) in out""" cpdef void _subtract_histograms( + const int feature_idx, unsigned int n_bins, - const hist_struct [::1] hist_a, # IN - const hist_struct [::1] hist_b, # IN - hist_struct [::1] out) nogil # OUT + const hist_struct [:, ::1] hist_a, # IN + const hist_struct [:, ::1] hist_b, # IN + hist_struct [:, ::1] out, # OUT + ) nogil """Return histogram for a given feature.""" cpdef void _build_histogram( + const int feature_idx, unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] ordered_gradients, # IN const Y_DTYPE_C [::1] ordered_hessians, # IN - hist_struct [::1] out) nogil # OUT + hist_struct [:, ::1] out) nogil # OUT """Return histogram for a given feature, not updating hessians. Used when the hessians of the loss are constant (tipycally LS loss).""" cpdef void _build_histogram_no_hessian( + const int feature_idx, unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] ordered_gradients, # IN - hist_struct [::1] out) nogil # OUT + hist_struct [:, ::1] out) nogil # OUT """Compute histogram of the root node. Unlike other nodes, the root node has to find the split among *all* the samples from the training set. binned_feature and all_gradients / all_hessians already have a consistent ordering.""" cpdef void _build_histogram_root( + const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] all_gradients, # IN const Y_DTYPE_C [::1] all_hessians, # IN - hist_struct [::1] out) nogil # OUT + hist_struct [:, ::1] out) nogil # OUT """Compute histogram of the root node, not updating hessians. Used when the hessians of the loss are constant (tipycally LS loss).""" cpdef void _build_histogram_root_no_hessian( + const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] all_gradients, # IN - hist_struct [::1] out) nogil # OUT + hist_struct [:, ::1] out) nogil # OUT diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index 39176fc770daa..57e418d331560 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -20,12 +20,13 @@ from .types import HISTOGRAM_DTYPE cpdef void _build_histogram_naive( + const int feature_idx, unsigned int n_bins, unsigned int [:] sample_indices, # IN X_BINNED_DTYPE_C [:] binned_feature, # IN Y_DTYPE_C [:] ordered_gradients, # IN Y_DTYPE_C [:] ordered_hessians, # IN - hist_struct [:] out # OUT + hist_struct [:, :] out # OUT ) nogil: """Build histogram in a naive way, without optimizing for cache hit.""" cdef: @@ -37,32 +38,34 @@ cpdef void _build_histogram_naive( for i in range(n_samples): sample_idx = sample_indices[i] bin_idx = binned_feature[sample_idx] - out[bin_idx].sum_gradients += ordered_gradients[i] - out[bin_idx].sum_hessians += ordered_hessians[i] - out[bin_idx].count += 1 + out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i] + out[feature_idx, bin_idx].count += 1 cpdef void _subtract_histograms( + const int feature_idx, unsigned int n_bins, - hist_struct [::1] hist_a, # IN - hist_struct [::1] hist_b, # IN - hist_struct [::1] out # OUT + hist_struct [:, ::1] hist_a, # IN + hist_struct [:, ::1] hist_b, # IN + hist_struct [:, ::1] out, # OUT ) nogil: cdef: unsigned int i = 0 for i in range(n_bins): - out[i].sum_gradients = hist_a[i].sum_gradients - hist_b[i].sum_gradients - out[i].sum_hessians = hist_a[i].sum_hessians - hist_b[i].sum_hessians - out[i].count = hist_a[i].count - hist_b[i].count + out[feature_idx, i].sum_gradients = hist_a[feature_idx, i].sum_gradients - hist_b[feature_idx, i].sum_gradients + out[feature_idx, i].sum_hessians = hist_a[feature_idx, i].sum_hessians - hist_b[feature_idx, i].sum_hessians + out[feature_idx, i].count = hist_a[feature_idx, i].count - hist_b[feature_idx, i].count cpdef void _build_histogram( + const int feature_idx, unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] ordered_gradients, # IN const Y_DTYPE_C [::1] ordered_hessians, # IN - hist_struct [::1] out # OUT + hist_struct [:, ::1] out # OUT ) nogil: cdef: unsigned int i = 0 @@ -81,34 +84,35 @@ cpdef void _build_histogram( bin_2 = binned_feature[sample_indices[i + 2]] bin_3 = binned_feature[sample_indices[i + 3]] - out[bin_0].sum_gradients += ordered_gradients[i] - out[bin_1].sum_gradients += ordered_gradients[i + 1] - out[bin_2].sum_gradients += ordered_gradients[i + 2] - out[bin_3].sum_gradients += ordered_gradients[i + 3] + out[feature_idx, bin_0].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3] - out[bin_0].sum_hessians += ordered_hessians[i] - out[bin_1].sum_hessians += ordered_hessians[i + 1] - out[bin_2].sum_hessians += ordered_hessians[i + 2] - out[bin_3].sum_hessians += ordered_hessians[i + 3] + out[feature_idx, bin_0].sum_hessians += ordered_hessians[i] + out[feature_idx, bin_1].sum_hessians += ordered_hessians[i + 1] + out[feature_idx, bin_2].sum_hessians += ordered_hessians[i + 2] + out[feature_idx, bin_3].sum_hessians += ordered_hessians[i + 3] - out[bin_0].count += 1 - out[bin_1].count += 1 - out[bin_2].count += 1 - out[bin_3].count += 1 + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): bin_idx = binned_feature[sample_indices[i]] - out[bin_idx].sum_gradients += ordered_gradients[i] - out[bin_idx].sum_hessians += ordered_hessians[i] - out[bin_idx].count += 1 + out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i] + out[feature_idx, bin_idx].count += 1 cpdef void _build_histogram_no_hessian( + const int feature_idx, unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] ordered_gradients, # OUT - hist_struct [::1] out # OUT + hist_struct [:, ::1] out # OUT ) nogil: cdef: unsigned int i = 0 @@ -127,28 +131,29 @@ cpdef void _build_histogram_no_hessian( bin_2 = binned_feature[sample_indices[i + 2]] bin_3 = binned_feature[sample_indices[i + 3]] - out[bin_0].sum_gradients += ordered_gradients[i] - out[bin_1].sum_gradients += ordered_gradients[i + 1] - out[bin_2].sum_gradients += ordered_gradients[i + 2] - out[bin_3].sum_gradients += ordered_gradients[i + 3] + out[feature_idx, bin_0].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3] - out[bin_0].count += 1 - out[bin_1].count += 1 - out[bin_2].count += 1 - out[bin_3].count += 1 + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 for i in range(unrolled_upper, n_node_samples): bin_idx = binned_feature[sample_indices[i]] - out[bin_idx].sum_gradients += ordered_gradients[i] - out[bin_idx].count += 1 + out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_idx].count += 1 cpdef void _build_histogram_root( + const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] all_gradients, # IN const Y_DTYPE_C [::1] all_hessians, # IN - hist_struct [::1] out # OUT + hist_struct [:, ::1] out # OUT ) nogil: cdef: unsigned int i = 0 @@ -168,33 +173,34 @@ cpdef void _build_histogram_root( bin_2 = binned_feature[i + 2] bin_3 = binned_feature[i + 3] - out[bin_0].sum_gradients += all_gradients[i] - out[bin_1].sum_gradients += all_gradients[i + 1] - out[bin_2].sum_gradients += all_gradients[i + 2] - out[bin_3].sum_gradients += all_gradients[i + 3] + out[feature_idx, bin_0].sum_gradients += all_gradients[i] + out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3] - out[bin_0].sum_hessians += all_hessians[i] - out[bin_1].sum_hessians += all_hessians[i + 1] - out[bin_2].sum_hessians += all_hessians[i + 2] - out[bin_3].sum_hessians += all_hessians[i + 3] + out[feature_idx, bin_0].sum_hessians += all_hessians[i] + out[feature_idx, bin_1].sum_hessians += all_hessians[i + 1] + out[feature_idx, bin_2].sum_hessians += all_hessians[i + 2] + out[feature_idx, bin_3].sum_hessians += all_hessians[i + 3] - out[bin_0].count += 1 - out[bin_1].count += 1 - out[bin_2].count += 1 - out[bin_3].count += 1 + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 for i in range(unrolled_upper, n_samples): bin_idx = binned_feature[i] - out[bin_idx].sum_gradients += all_gradients[i] - out[bin_idx].sum_hessians += all_hessians[i] - out[bin_idx].count += 1 + out[feature_idx, bin_idx].sum_gradients += all_gradients[i] + out[feature_idx, bin_idx].sum_hessians += all_hessians[i] + out[feature_idx, bin_idx].count += 1 cpdef void _build_histogram_root_no_hessian( + const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const Y_DTYPE_C [::1] all_gradients, # IN - hist_struct [::1] out # OUT + hist_struct [:, ::1] out # OUT ) nogil: cdef: unsigned int i = 0 @@ -213,17 +219,17 @@ cpdef void _build_histogram_root_no_hessian( bin_2 = binned_feature[i + 2] bin_3 = binned_feature[i + 3] - out[bin_0].sum_gradients += all_gradients[i] - out[bin_1].sum_gradients += all_gradients[i + 1] - out[bin_2].sum_gradients += all_gradients[i + 2] - out[bin_3].sum_gradients += all_gradients[i + 3] + out[feature_idx, bin_0].sum_gradients += all_gradients[i] + out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3] - out[bin_0].count += 1 - out[bin_1].count += 1 - out[bin_2].count += 1 - out[bin_3].count += 1 + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 for i in range(unrolled_upper, n_samples): bin_idx = binned_feature[i] - out[bin_idx].sum_gradients += all_gradients[i] - out[bin_idx].count += 1 + out[feature_idx, bin_idx].sum_gradients += all_gradients[i] + out[feature_idx, bin_idx].count += 1 diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 7099c71c3ee99..0acf4b0d08b90 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -22,6 +22,7 @@ from .histogram cimport _build_histogram_no_hessian from .histogram cimport _build_histogram_root from .histogram cimport _build_histogram_root_no_hessian from .histogram cimport _subtract_histograms +# from .histogram cimport _subtract_histograms from .types cimport X_BINNED_DTYPE_C from .types cimport Y_DTYPE_C from .types cimport hist_struct @@ -392,7 +393,7 @@ cdef class Splitter: # Populate ordered_gradients and ordered_hessians. (Already done # for root) Ordering the gradients and hessians helps to improve # cache hit. - if sample_indices.shape[0] != self.gradients.shape[0]: + if sample_indices.shape[0] != gradients.shape[0]: if self.constant_hessian: for i in prange(n_samples, schedule='static'): ordered_gradients[i] = gradients[sample_indices[i]] @@ -415,12 +416,12 @@ cdef class Splitter: for feature_idx in prange(self.n_features): # Compute histogram of each feature self._compute_histogram(feature_idx, sample_indices, - histograms[feature_idx]) + histograms) # and get the best possible split for the feature among all # bins split_info = self._find_best_bin_to_split_helper( - feature_idx, histograms[feature_idx], n_samples, + feature_idx, histograms, n_samples, sum_gradients, sum_hessians) split_infos[feature_idx] = split_info @@ -443,9 +444,9 @@ cdef class Splitter: cdef void _compute_histogram( self, - unsigned int feature_idx, + const unsigned int feature_idx, const unsigned int [::1] sample_indices, # IN - hist_struct [::1] histogram # OUT + hist_struct [:, ::1] histograms # OUT ) nogil: """Compute the histogram for a given feature.""" @@ -461,21 +462,21 @@ cdef class Splitter: if root_node: if self.constant_hessian: - _build_histogram_root_no_hessian(self.max_bins, X_binned, - ordered_gradients, histogram) + _build_histogram_root_no_hessian(feature_idx, self.max_bins, X_binned, + ordered_gradients, histograms) else: - _build_histogram_root(self.max_bins, X_binned, + _build_histogram_root(feature_idx, self.max_bins, X_binned, ordered_gradients, - ordered_hessians, histogram) + ordered_hessians, histograms) else: if self.constant_hessian: - _build_histogram_no_hessian(self.max_bins, sample_indices, + _build_histogram_no_hessian(feature_idx, self.max_bins, sample_indices, X_binned, ordered_gradients, - histogram) + histograms) else: - _build_histogram(self.max_bins, sample_indices, X_binned, + _build_histogram(feature_idx, self.max_bins, sample_indices, X_binned, ordered_gradients, ordered_hessians, - histogram) + histograms) def find_node_split_subtraction( Splitter self, @@ -537,14 +538,15 @@ cdef class Splitter: self.n_features * sizeof(split_info_struct)) for feature_idx in prange(self.n_features): # Compute histogram of each feature - _subtract_histograms(self.max_bins, - parent_histograms[feature_idx], - sibling_histograms[feature_idx], - histograms[feature_idx]) + _subtract_histograms(feature_idx, + self.max_bins, + parent_histograms, + sibling_histograms, + histograms) # and get the best possible split for the feature among all # bins split_info = self._find_best_bin_to_split_helper( - feature_idx, histograms[feature_idx], n_samples, + feature_idx, histograms, n_samples, sum_gradients, sum_hessians) split_infos[feature_idx] = split_info @@ -588,10 +590,11 @@ cdef class Splitter: cdef split_info_struct _find_best_bin_to_split_helper( self, unsigned int feature_idx, - const hist_struct [::1] histogram, # IN + const hist_struct [:, ::1] histograms, # IN unsigned int n_samples, Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians) nogil: + Y_DTYPE_C sum_hessians, + ) nogil: """Find best bin to split on for a given feature. Splits that do not satisfy the splitting constraints @@ -617,17 +620,17 @@ cdef class Splitter: n_samples_left = 0 for bin_idx in range(self.n_bins_per_feature[feature_idx]): - n_samples_left += histogram[bin_idx].count + n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left if self.constant_hessian: - hessian_left += (histogram[bin_idx].count + hessian_left += (histograms[feature_idx, bin_idx].count * self.constant_hessian_value) else: - hessian_left += histogram[bin_idx].sum_hessians + hessian_left += histograms[feature_idx, bin_idx].sum_hessians hessian_right = sum_hessians - hessian_left - gradient_left += histogram[bin_idx].sum_gradients + gradient_left += histograms[feature_idx, bin_idx].sum_gradients gradient_right = sum_gradients - gradient_left if n_samples_left < self.min_samples_leaf: @@ -666,14 +669,14 @@ cdef class Splitter: self, unsigned int feature_idx, unsigned int [::1] sample_indices, - hist_struct [::1] histogram, + hist_struct [:, ::1] histograms, Y_DTYPE_C sum_gradients, Y_DTYPE_C sum_hessians): - self._compute_histogram(feature_idx, sample_indices, histogram) + self._compute_histogram(feature_idx, sample_indices, histograms) n_samples = sample_indices.shape[0] split_info = self._find_best_bin_to_split_helper( - feature_idx, histogram, n_samples, + feature_idx, histograms, n_samples, sum_gradients, sum_hessians) return SplitInfo( diff --git a/sklearn/_fast_gradient_boosting/tests/test_histogram.py b/sklearn/_fast_gradient_boosting/tests/test_histogram.py index dceb9bf22a108..e32eedc8271cb 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_histogram.py +++ b/sklearn/_fast_gradient_boosting/tests/test_histogram.py @@ -27,9 +27,10 @@ def test_build_histogram(build_func): ordered_hessians = np.array([1, 1, 2], dtype=Y_DTYPE) sample_indices = np.array([0, 2, 3], dtype=np.uint32) - hist = np.zeros(3, dtype=HISTOGRAM_DTYPE) - build_func(3, sample_indices, binned_feature, ordered_gradients, + hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) + build_func(0, 3, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist) + hist = hist[0] assert_array_equal(hist['count'], [2, 1, 0]) assert_allclose(hist['sum_gradients'], [1, 3, 0]) assert_allclose(hist['sum_hessians'], [2, 2, 0]) @@ -39,9 +40,10 @@ def test_build_histogram(build_func): ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=Y_DTYPE) ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=Y_DTYPE) - hist = np.zeros(3, dtype=HISTOGRAM_DTYPE) - build_func(3, sample_indices, binned_feature, ordered_gradients, + hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) + build_func(0, 3, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist) + hist = hist[0] assert_array_equal(hist['count'], [2, 2, 1]) assert_allclose(hist['sum_gradients'], [1, 4, 0]) assert_allclose(hist['sum_hessians'], [2, 2, 1]) @@ -58,26 +60,31 @@ def test_histogram_sample_order_independence(): sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False) ordered_gradients = rng.randn(n_sub_samples).astype(Y_DTYPE) - hist_gc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, ordered_gradients, hist_gc) ordered_hessians = rng.exponential(size=n_sub_samples).astype(Y_DTYPE) - hist_ghc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - _build_histogram(n_bins, sample_indices, binned_feature, + hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram(0, n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc) permutation = rng.permutation(n_sub_samples) - hist_gc_perm = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - _build_histogram_no_hessian(n_bins, sample_indices[permutation], + hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram_no_hessian(0, n_bins, sample_indices[permutation], binned_feature, ordered_gradients[permutation], hist_gc_perm) - hist_ghc_perm = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - _build_histogram(n_bins, sample_indices[permutation], binned_feature, + hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram(0, n_bins, sample_indices[permutation], binned_feature, ordered_gradients[permutation], ordered_hessians[permutation], hist_ghc_perm) + hist_gc = hist_gc[0] + hist_ghc = hist_ghc[0] + hist_gc_perm = hist_gc_perm[0] + hist_ghc_perm = hist_ghc_perm[0] + assert_allclose(hist_gc['sum_gradients'], hist_gc_perm['sum_gradients']) assert_array_equal(hist_gc['count'], hist_gc_perm['count']) @@ -101,24 +108,29 @@ def test_unrolled_equivalent_to_naive(constant_hessian): else: ordered_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) - hist_gc_root = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - hist_ghc_root = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - hist_gc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - hist_ghc = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - hist_naive = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram_root_no_hessian(n_bins, binned_feature, ordered_gradients, + _build_histogram_root_no_hessian(0, n_bins, binned_feature, ordered_gradients, hist_gc_root) - _build_histogram_root(n_bins, binned_feature, ordered_gradients, + _build_histogram_root(0, n_bins, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root) - _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, ordered_gradients, hist_gc) - _build_histogram(n_bins, sample_indices, binned_feature, + _build_histogram(0, n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc) - _build_histogram_naive(n_bins, sample_indices, binned_feature, + _build_histogram_naive(0, n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_naive) - for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_gc, hist_ghc): + hist_naive = hist_naive[0] + hist_gc_root = hist_gc_root[0] + hist_ghc_root = hist_ghc_root[0] + hist_gc = hist_gc[0] + hist_ghc = hist_ghc[0] + for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc): assert_array_equal(hist['count'], hist_naive['count']) assert_allclose(hist['sum_gradients'], hist_naive['sum_gradients']) for hist in (hist_ghc_root, hist_ghc): @@ -142,12 +154,12 @@ def test_hist_subtraction(constant_hessian): else: ordered_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) - hist_parent = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: - _build_histogram_no_hessian(n_bins, sample_indices, binned_feature, + _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, ordered_gradients, hist_parent) else: - _build_histogram(n_bins, sample_indices, binned_feature, + _build_histogram(0, n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_parent) mask = rng.randint(0, 2, n_samples).astype(np.bool) @@ -155,33 +167,33 @@ def test_hist_subtraction(constant_hessian): sample_indices_left = sample_indices[mask] ordered_gradients_left = ordered_gradients[mask] ordered_hessians_left = ordered_hessians[mask] - hist_left = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: - _build_histogram_no_hessian(n_bins, sample_indices_left, + _build_histogram_no_hessian(0, n_bins, sample_indices_left, binned_feature, ordered_gradients_left, hist_left) else: - _build_histogram(n_bins, sample_indices_left, binned_feature, + _build_histogram(0, n_bins, sample_indices_left, binned_feature, ordered_gradients_left, ordered_hessians_left, hist_left) sample_indices_right = sample_indices[~mask] ordered_gradients_right = ordered_gradients[~mask] ordered_hessians_right = ordered_hessians[~mask] - hist_right = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) + hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: - _build_histogram_no_hessian(n_bins, sample_indices_right, + _build_histogram_no_hessian(0, n_bins, sample_indices_right, binned_feature, ordered_gradients_right, hist_right) else: - _build_histogram(n_bins, sample_indices_right, binned_feature, + _build_histogram(0, n_bins, sample_indices_right, binned_feature, ordered_gradients_right, ordered_hessians_right, hist_right) - hist_left_sub = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - hist_right_sub = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - _subtract_histograms(n_bins, hist_parent, hist_right, hist_left_sub) - _subtract_histograms(n_bins, hist_parent, hist_left, hist_right_sub) + hist_left_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_right_sub = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _subtract_histograms(0, n_bins, hist_parent, hist_right, hist_left_sub) + _subtract_histograms(0, n_bins, hist_parent, hist_left, hist_right_sub) for key in ('count', 'sum_hessians', 'sum_gradients'): assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index f19af4e43214b..09658c71c74b7 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -43,9 +43,9 @@ def test_histogram_split(n_bins): min_hessian_to_split, min_samples_leaf, min_gain_to_split) - histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) + histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) split_info = splitter.find_best_split_wrapper( - feature_idx, sample_indices, histogram, sum_gradients, + feature_idx, sample_indices, histograms, sum_gradients, sum_hessians) assert split_info.bin_idx == true_bin @@ -336,8 +336,8 @@ def test_min_gain_to_split(): min_hessian_to_split, min_samples_leaf, min_gain_to_split) - histogram = np.zeros(shape=(n_bins), dtype=HISTOGRAM_DTYPE) + histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) split_info = splitter.find_best_split_wrapper( - feature_idx, sample_indices, histogram, sum_gradients, + feature_idx, sample_indices, histograms, sum_gradients, sum_hessians) assert split_info.gain == -1 From 1bfde2c51c5c69abc211ce0f3bea259b1aef9e2c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 09:43:11 -0500 Subject: [PATCH 059/247] some doc and attribute exposition --- doc/modules/classes.rst | 2 + .../gradient_boosting.py | 138 ++++++++++++------ sklearn/_fast_gradient_boosting/splitting.pyx | 2 +- .../tests/test_gradient_boosting.py | 14 +- .../tests/test_splitting.py | 11 +- 5 files changed, 105 insertions(+), 62 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 266e45b14bb1b..3125c5d893521 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -416,6 +416,8 @@ Samples generator ensemble.ExtraTreesRegressor ensemble.GradientBoostingClassifier ensemble.GradientBoostingRegressor + ensemble.FastGradientBoostingClassifier + ensemble.FastGradientBoostingRegressor ensemble.IsolationForest ensemble.RandomForestClassifier ensemble.RandomForestRegressor diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 3fd2e99cbf109..291cb6aded2a7 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -87,7 +87,6 @@ def fit(self, X, y): self : object """ - self._in_fit = True # TODO: document this fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes @@ -100,6 +99,13 @@ def fit(self, X, y): self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() + # we need this stateful variable to tell raw_predict() that it was + # called from fit(), which only passes pre-binned data to + # raw_predict() via the scorer_ attribute. predicting is faster on + # pre-binned data. + self._in_fit = True + + # bin the data if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) @@ -117,6 +123,7 @@ def fit(self, X, y): self.do_early_stopping_ = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) + # create validation data if needed if self.do_early_stopping_ and self.validation_fraction is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None @@ -139,9 +146,9 @@ def fit(self, X, y): X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None - # Subsample the training set for early stopping + # Subsample the training set for early stopping and score monitoring if self.do_early_stopping_: - subsample_size = 10000 # should we expose this? + subsample_size = 10000 # should we expose this parameter? indices = np.arange(X_binned_train.shape[0]) if X_binned_train.shape[0] > subsample_size: indices = rng.choice(indices, subsample_size) @@ -153,27 +160,29 @@ def fit(self, X, y): if self.verbose: print("Fitting gradient boosted rounds:") + # initialize raw_predictions: those are the accumulated values + # predicted by the trees for the training data. raw_predictions has + # shape (n_samples, n_trees_per_iteration) where n_trees_per_iterations + # is n_classes in multiclass classification, else 1. n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.n_trees_per_iteration_) - # raw_predictions are the accumulated values predicted by the trees - # for the training data. raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ - # gradients and hessians are 1D arrays of size - # n_samples * n_trees_per_iteration + # initialize gradients and hessians (empty arrays). Those 1D arrays of + # size (n_samples * n_trees_per_iteration). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.n_trees_per_iteration_ ) - # predictors_ is a matrix (list of lists) of TreePredictor objects + # estimators_ is a matrix (list of lists) of TreePredictor objects # with shape (n_iter_, n_trees_per_iteration) - self.predictors_ = predictors = [] + self.estimators_ = estimators = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. @@ -181,15 +190,15 @@ def fit(self, X, y): self.scorer_ = check_scoring(self, self.scoring) else: self.scorer_ = None - self.train_scores_ = [] - self.validation_scores_ = [] + self.train_score_ = [] + self.validation_score_ = [] if self.do_early_stopping_: # Add predictions of the initial model (before the first tree) - self.train_scores_.append( + self.train_score_.append( self._get_scores(X_binned_small_train, y_small_train)) if self.validation_fraction is not None: - self.validation_scores_.append( + self.validation_score_.append( self._get_scores(X_binned_val, y_val)) for iteration in range(self.n_estimators): @@ -203,7 +212,7 @@ def fit(self, X, y): self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) - predictors.append([]) + estimators.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( @@ -228,9 +237,9 @@ def fit(self, X, y): acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time - predictor = grower.make_predictor( + estimator = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) - predictors[-1].append(predictor) + estimators[-1].append(estimator) tic_pred = time() _update_raw_predictions(raw_predictions[:, k], grower) @@ -246,23 +255,19 @@ def fit(self, X, y): if self.verbose: self._print_iteration_stats(iteration_start_time) - # if the only trees we could build are stumps, stop training - if all(predictor.get_n_leaf_nodes() == 1 - for predictor in self.predictors_[-1]): - should_early_stop = True - + # maybe we could also early stop if all the trees are stumps? if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( - predictor.get_n_leaf_nodes() - for predictors_at_ith_iteration in self.predictors_ - for predictor in predictors_at_ith_iteration) + estimator.get_n_leaf_nodes() + for predictors_at_ith_iteration in self.estimators_ + for estimator in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) - for predictors_at_ith_iteration in self.predictors_) + for predictors_at_ith_iteration in self.estimators_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " @@ -272,8 +277,8 @@ def fit(self, X, y): print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") - self.train_scores_ = np.asarray(self.train_scores_) - self.validation_scores_ = np.asarray(self.validation_scores_) + self.train_score_ = np.asarray(self.train_score_) + self.validation_score_ = np.asarray(self.validation_score_) self._in_fit = False return self @@ -284,15 +289,15 @@ def _check_early_stopping(self, X_binned_train, y_train, Scores are computed on validation data or on training data. """ - self.train_scores_.append( + self.train_score_.append( self._get_scores(X_binned_train, y_train)) if self.validation_fraction is not None: - self.validation_scores_.append( + self.validation_score_.append( self._get_scores(X_binned_val, y_val)) - return self._should_stop(self.validation_scores_) + return self._should_stop(self.validation_score_) - return self._should_stop(self.train_scores_) + return self._should_stop(self.train_score_) def _should_stop(self, scores): """ @@ -334,14 +339,14 @@ def _print_iteration_stats(self, iteration_start_time): log_msg = '' predictors_of_ith_iteration = [ - predictors_list for predictors_list in self.predictors_[-1] + predictors_list for predictors_list in self.estimators_[-1] if predictors_list ] n_trees = len(predictors_of_ith_iteration) - max_depth = max(predictor.get_max_depth() - for predictor in predictors_of_ith_iteration) - n_leaves = sum(predictor.get_n_leaf_nodes() - for predictor in predictors_of_ith_iteration) + max_depth = max(estimator.get_max_depth() + for estimator in predictors_of_ith_iteration) + n_leaves = sum(estimator.get_n_leaf_nodes() + for estimator in predictors_of_ith_iteration) if n_trees == 1: log_msg += (f"{n_trees} tree, {n_leaves} leaves, ") @@ -352,10 +357,10 @@ def _print_iteration_stats(self, iteration_start_time): log_msg += f"max depth = {max_depth}, " if self.do_early_stopping_: - log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, " + log_msg += f"{self.scoring} train: {self.train_score_[-1]:.5f}, " if self.validation_fraction is not None: log_msg += (f"{self.scoring} val: " - f"{self.validation_scores_[-1]:.5f}, ") + f"{self.validation_score_[-1]:.5f}, ") iteration_time = time() - iteration_start_time log_msg += f"in {iteration_time:0.3f}s" @@ -376,7 +381,7 @@ def _raw_predict(self, X): The raw predicted values. """ X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE]) - check_is_fitted(self, 'predictors_') + check_is_fitted(self, 'estimators_') if X.shape[1] != self.n_features_: raise ValueError( f'X has {X.shape[1]} features but this estimator was ' @@ -389,10 +394,10 @@ def _raw_predict(self, X): dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ - for predictors_of_ith_iteration in self.predictors_: - for k, predictor in enumerate(predictors_of_ith_iteration): - predict = (predictor.predict_binned if is_binned - else predictor.predict) + for predictors_of_ith_iteration in self.estimators_: + for k, estimator in enumerate(predictors_of_ith_iteration): + predict = (estimator.predict_binned if is_binned + else estimator.predict) raw_predictions[:, k] += predict(X) return raw_predictions @@ -406,13 +411,13 @@ def _encode_y(self, y=None): pass @property - def n_iter_(self): - check_is_fitted(self, 'predictors_') - return len(self.predictors_) + def n_estimators_(self): + check_is_fitted(self, 'estimators_') + return len(self.estimators_) class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): - """Scikit-learn compatible Gradient Boosting Tree for regression. + """Fast Gradient Boosting Regression Tree. Parameters ---------- @@ -470,6 +475,24 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): binning process, and the train/validation data split if early stopping is enabled. See :term:`random_state`. + Attributes + ---------- + n_estimators_ : int + The number of estimators as selected by early stopping (if + n_iter_no_change is not None). Otherwise it is set to n_estimators. + estimators_ : list of lists, shape=(n_estimators, n_trees_per_iteration) + The collection of fitted sub-estimators. The number of trees per + iteration is ``n_classes`` in multiclass classification, else 1. + train_score_ : array, shape=(n_estimators + 1) + The scores at each iteration on the training data. The first entry is + the score of the ensemble before the first iteration. Scores are + computed according to the ``scoring`` parameter. Empty if no early + stopping. + train_score_ : array, shape=(n_estimators + 1) + The scores at each iteration on the held-out validation data. The + first entry is the score of the ensemble before the first iteration. + Scores are computed according to the ``scoring`` parameter. Empty if + no early stopping or if ``validation_fraction`` is None. Examples -------- @@ -526,7 +549,7 @@ def _get_loss(self): class FastGradientBoostingClassifier(BaseFastGradientBoosting, ClassifierMixin): - """Scikit-learn compatible Gradient Boosting Tree for classification. + """Fast Gradient Boosting Classification Tree. Parameters ---------- @@ -590,6 +613,25 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, binning process, and the train/validation data split if early stopping is enabled. See :term:`random_state`. + Attributes + ---------- + n_estimators_ : int + The number of estimators as selected by early stopping (if + n_iter_no_change is not None). Otherwise it is set to n_estimators. + estimators_ : list of lists, shape=(n_estimators, n_trees_per_iteration) + The collection of fitted sub-estimators. The number of trees per + iteration is ``n_classes`` in multiclass classification, else 1. + train_score_ : array, shape=(n_estimators + 1) + The scores at each iteration on the training data. The first entry is + the score of the ensemble before the first iteration. Scores are + computed according to the ``scoring`` parameter. Empty if no early + stopping. + train_score_ : array, shape=(n_estimators + 1) + The scores at each iteration on the held-out validation data. The + first entry is the score of the ensemble before the first iteration. + Scores are computed according to the ``scoring`` parameter. Empty if + no early stopping or if ``validation_fraction`` is None. + Examples -------- >>> from sklearn.datasets import load_iris diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 0acf4b0d08b90..da5d07bdd8db6 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -663,7 +663,7 @@ cdef class Splitter: return best_split - # Only used for tests (python code cannot use cdef functions) + # Only used for tests (python code cannot use cdef types) # Not sure if this is a good practice... def find_best_split_wrapper( self, diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 355ad5522ef1c..8547df71463f4 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -99,7 +99,7 @@ def test_init_parameters_validation(GradientBoosting, X, y): def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): - n_estimators = 500 + n_estimators = 200 X, y = make_regression(random_state=0) @@ -113,9 +113,9 @@ def test_early_stopping_regression(scoring, validation_fraction, gb.fit(X, y) if n_iter_no_change is not None: - assert n_iter_no_change <= gb.n_iter_ < n_estimators + assert n_iter_no_change <= gb.n_estimators_ < n_estimators else: - assert gb.n_iter_ == n_estimators + assert gb.n_estimators_ == n_estimators @pytest.mark.parametrize('data', ( @@ -129,11 +129,12 @@ def test_early_stopping_regression(scoring, validation_fraction, (None, None, 5, 1e-1), ('loss', .1, 5, 1e-7), # use loss ('loss', None, 5, 1e-1), # use loss on training data + (None, None, None, None), # no early stopping ]) def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): - n_estimators = 500 + n_estimators = 50 X, y = data @@ -146,7 +147,10 @@ def test_early_stopping_classification(data, scoring, validation_fraction, random_state=0) gb.fit(X, y) - assert n_iter_no_change <= gb.n_iter_ < n_estimators + if n_iter_no_change is not None: + assert n_iter_no_change <= gb.n_estimators_ < n_estimators + else: + assert gb.n_estimators_ == n_estimators def test_should_stop(): diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index 09658c71c74b7..35bb621a94f1c 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -312,7 +312,6 @@ def test_min_gain_to_split(): # possible gain = -1). Note: before the strict inequality comparison, this # test would fail because the node would be split with a gain of 0. rng = np.random.RandomState(42) - feature_idx = 0 l2_regularization = 0 min_hessian_to_split = 0 min_samples_leaf = 1 @@ -320,13 +319,11 @@ def test_min_gain_to_split(): n_bins = 255 n_samples = 100 X_binned = np.asfortranarray( - rng.randint(0, n_bins, size=(n_samples, 2)), dtype=X_BINNED_DTYPE) - binned_feature = X_binned.T[feature_idx] + rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE) + binned_feature = X_binned[:, 0] sample_indices = np.arange(n_samples, dtype=np.uint32) all_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) all_gradients = np.ones_like(binned_feature, dtype=Y_DTYPE) - sum_gradients = all_gradients.sum() - sum_hessians = all_hessians.sum() n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -337,7 +334,5 @@ def test_min_gain_to_split(): min_samples_leaf, min_gain_to_split) histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) - split_info = splitter.find_best_split_wrapper( - feature_idx, sample_indices, histograms, sum_gradients, - sum_hessians) + split_info = splitter.find_node_split(sample_indices, histograms) assert split_info.gain == -1 From 65ac62a02f164e4b13c1f9831bff3f48277b9355 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 09:51:14 -0500 Subject: [PATCH 060/247] remomved constant_hessian_value --- sklearn/_fast_gradient_boosting/grower.py | 2 +- sklearn/_fast_gradient_boosting/loss.pyx | 11 +++++---- sklearn/_fast_gradient_boosting/splitting.pyx | 24 +++++++------------ 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index f1021996ae221..3075bd17f3b97 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -249,7 +249,7 @@ def _intilialize_root(self): n_samples = self.X_binned.shape[0] depth = 0 sum_gradients = np.sum(self.splitter.gradients) - if self.splitter.constant_hessian: + if self.splitter.hessians_are_constant: sum_hessians = self.splitter.hessians[0] * n_samples else: sum_hessians = np.sum(self.splitter.hessians) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 2cb6a4fb9077d..a18f556883ae1 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -52,7 +52,10 @@ class BaseLoss(ABC): """ shape = n_samples * prediction_dim gradients = np.empty(shape=shape, dtype=Y_DTYPE) - if self.hessian_is_constant: + if self.hessians_are_constant: + # if the hessians are constant, we consider they are equal to 1. + # this is correct as long as we adjust the gradients. See e.g. LS + # loss hessians = np.ones(shape=1, dtype=Y_DTYPE) else: hessians = np.empty(shape=shape, dtype=Y_DTYPE) @@ -111,7 +114,7 @@ class LeastSquares(BaseLoss): loss(x_i) = (y_true_i - raw_pred_i)**2 """ - hessian_is_constant = True + hessians_are_constant = True def __call__(self, y_true, raw_predictions, average=True): # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to @@ -160,7 +163,7 @@ class BinaryCrossEntropy(BaseLoss): See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. """ - hessian_is_constant = False + hessians_are_constant = False inverse_link_function = staticmethod(expit) def __call__(self, y_true, raw_predictions, average=True): @@ -221,7 +224,7 @@ class CategoricalCrossEntropy(BaseLoss): cross-entropy to more than 2 classes. """ - hessian_is_constant = False + hessians_are_constant = False def __call__(self, y_true, raw_predictions, average=True): one_hot_true = np.zeros_like(raw_predictions) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index da5d07bdd8db6..44dc09bf97749 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -143,8 +143,7 @@ cdef class Splitter: Y_DTYPE_C [::1] ordered_hessians Y_DTYPE_C sum_gradients Y_DTYPE_C sum_hessians - unsigned char constant_hessian - Y_DTYPE_C constant_hessian_value + unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split unsigned int min_samples_leaf @@ -172,15 +171,11 @@ cdef class Splitter: # for root node, gradients and hessians are already ordered self.ordered_gradients = gradients.copy() self.ordered_hessians = hessians.copy() - self.constant_hessian = hessians.shape[0] == 1 + self.hessians_are_constant = hessians.shape[0] == 1 self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf self.min_gain_to_split = min_gain_to_split - if self.constant_hessian: - self.constant_hessian_value = hessians[0] # 1 scalar - else: - self.constant_hessian_value = 1. # won't be used anyway # The partition array maps each sample index into the leaves of the # tree (a leaf in this context is a node that isn't splitted yet, not @@ -394,7 +389,7 @@ cdef class Splitter: # for root) Ordering the gradients and hessians helps to improve # cache hit. if sample_indices.shape[0] != gradients.shape[0]: - if self.constant_hessian: + if self.hessians_are_constant: for i in prange(n_samples, schedule='static'): ordered_gradients[i] = gradients[sample_indices[i]] else: @@ -405,8 +400,8 @@ cdef class Splitter: # Compute sums of gradients and hessians at the node for i in prange(n_samples, schedule='static'): sum_gradients += ordered_gradients[i] - if self.constant_hessian: - sum_hessians = self.constant_hessian_value * n_samples + if self.hessians_are_constant: + sum_hessians = n_samples else: for i in prange(n_samples, schedule='static'): sum_hessians += ordered_hessians[i] @@ -461,7 +456,7 @@ cdef class Splitter: self.ordered_hessians[:n_samples] if root_node: - if self.constant_hessian: + if self.hessians_are_constant: _build_histogram_root_no_hessian(feature_idx, self.max_bins, X_binned, ordered_gradients, histograms) else: @@ -469,7 +464,7 @@ cdef class Splitter: ordered_gradients, ordered_hessians, histograms) else: - if self.constant_hessian: + if self.hessians_are_constant: _build_histogram_no_hessian(feature_idx, self.max_bins, sample_indices, X_binned, ordered_gradients, histograms) @@ -623,9 +618,8 @@ cdef class Splitter: n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left - if self.constant_hessian: - hessian_left += (histograms[feature_idx, bin_idx].count - * self.constant_hessian_value) + if self.hessians_are_constant: + hessian_left += histograms[feature_idx, bin_idx].count else: hessian_left += histograms[feature_idx, bin_idx].sum_hessians hessian_right = sum_hessians - hessian_left From 27d32d65049d084678f4fe6eca340a9fbc08e00c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 10:18:51 -0500 Subject: [PATCH 061/247] removed f-strings --- gdb_test.py | 44 +++++------ sklearn/_fast_gradient_boosting/binning.pyx | 4 +- .../gradient_boosting.py | 76 ++++++++++--------- sklearn/_fast_gradient_boosting/grower.py | 32 ++++---- sklearn/_fast_gradient_boosting/splitting.pyx | 6 +- .../tests/test_gradient_boosting.py | 22 +++--- 6 files changed, 95 insertions(+), 89 deletions(-) diff --git a/gdb_test.py b/gdb_test.py index c96a7d851dfd6..a00e14e5e41c6 100644 --- a/gdb_test.py +++ b/gdb_test.py @@ -13,7 +13,7 @@ classif = False n_classes = 2 n_features = 20 -n_samples = int(5e6) +n_samples = int(5e3) max_iter = 5 if classif: @@ -28,21 +28,21 @@ PYGBM_GBM = pygbm.GradientBoostingRegressor -pygbm_est = PYGBM_GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - random_state=0, - verbose=False) -print("compiling pygbm code") -pygbm_est.fit(X[:1000], y[:1000]) -print("done") +# pygbm_est = PYGBM_GBM( +# max_iter=max_iter, +# scoring=None, # no early stopping +# validation_split=None, +# random_state=0, +# verbose=False) +# print("compiling pygbm code") +# pygbm_est.fit(X[:1000], y[:1000]) +# print("done") gbm = GBM( n_estimators=max_iter, - scoring=None, - validation_fraction=None, - n_iter_no_change=None, + scoring='loss', + validation_fraction=.3, + n_iter_no_change=1000, random_state=0, verbose=True) tic = time() @@ -55,15 +55,15 @@ print(f'sklearn gbm score_duration {score_duration:.3f}s') -pygbm_est.set_params(verbose=True) -tic = time() -pygbm_est.fit(X, y) -fit_duration = time() - tic -tic = time() -print(f'score: {pygbm_est.score(X, y)}') -score_duration = time() - tic -print(f'pygbm fit_duration: {fit_duration:.3f}s') -print(f'pygbm score_duration {score_duration:.3f}s') +# pygbm_est.set_params(verbose=True) +# tic = time() +# pygbm_est.fit(X, y) +# fit_duration = time() - tic +# tic = time() +# print(f'score: {pygbm_est.score(X, y)}') +# score_duration = time() - tic +# print(f'pygbm fit_duration: {fit_duration:.3f}s') +# print(f'pygbm score_duration {score_duration:.3f}s') # cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") # s = pstats.Stats("Profile.prof") diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx index 3daf590547ddb..ff8cfb179186f 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.pyx @@ -35,8 +35,8 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), be used to separate the bins. len(binning_thresholds) == n_features. """ if not (2 <= max_bins <= 256): - raise ValueError(f'max_bins={max_bins} should be no smaller than 2 ' - f'and no larger than 256.') + raise ValueError('max_bins={} should be no smaller than 2 ' + 'and no larger than 256.'.format(max_bins)) rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: subset = rng.choice(np.arange(data.shape[0]), subsample) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 291cb6aded2a7..02c3ba51b590a 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -56,20 +56,21 @@ def _validate_parameters(self): ', '.join(self._VALID_LOSSES))) if self.learning_rate <= 0: - raise ValueError(f'learning_rate={self.learning_rate} must ' - f'be strictly positive') + raise ValueError('learning_rate={} must ' + 'be strictly positive'.format(self.learning_rate)) if self.n_estimators < 1: - raise ValueError(f'n_estimators={self.n_estimators} must ' - f'not be smaller than 1.') + raise ValueError('n_estimators={} must not be smaller ' + 'than 1.'.format(self.n_estimators)) if self.n_iter_no_change is not None and self.n_iter_no_change < 0: - raise ValueError(f'n_iter_no_change={self.n_iter_no_change} ' - f'must be positive.') + raise ValueError('n_iter_no_change={} must be ' + 'positive.'.format(self.n_iter_no_change)) if self.validation_fraction is not None and self.validation_fraction <= 0: - raise ValueError(f'validation_fraction={self.validation_fraction} ' - f'must be strictly positive, or None.') + raise ValueError( + 'validation_fraction={} must be strictly ' + 'positive, or None.'.format(self.validation_fraction)) if self.tol is not None and self.tol < 0: - raise ValueError(f'tol={self.tol} ' - f'must not be smaller than 0.') + raise ValueError('tol={} ' + 'must not be smaller than 0.'.format(self.tol)) def fit(self, X, y): """Fit the gradient boosting model. @@ -107,7 +108,7 @@ def fit(self, X, y): # bin the data if self.verbose: - print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", + print("Binning {:.3f} GB of data: ".format(X.nbytes / 1e9), end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) @@ -116,7 +117,7 @@ def fit(self, X, y): if self.verbose: duration = toc - tic troughput = X.nbytes / duration - print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)") + print("{:.3f} s ({:.3f} MB/s)".format(duration, troughput / 1e6)) self.loss_ = self._get_loss() @@ -133,10 +134,12 @@ def fit(self, X, y): stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( - f'Not enough data (n_samples={X_binned.shape[0]}) to ' - f'perform early stopping with validation_fraction=' - f'{self.validation_fraction}. Use more training data or ' - f'adjust validation_fraction.' + 'Not enough data (n_samples={}) to ' + 'perform early stopping with validation_fraction=' + '{}. Use more training data or ' + 'adjust validation_fraction.'.format( + X_binned.shape[0], + self.validation_fraction) ) # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. @@ -205,8 +208,8 @@ def fit(self, X, y): if self.verbose: iteration_start_time = time() - print(f"[{iteration + 1}/{self.n_estimators}] ", end='', - flush=True) + print("[{}/{}] ".format(iteration + 1, self.n_estimators), + end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, @@ -268,14 +271,14 @@ def fit(self, X, y): n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.estimators_) - print(f"Fit {n_predictors} trees in {duration:.3f} s, " - f"({n_total_leaves} total leaves)") - print(f"{'Time spent finding best splits:':<32} " - f"{acc_find_split_time:.3f}s") - print(f"{'Time spent applying splits:':<32} " - f"{acc_apply_split_time:.3f}s") - print(f"{'Time spent predicting:':<32} " - f"{acc_prediction_time:.3f}s") + print("Fit {} trees in {:.3f} s, ({} total leaves)".format( + n_predictors, duration, n_total_leaves)) + print("{:<32} {:.3f}s".format('Time spent finding best splits:', + acc_find_split_time)) + print("{:<32} {:.3f}s".format('Time spent applying splits:', + acc_apply_split_time)) + print("{:<32} {:.3f}s".format('Time spent predicting:', + acc_prediction_time)) self.train_score_ = np.asarray(self.train_score_) self.validation_score_ = np.asarray(self.validation_score_) @@ -349,21 +352,22 @@ def _print_iteration_stats(self, iteration_start_time): for estimator in predictors_of_ith_iteration) if n_trees == 1: - log_msg += (f"{n_trees} tree, {n_leaves} leaves, ") + log_msg += ("{} tree, {} leaves, ".format(n_trees, n_leaves)) else: - log_msg += (f"{n_trees} trees, {n_leaves} leaves ") - log_msg += (f"({int(n_leaves / n_trees)} on avg), ") + log_msg += ("{} trees, {} leaves ".format(n_trees, n_leaves)) + log_msg += ("({} on avg), ".format(int(n_leaves / n_trees))) - log_msg += f"max depth = {max_depth}, " + log_msg += "max depth = {}, ".format(max_depth) if self.do_early_stopping_: - log_msg += f"{self.scoring} train: {self.train_score_[-1]:.5f}, " + name = 'neg-loss' if self.scoring == 'loss' else 'score' + log_msg += "train {}: {:.5f}, ".format(name, self.train_score_[-1]) if self.validation_fraction is not None: - log_msg += (f"{self.scoring} val: " - f"{self.validation_score_[-1]:.5f}, ") + log_msg += "val {}: {:.5f}, ".format(name, + self.validation_score_[-1]) iteration_time = time() - iteration_start_time - log_msg += f"in {iteration_time:0.3f}s" + log_msg += "in {:0.3f}s".format(iteration_time) print(log_msg) @@ -384,8 +388,8 @@ def _raw_predict(self, X): check_is_fitted(self, 'estimators_') if X.shape[1] != self.n_features_: raise ValueError( - f'X has {X.shape[1]} features but this estimator was ' - f'trained with {self.n_features_} features.' + 'X has {} features but this estimator was trained with ' + '{} features.'.format(X.shape[1], self.n_features_) ) is_binned = self._in_fit and X.dtype == X_BINNED_DTYPE n_samples = X.shape[0] diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 3075bd17f3b97..3a2c973b2a63a 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -98,11 +98,11 @@ def __init__(self, depth, sample_indices, sum_gradients, def __repr__(self): # To help with debugging - out = f"TreeNode: depth={self.depth}, " - out += f"samples={len(self.sample_indices)}" + out = "TreeNode: depth={}, ".format(self.depth) + out += "samples={}".format(len(self.sample_indices)) if self.split_info is not None: - out += f", feature_idx={self.split_info.feature_idx}" - out += f", bin_idx={self.split_info.bin_idx}" + out += ", feature_idx={}".format(self.split_info.feature_idx) + out += ", bin_idx={}".format(self.split_info.bin_idx) return out def __lt__(self, other_node): @@ -221,23 +221,23 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, "X_binned should be passed as Fortran contiguous " "array for maximum efficiency.") if max_leaf_nodes is not None and max_leaf_nodes < 1: - raise ValueError(f'max_leaf_nodes={max_leaf_nodes} should not be' - f' smaller than 1') + raise ValueError('max_leaf_nodes={} should not be' + ' smaller than 1'.format(max_leaf_nodes)) if max_depth is not None and max_depth < 1: - raise ValueError(f'max_depth={max_depth} should not be' - f' smaller than 1') + raise ValueError('max_depth={} should not be' + ' smaller than 1'.format(max_depth)) if min_samples_leaf < 1: - raise ValueError(f'min_samples_leaf={min_samples_leaf} should ' - f'not be smaller than 1') + raise ValueError('min_samples_leaf={} should ' + 'not be smaller than 1'.format(min_samples_leaf)) if min_gain_to_split < 0: - raise ValueError(f'min_gain_to_split={min_gain_to_split} ' - f'must be positive.') + raise ValueError('min_gain_to_split={} ' + 'must be positive.'.format(min_gain_to_split)) if l2_regularization < 0: - raise ValueError(f'l2_regularization={l2_regularization} must be ' - f'positive.') + raise ValueError('l2_regularization={} must be ' + 'positive.'.format(l2_regularization)) if min_hessian_to_split < 0: - raise ValueError(f'min_hessian_to_split={min_hessian_to_split} ' - f'must be positive.') + raise ValueError('min_hessian_to_split={} ' + 'must be positive.'.format(min_hessian_to_split)) def grow(self): """Grow the tree, from root to leaves.""" diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 44dc09bf97749..28ad4ffcf9bcf 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -136,7 +136,7 @@ cdef class Splitter: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features unsigned int max_bins - unsigned int [:] n_bins_per_feature + unsigned int [::1] n_bins_per_feature Y_DTYPE_C [::1] gradients Y_DTYPE_C [::1] hessians Y_DTYPE_C [::1] ordered_gradients @@ -376,7 +376,7 @@ cdef class Splitter: split_info_struct * split_infos Y_DTYPE_C sum_gradients = 0. Y_DTYPE_C sum_hessians = 0. - # Also, need local views to avoid python interactions + # need local views to avoid python interactions Y_DTYPE_C [::1] ordered_gradients = self.ordered_gradients Y_DTYPE_C [::1] gradients = self.gradients Y_DTYPE_C [::1] ordered_hessians = self.ordered_hessians @@ -596,7 +596,7 @@ cdef class Splitter: (min_gain_to_split, etc.) are discarded here. If no split can satisfy the constraints, a SplitInfo with a gain of -1 is returned. If for a given node the best SplitInfo has a gain of -1, it is - finalized into a leaf. + finalized into a leaf in the grower. """ cdef: unsigned int bin_idx diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 8547df71463f4..bae86eff484f4 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -26,63 +26,65 @@ def test_init_parameters_validation(GradientBoosting, X, y): for learning_rate in (-1, 0): assert_raises_regex( ValueError, - f"learning_rate={learning_rate} must be strictly positive", + "learning_rate={} must be strictly positive".format(learning_rate), GradientBoosting(learning_rate=learning_rate).fit, X, y ) assert_raises_regex( ValueError, - f"n_estimators=0 must not be smaller than 1", + "n_estimators=0 must not be smaller than 1", GradientBoosting(n_estimators=0).fit, X, y ) assert_raises_regex( ValueError, - f"max_leaf_nodes=0 should not be smaller than 1", + "max_leaf_nodes=0 should not be smaller than 1", GradientBoosting(max_leaf_nodes=0).fit, X, y ) assert_raises_regex( ValueError, - f"max_depth=0 should not be smaller than 1", + "max_depth=0 should not be smaller than 1", GradientBoosting(max_depth=0).fit, X, y ) assert_raises_regex( ValueError, - f"min_samples_leaf=0 should not be smaller than 1", + "min_samples_leaf=0 should not be smaller than 1", GradientBoosting(min_samples_leaf=0).fit, X, y ) assert_raises_regex( ValueError, - f"l2_regularization=-1 must be positive", + "l2_regularization=-1 must be positive", GradientBoosting(l2_regularization=-1).fit, X, y ) for max_bins in (1, 257): assert_raises_regex( ValueError, - f"max_bins={max_bins} should be no smaller than 2 and no larger", + "max_bins={} should be no smaller than 2 and no larger".format( + max_bins), GradientBoosting(max_bins=max_bins).fit, X, y ) assert_raises_regex( ValueError, - f"n_iter_no_change=-1 must be positive", + "n_iter_no_change=-1 must be positive", GradientBoosting(n_iter_no_change=-1).fit, X, y ) for validation_fraction in (-1, 0): assert_raises_regex( ValueError, - f"validation_fraction={validation_fraction} must be strictly positive", + "validation_fraction={} must be strictly positive".format( + validation_fraction), GradientBoosting(validation_fraction=validation_fraction).fit, X, y ) assert_raises_regex( ValueError, - f"tol=-1 must not be smaller than 0", + "tol=-1 must not be smaller than 0", GradientBoosting(tol=-1).fit, X, y ) From 59a74830855ed3f44a9221ccdfc39be579f90369 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 10:26:53 -0500 Subject: [PATCH 062/247] removed unused files --- sklearn/_fast_gradient_boosting/fun.py | 0 .../_fast_gradient_boosting/playground.pyx | 19 ------------------- sklearn/_fast_gradient_boosting/setup.py | 7 ------- 3 files changed, 26 deletions(-) delete mode 100644 sklearn/_fast_gradient_boosting/fun.py delete mode 100644 sklearn/_fast_gradient_boosting/playground.pyx diff --git a/sklearn/_fast_gradient_boosting/fun.py b/sklearn/_fast_gradient_boosting/fun.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/sklearn/_fast_gradient_boosting/playground.pyx b/sklearn/_fast_gradient_boosting/playground.pyx deleted file mode 100644 index d84bc1602be68..0000000000000 --- a/sklearn/_fast_gradient_boosting/playground.pyx +++ /dev/null @@ -1,19 +0,0 @@ -import numpy as np -from cython.parallel import prange - - -def wrapper(): - print('in') - a = np.random.uniform(0, 100, size=(100, 100)).astype(np.int32) - g(a) - -cdef int f(int [:] a) nogil: - return 3 - -cdef int g(int [:, :] a) nogil: - - cdef: - int i - - for i in range(a.shape[0]): - f(a[i]) \ No newline at end of file diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index d65b0f36fe74f..398e678f2f31e 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -43,14 +43,7 @@ def configuration(parent_package="", top_path=None): sources=["types.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("playground", - sources=["playground.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) - config.add_subpackage("tests") - # config.add_data_files("histogram.pxd") return config From 04a99c4aceed471ea6d9045f797d986f5145858a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 10:30:14 -0500 Subject: [PATCH 063/247] removed benchmark files --- bench_binning.py | 85 ---------------------- bench_find_node_split.py | 96 ------------------------- bench_hist.py | 147 --------------------------------------- bench_predict.py | 90 ------------------------ bench_split_indices.py | 102 --------------------------- 5 files changed, 520 deletions(-) delete mode 100644 bench_binning.py delete mode 100644 bench_find_node_split.py delete mode 100644 bench_hist.py delete mode 100644 bench_predict.py delete mode 100644 bench_split_indices.py diff --git a/bench_binning.py b/bench_binning.py deleted file mode 100644 index 6748487f12e19..0000000000000 --- a/bench_binning.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Compare binning fitting and transform time with pygbm. -""" -from time import time -from collections import defaultdict - -import numpy as np -import pygbm -import matplotlib.pyplot as plt -from sklearn.datasets import make_regression - -from sklearn._fast_gradient_boosting.binning import BinMapper - - -n_features = 5 - -max_pow = 7 -n_samples = int(10**max_pow) -X, y = make_regression(n_samples=n_samples, n_features=n_features, - random_state=0) - -print("compiling pygbm") -pygbm_bm = pygbm.binning.BinMapper() -pygbm_bm.fit_transform(X[:1000]) -print('done') - -bm = BinMapper() - -n_samples_list = [10**x for x in range(2, max_pow + 1)] -n_exp = 10 - -transform_durations = defaultdict(lambda: defaultdict(list)) -fit_durations = defaultdict(lambda: defaultdict(list)) - -for n_samples in n_samples_list: - for exp in range(n_exp): - - tic = time() - tic = time() - bm.fit(X[:n_samples]) - fit_duration = time() - tic - print(f"sklearn fit duration = {fit_duration:.3f}") - tic = time() - bm.transform(X[:n_samples]) - transform_duration = time() - tic - print(f"sklearn transform duration = {transform_duration:.3f}") - - fit_durations['sklearn'][n_samples].append(fit_duration) - transform_durations['sklearn'][n_samples].append(transform_duration) - - tic = time() - pygbm_bm.fit(X[:n_samples]) - fit_duration = time() - tic - print(f"pygbm fit duration = {fit_duration:.3f}") - tic = time() - pygbm_bm.transform(X[:n_samples]) - transform_duration = time() - tic - print(f"pygbm transform duration = {transform_duration:.3f}") - fit_durations['pygbm'][n_samples].append(fit_duration) - transform_durations['pygbm'][n_samples].append(transform_duration) - -fig, axs = plt.subplots(2) - -for implem in ('sklearn', 'pygbm'): - avgs = [np.mean(fit_durations[implem][n_samples]) - for n_samples in n_samples_list] - stds = [np.std(fit_durations[implem][n_samples]) - for n_samples in n_samples_list] - axs[0].errorbar(n_samples_list, avgs, yerr=stds, label=implem) - axs[0].set_title('Fit') - -for implem in ('sklearn', 'pygbm'): - avgs = [np.mean(transform_durations[implem][n_samples]) - for n_samples in n_samples_list] - stds = [np.std(transform_durations[implem][n_samples]) - for n_samples in n_samples_list] - axs[1].errorbar(n_samples_list, avgs, yerr=stds, label=implem) - axs[1].set_title('transform') - -for ax in axs: - ax.set_xscale('log') - ax.legend(loc='best') - -fig.suptitle(f'Avg fit and transform time for binning over {n_exp} runs\nfor different sample sizes') -plt.show() diff --git a/bench_find_node_split.py b/bench_find_node_split.py deleted file mode 100644 index 6433fa8ffddab..0000000000000 --- a/bench_find_node_split.py +++ /dev/null @@ -1,96 +0,0 @@ -from collections import defaultdict -from time import time - -import numpy as np -import matplotlib.pyplot as plt -from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import X_DTYPE -from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE -from sklearn._fast_gradient_boosting.types import Y_DTYPE -from sklearn._fast_gradient_boosting.splitting import SplittingContext -from sklearn._fast_gradient_boosting.splitting import find_node_split -from pygbm.splitting import SplittingContext as SplittingContext_pygbm -from pygbm.splitting import find_node_split as find_node_split_pygbm - -rng = np.random.RandomState(42) - -n_bins = 255 -n_features = 20 -l2_regularization = 0. -min_hessian_to_split = 1e-3 -min_samples_leaf = 1 -min_gain_to_split = 0. - -max_pow = 7 -n_samples_list = [10**x for x in range(2, max_pow + 1)] -n_exp = 10 - -n_samples = 10**max_pow - -X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE) -sample_indices_ = np.arange(n_samples, dtype=np.uint32) -all_gradients_ = rng.randn(n_samples).astype(Y_DTYPE) -all_hessians_ = rng.lognormal(size=n_samples).astype(Y_DTYPE) - -def one_run(n_samples): - - X_binned = X_binned_[:n_samples] - X_binned = np.asfortranarray(X_binned) - sample_indices = sample_indices_[:n_samples] - all_gradients = all_gradients_[:n_samples] - all_hessians = all_hessians_[:n_samples] - - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - - sklearn_context = SplittingContext(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) - all_gradients = all_gradients.astype(np.float32) - all_hessians = all_hessians.astype(np.float32) - pygbm_context = SplittingContext_pygbm(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) - - tic = time() - histograms = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - _ = find_node_split(sklearn_context, sample_indices, histograms) - sklearn_duration = time() - tic - - tic = time() - _, _ = find_node_split_pygbm(pygbm_context, sample_indices) - pygbm_duration = time() - tic - - return sklearn_duration, pygbm_duration - -one_run(100) # compile pygbm - -durations = defaultdict(lambda: defaultdict(list)) - -for n_samples in n_samples_list: - for exp in range(n_exp): - - sklearn_duration, pygbm_duration = one_run(n_samples) - print(f"sklearn fit duration = {sklearn_duration:.3f}") - print(f"pygbm fit duration = {pygbm_duration:.3f}") - durations['sklearn'][n_samples].append(sklearn_duration) - durations['pygbm'][n_samples].append(pygbm_duration) - -fig, ax = plt.subplots(1) - -for implem in ('sklearn', 'pygbm'): - avgs = [np.mean(durations[implem][n_samples]) - for n_samples in n_samples_list] - stds = [np.std(durations[implem][n_samples]) - for n_samples in n_samples_list] - ax.errorbar(n_samples_list, avgs, yerr=stds, label=implem) - - -ax.set_xscale('log') -ax.legend(loc='best') - -fig.suptitle(f'Avg time for find_node_split {n_exp} runs\nfor different sample sizes') -plt.show() \ No newline at end of file diff --git a/bench_hist.py b/bench_hist.py deleted file mode 100644 index 6156db2317e30..0000000000000 --- a/bench_hist.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Compare histogram building function with pygbm. - -might be a bit unfair to cython code since we're calling the python versions -of the cpdef functions, which causes unnecessary conversions. -""" -from time import time -from collections import defaultdict - -import matplotlib.pyplot as plt -import numpy as np -from joblib import Memory -from pygbm.histogram import _build_histogram_naive as pygbm_build_histogram_naive -from pygbm.histogram import _build_histogram as pygbm_build_histogram -from pygbm.histogram import _build_histogram_no_hessian as pygbm_build_histogram_no_hessian -from pygbm.histogram import _build_histogram_root as pygbm_build_histogram_root -from pygbm.histogram import _build_histogram_root_no_hessian as pygbm_build_histogram_root_no_hessian -from pygbm.histogram import _subtract_histograms as pygbm_subtract_histograms - -from sklearn._fast_gradient_boosting.histogram import _build_histogram_naive -from sklearn._fast_gradient_boosting.histogram import _build_histogram -from sklearn._fast_gradient_boosting.histogram import _build_histogram_no_hessian -from sklearn._fast_gradient_boosting.histogram import _build_histogram_root -from sklearn._fast_gradient_boosting.histogram import _build_histogram_root_no_hessian -from sklearn._fast_gradient_boosting.histogram import _subtract_histograms -from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import X_DTYPE -from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE -from sklearn._fast_gradient_boosting.types import Y_DTYPE - - -m = Memory(location='/tmp') - -@m.cache -def make_data(n_bins=256, n_samples=int(1e8), seed=42): - rng = np.random.RandomState(seed) - - sample_indices = np.arange(n_samples, dtype=np.uint32) - ordered_gradients = rng.randn(n_samples).astype(Y_DTYPE) - ordered_hessians = rng.exponential(size=n_samples).astype(Y_DTYPE) - binned_feature = rng.randint(0, n_bins, size=n_samples, dtype=X_BINNED_DTYPE) - return sample_indices, binned_feature, ordered_gradients, ordered_hessians - - -n_bins = 256 -print(f"Compiling pygbm...") -sample_indices, binned_feature, gradients, hessians = make_data( - n_bins, n_samples=10) -tic = time() -a = pygbm_build_histogram_naive(n_bins, sample_indices, binned_feature, gradients, hessians) -b = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) -pygbm_subtract_histograms(n_bins, a, b) -pygbm_build_histogram_no_hessian(n_bins, sample_indices, binned_feature, gradients) -pygbm_build_histogram_root(n_bins, binned_feature, gradients, hessians) -pygbm_build_histogram_root_no_hessian(n_bins, binned_feature, gradients) -toc = time() -duration = toc - tic -print(f"done in {duration:.3f}s") - -def one_run(sklearn_fun, pygbm_fun): - print('-' * 10) - print(sklearn_fun.__name__) - - if 'subtract' in sklearn_fun.__name__: - # specal case for subtract... crappy - a = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) - b = pygbm_build_histogram(n_bins, sample_indices, binned_feature, gradients, hessians) - - args = [n_bins, a, b] - tic = time() - pygbm_fun(*args) - pygbm_duration = time() - tic - print(f"pygbm: Built in {pygbm_duration:.3f}s") - - a = a.astype(HISTOGRAM_DTYPE) - b = b.astype(HISTOGRAM_DTYPE) - args = [n_bins, a, b] - tic = time() - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - args.append(histogram) - sklearn_fun(*args) - sklearn_duration = time() - tic - print(f"sklearn: Built in {sklearn_duration:.3f}s") - - else: - args = [n_bins] - if not 'root' in sklearn_fun.__name__: - args.append(sample_indices) - args += [binned_feature, gradients, hessians] - if 'no_hessian' in sklearn_fun.__name__: - args.pop() - - tic = time() - pygbm_fun(*args) - pygbm_duration = time() - tic - print(f"pygbm: Built in {pygbm_duration:.3f}s") - - tic = time() - histogram = np.zeros(n_bins, dtype=HISTOGRAM_DTYPE) - args.append(histogram) - sklearn_fun(*args) - sklearn_duration = time() - tic - print(f"sklearn: Built in {sklearn_duration:.3f}s") - - return sklearn_duration, pygbm_duration - -n_exp = 10 -n_samples_list = [10**x for x in range(2, 9)] - - -n_rows = 3 -n_cols = 2 -fig, axs = plt.subplots(n_rows, n_cols, sharex=True) - -for i, (sklearn_fun, pygbm_fun) in enumerate(( - (_build_histogram_naive, pygbm_build_histogram_naive), - (_build_histogram, pygbm_build_histogram), - (_build_histogram_no_hessian, pygbm_build_histogram_no_hessian), - (_build_histogram_root, pygbm_build_histogram_root), - (_build_histogram_root_no_hessian, pygbm_build_histogram_root_no_hessian), - (_subtract_histograms, pygbm_subtract_histograms))): - - row = i // n_cols - col = i % n_cols - ax = axs[row][col] - - durations = defaultdict(lambda: defaultdict(list)) - for n_samples in n_samples_list: - sample_indices, binned_feature, gradients, hessians = make_data( - n_bins, n_samples) - for _ in range(n_exp): - sklearn_duration, pygbm_duration = one_run(sklearn_fun, pygbm_fun) - durations[n_samples]['sklearn'].append(sklearn_duration) - durations[n_samples]['pygbm'].append(pygbm_duration) - - sklearn_avgs = [np.mean(durations[n_samples]['sklearn']) for n_samples in n_samples_list] - sklearn_stds = [np.std(durations[n_samples]['sklearn']) for n_samples in n_samples_list] - ax.errorbar(n_samples_list, sklearn_avgs, yerr=sklearn_stds, label='PR') - - pygbm_avgs = [np.mean(durations[n_samples]['pygbm']) for n_samples in n_samples_list] - pygbm_stds = [np.std(durations[n_samples]['pygbm']) for n_samples in n_samples_list] - ax.errorbar(n_samples_list, pygbm_avgs, yerr=pygbm_stds, label='pygbm') - ax.set_xscale('log') - ax.set_title(sklearn_fun.__name__) - ax.legend() -fig.suptitle(f'Avg histogram computation time over {n_exp} runs\nfor different sample sizes') -plt.show() diff --git a/bench_predict.py b/bench_predict.py deleted file mode 100644 index cf47d9660b17e..0000000000000 --- a/bench_predict.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Compare prediction time with pygbm. -""" - -from time import time -from collections import defaultdict - -import pygbm -import numpy as np -import matplotlib.pyplot as plt - -from sklearn.datasets import make_regression, make_classification -from sklearn._fast_gradient_boosting import FastGradientBoostingRegressor -from sklearn._fast_gradient_boosting import FastGradientBoostingClassifier - -classif = False -n_classes = 3 -max_pow = 7 -n_samples = int(10**max_pow) -max_iter = 20 -n_features = 5 - -if classif: - X, y = make_classification(n_samples=n_samples, n_features=n_features, - random_state=0, n_classes=n_classes, - n_clusters_per_class=1) - GBM = FastGradientBoostingClassifier - PYGBM_GBM = pygbm.GradientBoostingClassifier -else: - X, y = make_regression(n_samples=n_samples, n_features=n_features, - random_state=0) - GBM = FastGradientBoostingRegressor - PYGBM_GBM = pygbm.GradientBoostingRegressor - - -sklearn_est = GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - n_iter_no_change=None, - random_state=0, - verbose=False) - -pygbm_est = PYGBM_GBM( - max_iter=max_iter, - scoring=None, # no early stopping - validation_split=None, - random_state=0, - verbose=False) -print("compiling pygbm code, and fit estimators") -pygbm_est.fit(X[:1000], y[:1000]) -pygbm_est.predict(X[:1000]) -sklearn_est.fit(X[:1000], y[:1000]) -print("done") - -n_samples_list = [10**x for x in range(2, max_pow + 1)] -n_exp = 3 - -predict_durations = defaultdict(lambda: defaultdict(list)) - -for n_samples in n_samples_list: - for exp in range(n_exp): - - tic = time() - sklearn_est.predict(X[:n_samples]) - predict_duration = time() - tic - print(f'sklearn_est predict_duration: {predict_duration:.3f}s') - - predict_durations['sklearn'][n_samples].append(predict_duration) - - tic = time() - pygbm_est.predict(X[:n_samples]) - predict_duration = time() - tic - print(f'pygbm_est predict_duration: {predict_duration:.3f}s\n') - predict_durations['pygbm'][n_samples].append(predict_duration) - - -fig, ax = plt.subplots(1) - -for implem in ('sklearn', 'pygbm'): - avgs = [np.mean(predict_durations[implem][n_samples]) - for n_samples in n_samples_list] - stds = [np.std(predict_durations[implem][n_samples]) - for n_samples in n_samples_list] - ax.errorbar(n_samples_list, avgs, yerr=stds, label=implem) -ax.set_xscale('log') -ax.legend(loc='best') - -fig.suptitle(f'Avg prediction time over {n_exp} runs\nfor different sample sizes') -plt.show() diff --git a/bench_split_indices.py b/bench_split_indices.py deleted file mode 100644 index f53d69269805f..0000000000000 --- a/bench_split_indices.py +++ /dev/null @@ -1,102 +0,0 @@ -from collections import defaultdict -from time import time - -import numpy as np -import matplotlib.pyplot as plt -from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import X_DTYPE -from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE -from sklearn._fast_gradient_boosting.types import Y_DTYPE -from sklearn._fast_gradient_boosting.splitting import SplittingContext -from sklearn._fast_gradient_boosting.splitting import find_node_split -from sklearn._fast_gradient_boosting.splitting import split_indices -from pygbm.splitting import SplittingContext as SplittingContext_pygbm -from pygbm.splitting import find_node_split as find_node_split_pygbm -from pygbm.splitting import split_indices as split_indices_pygbm - -rng = np.random.RandomState(42) - -n_bins = 255 -n_features = 20 # Number of features has huge impact, it's weird -l2_regularization = 0. -min_hessian_to_split = 1e-3 -min_samples_leaf = 1 -min_gain_to_split = 0. - -max_pow = 7 -n_samples_list = [10**x for x in range(2, max_pow + 1)] -n_exp = 10 - -n_samples = 10**max_pow - -X_binned_ = rng.randint(0, n_bins, size=(n_samples, n_features), dtype=np.uint8) -sample_indices_ = np.arange(n_samples, dtype=np.uint32) -all_gradients_ = rng.randn(n_samples).astype(Y_DTYPE) -all_hessians_ = rng.lognormal(size=n_samples).astype(Y_DTYPE) - -def one_run(n_samples): - - X_binned = X_binned_[:n_samples] - X_binned = np.asfortranarray(X_binned) - sample_indices = sample_indices_[:n_samples] - all_gradients = all_gradients_[:n_samples] - all_hessians = all_hessians_[:n_samples] - - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - - sklearn_context = SplittingContext(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) - all_gradients = all_gradients.astype(np.float32) - all_hessians = all_hessians.astype(np.float32) - pygbm_context = SplittingContext_pygbm(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) - - sample_indices = np.arange(n_samples, dtype=np.uint32) - - histograms = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - split_info = find_node_split(sklearn_context, sample_indices, histograms) - tic = time() - _, _, _ = split_indices(sklearn_context, split_info, sample_indices) - sklearn_duration = time() - tic - - split_info, _ = find_node_split_pygbm(pygbm_context, sample_indices) - tic = time() - _, _ = split_indices_pygbm(pygbm_context, split_info, sample_indices) - pygbm_duration = time() - tic - - return sklearn_duration, pygbm_duration - -one_run(100) # compile pygbm - -durations = defaultdict(lambda: defaultdict(list)) - -for n_samples in n_samples_list: - for exp in range(n_exp): - - sklearn_duration, pygbm_duration = one_run(n_samples) - print(f"sklearn fit duration = {sklearn_duration:.3f}") - print(f"pygbm fit duration = {pygbm_duration:.3f}") - durations['sklearn'][n_samples].append(sklearn_duration) - durations['pygbm'][n_samples].append(pygbm_duration) - -fig, ax = plt.subplots(1) - -for implem in ('sklearn', 'pygbm'): - avgs = [np.mean(durations[implem][n_samples]) - for n_samples in n_samples_list] - stds = [np.std(durations[implem][n_samples]) - for n_samples in n_samples_list] - ax.errorbar(n_samples_list, avgs, yerr=stds, label=implem) - - -ax.set_xscale('log') -ax.legend(loc='best') - -fig.suptitle(f'Avg time for split_indices over {n_exp} runs\nfor different sample sizes') -plt.show() From e4738ee84338c52358a151df1ff3899e5c6f1ce1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 10:46:03 -0500 Subject: [PATCH 064/247] Added higgs boson benchmark and removed files --- ...bench_fast_gradient_boosting_higgsboson.py | 93 +++++++++++++++++++ push_annotated_cython.sh | 56 ----------- sklearn/tree/_tree.pyx | 4 - sklearn/tree/tree.py | 1 + 4 files changed, 94 insertions(+), 60 deletions(-) create mode 100644 benchmarks/bench_fast_gradient_boosting_higgsboson.py delete mode 100755 push_annotated_cython.sh diff --git a/benchmarks/bench_fast_gradient_boosting_higgsboson.py b/benchmarks/bench_fast_gradient_boosting_higgsboson.py new file mode 100644 index 0000000000000..4305dc378074a --- /dev/null +++ b/benchmarks/bench_fast_gradient_boosting_higgsboson.py @@ -0,0 +1,93 @@ +from urllib.request import urlretrieve +import os +from gzip import GzipFile +from time import time +import argparse + +import numpy as np +import pandas as pd +from joblib import Memory +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.ensemble import FastGradientBoostingClassifier +from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator + + +parser = argparse.ArgumentParser() +parser.add_argument('--n-leaf-nodes', type=int, default=31) +parser.add_argument('--n-trees', type=int, default=10) +parser.add_argument('--lightgbm', action="store_true", default=False) +parser.add_argument('--learning-rate', type=float, default=1.) +parser.add_argument('--subsample', type=int, default=None) +parser.add_argument('--max-bins', type=int, default=255) +args = parser.parse_args() + +HERE = os.path.dirname(__file__) +URL = ("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/" + "HIGGS.csv.gz") +m = Memory(location='/tmp', mmap_mode='r') + +n_leaf_nodes = args.n_leaf_nodes +n_trees = args.n_trees +subsample = args.subsample +lr = args.learning_rate +max_bins = args.max_bins + + +@m.cache +def load_data(): + filename = os.path.join(HERE, URL.rsplit('/', 1)[-1]) + if not os.path.exists(filename): + print(f"Downloading {URL} to {filename} (2.6 GB)...") + urlretrieve(URL, filename) + print("done.") + + print(f"Parsing {filename}...") + tic = time() + with GzipFile(filename) as f: + df = pd.read_csv(f, header=None, dtype=np.float32) + toc = time() + print(f"Loaded {df.values.nbytes / 1e9:0.3f} GB in {toc - tic:0.3f}s") + return df + + +df = load_data() +target = df.values[:, 0] +data = np.ascontiguousarray(df.values[:, 1:]) +data_train, data_test, target_train, target_test = train_test_split( + data, target, test_size=50000, random_state=0) + +if subsample is not None: + data_train, target_train = data_train[:subsample], target_train[:subsample] + +n_samples, n_features = data_train.shape +print(f"Training set with {n_samples} records with {n_features} features.") + +print("Fitting a sklearn model...") +tic = time() +est = FastGradientBoostingClassifier( + loss='binary_crossentropy', + learning_rate=lr, + n_estimators=n_trees, + max_bins=max_bins, + max_leaf_nodes=n_leaf_nodes, + n_iter_no_change=None, + random_state=0, + verbose=1) +est.fit(data_train, target_train) +toc = time() +predicted_test = est.predict(data_test) +roc_auc = roc_auc_score(target_test, predicted_test) +acc = accuracy_score(target_test, predicted_test) +print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") + +if args.lightgbm: + print("Fitting a LightGBM model...") + tic = time() + lightgbm_est = get_lightgbm_estimator(est) + lightgbm_est.fit(data_train, target_train) + toc = time() + predicted_test = lightgbm_est.predict(data_test) + roc_auc = roc_auc_score(target_test, predicted_test) + acc = accuracy_score(target_test, predicted_test) + print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") diff --git a/push_annotated_cython.sh b/push_annotated_cython.sh deleted file mode 100755 index 9e7424b995e81..0000000000000 --- a/push_annotated_cython.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/sh - -set -e # exit if any command fails - - -BRANCH=gbm -SOURCE_DIR=/home/nico/dev/sklearn/sklearn/ensemble/gbm -TARGET_DIR=/home/nico/dev/cython_annotations - -ORIGINAL_DIR=`pwd` - - -git co $BRANCH - -# Commits in the branch (provided it branched off master) -COMMITS=`git log master.. --pretty=format:"%h"` - -annotate_and_copy_files() { - # For a give commit, annotate all pyx file in SOURCE_DIR and copy the html - # files in TARGET_DIR/COMMIT_HASH/ - - git co $1 # checkout commit - rm -f $SOURCE_DIR/*.html # remove any previous file just in case - for pyx_file in `ls $SOURCE_DIR/*.pyx` - do - echo 'annotating' $1 $pyx_file - cython -a $pyx_file - done - - for html_file in `ls $SOURCE_DIR/*.html` - do - mkdir -p $TARGET_DIR/$1 - mv $html_file $TARGET_DIR/$1 - html_file_name=$(basename -- "$html_file") # without path - echo moved $html_file_name to $TARGET_DIR/$1 - done -} - -for commit in $COMMITS -do - annotate_and_copy_files $commit -done - - -# Get into target dir, commit html files and push them. -cd $TARGET_DIR -git co gh-pages -echo Generating index.html -python lol.py # generates index.html with links to each file -echo Committing and pushing files -git add . -git ci -am "Added some annotated cython files" -git push - -cd $ORIGINAL_DIR # go back where we were -git co $BRANCH # Probably useless since with checked out the last commit diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index d7ce5d195ac11..ed259c98ac850 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -604,10 +604,6 @@ cdef class Tree: def __get__(self): return self._get_value_ndarray()[:self.node_count] - property nodes: - def __get__(self): - return self._get_node_ndarray() - def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes, int n_outputs): """Constructor.""" diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 1bf35f28d3d65..cd6a798291cf6 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -359,6 +359,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.presort) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, From 2341a04ab7297b11bf9b3c9dd78560e24747cae4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 12:01:57 -0500 Subject: [PATCH 065/247] Added another benchmark --- benchmarks/bench_fast_gradient_boosting.py | 160 ++++++++++++++++++ .../gradient_boosting.py | 4 +- sklearn/_fast_gradient_boosting/utils.py | 11 +- 3 files changed, 168 insertions(+), 7 deletions(-) create mode 100644 benchmarks/bench_fast_gradient_boosting.py diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_fast_gradient_boosting.py new file mode 100644 index 0000000000000..aec326e735421 --- /dev/null +++ b/benchmarks/bench_fast_gradient_boosting.py @@ -0,0 +1,160 @@ +from urllib.request import urlretrieve +import os +from gzip import GzipFile +from time import time +import argparse + +import numpy as np +import pandas as pd +from joblib import Memory +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.ensemble import FastGradientBoostingClassifier +from sklearn.ensemble import FastGradientBoostingRegressor +from sklearn.datasets import make_classification +from sklearn.datasets import make_regression +from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator + + +parser = argparse.ArgumentParser() +parser.add_argument('--n-leaf-nodes', type=int, default=31) +parser.add_argument('--n-trees', type=int, default=10) +parser.add_argument('--lightgbm', action="store_true", default=False, + help='also plot lightgbm') +parser.add_argument('--learning-rate', type=float, default=1.) +parser.add_argument('--problem', type=str, default='classification', + choices=['classification', 'regression']) +parser.add_argument('--n-classes', type=int, default=2) +parser.add_argument('--n-samples-max', type=int, default=int(1e6)) +parser.add_argument('--n-features', type=int, default=20) +parser.add_argument('--max-bins', type=int, default=255) +args = parser.parse_args() + +n_leaf_nodes = args.n_leaf_nodes +n_trees = args.n_trees +lr = args.learning_rate +max_bins = args.max_bins + +def get_estimator_and_data(): + if args.problem == 'classification': + X, y = make_classification(args.n_samples_max, + n_features=args.n_features, + n_classes=args.n_classes, + n_clusters_per_class=1, + random_state=0) + return X, y, FastGradientBoostingClassifier + elif args.problem == 'regression': + X, y = make_regression(args.n_samples_max, + n_features=args.n_features, random_state=0) + return X, y, FastGradientBoostingRegressor + + +X, y, Estimator = get_estimator_and_data() +X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, random_state=0) + + +def one_run(n_samples): + X_train = X_train_[:n_samples] + X_test = X_test_[:n_samples] + y_train = y_train_[:n_samples] + y_test = y_test_[:n_samples] + + print("Fitting a sklearn model...") + tic = time() + est = Estimator(learning_rate=lr, + n_estimators=n_trees, + max_bins=max_bins, + max_leaf_nodes=n_leaf_nodes, + n_iter_no_change=None, + random_state=0, + verbose=0) + est.fit(X_train, y_train) + sklearn_fit_duration = time() - tic + tic = time() + sklearn_score = est.score(X_test, y_test) + sklearn_score_duration = time() - tic + print("score: {:.4f}".format(sklearn_score)) + print("fit duration: {:.3f}s,".format(sklearn_fit_duration)) + print("score duration: {:.3f}s,".format(sklearn_score_duration)) + + if args.lightgbm: + print("Fitting a LightGBM model...") + # get_lightgbm does not accept loss='auto' + if args.problem == 'classification': + loss = 'binary_crossentropy' if args.n_classes == 2 else \ + 'categorical_crossentropy' + est.set_params(loss=loss) + lightgbm_est = get_lightgbm_estimator(est) + + tic = time() + lightgbm_est.fit(X_train, y_train) + lightgbm_fit_duration = time() - tic + tic = time() + lightgbm_score = lightgbm_est.score(X_test, y_test) + lightgbm_score_duration = time() - tic + print("score: {:.4f}".format(lightgbm_score)) + print("fit duration: {:.3f}s,".format(lightgbm_fit_duration)) + print("score duration: {:.3f}s,".format(lightgbm_score_duration)) + + return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, + lightgbm_score, lightgbm_fit_duration, + lightgbm_score_duration) + + return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, + None, None, None) + +n_samples_list = [1000, 10000, 100000, 500000, 1000000, 5000000, 10000000] +n_samples_list = [n_samples for n_samples in n_samples_list + if n_samples <= args.n_samples_max] + +sklearn_scores = [] +sklearn_fit_durations = [] +sklearn_score_durations = [] +lightgbm_scores = [] +lightgbm_fit_durations = [] +lightgbm_score_durations = [] + +for n_samples in n_samples_list: + (sklearn_score, + sklearn_fit_duration, + sklearn_score_duration, + lightgbm_score, + lightgbm_fit_duration, + lightgbm_score_duration) = one_run(n_samples) + + sklearn_scores.append(sklearn_score) + sklearn_fit_durations.append(sklearn_fit_duration) + sklearn_score_durations.append(sklearn_score_duration) + lightgbm_scores.append(lightgbm_score) + lightgbm_fit_durations.append(lightgbm_fit_duration) + lightgbm_score_durations.append(lightgbm_score_duration) + +fig, axs = plt.subplots(3, sharex=True) + +axs[0].plot(n_samples_list, sklearn_scores, label='sklearn') +axs[1].plot(n_samples_list, sklearn_fit_durations, label='sklearn') +axs[2].plot(n_samples_list, sklearn_score_durations, label='sklearn') + +if args.lightgbm: + axs[0].plot(n_samples_list, lightgbm_scores, label='lgbm') + axs[1].plot(n_samples_list, lightgbm_fit_durations, label='lgbm') + axs[2].plot(n_samples_list, lightgbm_score_durations, label='lgbm') + +for ax in axs: + ax.set_xscale('log') + ax.legend(loc='best') + ax.set_xlabel('n_samples') + +axs[0].set_title('scores') +axs[1].set_title('fit duration (s)') +axs[2].set_title('score duration (s)') + +title = args.problem +if args.problem == 'classification': + title += ' n_classes = {}'.format(args.n_classes) +fig.suptitle(title) + + +plt.tight_layout() +plt.show() diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 02c3ba51b590a..c4d11bf4da857 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -51,8 +51,8 @@ def _validate_parameters(self): if self.loss not in self._VALID_LOSSES: raise ValueError( - "Loss {} is not supported for {}. Accepted losses" - "are {}.".format(self.loss, self.__class__.__name__, + "Loss {} is not supported for {}. Accepted losses: " + "{}.".format(self.loss, self.__class__.__name__, ', '.join(self._VALID_LOSSES))) if self.learning_rate <= 0: diff --git a/sklearn/_fast_gradient_boosting/utils.py b/sklearn/_fast_gradient_boosting/utils.py index f9c9b59f42849..5a568f30465a3 100644 --- a/sklearn/_fast_gradient_boosting/utils.py +++ b/sklearn/_fast_gradient_boosting/utils.py @@ -34,17 +34,18 @@ def get_lightgbm_estimator(pygbm_estimator): 'n_estimators': pygbm_params['n_estimators'], 'num_leaves': pygbm_params['max_leaf_nodes'], 'max_depth': pygbm_params['max_depth'], - 'min_data_in_leaf': pygbm_params['min_samples_leaf'], - 'lambda_l2': pygbm_params['l2_regularization'], + 'min_child_samples': pygbm_params['min_samples_leaf'], + 'reg_lambda': pygbm_params['l2_regularization'], 'max_bin': pygbm_params['max_bins'], 'min_data_in_bin': 1, + 'min_child_weight': 1e-3, 'min_sum_hessian_in_leaf': 1e-3, - 'min_gain_to_split': 0, - 'verbosity': 10 if pygbm_params['verbose'] else 0, + 'min_split_gain': 0, + 'verbosity': 10 if pygbm_params['verbose'] else -10, 'boost_from_average': True, 'enable_bundle': False, # also makes feature order consistent 'min_data_in_bin': 1, - 'bin_construct_sample_cnt': BinMapper().subsample, + 'subsample_for_bin': BinMapper().subsample, } # TODO: change hardcoded values when / if they're arguments to the # estimator. From 29ffcdf7f7e65dcd67db503147f7ffd2bfa5fa0e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 12:20:01 -0500 Subject: [PATCH 066/247] changed benchmark default learning rate --- benchmarks/bench_fast_gradient_boosting.py | 2 +- gdb_test.py | 71 ---------------------- 2 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 gdb_test.py diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_fast_gradient_boosting.py index aec326e735421..7c297196225be 100644 --- a/benchmarks/bench_fast_gradient_boosting.py +++ b/benchmarks/bench_fast_gradient_boosting.py @@ -22,7 +22,7 @@ parser.add_argument('--n-trees', type=int, default=10) parser.add_argument('--lightgbm', action="store_true", default=False, help='also plot lightgbm') -parser.add_argument('--learning-rate', type=float, default=1.) +parser.add_argument('--learning-rate', type=float, default=.1) parser.add_argument('--problem', type=str, default='classification', choices=['classification', 'regression']) parser.add_argument('--n-classes', type=int, default=2) diff --git a/gdb_test.py b/gdb_test.py deleted file mode 100644 index a00e14e5e41c6..0000000000000 --- a/gdb_test.py +++ /dev/null @@ -1,71 +0,0 @@ -from time import time - -from sklearn.datasets import make_regression, make_classification -from sklearn.ensemble import GradientBoostingRegressor -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import FastGradientBoostingRegressor -from sklearn.ensemble import FastGradientBoostingClassifier - -import pstats -import cProfile -import pygbm - -classif = False -n_classes = 2 -n_features = 20 -n_samples = int(5e3) -max_iter = 5 - -if classif: - X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0, n_classes=n_classes, n_clusters_per_class=1) - GBM = FastGradientBoostingClassifier - GBDT = GradientBoostingClassifier - PYGBM_GBM = pygbm.GradientBoostingClassifier -else: - X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0) - GBM = FastGradientBoostingRegressor - GBDT = GradientBoostingRegressor - PYGBM_GBM = pygbm.GradientBoostingRegressor - - -# pygbm_est = PYGBM_GBM( -# max_iter=max_iter, -# scoring=None, # no early stopping -# validation_split=None, -# random_state=0, -# verbose=False) -# print("compiling pygbm code") -# pygbm_est.fit(X[:1000], y[:1000]) -# print("done") - -gbm = GBM( - n_estimators=max_iter, - scoring='loss', - validation_fraction=.3, - n_iter_no_change=1000, - random_state=0, - verbose=True) -tic = time() -gbm.fit(X, y) -fit_duration = time() - tic -tic = time() -print(f'score: {gbm.score(X, y)}') -score_duration = time() - tic -print(f'sklearn gbm fit_duration: {fit_duration:.3f}s') -print(f'sklearn gbm score_duration {score_duration:.3f}s') - - -# pygbm_est.set_params(verbose=True) -# tic = time() -# pygbm_est.fit(X, y) -# fit_duration = time() - tic -# tic = time() -# print(f'score: {pygbm_est.score(X, y)}') -# score_duration = time() - tic -# print(f'pygbm fit_duration: {fit_duration:.3f}s') -# print(f'pygbm score_duration {score_duration:.3f}s') - -# cProfile.runctx("gbm.fit(X, y)", globals(), locals(), "Profile.prof") -# s = pstats.Stats("Profile.prof") -# s.strip_dirs().sort_stats("time").print_stats(.2) - From b4ba169315cf6cf26a1a4ee77d3fb502ccb5d6c1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 17:01:07 -0500 Subject: [PATCH 067/247] used custom expit function --- sklearn/_fast_gradient_boosting/loss.pyx | 44 ++++++++++++++---------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index a18f556883ae1..416de1d6be2df 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -16,7 +16,6 @@ from cython.parallel import prange import numpy as np cimport numpy as np from scipy.special import expit, logsumexp -from scipy.special.cython_special cimport expit as cexpit from libc.math cimport fabs, exp, log @@ -258,23 +257,6 @@ class CategoricalCrossEntropy(BaseLoss): logsumexp(raw_predictions, axis=1)[:, np.newaxis]) -cdef inline Y_DTYPE_C _logsumexp(const Y_DTYPE_C [:, :] a, const int row) nogil: - # Need to pass the whole array, else prange won't work. See Cython issue - # #2798 - cdef: - int k - Y_DTYPE_C out = 0. - Y_DTYPE_C amax = a[row, 0] - - for k in range(1, a.shape[1]): - if amax < a[row, k]: - amax = a[row, k] - - for k in range(a.shape[1]): - out += exp(a[row, k] - amax) - return log(out) + amax - - cdef void _update_gradients_hessians_categorical_crossentropy( Y_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT Y_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT @@ -298,11 +280,35 @@ cdef void _update_gradients_hessians_categorical_crossentropy( for i in prange(n_samples, schedule='static'): # p_k is the probability that class(ith sample) == k. # This is a regular softmax. - p_k = exp(raw_predictions[i, k] - _logsumexp(raw_predictions, i)) + p_k = exp(raw_predictions[i, k] - clogsumexp(raw_predictions, i)) gradients_at_k[i] = p_k - (y_true[i] == k) hessians_at_k[i] = p_k * (1. - p_k) +cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: + return 1. / (1 + exp(-x)) + + +cdef inline Y_DTYPE_C clogsumexp( + const Y_DTYPE_C [:, :] a, + const int row) nogil: + # Need to pass the whole array, else prange won't work. See Cython issue + # #2798 + cdef: + int k + Y_DTYPE_C out = 0. + Y_DTYPE_C amax = a[row, 0] + + for k in range(1, a.shape[1]): + if amax < a[row, k]: + amax = a[row, k] + + for k in range(a.shape[1]): + out += exp(a[row, k] - amax) + return log(out) + amax + + + _LOSSES = { 'least_squares': LeastSquares, 'binary_crossentropy': BinaryCrossEntropy, From e66fff229cf57bfccd2e3c9c8e342192c5302ff7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 17:01:17 -0500 Subject: [PATCH 068/247] doc --- sklearn/_fast_gradient_boosting/gradient_boosting.py | 4 ++-- sklearn/ensemble/gradient_boosting.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index c4d11bf4da857..a5973d74d6b85 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -492,7 +492,7 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if no early stopping. - train_score_ : array, shape=(n_estimators + 1) + validation_score_ : array, shape=(n_estimators + 1) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if @@ -630,7 +630,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if no early stopping. - train_score_ : array, shape=(n_estimators + 1) + validation_score_ : array, shape=(n_estimators + 1) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index e0f3d9e4c35f7..1eafbe48b8395 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1922,8 +1922,8 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): See also -------- - sklearn.tree.DecisionTreeClassifier, RandomForestClassifier - AdaBoostClassifier + FastGradientBoostingClassifier, sklearn.tree.DecisionTreeClassifier, + RandomForestClassifier AdaBoostClassifier References ---------- @@ -2378,7 +2378,8 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): See also -------- - DecisionTreeRegressor, RandomForestRegressor + FastGradientBoostingRegressor, sklearn.tree.DecisionTreeRegressor, + RandomForestRegressor References ---------- From c75acca545aa07766fff09b92d32deb68fde50de Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 17:08:22 -0500 Subject: [PATCH 069/247] Added decision_function --- .../gradient_boosting.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index a5973d74d6b85..e183627284827 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -696,6 +696,27 @@ def predict_proba(self, X): raw_predictions = self._raw_predict(X) return self.loss_.predict_proba(raw_predictions) + def decision_function(self, X): + """Compute the decision function of X + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + decision : array, shape (n_samples,) or \ + (n_samples, n_trees_per_iteration) + The raw predicted values (i.e. the sum of the trees leaves) for + each sample. n_trees_per_iteration is equal to the number of + classes in multiclass classification. + """ + decision = self._raw_predict(X) + if decision.shape[1] == 1: + decision = decision.ravel() + return decision + def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ # and n_trees_per_iteration_ From 9ff4242d091627ee4c9aa45f8e85948045cfa0bb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 17:31:19 -0500 Subject: [PATCH 070/247] Using openmp flags from #11950 --- build_tools/travis/install.sh | 9 +++++ setup.py | 47 +++++++++++++++++++++++- sklearn/_fast_gradient_boosting/setup.py | 20 +++------- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index d79f8845a3d89..d0fb0409987d9 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -25,6 +25,13 @@ then # export CCACHE_LOGFILE=/tmp/ccache.log # ~60M is used by .ccache when compiling from scratch at the time of writing ccache --max-size 100M --show-stats +elif [ $TRAVIS_OS_NAME = "osx" ] +then + # use clang installed by conda which supports OpenMP + export CC=clang + export CXX=clang + # avoid error due to multiple openmp libraries loaded simultaneously + export KMP_DUPLICATE_LIB_OK=TRUE fi make_conda() { @@ -38,6 +45,8 @@ make_conda() { if [ $TRAVIS_OS_NAME = "osx" ] then fname=Miniconda3-latest-MacOSX-x86_64.sh + # we need to install a version on clang which supports OpenMP + TO_INSTALL="$TO_INSTALL llvm-openmp clang" else fname=Miniconda3-latest-Linux-x86_64.sh fi diff --git a/setup.py b/setup.py index cce21f5883c5a..9788b3c51f9bd 100755 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ 'develop', 'release', 'bdist_egg', 'bdist_rpm', 'bdist_wininst', 'install_egg_info', 'build_sphinx', 'egg_info', 'easy_install', 'upload', 'bdist_wheel', - '--single-version-externally-managed', + '--single-version-externally-managed', 'build_ext', ]) if SETUPTOOLS_COMMANDS.intersection(sys.argv): import setuptools @@ -102,7 +102,50 @@ def run(self): shutil.rmtree(os.path.join(dirpath, dirname)) -cmdclass = {'clean': CleanCommand} +def get_openmp_flag(compiler): + if sys.platform == "win32" and ('icc' in compiler or 'icl' in compiler): + return ['/Qopenmp'] + elif sys.platform == "win32": + return ['/openmp'] + elif sys.platform == "darwin" and ('icc' in compiler or 'icl' in compiler): + return ['-openmp'] + return ['-fopenmp'] + + +OPENMP_EXTENSIONS = [ + "sklearn._fast_gradient_boosting._gradient_boosting", + "sklearn._fast_gradient_boosting.splitting", + "sklearn._fast_gradient_boosting.binning", + "sklearn._fast_gradient_boosting.predictor", + "sklearn._fast_gradient_boosting.loss", +] + + +# custom build_ext command to set OpenMP compile flags depending on os and +# compiler +# build_ext has to be imported after setuptools +from numpy.distutils.command.build_ext import build_ext # noqa + + +class build_ext_subclass(build_ext): + def build_extensions(self): + if hasattr(self.compiler, 'compiler'): + compiler = self.compiler.compiler[0] + else: + compiler = self.compiler.__class__.__name__ + + openmp_flag = get_openmp_flag(compiler) + + for e in self.extensions: + print(e.name) + if e.name in OPENMP_EXTENSIONS: + e.extra_compile_args += openmp_flag + e.extra_link_args += openmp_flag + + build_ext.build_extensions(self) + + +cmdclass = {'clean': CleanCommand, 'build_ext': build_ext_subclass} # Optional wheelhouse-uploader features # To automate release of binary packages for scikit-learn we need a tool diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index 398e678f2f31e..6dc60867f6c68 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -7,9 +7,7 @@ def configuration(parent_package="", top_path=None): config.add_extension("_gradient_boosting", sources=["_gradient_boosting.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) + include_dirs=[numpy.get_include()]) config.add_extension("histogram", sources=["histogram.pyx"], @@ -17,27 +15,19 @@ def configuration(parent_package="", top_path=None): config.add_extension("splitting", sources=["splitting.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) + include_dirs=[numpy.get_include()]) config.add_extension("binning", sources=["binning.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) + include_dirs=[numpy.get_include()]) config.add_extension("predictor", sources=["predictor.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) + include_dirs=[numpy.get_include()]) config.add_extension("loss", sources=["loss.pyx"], - include_dirs=[numpy.get_include()], - extra_compile_args=['-fopenmp'], - extra_link_args=['-fopenmp']) + include_dirs=[numpy.get_include()]) config.add_extension("types", sources=["types.pyx"], From d782d02239af0ff94b6a9554096e4240400056c0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 17:34:31 -0500 Subject: [PATCH 071/247] scipy logsumexp import from misc if error --- sklearn/_fast_gradient_boosting/loss.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 416de1d6be2df..9961a1008d692 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -15,7 +15,11 @@ cimport cython from cython.parallel import prange import numpy as np cimport numpy as np -from scipy.special import expit, logsumexp +from scipy.special import expit +try: + from scipy.special import logsumexp +except ImportError: + from scipy.misc import logsumexp from libc.math cimport fabs, exp, log From ea53299426883e13bba4ef598b843a69da5c4ed7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 18 Jan 2019 17:46:35 -0500 Subject: [PATCH 072/247] pep8 --- benchmarks/bench_fast_gradient_boosting.py | 9 +--- .../gradient_boosting.py | 9 ++-- .../tests/test_gradient_boosting.py | 53 ++++++++++--------- .../tests/test_histogram.py | 4 +- 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_fast_gradient_boosting.py index 7c297196225be..31b96182b8039 100644 --- a/benchmarks/bench_fast_gradient_boosting.py +++ b/benchmarks/bench_fast_gradient_boosting.py @@ -1,15 +1,8 @@ -from urllib.request import urlretrieve -import os -from gzip import GzipFile from time import time import argparse -import numpy as np -import pandas as pd -from joblib import Memory import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.ensemble import FastGradientBoostingClassifier from sklearn.ensemble import FastGradientBoostingRegressor from sklearn.datasets import make_classification @@ -36,6 +29,7 @@ lr = args.learning_rate max_bins = args.max_bins + def get_estimator_and_data(): if args.problem == 'classification': X, y = make_classification(args.n_samples_max, @@ -104,6 +98,7 @@ def one_run(n_samples): return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, None, None, None) + n_samples_list = [1000, 10000, 100000, 500000, 1000000, 5000000, 10000000] n_samples_list = [n_samples for n_samples in n_samples_list if n_samples <= args.n_samples_max] diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index e183627284827..fa196c8d343ba 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -53,7 +53,7 @@ def _validate_parameters(self): raise ValueError( "Loss {} is not supported for {}. Accepted losses: " "{}.".format(self.loss, self.__class__.__name__, - ', '.join(self._VALID_LOSSES))) + ', '.join(self._VALID_LOSSES))) if self.learning_rate <= 0: raise ValueError('learning_rate={} must ' @@ -64,7 +64,8 @@ def _validate_parameters(self): if self.n_iter_no_change is not None and self.n_iter_no_change < 0: raise ValueError('n_iter_no_change={} must be ' 'positive.'.format(self.n_iter_no_change)) - if self.validation_fraction is not None and self.validation_fraction <= 0: + if (self.validation_fraction is not None and + self.validation_fraction <= 0): raise ValueError( 'validation_fraction={} must be strictly ' 'positive, or None.'.format(self.validation_fraction)) @@ -363,8 +364,8 @@ def _print_iteration_stats(self, iteration_start_time): name = 'neg-loss' if self.scoring == 'loss' else 'score' log_msg += "train {}: {:.5f}, ".format(name, self.train_score_[-1]) if self.validation_fraction is not None: - log_msg += "val {}: {:.5f}, ".format(name, - self.validation_score_[-1]) + log_msg += "val {}: {:.5f}, ".format( + name, self.validation_score_[-1]) iteration_time = time() - iteration_start_time log_msg += "in {:0.3f}s".format(iteration_time) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index bae86eff484f4..131f1204d186e 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -89,15 +89,16 @@ def test_init_parameters_validation(GradientBoosting, X, y): ) -@pytest.mark.parametrize('scoring, validation_fraction, n_iter_no_change, tol', [ - ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer - ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on training data - (None, .1, 5, 1e-7), # same with default scorer - (None, None, 5, 1e-1), - ('loss', .1, 5, 1e-7), # use loss - ('loss', None, 5, 1e-1), # use loss on training data - (None, None, None, None), # no early stopping -]) +@pytest.mark.parametrize( + 'scoring, validation_fraction, n_iter_no_change, tol', [ + ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer + ('neg_mean_squared_error', None, 5, 1e-1), # use scorer on train data + (None, .1, 5, 1e-7), # same with default scorer + (None, None, 5, 1e-1), + ('loss', .1, 5, 1e-7), # use loss + ('loss', None, 5, 1e-1), # use loss on training data + (None, None, None, None), # no early stopping + ]) def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): @@ -124,15 +125,16 @@ def test_early_stopping_regression(scoring, validation_fraction, make_classification(random_state=0), make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) )) -@pytest.mark.parametrize('scoring, validation_fraction, n_iter_no_change, tol', [ - ('accuracy', .1, 5, 1e-7), # use scorer - ('accuracy', None, 5, 1e-1), # use scorer on training data - (None, .1, 5, 1e-7), # same with default scorerscor - (None, None, 5, 1e-1), - ('loss', .1, 5, 1e-7), # use loss - ('loss', None, 5, 1e-1), # use loss on training data - (None, None, None, None), # no early stopping -]) +@pytest.mark.parametrize( + 'scoring, validation_fraction, n_iter_no_change, tol', [ + ('accuracy', .1, 5, 1e-7), # use scorer + ('accuracy', None, 5, 1e-1), # use scorer on training data + (None, .1, 5, 1e-7), # same with default scorerscor + (None, None, 5, 1e-1), + ('loss', .1, 5, 1e-7), # use loss + ('loss', None, 5, 1e-1), # use loss on training data + (None, None, None, None), # no early stopping + ]) def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): @@ -140,13 +142,14 @@ def test_early_stopping_classification(data, scoring, validation_fraction, X, y = data - gb = FastGradientBoostingClassifier(verbose=1, # just for coverage - scoring=scoring, - tol=tol, - validation_fraction=validation_fraction, - n_estimators=n_estimators, - n_iter_no_change=n_iter_no_change, - random_state=0) + gb = FastGradientBoostingClassifier( + verbose=1, # just for coverage + scoring=scoring, + tol=tol, + validation_fraction=validation_fraction, + n_estimators=n_estimators, + n_iter_no_change=n_iter_no_change, + random_state=0) gb.fit(X, y) if n_iter_no_change is not None: diff --git a/sklearn/_fast_gradient_boosting/tests/test_histogram.py b/sklearn/_fast_gradient_boosting/tests/test_histogram.py index e32eedc8271cb..7f847a545fb38 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_histogram.py +++ b/sklearn/_fast_gradient_boosting/tests/test_histogram.py @@ -114,8 +114,8 @@ def test_unrolled_equivalent_to_naive(constant_hessian): hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram_root_no_hessian(0, n_bins, binned_feature, ordered_gradients, - hist_gc_root) + _build_histogram_root_no_hessian(0, n_bins, binned_feature, + ordered_gradients, hist_gc_root) _build_histogram_root(0, n_bins, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root) _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, From c50f9e7065f19840b6be5ed170cbeae309e79d88 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 19 Jan 2019 10:46:34 -0500 Subject: [PATCH 073/247] fix test_loss in 3.5 --- sklearn/_fast_gradient_boosting/tests/test_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/tests/test_loss.py b/sklearn/_fast_gradient_boosting/tests/test_loss.py index 7750fcf999bd2..beeccb2eb432d 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_loss.py +++ b/sklearn/_fast_gradient_boosting/tests/test_loss.py @@ -157,7 +157,7 @@ def test_baseline_binary_crossentropy(): baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert_all_finite(baseline_prediction) assert_almost_equal(loss.inverse_link_function(baseline_prediction), - y_train[0]) + y_train[0], decimal=6) # Make sure baseline prediction is equal to link_function(p), where p # is the proba of the positive class. We want predict_proba() to return p, From 48abf289843bdae443c33b24fcce53a788d15a0a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 14:56:30 -0500 Subject: [PATCH 074/247] truncate array before rank check in check_decision_proba_consistency (expit isn't precise enough) --- sklearn/_fast_gradient_boosting/loss.pyx | 3 +-- sklearn/utils/estimator_checks.py | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 9961a1008d692..52939d837707a 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -290,7 +290,7 @@ cdef void _update_gradients_hessians_categorical_crossentropy( cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: - return 1. / (1 + exp(-x)) + return 1. / (1. + exp(-x)) cdef inline Y_DTYPE_C clogsumexp( @@ -312,7 +312,6 @@ cdef inline Y_DTYPE_C clogsumexp( return log(out) + amax - _LOSSES = { 'least_squares': LeastSquares, 'binary_crossentropy': BinaryCrossEntropy, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 69850ecc5f796..7bb8e54a9d5de 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2297,6 +2297,10 @@ def check_decision_proba_consistency(name, estimator_orig): estimator.fit(X, y) a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) + # truncate arrays to the 10th decimal to avoid rank discrepancies that + # woulde caused by floating point precision issue + a = np.around(a, decimals=10) + b = np.around(b, decimals=10) assert_array_equal(rankdata(a), rankdata(b)) From f93e2a56a2c57c5ea7aab13ae33b74904bcebb8d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 15:41:03 -0500 Subject: [PATCH 075/247] set random_state in second round of fit_idempotent --- sklearn/tests/test_common.py | 4 +++ sklearn/utils/estimator_checks.py | 59 +++++++++++++++++-------------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 267d3bb06aefc..6845a050ec80b 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -97,6 +97,10 @@ def test_non_meta_estimators(name, Estimator, check): with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning, UserWarning, FutureWarning)): estimator = Estimator() + from sklearn._fast_gradient_boosting.gradient_boosting import BaseFastGradientBoosting + if not isinstance(estimator, BaseFastGradientBoosting): + return + set_checking_parameters(estimator) check(name, estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 7bb8e54a9d5de..d1dc10e33c0a7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -238,32 +238,33 @@ def _yield_outliers_checks(name, estimator): def _yield_all_checks(name, estimator): - for check in _yield_non_meta_checks(name, estimator): - yield check - if is_classifier(estimator): - for check in _yield_classifier_checks(name, estimator): - yield check - if is_regressor(estimator): - for check in _yield_regressor_checks(name, estimator): - yield check - if hasattr(estimator, 'transform'): - for check in _yield_transformer_checks(name, estimator): - yield check - if isinstance(estimator, ClusterMixin): - for check in _yield_clustering_checks(name, estimator): - yield check - if is_outlier_detector(estimator): - for check in _yield_outliers_checks(name, estimator): - yield check - yield check_fit2d_predict1d - yield check_methods_subset_invariance - yield check_fit2d_1sample - yield check_fit2d_1feature - yield check_fit1d - yield check_get_params_invariance - yield check_set_params - yield check_dict_unchanged - yield check_dont_overwrite_parameters + # for check in _yield_non_meta_checks(name, estimator): + # yield check + # if is_classifier(estimator): + # for check in _yield_classifier_checks(name, estimator): + # yield check + # if is_regressor(estimator): + # for check in _yield_regressor_checks(name, estimator): + # yield check + # if hasattr(estimator, 'transform'): + # for check in _yield_transformer_checks(name, estimator): + # yield check + # if isinstance(estimator, ClusterMixin): + # for check in _yield_clustering_checks(name, estimator): + # yield check + # if is_outlier_detector(estimator): + # for check in _yield_outliers_checks(name, estimator): + # yield check + # yield check_fit2d_predict1d + # yield check_methods_subset_invariance + # yield check_fit2d_1sample + # yield check_fit2d_1feature + # yield check_fit1d + # yield check_get_params_invariance + # yield check_set_params + # yield check_dict_unchanged + # yield check_dont_overwrite_parameters + yield check_fit_idempotent yield check_fit_idempotent @@ -2294,6 +2295,9 @@ def check_decision_proba_consistency(name, estimator_orig): if (hasattr(estimator, "decision_function") and hasattr(estimator, "predict_proba")): + from scipy.special import expit + from numpy.testing import assert_array_almost_equal + estimator.fit(X, y) a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) @@ -2353,7 +2357,7 @@ def check_fit_idempotent(name, estimator_orig): rng = np.random.RandomState(0) estimator = clone(estimator_orig) - set_random_state(estimator) + set_random_state(estimator, random_state=0) if 'warm_start' in estimator.get_params().keys(): estimator.set_params(warm_start=False) @@ -2378,6 +2382,7 @@ def check_fit_idempotent(name, estimator_orig): if hasattr(estimator, method)} # Fit again + set_random_state(estimator, random_state=0) estimator.fit(X_train, y_train) for method in check_methods: From 01098e37c00dcab220d26f595b24233e0c4f6177 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 15:51:32 -0500 Subject: [PATCH 076/247] revert bad changes --- sklearn/tests/test_common.py | 4 --- sklearn/utils/estimator_checks.py | 56 ++++++++++++++----------------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6845a050ec80b..267d3bb06aefc 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -97,10 +97,6 @@ def test_non_meta_estimators(name, Estimator, check): with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning, UserWarning, FutureWarning)): estimator = Estimator() - from sklearn._fast_gradient_boosting.gradient_boosting import BaseFastGradientBoosting - if not isinstance(estimator, BaseFastGradientBoosting): - return - set_checking_parameters(estimator) check(name, estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d1dc10e33c0a7..5ba8da1859fbc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -238,33 +238,32 @@ def _yield_outliers_checks(name, estimator): def _yield_all_checks(name, estimator): - # for check in _yield_non_meta_checks(name, estimator): - # yield check - # if is_classifier(estimator): - # for check in _yield_classifier_checks(name, estimator): - # yield check - # if is_regressor(estimator): - # for check in _yield_regressor_checks(name, estimator): - # yield check - # if hasattr(estimator, 'transform'): - # for check in _yield_transformer_checks(name, estimator): - # yield check - # if isinstance(estimator, ClusterMixin): - # for check in _yield_clustering_checks(name, estimator): - # yield check - # if is_outlier_detector(estimator): - # for check in _yield_outliers_checks(name, estimator): - # yield check - # yield check_fit2d_predict1d - # yield check_methods_subset_invariance - # yield check_fit2d_1sample - # yield check_fit2d_1feature - # yield check_fit1d - # yield check_get_params_invariance - # yield check_set_params - # yield check_dict_unchanged - # yield check_dont_overwrite_parameters - yield check_fit_idempotent + for check in _yield_non_meta_checks(name, estimator): + yield check + if is_classifier(estimator): + for check in _yield_classifier_checks(name, estimator): + yield check + if is_regressor(estimator): + for check in _yield_regressor_checks(name, estimator): + yield check + if hasattr(estimator, 'transform'): + for check in _yield_transformer_checks(name, estimator): + yield check + if isinstance(estimator, ClusterMixin): + for check in _yield_clustering_checks(name, estimator): + yield check + if is_outlier_detector(estimator): + for check in _yield_outliers_checks(name, estimator): + yield check + yield check_fit2d_predict1d + yield check_methods_subset_invariance + yield check_fit2d_1sample + yield check_fit2d_1feature + yield check_fit1d + yield check_get_params_invariance + yield check_set_params + yield check_dict_unchanged + yield check_dont_overwrite_parameters yield check_fit_idempotent @@ -2295,9 +2294,6 @@ def check_decision_proba_consistency(name, estimator_orig): if (hasattr(estimator, "decision_function") and hasattr(estimator, "predict_proba")): - from scipy.special import expit - from numpy.testing import assert_array_almost_equal - estimator.fit(X, y) a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) From 602802fe273db9ab69be3b70f6acf99573e7373e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 17:12:06 -0500 Subject: [PATCH 077/247] probing travis --- .../tests/test_gradient_boosting.py | 6 +- sklearn/utils/estimator_checks.py | 59 +++++++++++-------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 131f1204d186e..1e95163307ff8 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -186,9 +186,11 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( - FastGradientBoostingRegressor(), + # FastGradientBoostingRegressor(), FastGradientBoostingClassifier(), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. - check_estimator(Estimator) + for _ in range(100): + print(_) + check_estimator(Estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5ba8da1859fbc..1ee7129d5480c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -238,32 +238,32 @@ def _yield_outliers_checks(name, estimator): def _yield_all_checks(name, estimator): - for check in _yield_non_meta_checks(name, estimator): - yield check - if is_classifier(estimator): - for check in _yield_classifier_checks(name, estimator): - yield check - if is_regressor(estimator): - for check in _yield_regressor_checks(name, estimator): - yield check - if hasattr(estimator, 'transform'): - for check in _yield_transformer_checks(name, estimator): - yield check - if isinstance(estimator, ClusterMixin): - for check in _yield_clustering_checks(name, estimator): - yield check - if is_outlier_detector(estimator): - for check in _yield_outliers_checks(name, estimator): - yield check - yield check_fit2d_predict1d - yield check_methods_subset_invariance - yield check_fit2d_1sample - yield check_fit2d_1feature - yield check_fit1d - yield check_get_params_invariance - yield check_set_params - yield check_dict_unchanged - yield check_dont_overwrite_parameters + # for check in _yield_non_meta_checks(name, estimator): + # yield check + # if is_classifier(estimator): + # for check in _yield_classifier_checks(name, estimator): + # yield check + # if is_regressor(estimator): + # for check in _yield_regressor_checks(name, estimator): + # yield check + # if hasattr(estimator, 'transform'): + # for check in _yield_transformer_checks(name, estimator): + # yield check + # if isinstance(estimator, ClusterMixin): + # for check in _yield_clustering_checks(name, estimator): + # yield check + # if is_outlier_detector(estimator): + # for check in _yield_outliers_checks(name, estimator): + # yield check + # yield check_fit2d_predict1d + # yield check_methods_subset_invariance + # yield check_fit2d_1sample + # yield check_fit2d_1feature + # yield check_fit1d + # yield check_get_params_invariance + # yield check_set_params + # yield check_dict_unchanged + # yield check_dont_overwrite_parameters yield check_fit_idempotent @@ -2371,12 +2371,19 @@ def check_fit_idempotent(name, estimator_orig): X_test, y_test = _safe_split(estimator, X, y, test, train) # Fit for the first time + print() + print(X_train) + print(y_train) + print(y_test) estimator.fit(X_train, y_train) result = {method: getattr(estimator, method)(X_test) for method in check_methods if hasattr(estimator, method)} + print(result['predict']) + print(result['predict_proba']) + # Fit again set_random_state(estimator, random_state=0) estimator.fit(X_train, y_train) From a70b15065c6fa63f7958fca0d1b38ceed0578ea4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 17:49:37 -0500 Subject: [PATCH 078/247] second --- sklearn/utils/estimator_checks.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1ee7129d5480c..6fb4d0871c3bf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2381,14 +2381,19 @@ def check_fit_idempotent(name, estimator_orig): for method in check_methods if hasattr(estimator, method)} - print(result['predict']) - print(result['predict_proba']) + + for k, v in result.items(): + print(k) + print(v) # Fit again set_random_state(estimator, random_state=0) estimator.fit(X_train, y_train) + print('second') for method in check_methods: if hasattr(estimator, method): new_result = getattr(estimator, method)(X_test) + print(method) + print(new_result) assert_allclose_dense_sparse(result[method], new_result) From 396b65cba9a53d4fbbf00162e16462e0bb4a1b11 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 18:32:16 -0500 Subject: [PATCH 079/247] ... --- sklearn/utils/estimator_checks.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6fb4d0871c3bf..f2bbb2841c3fc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2390,10 +2390,16 @@ def check_fit_idempotent(name, estimator_orig): set_random_state(estimator, random_state=0) estimator.fit(X_train, y_train) + new_result = {method: getattr(estimator, method)(X_test) + for method in check_methods + if hasattr(estimator, method)} + print('second') + for k, v in new_result.items(): + print(k) + print(v) + for method in check_methods: if hasattr(estimator, method): - new_result = getattr(estimator, method)(X_test) print(method) - print(new_result) - assert_allclose_dense_sparse(result[method], new_result) + assert_allclose_dense_sparse(result[method], new_result[method]) From 0dbbcee6033babfed34739dba7c6cb3d0e11f2d3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 20 Jan 2019 18:55:24 -0500 Subject: [PATCH 080/247] ... --- sklearn/utils/estimator_checks.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f2bbb2841c3fc..b484ab1b6b73e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2372,8 +2372,13 @@ def check_fit_idempotent(name, estimator_orig): # Fit for the first time print() + print('X_train') print(X_train) + print('X_test') + print(X_train) + print('y_train') print(y_train) + print('y_test') print(y_test) estimator.fit(X_train, y_train) @@ -2394,7 +2399,16 @@ def check_fit_idempotent(name, estimator_orig): for method in check_methods if hasattr(estimator, method)} - print('second') + print('AFTER SECOND FIT') + print() + print('X_train') + print(X_train) + print('X_test') + print(X_train) + print('y_train') + print(y_train) + print('y_test') + print(y_test) for k, v in new_result.items(): print(k) print(v) @@ -2403,3 +2417,5 @@ def check_fit_idempotent(name, estimator_orig): if hasattr(estimator, method): print(method) assert_allclose_dense_sparse(result[method], new_result[method]) + + print('-' * 10) From 4614762f43f898d2431ca764b7b5bbeb452d684c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 10:41:11 -0500 Subject: [PATCH 081/247] Revert travis probing changes --- .../tests/test_gradient_boosting.py | 6 +- sklearn/utils/estimator_checks.py | 90 ++++++------------- 2 files changed, 30 insertions(+), 66 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 1e95163307ff8..131f1204d186e 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -186,11 +186,9 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( - # FastGradientBoostingRegressor(), + FastGradientBoostingRegressor(), FastGradientBoostingClassifier(), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. - for _ in range(100): - print(_) - check_estimator(Estimator) + check_estimator(Estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b484ab1b6b73e..5ba8da1859fbc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -238,32 +238,32 @@ def _yield_outliers_checks(name, estimator): def _yield_all_checks(name, estimator): - # for check in _yield_non_meta_checks(name, estimator): - # yield check - # if is_classifier(estimator): - # for check in _yield_classifier_checks(name, estimator): - # yield check - # if is_regressor(estimator): - # for check in _yield_regressor_checks(name, estimator): - # yield check - # if hasattr(estimator, 'transform'): - # for check in _yield_transformer_checks(name, estimator): - # yield check - # if isinstance(estimator, ClusterMixin): - # for check in _yield_clustering_checks(name, estimator): - # yield check - # if is_outlier_detector(estimator): - # for check in _yield_outliers_checks(name, estimator): - # yield check - # yield check_fit2d_predict1d - # yield check_methods_subset_invariance - # yield check_fit2d_1sample - # yield check_fit2d_1feature - # yield check_fit1d - # yield check_get_params_invariance - # yield check_set_params - # yield check_dict_unchanged - # yield check_dont_overwrite_parameters + for check in _yield_non_meta_checks(name, estimator): + yield check + if is_classifier(estimator): + for check in _yield_classifier_checks(name, estimator): + yield check + if is_regressor(estimator): + for check in _yield_regressor_checks(name, estimator): + yield check + if hasattr(estimator, 'transform'): + for check in _yield_transformer_checks(name, estimator): + yield check + if isinstance(estimator, ClusterMixin): + for check in _yield_clustering_checks(name, estimator): + yield check + if is_outlier_detector(estimator): + for check in _yield_outliers_checks(name, estimator): + yield check + yield check_fit2d_predict1d + yield check_methods_subset_invariance + yield check_fit2d_1sample + yield check_fit2d_1feature + yield check_fit1d + yield check_get_params_invariance + yield check_set_params + yield check_dict_unchanged + yield check_dont_overwrite_parameters yield check_fit_idempotent @@ -2371,51 +2371,17 @@ def check_fit_idempotent(name, estimator_orig): X_test, y_test = _safe_split(estimator, X, y, test, train) # Fit for the first time - print() - print('X_train') - print(X_train) - print('X_test') - print(X_train) - print('y_train') - print(y_train) - print('y_test') - print(y_test) estimator.fit(X_train, y_train) result = {method: getattr(estimator, method)(X_test) for method in check_methods if hasattr(estimator, method)} - - for k, v in result.items(): - print(k) - print(v) - # Fit again set_random_state(estimator, random_state=0) estimator.fit(X_train, y_train) - new_result = {method: getattr(estimator, method)(X_test) - for method in check_methods - if hasattr(estimator, method)} - - print('AFTER SECOND FIT') - print() - print('X_train') - print(X_train) - print('X_test') - print(X_train) - print('y_train') - print(y_train) - print('y_test') - print(y_test) - for k, v in new_result.items(): - print(k) - print(v) - for method in check_methods: if hasattr(estimator, method): - print(method) - assert_allclose_dense_sparse(result[method], new_result[method]) - - print('-' * 10) + new_result = getattr(estimator, method)(X_test) + assert_allclose_dense_sparse(result[method], new_result) From 2181495e3fa1659ed0b073f6f77700d62898fca3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 11:07:07 -0500 Subject: [PATCH 082/247] slightly change feature splitting routine --- sklearn/_fast_gradient_boosting/splitting.pyx | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 28ad4ffcf9bcf..391fd5c29d78a 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -367,6 +367,7 @@ cdef class Splitter: cdef: unsigned int n_samples int feature_idx + int best_feature_idx int i unsigned int thread_idx unsigned int [:] starts @@ -421,7 +422,9 @@ cdef class Splitter: split_infos[feature_idx] = split_info # then compute best possible split among all feature - split_info = self._find_best_feature_to_split_helper(split_infos) + best_feature_idx = self._find_best_feature_to_split_helper( + split_infos) + split_info = split_infos[best_feature_idx] out = SplitInfo( split_info.gain, @@ -546,7 +549,9 @@ cdef class Splitter: split_infos[feature_idx] = split_info # then compute best possible split among all feature - split_info = self._find_best_feature_to_split_helper(split_infos) + best_feature_idx = self._find_best_feature_to_split_helper( + split_infos) + split_info = split_infos[best_feature_idx] out = SplitInfo( split_info.gain, @@ -562,25 +567,18 @@ cdef class Splitter: free(split_infos) return out - cdef split_info_struct _find_best_feature_to_split_helper(self, + cdef int _find_best_feature_to_split_helper(self, split_info_struct * split_infos # IN ) nogil: """Returns the best split_info among those in splits_infos.""" cdef: - Y_DTYPE_C gain - Y_DTYPE_C best_gain - split_info_struct split_info - split_info_struct best_split_info - unsigned int feature_idx - - best_gain = -1. - for feature_idx in range(self.n_features): - split_info = split_infos[feature_idx] - gain = split_info.gain - if best_gain < 0. or gain > best_gain: - best_gain = gain - best_split_info = split_info - return best_split_info + int feature_idx + int best_feature_idx = 0 + + for feature_idx in range(1, self.n_features): + if split_infos[feature_idx].gain > split_infos[best_feature_idx].gain: + best_feature_idx = feature_idx + return best_feature_idx cdef split_info_struct _find_best_bin_to_split_helper( self, From cb38816f94722cb744c0a71cf2e3767bafcaa850 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 12:10:26 -0500 Subject: [PATCH 083/247] removed unused attributes --- sklearn/_fast_gradient_boosting/splitting.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 391fd5c29d78a..e00d363cdab70 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -141,8 +141,6 @@ cdef class Splitter: Y_DTYPE_C [::1] hessians Y_DTYPE_C [::1] ordered_gradients Y_DTYPE_C [::1] ordered_hessians - Y_DTYPE_C sum_gradients - Y_DTYPE_C sum_hessians unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split From 8fc65f70084cbd84d8339ac14dd7ff02800752c7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 12:10:45 -0500 Subject: [PATCH 084/247] put back small optimization for small hessians --- sklearn/_fast_gradient_boosting/grower.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 3a2c973b2a63a..162321121120a 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -270,9 +270,9 @@ def _intilialize_root(self): # Do not even bother computing any splitting statistics. self._finalize_leaf(self.root) return - # if sum_hessians < self.splitter.min_hessian_to_split: - # self._finalize_leaf(self.root) - # return + if sum_hessians < self.splitter.min_hessian_to_split: + self._finalize_leaf(self.root) + return self._compute_spittability(self.root) From d703bf16afc3d610c32bc52491ad0fa844c53b9d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 12:46:49 -0500 Subject: [PATCH 085/247] trying range instead of prange for summing gradients --- sklearn/_fast_gradient_boosting/splitting.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index e00d363cdab70..29d8db2194090 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -397,11 +397,13 @@ cdef class Splitter: ordered_hessians[i] = hessians[sample_indices[i]] # Compute sums of gradients and hessians at the node - for i in prange(n_samples, schedule='static'): + # for i in prange(n_samples, schedule='static'): + for i in range(n_samples): sum_gradients += ordered_gradients[i] if self.hessians_are_constant: sum_hessians = n_samples else: + # for i in range(n_samples): for i in prange(n_samples, schedule='static'): sum_hessians += ordered_hessians[i] From afd48ac17f78231192502d784833928e6c3be77b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 14:57:20 -0500 Subject: [PATCH 086/247] cosmetics --- .../_gradient_boosting.pyx | 30 ++-- sklearn/_fast_gradient_boosting/binning.pyx | 1 - .../gradient_boosting.py | 2 + sklearn/_fast_gradient_boosting/grower.py | 10 +- sklearn/_fast_gradient_boosting/histogram.pyx | 97 +++++----- sklearn/_fast_gradient_boosting/loss.pyx | 36 ++-- sklearn/_fast_gradient_boosting/predictor.pyx | 38 ++-- sklearn/_fast_gradient_boosting/setup.py | 4 + sklearn/_fast_gradient_boosting/splitting.pyx | 169 ++++++++++-------- sklearn/_fast_gradient_boosting/types.pxd | 4 +- 10 files changed, 217 insertions(+), 174 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 4c7c3427a2f36..05be63c5ec56e 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -1,4 +1,3 @@ -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False @@ -13,29 +12,36 @@ from .types import Y_DTYPE from .types cimport Y_DTYPE_C -def _update_raw_predictions(Y_DTYPE_C [:] raw_predictions, grower): +def _update_raw_predictions( + Y_DTYPE_C [:] raw_predictions, # OUT + grower): + """Update raw_predictions with the predictions of the newest tree + + This is equivalent to + raw_predictions += last_estimator.predict(X_train) + """ cdef: - unsigned int [:] starts - unsigned int [:] stops - unsigned int [:] partition - Y_DTYPE_C [:] values + unsigned int [:] starts # start of each leaf in partition + unsigned int [:] stops # end of each leaf in partition + Y_DTYPE_C [:] values # value of each leaf + const unsigned int [:] partition = grower.splitter.partition list leaves leaves = grower.finalized_leaves starts = np.array([leaf.start for leaf in leaves], dtype=np.uint32) stops = np.array([leaf.stop for leaf in leaves], dtype=np.uint32) values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) - partition = grower.splitter.partition _update_raw_predictions_helper(raw_predictions, starts, stops, partition, values) + cdef void _update_raw_predictions_helper( - Y_DTYPE_C [:] raw_predictions, - const unsigned int [:] starts, - const unsigned int [:] stops, - const unsigned int [:] partition, - Y_DTYPE_C [:] values) nogil: + Y_DTYPE_C [:] raw_predictions, # OUT + const unsigned int [:] starts, + const unsigned int [:] stops, + const unsigned int [:] partition, + const Y_DTYPE_C [:] values) nogil: cdef: unsigned int position diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx index ff8cfb179186f..5361ff82b3b0a 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.pyx @@ -1,4 +1,3 @@ -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index fa196c8d343ba..24606e16ad70b 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -245,6 +245,8 @@ def fit(self, X, y): bin_thresholds=self.bin_mapper_.bin_thresholds_) estimators[-1].append(estimator) + # Update raw_predictions with the predictions of the newly + # created tree. tic_pred = time() _update_raw_predictions(raw_predictions[:, k], grower) toc_pred = time() diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 162321121120a..a104bbbcc13dc 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -24,7 +24,7 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root - samples_indices : array of int + sample_indices : array of int The indices of the samples at the node sum_gradients : float The sum of the gradients of the samples at the node @@ -37,7 +37,7 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root - samples_indices : array of int + sample_indices : array of int The indices of the samples at the node sum_gradients : float The sum of the gradients of the samples at the node @@ -61,9 +61,13 @@ class TreeNode: The Number of samples at the node divided find_split_time. apply_split_time : float The total time spent actually splitting the node, e.g. splitting - samples_indices into left and right child. + sample_indices into left and right child. hist_subtraction : bool Wheter the subtraction method was used for computing the histograms. + start : int + start position of the node's sample_indices in splitter.partition + stop : int + stop position of the node's sample_indices in splitter.partition """ split_info = None diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index 57e418d331560..4b1f6e4c041e3 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -20,15 +20,16 @@ from .types import HISTOGRAM_DTYPE cpdef void _build_histogram_naive( - const int feature_idx, - unsigned int n_bins, - unsigned int [:] sample_indices, # IN - X_BINNED_DTYPE_C [:] binned_feature, # IN - Y_DTYPE_C [:] ordered_gradients, # IN - Y_DTYPE_C [:] ordered_hessians, # IN - hist_struct [:, :] out # OUT - ) nogil: - """Build histogram in a naive way, without optimizing for cache hit.""" + const int feature_idx, + unsigned int n_bins, + unsigned int [:] sample_indices, # IN + X_BINNED_DTYPE_C [:] binned_feature, # IN + Y_DTYPE_C [:] ordered_gradients, # IN + Y_DTYPE_C [:] ordered_hessians, # IN + hist_struct [:, :] out) nogil: # OUT + """Build histogram in a naive way, without optimizing for cache hit. + + Used in tests to compare with the optimized version.""" cdef: unsigned int i unsigned int n_samples = sample_indices.shape[0] @@ -44,29 +45,36 @@ cpdef void _build_histogram_naive( cpdef void _subtract_histograms( - const int feature_idx, - unsigned int n_bins, - hist_struct [:, ::1] hist_a, # IN - hist_struct [:, ::1] hist_b, # IN - hist_struct [:, ::1] out, # OUT - ) nogil: + const int feature_idx, + unsigned int n_bins, + hist_struct [:, ::1] hist_a, # IN + hist_struct [:, ::1] hist_b, # IN + hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 for i in range(n_bins): - out[feature_idx, i].sum_gradients = hist_a[feature_idx, i].sum_gradients - hist_b[feature_idx, i].sum_gradients - out[feature_idx, i].sum_hessians = hist_a[feature_idx, i].sum_hessians - hist_b[feature_idx, i].sum_hessians - out[feature_idx, i].count = hist_a[feature_idx, i].count - hist_b[feature_idx, i].count + out[feature_idx, i].sum_gradients = ( + hist_a[feature_idx, i].sum_gradients - + hist_b[feature_idx, i].sum_gradients + ) + out[feature_idx, i].sum_hessians = ( + hist_a[feature_idx, i].sum_hessians - + hist_b[feature_idx, i].sum_hessians + ) + out[feature_idx, i].count = ( + hist_a[feature_idx, i].count - + hist_b[feature_idx, i].count + ) cpdef void _build_histogram( - const int feature_idx, - unsigned int n_bins, - const unsigned int [::1] sample_indices, # IN - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] ordered_gradients, # IN - const Y_DTYPE_C [::1] ordered_hessians, # IN - hist_struct [:, ::1] out # OUT - ) nogil: + const int feature_idx, + unsigned int n_bins, + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] ordered_gradients, # IN + const Y_DTYPE_C [::1] ordered_hessians, # IN + hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 unsigned int n_node_samples = sample_indices.shape[0] @@ -107,13 +115,12 @@ cpdef void _build_histogram( cpdef void _build_histogram_no_hessian( - const int feature_idx, - unsigned int n_bins, - const unsigned int [::1] sample_indices, # IN - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] ordered_gradients, # OUT - hist_struct [:, ::1] out # OUT - ) nogil: + const int feature_idx, + unsigned int n_bins, + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] ordered_gradients, # IN + hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 unsigned int n_node_samples = sample_indices.shape[0] @@ -148,13 +155,12 @@ cpdef void _build_histogram_no_hessian( cpdef void _build_histogram_root( - const int feature_idx, - unsigned int n_bins, - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] all_gradients, # IN - const Y_DTYPE_C [::1] all_hessians, # IN - hist_struct [:, ::1] out # OUT - ) nogil: + const int feature_idx, + unsigned int n_bins, + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] all_gradients, # IN + const Y_DTYPE_C [::1] all_hessians, # IN + hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 unsigned int n_samples = binned_feature.shape[0] @@ -196,12 +202,11 @@ cpdef void _build_histogram_root( cpdef void _build_histogram_root_no_hessian( - const int feature_idx, - unsigned int n_bins, - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] all_gradients, # IN - hist_struct [:, ::1] out # OUT - ) nogil: + const int feature_idx, + unsigned int n_bins, + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const Y_DTYPE_C [::1] all_gradients, # IN + hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 unsigned int n_samples = binned_feature.shape[0] diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 52939d837707a..95289203e20ad 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -1,4 +1,3 @@ -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False @@ -140,9 +139,9 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( - Y_DTYPE_C [:] gradients, - const Y_DTYPE_C [:] y_true, - const Y_DTYPE_C [:] raw_predictions) nogil: + Y_DTYPE_C [:] gradients, + const Y_DTYPE_C [:] y_true, + const Y_DTYPE_C [:] raw_predictions) nogil: cdef: unsigned int n_samples int i @@ -202,11 +201,12 @@ class BinaryCrossEntropy(BaseLoss): proba[:, 0] = 1 - proba[:, 1] return proba + cdef void _update_gradients_hessians_binary_crossentropy( - Y_DTYPE_C [:] gradients, - Y_DTYPE_C [:] hessians, - const Y_DTYPE_C [:] y_true, - const Y_DTYPE_C [:] raw_predictions) nogil: + Y_DTYPE_C [:] gradients, + Y_DTYPE_C [:] hessians, + const Y_DTYPE_C [:] y_true, + const Y_DTYPE_C [:] raw_predictions) nogil: cdef: unsigned int n_samples Y_DTYPE_C gradient_abs @@ -262,11 +262,11 @@ class CategoricalCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_categorical_crossentropy( - Y_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT - Y_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT - const Y_DTYPE_C [:] y_true, # shape (n_samples,), IN - const Y_DTYPE_C [:, :] raw_predictions # shape (n_samples, n_tree_per_iter), IN - ) nogil: + Y_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT + Y_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT + const Y_DTYPE_C [:] y_true, # shape (n_samples,), IN + # shape (n_samples, n_tree_per_iter), IN + const Y_DTYPE_C [:, :] raw_predictions) nogil: cdef: unsigned int n_samples unsigned int prediction_dim @@ -290,14 +290,16 @@ cdef void _update_gradients_hessians_categorical_crossentropy( cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: + """Custom expit (logistic sigmoid function)""" return 1. / (1. + exp(-x)) cdef inline Y_DTYPE_C clogsumexp( - const Y_DTYPE_C [:, :] a, - const int row) nogil: - # Need to pass the whole array, else prange won't work. See Cython issue - # #2798 + const Y_DTYPE_C [:, :] a, + const int row) nogil: + """Custom logsumexp, with numerical stability""" + # Need to pass the whole array and the row index, else prange won't work. + # See issue Cython #2798 cdef: int k Y_DTYPE_C out = 0. diff --git a/sklearn/_fast_gradient_boosting/predictor.pyx b/sklearn/_fast_gradient_boosting/predictor.pyx index eff4d768bf2f5..b3ef7173c3064 100644 --- a/sklearn/_fast_gradient_boosting/predictor.pyx +++ b/sklearn/_fast_gradient_boosting/predictor.pyx @@ -1,4 +1,3 @@ -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False @@ -34,6 +33,8 @@ PREDICTOR_RECORD_DTYPE = np.dtype([ cdef packed struct node_struct: + # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It + # needs to be packed since by default numpy dtypes aren't aligned Y_DTYPE_C value unsigned int count unsigned int feature_idx @@ -99,13 +100,13 @@ class TreePredictor: _predict_from_binned_data(self.nodes, X, out) return out + cdef inline Y_DTYPE_C _predict_one_from_numeric_data( - node_struct [:] nodes, - const X_DTYPE_C [:, :] numeric_data, - const int row - ) nogil: - # Need to pass the whole array, else prange won't work. See issue Cython - # #2798 + node_struct [:] nodes, + const X_DTYPE_C [:, :] numeric_data, + const int row) nogil: + # Need to pass the whole array and the row index, else prange won't work. + # See issue Cython #2798 cdef: node_struct node = nodes[0] @@ -120,9 +121,9 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( cdef void _predict_from_numeric_data( - node_struct [:] nodes, - const X_DTYPE_C [:, :] numeric_data, - Y_DTYPE_C [:] out) nogil: + node_struct [:] nodes, + const X_DTYPE_C [:, :] numeric_data, + Y_DTYPE_C [:] out) nogil: cdef: int i @@ -132,12 +133,11 @@ cdef void _predict_from_numeric_data( cdef inline Y_DTYPE_C _predict_one_from_binned_data( - node_struct [:] nodes, - const X_BINNED_DTYPE_C [:, :] binned_data, - const int row - ) nogil: - # Need to pass the whole array, else prange won't work. See issue Cython - # #2798 + node_struct [:] nodes, + const X_BINNED_DTYPE_C [:, :] binned_data, + const int row) nogil: + # Need to pass the whole array and the row index, else prange won't work. + # See issue Cython #2798 cdef: node_struct node = nodes[0] @@ -152,9 +152,9 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( cdef void _predict_from_binned_data( - node_struct [:] nodes, - const X_BINNED_DTYPE_C [:, :] binned_data, - Y_DTYPE_C [:] out) nogil: + node_struct [:] nodes, + const X_BINNED_DTYPE_C [:, :] binned_data, + Y_DTYPE_C [:] out) nogil: cdef: int i diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index 6dc60867f6c68..d0ad96ba3dd7f 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -33,6 +33,10 @@ def configuration(parent_package="", top_path=None): sources=["types.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("playground", + sources=["playground.pyx"], + include_dirs=[numpy.get_include()]) + config.add_subpackage("tests") return config diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 29d8db2194090..ecc8b73f0ec1b 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -1,4 +1,3 @@ -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False @@ -29,11 +28,18 @@ from .types cimport hist_struct from .types import HISTOGRAM_DTYPE +# Note: in a lot of functions here we pass feature_idx and the whole 2d +# histograms arrays instead a lot just histograms[feature_idx]. This is +# because Cython generated C code will have strange Python interactions (likely +# related to the GIL release and the custom histogram dtype) when using 1d +# histogram arrays. + + cdef struct split_info_struct: # Same as the SplitInfo class, but we need a C struct to use it in the # nogil sections Y_DTYPE_C gain - unsigned int feature_idx + int feature_idx unsigned int bin_idx Y_DTYPE_C gradient_left Y_DTYPE_C gradient_right @@ -70,7 +76,7 @@ cdef class SplitInfo: """ cdef public: Y_DTYPE_C gain - unsigned int feature_idx + int feature_idx unsigned int bin_idx Y_DTYPE_C gradient_left Y_DTYPE_C gradient_right @@ -79,7 +85,7 @@ cdef class SplitInfo: unsigned int n_samples_left unsigned int n_samples_right - def __init__(self, Y_DTYPE_C gain=-1., unsigned int feature_idx=0, unsigned + def __init__(self, Y_DTYPE_C gain=-1., int feature_idx=0, unsigned int bin_idx=0, Y_DTYPE_C gradient_left=0., Y_DTYPE_C hessian_left=0., Y_DTYPE_C gradient_right=0., Y_DTYPE_C hessian_right=0., unsigned int n_samples_left=0, unsigned @@ -238,10 +244,10 @@ cdef class Splitter: # and right_child_pos = left_child_pos + left_child.n_samples. The # order of the samples inside a leaf is irrelevant. - # 1. samples_indices is a view on this region a..x. We conceptually + # 1. sample_indices is a view on this region a..x. We conceptually # divide it into n_threads regions. Each thread will be responsible # for its own region. Here is an example with 4 threads: - # samples_indices = [abcdef|ghijkl|mnopqr|stuvwx] + # sample_indices = [abcdef|ghijkl|mnopqr|stuvwx] # 2. Each thread processes 6 = 24 // 4 entries and maps them into # left_indices_buffer or right_indices_buffer. For example, we could # have the following mapping ('.' denotes an undefined entry): @@ -254,9 +260,9 @@ cdef class Splitter: # - left_counts = [4, 2, 6, 3] # - right_counts = [2, 4, 0, 3] # 4. Finally, we put left/right_indices_buffer back into the - # samples_indices, without any undefined entries and the partition + # sample_indices, without any undefined entries and the partition # looks as expected - # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + # partition = [*************abefilmnopqrtuxcdghjksvw***************] # Note: We here show left/right_indices_buffer as being the same size # as sample_indices for simplicity, but in reality they are of the @@ -293,7 +299,7 @@ cdef class Splitter: offset_in_buffers[thread_idx] = \ offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] - # map indices from samples_indices to left/right_indices_buffer + # map indices from sample_indices to left/right_indices_buffer for thread_idx in prange(n_threads): left_count = 0 right_count = 0 @@ -317,7 +323,7 @@ cdef class Splitter: for thread_idx in range(n_threads): right_child_position += left_counts[thread_idx] - # offset of each thread in samples_indices for left and right + # offset of each thread in sample_indices for left and right # child, i.e. where each thread will start to write. right_offset[0] = right_child_position for thread_idx in range(1, n_threads): @@ -327,8 +333,8 @@ cdef class Splitter: right_offset[thread_idx - 1] + right_counts[thread_idx - 1] # map indices in left/right_indices_buffer back into - # samples_indices. This also updates self.partition since - # samples_indice is a view. + # sample_indices. This also updates self.partition since + # sample_indices is a view. for thread_idx in prange(n_threads): for i in range(left_counts[thread_idx]): @@ -342,13 +348,12 @@ cdef class Splitter: sample_indices[right_child_position:], right_child_position) - def find_node_split( - self, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms): # OUT + def find_node_split(self, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms): # OUT """For each feature, find the best bin to split on at a given node. - Returns the best split info among all features, and the histograms of + Return the best split info among all features, and the histograms of all the features. The histograms are computed by scanning the whole data. @@ -356,6 +361,9 @@ cdef class Splitter: ---------- sample_indices : array of int The indices of the samples at the node to split. + histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the current node (to be computed) Returns ------- @@ -397,13 +405,24 @@ cdef class Splitter: ordered_hessians[i] = hessians[sample_indices[i]] # Compute sums of gradients and hessians at the node + + # TODO: ideally use: # for i in prange(n_samples, schedule='static'): + # we should be using prange here, but for some reason it + # leads to slightly incorrect values (1 out of ~100 times) and + # test check_estimator() does not pass anymore + # (check_fit_idempotent). It only seems to be a problem for + # classifiers which is very strange because the loop isn't + # classifier-specific. Maybe it has to do with the array + # population above (hessians aren't constant for classification + # losses). I tried to create a minimal reproducing example, without + # sucess. for i in range(n_samples): sum_gradients += ordered_gradients[i] if self.hessians_are_constant: sum_hessians = n_samples else: - # for i in range(n_samples): + # Using prange seems to be OK here for i in prange(n_samples, schedule='static'): sum_hessians += ordered_hessians[i] @@ -440,12 +459,11 @@ cdef class Splitter: free(split_infos) return out - cdef void _compute_histogram( - self, - const unsigned int feature_idx, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms # OUT - ) nogil: + cdef void _compute_histogram(self, + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms # OUT + ) nogil: """Compute the histogram for a given feature.""" cdef: @@ -460,33 +478,35 @@ cdef class Splitter: if root_node: if self.hessians_are_constant: - _build_histogram_root_no_hessian(feature_idx, self.max_bins, X_binned, - ordered_gradients, histograms) + _build_histogram_root_no_hessian(feature_idx, self.max_bins, + X_binned, + ordered_gradients, + histograms) else: _build_histogram_root(feature_idx, self.max_bins, X_binned, - ordered_gradients, - ordered_hessians, histograms) + ordered_gradients, ordered_hessians, + histograms) else: if self.hessians_are_constant: - _build_histogram_no_hessian(feature_idx, self.max_bins, sample_indices, - X_binned, ordered_gradients, - histograms) + _build_histogram_no_hessian(feature_idx, self.max_bins, + sample_indices, X_binned, + ordered_gradients, histograms) else: - _build_histogram(feature_idx, self.max_bins, sample_indices, X_binned, - ordered_gradients, ordered_hessians, - histograms) + _build_histogram(feature_idx, self.max_bins, sample_indices, + X_binned, ordered_gradients, + ordered_hessians, histograms) def find_node_split_subtraction( - Splitter self, - unsigned int [::1] sample_indices, # IN - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians, - hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] sibling_histograms, # IN - hist_struct [:, ::1] histograms): # OUT + Splitter self, + unsigned int [::1] sample_indices, # IN + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] sibling_histograms, # IN + hist_struct [:, ::1] histograms): # OUT """For each feature, find the best bin to split on at a given node. - Returns the best split info among all features, and the histograms of + Return the best split info among all features, and the histograms of all the features. This does the same job as ``find_node_split()`` but uses the @@ -507,14 +527,15 @@ cdef class Splitter: Sum of the samples gradients at the current node sum_hessians : float Sum of the samples hessians at the current node - parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + parent_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) The histograms of the parent sibling_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) + shape(n_features, max_bins) The histograms of the sibling histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The computed histograms + shape(n_features, max_bins) + The histograms of the current node (to be computed) Returns ------- @@ -567,27 +588,27 @@ cdef class Splitter: free(split_infos) return out - cdef int _find_best_feature_to_split_helper(self, - split_info_struct * split_infos # IN - ) nogil: - """Returns the best split_info among those in splits_infos.""" + cdef int _find_best_feature_to_split_helper( + self, + split_info_struct * split_infos) nogil: # IN + """Returns the best feature among those in splits_infos.""" cdef: int feature_idx int best_feature_idx = 0 for feature_idx in range(1, self.n_features): - if split_infos[feature_idx].gain > split_infos[best_feature_idx].gain: + if (split_infos[feature_idx].gain > + split_infos[best_feature_idx].gain): best_feature_idx = feature_idx return best_feature_idx cdef split_info_struct _find_best_bin_to_split_helper( - self, - unsigned int feature_idx, - const hist_struct [:, ::1] histograms, # IN - unsigned int n_samples, - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians, - ) nogil: + self, + unsigned int feature_idx, + const hist_struct [:, ::1] histograms, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians) nogil: """Find best bin to split on for a given feature. Splits that do not satisfy the splitting constraints @@ -658,12 +679,12 @@ cdef class Splitter: # Only used for tests (python code cannot use cdef types) # Not sure if this is a good practice... def find_best_split_wrapper( - self, - unsigned int feature_idx, - unsigned int [::1] sample_indices, - hist_struct [:, ::1] histograms, - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians): + self, + int feature_idx, + unsigned int [::1] sample_indices, + hist_struct [:, ::1] histograms, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians): self._compute_histogram(feature_idx, sample_indices, histograms) n_samples = sample_indices.shape[0] @@ -685,13 +706,13 @@ cdef class Splitter: cdef inline Y_DTYPE_C _split_gain( - Y_DTYPE_C gradient_left, - Y_DTYPE_C hessian_left, - Y_DTYPE_C gradient_right, - Y_DTYPE_C hessian_right, - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians, - Y_DTYPE_C l2_regularization) nogil: + Y_DTYPE_C gradient_left, + Y_DTYPE_C hessian_left, + Y_DTYPE_C gradient_right, + Y_DTYPE_C hessian_right, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + Y_DTYPE_C l2_regularization) nogil: """Loss reduction Compute the reduction in loss after taking a split, compared to keeping @@ -709,7 +730,7 @@ cdef inline Y_DTYPE_C _split_gain( return gain cdef inline Y_DTYPE_C negative_loss( - Y_DTYPE_C gradient, - Y_DTYPE_C hessian, - Y_DTYPE_C l2_regularization) nogil: + Y_DTYPE_C gradient, + Y_DTYPE_C hessian, + Y_DTYPE_C l2_regularization) nogil: return (gradient * gradient) / (hessian + l2_regularization) diff --git a/sklearn/_fast_gradient_boosting/types.pxd b/sklearn/_fast_gradient_boosting/types.pxd index d9470ecef62f8..d614df001bb1c 100644 --- a/sklearn/_fast_gradient_boosting/types.pxd +++ b/sklearn/_fast_gradient_boosting/types.pxd @@ -7,9 +7,9 @@ ctypedef np.npy_float64 X_DTYPE_C ctypedef np.npy_uint8 X_BINNED_DTYPE_C ctypedef np.npy_float64 Y_DTYPE_C -# Same as histogram dtype but we need a struct to declare views. It needs to be -# packed since by default numpy dtypes aren't aligned cdef packed struct hist_struct: + # Same as histogram dtype but we need a struct to declare views. It needs + # to be packed since by default numpy dtypes aren't aligned Y_DTYPE_C sum_gradients Y_DTYPE_C sum_hessians unsigned int count From 2e5bf391fe2c60f33b5bd4d8b5318dcce7683ffc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 15:10:04 -0500 Subject: [PATCH 087/247] revert change in setup --- sklearn/_fast_gradient_boosting/setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index d0ad96ba3dd7f..6dc60867f6c68 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -33,10 +33,6 @@ def configuration(parent_package="", top_path=None): sources=["types.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("playground", - sources=["playground.pyx"], - include_dirs=[numpy.get_include()]) - config.add_subpackage("tests") return config From ce5dff3bd7a89fbcc67d82c138b3f4f5de9bcfef Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 15:12:06 -0500 Subject: [PATCH 088/247] Added note in user guide --- doc/modules/ensemble.rst | 15 +++++++++++++++ .../_fast_gradient_boosting/gradient_boosting.py | 4 +--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 4abe7789b63d3..a520fb5e8293b 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -456,6 +456,21 @@ The module :mod:`sklearn.ensemble` provides methods for both classification and regression via gradient boosted regression trees. + +.. note:: + :class:`FastGradientBoostingClassifier` and + :class:`FastGradientBoostingRegressor` were introduced in version 0.21 and + are considerably faster than :class:`GradientBoostingClassifier` and + :class:`GradientBoostingRegressor` when the number of samples is bigger than + ``10 000``. These fast estimators first bin the input samples `X` into + integer-valued bins (typically 256 bins) which tremendously reduces the + number of splitting points to consider. The API of these new estimators is + slightly different, and some features are not yet supported. + + The following doc focuses on :class:`GradientBoostingClassifier` and + :class:`GradientBoostingRegressor` only. + + Classification --------------- diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 24606e16ad70b..e24a4424c2c43 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -1,6 +1,4 @@ -""" -Gradient Boosting decision trees for classification and regression. -""" +"""Fast Gradient Boosting decision trees for classification and regression.""" from abc import ABC, abstractmethod import numpy as np From 6e791ba0c13a682ca91ee29abc092da2e67718ed Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 16:11:45 -0500 Subject: [PATCH 089/247] some docstrings --- sklearn/_fast_gradient_boosting/gradient_boosting.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index e24a4424c2c43..007bd8188f245 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -424,6 +424,12 @@ def n_estimators_(self): class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): """Fast Gradient Boosting Regression Tree. + This estimator is much faster than + :class:`GradientBoostingRegressor` + for big datasets (n_samples >= 10 000). The input data `X` is pre-binned + into integer-valued bins, which considerably reduces the number of + splitting points to consider. + Parameters ---------- loss : {'least_squares'}, optional(default='least_squares') @@ -556,6 +562,12 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, ClassifierMixin): """Fast Gradient Boosting Classification Tree. + This estimator is much faster than + :class:`GradientBoostingClassifier` + for big datasets (n_samples >= 10 000). The input data `X` is pre-binned + into integer-valued bins, which considerably reduces the number of + splitting points to consider. + Parameters ---------- loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \ From f543d61825a565649225624be194a9b33584c98a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 16:12:06 -0500 Subject: [PATCH 090/247] convert prange argument to int instead of unsigned int to avoid cython bug --- sklearn/_fast_gradient_boosting/loss.pyx | 6 +++--- sklearn/_fast_gradient_boosting/splitting.pyx | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 95289203e20ad..b38d0fa396abe 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -143,7 +143,7 @@ cdef void _update_gradients_least_squares( const Y_DTYPE_C [:] y_true, const Y_DTYPE_C [:] raw_predictions) nogil: cdef: - unsigned int n_samples + int n_samples int i n_samples = raw_predictions.shape[0] @@ -208,7 +208,7 @@ cdef void _update_gradients_hessians_binary_crossentropy( const Y_DTYPE_C [:] y_true, const Y_DTYPE_C [:] raw_predictions) nogil: cdef: - unsigned int n_samples + int n_samples Y_DTYPE_C gradient_abs int i @@ -268,7 +268,7 @@ cdef void _update_gradients_hessians_categorical_crossentropy( # shape (n_samples, n_tree_per_iter), IN const Y_DTYPE_C [:, :] raw_predictions) nogil: cdef: - unsigned int n_samples + int n_samples unsigned int prediction_dim unsigned int k int i diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index ecc8b73f0ec1b..9f7fdf55ba5ea 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -371,9 +371,10 @@ cdef class Splitter: The info about the best possible split among all features. """ cdef: - unsigned int n_samples + int n_samples int feature_idx int best_feature_idx + int n_features = self.n_features int i unsigned int thread_idx unsigned int [:] starts @@ -428,7 +429,7 @@ cdef class Splitter: split_infos = malloc( self.n_features * sizeof(split_info_struct)) - for feature_idx in prange(self.n_features): + for feature_idx in prange(n_features): # Compute histogram of each feature self._compute_histogram(feature_idx, sample_indices, histograms) @@ -545,6 +546,7 @@ cdef class Splitter: cdef: int feature_idx + int n_features = self.n_features unsigned int n_samples split_info_struct split_info split_info_struct * split_infos @@ -555,7 +557,7 @@ cdef class Splitter: split_infos = malloc( self.n_features * sizeof(split_info_struct)) - for feature_idx in prange(self.n_features): + for feature_idx in prange(n_features): # Compute histogram of each feature _subtract_histograms(feature_idx, self.max_bins, From 00aab5f6d05474425186f3f8cd237e83dde17be9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 16:37:44 -0500 Subject: [PATCH 091/247] minor comments --- sklearn/_fast_gradient_boosting/types.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/types.pyx b/sklearn/_fast_gradient_boosting/types.pyx index f5dae1d17b856..fe2345b3df994 100644 --- a/sklearn/_fast_gradient_boosting/types.pyx +++ b/sklearn/_fast_gradient_boosting/types.pyx @@ -1,8 +1,11 @@ import numpy as np +# Y_DYTPE is the dtype to which the targets y are converted to. This is also +# the dtype for gradients, hessians, leaf values, etc. because they are all +# homogeneous to a target. Y_DTYPE = np.float64 X_DTYPE = np.float64 -X_BINNED_DTYPE = np.uint8 +X_BINNED_DTYPE = np.uint8 # hence max_bins == 256 HISTOGRAM_DTYPE = np.dtype([ ('sum_gradients', Y_DTYPE), # sum of sample gradients in bin From ad94842f3363c927d661ebed5562c78ce9c80f20 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 17:32:08 -0500 Subject: [PATCH 092/247] removed construction_speed --- sklearn/_fast_gradient_boosting/grower.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index a104bbbcc13dc..28a485a578889 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -57,8 +57,6 @@ class TreeNode: find_split_time : float The total time spent computing the histogram and finding the best split at the node. - construction_speed : float - The Number of samples at the node divided find_split_time. apply_split_time : float The total time spent actually splitting the node, e.g. splitting sample_indices into left and right child. @@ -78,7 +76,6 @@ class TreeNode: sibling = None parent = None find_split_time = 0. - construction_speed = 0. apply_split_time = 0. hist_subtraction = False @@ -332,7 +329,6 @@ def _compute_spittability(self, node, only_hist=False): toc = time() node.find_split_time = toc - tic self.total_find_split_time += node.find_split_time - node.construction_speed = node.n_samples / node.find_split_time node.split_info = split_info node.histograms = histograms From 468ec148c102a7d36d883a50532ffbbb1de5466f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 18:00:24 -0500 Subject: [PATCH 093/247] removed throughput computation --- sklearn/_fast_gradient_boosting/gradient_boosting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 007bd8188f245..6b784390ab42b 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -115,8 +115,7 @@ def fit(self, X, y): toc = time() if self.verbose: duration = toc - tic - troughput = X.nbytes / duration - print("{:.3f} s ({:.3f} MB/s)".format(duration, troughput / 1e6)) + print("{:.3f} s".format(duration)) self.loss_ = self._get_loss() From a53de7bf4d7db9e746d1aec4776ed12ee8feeb3b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Jan 2019 18:53:53 -0500 Subject: [PATCH 094/247] lower decimal rounding for check --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5ba8da1859fbc..1048ea19561f9 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2299,8 +2299,8 @@ def check_decision_proba_consistency(name, estimator_orig): b = estimator.decision_function(X_test) # truncate arrays to the 10th decimal to avoid rank discrepancies that # woulde caused by floating point precision issue - a = np.around(a, decimals=10) - b = np.around(b, decimals=10) + a = np.around(a, decimals=6) + b = np.around(b, decimals=6) assert_array_equal(rankdata(a), rankdata(b)) From e06b9882a239dcd533e9ebd8bf87518e314104f8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 23 Jan 2019 08:41:29 -0500 Subject: [PATCH 095/247] set random seed in test --- sklearn/utils/estimator_checks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1048ea19561f9..9745f8829b47f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2285,6 +2285,7 @@ def check_decision_proba_consistency(name, estimator_orig): # Check whether an estimator having both decision_function and # predict_proba methods has outputs with perfect rank correlation. + np.random.seed(0) centers = [(2, 2), (4, 4)] X, y = make_blobs(n_samples=100, random_state=0, n_features=4, centers=centers, cluster_std=1.0, shuffle=True) @@ -2299,8 +2300,8 @@ def check_decision_proba_consistency(name, estimator_orig): b = estimator.decision_function(X_test) # truncate arrays to the 10th decimal to avoid rank discrepancies that # woulde caused by floating point precision issue - a = np.around(a, decimals=6) - b = np.around(b, decimals=6) + a = np.around(a, decimals=10) + b = np.around(b, decimals=10) assert_array_equal(rankdata(a), rankdata(b)) From a92cbbd4865ef2a793670984b733a0c8fb9ca76c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 23 Jan 2019 10:02:44 -0500 Subject: [PATCH 096/247] typo --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9745f8829b47f..be8138e6cad12 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2299,7 +2299,7 @@ def check_decision_proba_consistency(name, estimator_orig): a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) # truncate arrays to the 10th decimal to avoid rank discrepancies that - # woulde caused by floating point precision issue + # would be caused by floating point precision issue a = np.around(a, decimals=10) b = np.around(b, decimals=10) assert_array_equal(rankdata(a), rankdata(b)) From 783a39996f3815da2fffd719346d1dcb7fb6ebd0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 23 Jan 2019 18:43:08 -0500 Subject: [PATCH 097/247] Should fix check_fit_idempotent due to prange summing instability thanks @amueller!! --- sklearn/_fast_gradient_boosting/splitting.pyx | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 9f7fdf55ba5ea..e7e27a95bcd7e 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -28,12 +28,16 @@ from .types cimport hist_struct from .types import HISTOGRAM_DTYPE -# Note: in a lot of functions here we pass feature_idx and the whole 2d +# Note: in a lot of functions here, we pass feature_idx and the whole 2d # histograms arrays instead a lot just histograms[feature_idx]. This is # because Cython generated C code will have strange Python interactions (likely # related to the GIL release and the custom histogram dtype) when using 1d # histogram arrays. +# epsilon for comparing gains to avoid floating precision issues that might be +# caused by the (slightly non-deterministic) parallel sums over gradients and +# hessians +cdef Y_DTYPE_C EPS = 1e-13 cdef struct split_info_struct: # Same as the SplitInfo class, but we need a C struct to use it in the @@ -406,19 +410,7 @@ cdef class Splitter: ordered_hessians[i] = hessians[sample_indices[i]] # Compute sums of gradients and hessians at the node - - # TODO: ideally use: - # for i in prange(n_samples, schedule='static'): - # we should be using prange here, but for some reason it - # leads to slightly incorrect values (1 out of ~100 times) and - # test check_estimator() does not pass anymore - # (check_fit_idempotent). It only seems to be a problem for - # classifiers which is very strange because the loop isn't - # classifier-specific. Maybe it has to do with the array - # population above (hessians aren't constant for classification - # losses). I tried to create a minimal reproducing example, without - # sucess. - for i in range(n_samples): + for i in prange(n_samples, schedule='static'): sum_gradients += ordered_gradients[i] if self.hessians_are_constant: sum_hessians = n_samples @@ -599,8 +591,8 @@ cdef class Splitter: int best_feature_idx = 0 for feature_idx in range(1, self.n_features): - if (split_infos[feature_idx].gain > - split_infos[best_feature_idx].gain): + if (split_infos[feature_idx].gain - + split_infos[best_feature_idx].gain) > EPS: best_feature_idx = feature_idx return best_feature_idx @@ -665,7 +657,7 @@ cdef class Splitter: sum_gradients, sum_hessians, self.l2_regularization) - if gain > best_split.gain and gain > self.min_gain_to_split: + if gain - best_split.gain > EPS and gain > self.min_gain_to_split: best_split.gain = gain best_split.feature_idx = feature_idx best_split.bin_idx = bin_idx From e47b7453762de18b6cae4f99bfb7144db37c4e24 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 25 Jan 2019 14:10:25 -0500 Subject: [PATCH 098/247] renamed start and stop into partition_start and partition_stop --- .../_gradient_boosting.pyx | 6 ++++-- sklearn/_fast_gradient_boosting/grower.py | 20 +++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 05be63c5ec56e..2c1a3528ae409 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -28,8 +28,10 @@ def _update_raw_predictions( list leaves leaves = grower.finalized_leaves - starts = np.array([leaf.start for leaf in leaves], dtype=np.uint32) - stops = np.array([leaf.stop for leaf in leaves], dtype=np.uint32) + starts = np.array([leaf.partition_start for leaf in leaves], + dtype=np.uint32) + stops = np.array([leaf.partition_stop for leaf in leaves], + dtype=np.uint32) values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) _update_raw_predictions_helper(raw_predictions, starts, stops, partition, diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 28a485a578889..ce9a1706ce668 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -62,9 +62,9 @@ class TreeNode: sample_indices into left and right child. hist_subtraction : bool Wheter the subtraction method was used for computing the histograms. - start : int + partition_start : int start position of the node's sample_indices in splitter.partition - stop : int + partition_stop : int stop position of the node's sample_indices in splitter.partition """ @@ -85,8 +85,8 @@ class TreeNode: # Only used in _update_raw_prediction, because we need to iterate over the # leaves and I don't know how to efficiently store the sample_indices # views because they're all of different sizes. - start = 0 - stop = 0 + partition_start = 0 + partition_stop = 0 def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, parent=None): @@ -261,8 +261,8 @@ def _intilialize_root(self): sum_hessians=sum_hessians ) - self.root.start = 0 - self.root.stop = n_samples + self.root.partition_start = 0 + self.root.partition_stop = n_samples if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1): self._finalize_leaf(self.root) @@ -392,10 +392,10 @@ def split_next(self): node.left_child = left_child_node # set start and stop indices - left_child_node.start = node.start - left_child_node.stop = node.start + right_child_pos - right_child_node.start = left_child_node.stop - right_child_node.stop = node.stop + left_child_node.partition_start = node.partition_start + left_child_node.partition_stop = node.partition_start + right_child_pos + right_child_node.partition_start = left_child_node.partition_stop + right_child_node.partition_stop = node.partition_stop self.n_nodes += 2 From 39d803095d37091fcaa15d8de8f61e8ca7064f84 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 27 Jan 2019 11:29:05 -0500 Subject: [PATCH 099/247] Parallelized root gradient and hessians sums --- sklearn/_fast_gradient_boosting/grower.py | 5 +++-- sklearn/_fast_gradient_boosting/setup.py | 4 ++++ .../{utils.py => utils.pyx} | 21 +++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) rename sklearn/_fast_gradient_boosting/{utils.py => utils.pyx} (85%) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index ce9a1706ce668..80b3802fd6bdc 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -10,7 +10,7 @@ from .splitting import Splitter from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE - +from .utils import sum_parallel from .types import HISTOGRAM_DTYPE @@ -249,7 +249,8 @@ def _intilialize_root(self): """Initialize root node and finalize it if needed.""" n_samples = self.X_binned.shape[0] depth = 0 - sum_gradients = np.sum(self.splitter.gradients) + # sum_gradients = np.sum(self.splitter.gradients) + sum_gradients = sum_parallel(self.splitter.gradients) if self.splitter.hessians_are_constant: sum_hessians = self.splitter.hessians[0] * n_samples else: diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index 6dc60867f6c68..a64ea2f92b3a0 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -33,6 +33,10 @@ def configuration(parent_package="", top_path=None): sources=["types.pyx"], include_dirs=[numpy.get_include()]) + config.add_extension("utils", + sources=["utils.pyx"], + include_dirs=[numpy.get_include()]) + config.add_subpackage("tests") return config diff --git a/sklearn/_fast_gradient_boosting/utils.py b/sklearn/_fast_gradient_boosting/utils.pyx similarity index 85% rename from sklearn/_fast_gradient_boosting/utils.py rename to sklearn/_fast_gradient_boosting/utils.pyx index 5a568f30465a3..9b594c5beec06 100644 --- a/sklearn/_fast_gradient_boosting/utils.py +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -1,5 +1,13 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 """This module contains utility routines.""" + +from cython.parallel import prange + from .binning import BinMapper +from .types cimport Y_DTYPE_C def get_lightgbm_estimator(pygbm_estimator): @@ -61,3 +69,16 @@ def get_lightgbm_estimator(pygbm_estimator): Est = LGBMRegressor return Est(**lgbm_params) + + +def sum_parallel(Y_DTYPE_C [:] array): + + cdef: + Y_DTYPE_C out = 0. + int i = 0 + + with nogil: + for i in prange(array.shape[0], schedule='static'): + out += array[i] + + return out From 14c7d47ba7f6c9201ae1168a3896b0f224e451a8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jan 2019 11:01:35 -0500 Subject: [PATCH 100/247] Used floats instead of doubles for gradients and hessians arrays --- sklearn/_fast_gradient_boosting/histogram.pxd | 13 +++++----- sklearn/_fast_gradient_boosting/histogram.pyx | 18 ++++++------- sklearn/_fast_gradient_boosting/loss.pyx | 24 +++++++++-------- sklearn/_fast_gradient_boosting/splitting.pyx | 23 ++++++++-------- .../tests/test_grower.py | 13 +++++----- .../tests/test_histogram.py | 26 +++++++++---------- .../tests/test_loss.py | 9 ++++--- .../tests/test_predictor.py | 6 ++--- .../tests/test_splitting.py | 26 +++++++++---------- sklearn/_fast_gradient_boosting/types.pxd | 1 + sklearn/_fast_gradient_boosting/types.pyx | 6 +++-- sklearn/_fast_gradient_boosting/utils.pyx | 3 ++- 12 files changed, 88 insertions(+), 80 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/histogram.pxd b/sklearn/_fast_gradient_boosting/histogram.pxd index e89582d03a266..70487ade70a8e 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pxd +++ b/sklearn/_fast_gradient_boosting/histogram.pxd @@ -19,6 +19,7 @@ cimport numpy as np from .types import HISTOGRAM_DTYPE from .types cimport X_BINNED_DTYPE_C from .types cimport Y_DTYPE_C +from .types cimport G_H_DTYPE_C from .types cimport hist_struct """compute (hist_a - hist_b) in out""" @@ -37,8 +38,8 @@ cpdef void _build_histogram( unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] ordered_gradients, # IN - const Y_DTYPE_C [::1] ordered_hessians, # IN + const G_H_DTYPE_C [::1] ordered_gradients, # IN + const G_H_DTYPE_C [::1] ordered_hessians, # IN hist_struct [:, ::1] out) nogil # OUT @@ -49,7 +50,7 @@ cpdef void _build_histogram_no_hessian( unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] ordered_gradients, # IN + const G_H_DTYPE_C [::1] ordered_gradients, # IN hist_struct [:, ::1] out) nogil # OUT """Compute histogram of the root node. @@ -60,8 +61,8 @@ cpdef void _build_histogram_root( const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] all_gradients, # IN - const Y_DTYPE_C [::1] all_hessians, # IN + const G_H_DTYPE_C [::1] all_gradients, # IN + const G_H_DTYPE_C [::1] all_hessians, # IN hist_struct [:, ::1] out) nogil # OUT """Compute histogram of the root node, not updating hessians. @@ -70,5 +71,5 @@ cpdef void _build_histogram_root_no_hessian( const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] all_gradients, # IN + const G_H_DTYPE_C [::1] all_gradients, # IN hist_struct [:, ::1] out) nogil # OUT diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index 4b1f6e4c041e3..4335980b2ec4a 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -13,8 +13,6 @@ cimport cython import numpy as np cimport numpy as np -from .types import HISTOGRAM_DTYPE - # Note: IN views are read-only, OUT views are write-only # See histogram.pxd for docstrings and details @@ -24,8 +22,8 @@ cpdef void _build_histogram_naive( unsigned int n_bins, unsigned int [:] sample_indices, # IN X_BINNED_DTYPE_C [:] binned_feature, # IN - Y_DTYPE_C [:] ordered_gradients, # IN - Y_DTYPE_C [:] ordered_hessians, # IN + G_H_DTYPE_C [:] ordered_gradients, # IN + G_H_DTYPE_C [:] ordered_hessians, # IN hist_struct [:, :] out) nogil: # OUT """Build histogram in a naive way, without optimizing for cache hit. @@ -72,8 +70,8 @@ cpdef void _build_histogram( unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] ordered_gradients, # IN - const Y_DTYPE_C [::1] ordered_hessians, # IN + const G_H_DTYPE_C [::1] ordered_gradients, # IN + const G_H_DTYPE_C [::1] ordered_hessians, # IN hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 @@ -119,7 +117,7 @@ cpdef void _build_histogram_no_hessian( unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] ordered_gradients, # IN + const G_H_DTYPE_C [::1] ordered_gradients, # IN hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 @@ -158,8 +156,8 @@ cpdef void _build_histogram_root( const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] all_gradients, # IN - const Y_DTYPE_C [::1] all_hessians, # IN + const G_H_DTYPE_C [::1] all_gradients, # IN + const G_H_DTYPE_C [::1] all_hessians, # IN hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 @@ -205,7 +203,7 @@ cpdef void _build_histogram_root_no_hessian( const int feature_idx, unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const Y_DTYPE_C [::1] all_gradients, # IN + const G_H_DTYPE_C [::1] all_gradients, # IN hist_struct [:, ::1] out) nogil: # OUT cdef: unsigned int i = 0 diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index b38d0fa396abe..a4ebf3e01f986 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -24,6 +24,8 @@ from libc.math cimport fabs, exp, log from .types import Y_DTYPE from .types cimport Y_DTYPE_C +from .types import G_H_DTYPE +from .types cimport G_H_DTYPE_C class BaseLoss(ABC): @@ -53,14 +55,14 @@ class BaseLoss(ABC): is (1,) and the array is initialized to ``1``. """ shape = n_samples * prediction_dim - gradients = np.empty(shape=shape, dtype=Y_DTYPE) + gradients = np.empty(shape=shape, dtype=G_H_DTYPE) if self.hessians_are_constant: # if the hessians are constant, we consider they are equal to 1. # this is correct as long as we adjust the gradients. See e.g. LS # loss - hessians = np.ones(shape=1, dtype=Y_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) else: - hessians = np.empty(shape=shape, dtype=Y_DTYPE) + hessians = np.empty(shape=shape, dtype=G_H_DTYPE) return gradients, hessians @@ -139,7 +141,7 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( - Y_DTYPE_C [:] gradients, + G_H_DTYPE_C [:] gradients, const Y_DTYPE_C [:] y_true, const Y_DTYPE_C [:] raw_predictions) nogil: cdef: @@ -203,13 +205,13 @@ class BinaryCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_binary_crossentropy( - Y_DTYPE_C [:] gradients, - Y_DTYPE_C [:] hessians, + G_H_DTYPE_C [:] gradients, + G_H_DTYPE_C [:] hessians, const Y_DTYPE_C [:] y_true, const Y_DTYPE_C [:] raw_predictions) nogil: cdef: int n_samples - Y_DTYPE_C gradient_abs + G_H_DTYPE_C gradient_abs int i n_samples = raw_predictions.shape[0] @@ -262,8 +264,8 @@ class CategoricalCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_categorical_crossentropy( - Y_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT - Y_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT + G_H_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT + G_H_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT const Y_DTYPE_C [:] y_true, # shape (n_samples,), IN # shape (n_samples, n_tree_per_iter), IN const Y_DTYPE_C [:, :] raw_predictions) nogil: @@ -273,8 +275,8 @@ cdef void _update_gradients_hessians_categorical_crossentropy( unsigned int k int i Y_DTYPE_C p_k - Y_DTYPE_C [:] gradients_at_k, - Y_DTYPE_C [:] hessians_at_k, + G_H_DTYPE_C [:] gradients_at_k, + G_H_DTYPE_C [:] hessians_at_k, n_samples = raw_predictions.shape[0] prediction_dim = raw_predictions.shape[1] diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index e7e27a95bcd7e..75f1a17f8c5b9 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -24,6 +24,7 @@ from .histogram cimport _subtract_histograms # from .histogram cimport _subtract_histograms from .types cimport X_BINNED_DTYPE_C from .types cimport Y_DTYPE_C +from .types cimport G_H_DTYPE_C from .types cimport hist_struct from .types import HISTOGRAM_DTYPE @@ -147,10 +148,10 @@ cdef class Splitter: unsigned int n_features unsigned int max_bins unsigned int [::1] n_bins_per_feature - Y_DTYPE_C [::1] gradients - Y_DTYPE_C [::1] hessians - Y_DTYPE_C [::1] ordered_gradients - Y_DTYPE_C [::1] ordered_hessians + G_H_DTYPE_C [::1] gradients + G_H_DTYPE_C [::1] hessians + G_H_DTYPE_C [::1] ordered_gradients + G_H_DTYPE_C [::1] ordered_hessians unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -163,7 +164,7 @@ cdef class Splitter: def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - Y_DTYPE_C [::1] gradients, Y_DTYPE_C [::1] hessians, + G_H_DTYPE_C [::1] gradients, G_H_DTYPE_C [::1] hessians, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0.): @@ -389,10 +390,10 @@ cdef class Splitter: Y_DTYPE_C sum_gradients = 0. Y_DTYPE_C sum_hessians = 0. # need local views to avoid python interactions - Y_DTYPE_C [::1] ordered_gradients = self.ordered_gradients - Y_DTYPE_C [::1] gradients = self.gradients - Y_DTYPE_C [::1] ordered_hessians = self.ordered_hessians - Y_DTYPE_C [::1] hessians = self.hessians + G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients + G_H_DTYPE_C [::1] gradients = self.gradients + G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians + G_H_DTYPE_C [::1] hessians = self.hessians with nogil: n_samples = sample_indices.shape[0] @@ -464,9 +465,9 @@ cdef class Splitter: const X_BINNED_DTYPE_C [::1] X_binned = \ self.X_binned[:, feature_idx] unsigned int root_node = X_binned.shape[0] == n_samples - Y_DTYPE_C [::1] ordered_gradients = \ + G_H_DTYPE_C [::1] ordered_gradients = \ self.ordered_gradients[:n_samples] - Y_DTYPE_C [::1] ordered_hessians = \ + G_H_DTYPE_C [::1] ordered_hessians = \ self.ordered_hessians[:n_samples] if root_node: diff --git a/sklearn/_fast_gradient_boosting/tests/test_grower.py b/sklearn/_fast_gradient_boosting/tests/test_grower.py index e9cc3a0a04908..0432598478a21 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_grower.py +++ b/sklearn/_fast_gradient_boosting/tests/test_grower.py @@ -8,6 +8,7 @@ from sklearn._fast_gradient_boosting.binning import BinMapper from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import G_H_DTYPE def _make_training_data(n_bins=256, constant_hessian=True): @@ -40,9 +41,9 @@ def true_decision_function(input_features): # Assume a square loss applied to an initial model that always predicts 0 # (hardcoded for this test): - all_gradients = target + all_gradients = target.astype(G_H_DTYPE) if constant_hessian: - all_hessians = np.ones(shape=1, dtype=Y_DTYPE) + all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) else: all_hessians = np.ones_like(all_gradients) return X_binned, all_gradients, all_hessians @@ -209,9 +210,9 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, mapper = BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) - all_gradients = y.astype(Y_DTYPE) + all_gradients = y.astype(G_H_DTYPE) if constant_hessian: - all_hessians = np.ones(shape=1, dtype=Y_DTYPE) + all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) else: all_hessians = np.ones_like(all_gradients) grower = TreeGrower(X, all_gradients, all_hessians, @@ -248,8 +249,8 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): mapper = BinMapper(max_bins=max_bins) X = mapper.fit_transform(X) - all_gradients = y.astype(Y_DTYPE) - all_hessians = np.ones(shape=1, dtype=Y_DTYPE) + all_gradients = y.astype(G_H_DTYPE) + all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=max_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, diff --git a/sklearn/_fast_gradient_boosting/tests/test_histogram.py b/sklearn/_fast_gradient_boosting/tests/test_histogram.py index 7f847a545fb38..b432e2639c7f3 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_histogram.py +++ b/sklearn/_fast_gradient_boosting/tests/test_histogram.py @@ -13,7 +13,7 @@ from sklearn._fast_gradient_boosting.histogram import _build_histogram_root from sklearn._fast_gradient_boosting.histogram import _subtract_histograms from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import G_H_DTYPE from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE @@ -23,8 +23,8 @@ def test_build_histogram(build_func): binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE) # Small sample_indices (below unrolling threshold) - ordered_gradients = np.array([0, 1, 3], dtype=Y_DTYPE) - ordered_hessians = np.array([1, 1, 2], dtype=Y_DTYPE) + ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE) + ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE) sample_indices = np.array([0, 2, 3], dtype=np.uint32) hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) @@ -37,8 +37,8 @@ def test_build_histogram(build_func): # Larger sample_indices (above unrolling threshold) sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32) - ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=Y_DTYPE) - ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=Y_DTYPE) + ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE) + ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE) hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) build_func(0, 3, sample_indices, binned_feature, ordered_gradients, @@ -59,12 +59,12 @@ def test_histogram_sample_order_independence(): dtype=X_BINNED_DTYPE) sample_indices = rng.choice(np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False) - ordered_gradients = rng.randn(n_sub_samples).astype(Y_DTYPE) + ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE) hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, ordered_gradients, hist_gc) - ordered_hessians = rng.exponential(size=n_sub_samples).astype(Y_DTYPE) + ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE) hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) _build_histogram(0, n_bins, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc) @@ -102,11 +102,11 @@ def test_unrolled_equivalent_to_naive(constant_hessian): n_bins = 5 sample_indices = np.arange(n_samples).astype(np.uint32) binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) - ordered_gradients = rng.randn(n_samples).astype(Y_DTYPE) + ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE) if constant_hessian: - ordered_hessians = np.ones(n_samples, dtype=Y_DTYPE) + ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE) else: - ordered_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) + ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) @@ -148,11 +148,11 @@ def test_hist_subtraction(constant_hessian): n_bins = 5 sample_indices = np.arange(n_samples).astype(np.uint32) binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) - ordered_gradients = rng.randn(n_samples).astype(Y_DTYPE) + ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE) if constant_hessian: - ordered_hessians = np.ones(n_samples, dtype=Y_DTYPE) + ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE) else: - ordered_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) + ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: diff --git a/sklearn/_fast_gradient_boosting/tests/test_loss.py b/sklearn/_fast_gradient_boosting/tests/test_loss.py index beeccb2eb432d..4034328454578 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_loss.py +++ b/sklearn/_fast_gradient_boosting/tests/test_loss.py @@ -7,6 +7,7 @@ from sklearn._fast_gradient_boosting.loss import _LOSSES from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import G_H_DTYPE def get_derivatives_helper(loss): @@ -16,8 +17,8 @@ def get_derivatives_helper(loss): def get_gradients(y_true, raw_predictions): # create gradients and hessians array, update inplace, and return shape = raw_predictions.shape[0] * raw_predictions.shape[1] - gradients = np.empty(shape=shape, dtype=Y_DTYPE) - hessians = np.empty(shape=shape, dtype=Y_DTYPE) + gradients = np.empty(shape=shape, dtype=G_H_DTYPE) + hessians = np.empty(shape=shape, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) @@ -29,8 +30,8 @@ def get_gradients(y_true, raw_predictions): def get_hessians(y_true, raw_predictions): # create gradients and hessians array, update inplace, and return shape = raw_predictions.shape[0] * raw_predictions.shape[1] - gradients = np.empty(shape=shape, dtype=Y_DTYPE) - hessians = np.empty(shape=shape, dtype=Y_DTYPE) + gradients = np.empty(shape=shape, dtype=G_H_DTYPE) + hessians = np.empty(shape=shape, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) diff --git a/sklearn/_fast_gradient_boosting/tests/test_predictor.py b/sklearn/_fast_gradient_boosting/tests/test_predictor.py index 9ee07a2adf439..e31c639c09dbe 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_predictor.py +++ b/sklearn/_fast_gradient_boosting/tests/test_predictor.py @@ -6,7 +6,7 @@ from sklearn._fast_gradient_boosting.binning import BinMapper from sklearn._fast_gradient_boosting.grower import TreeGrower -from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import G_H_DTYPE @pytest.mark.parametrize('max_bins', [200, 256]) @@ -19,8 +19,8 @@ def test_boston_dataset(max_bins): X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss - gradients = -y_train.astype(Y_DTYPE) - hessians = np.ones(1, dtype=Y_DTYPE) + gradients = -y_train.astype(G_H_DTYPE) + hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index 35bb621a94f1c..8307475db415a 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -4,7 +4,7 @@ import pytest from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import Y_DTYPE +from sklearn._fast_gradient_boosting.types import G_H_DTYPE from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE from sklearn._fast_gradient_boosting.splitting import Splitter @@ -21,14 +21,14 @@ def test_histogram_split(n_bins): rng.randint(0, n_bins, size=(int(1e4), 2)), dtype=X_BINNED_DTYPE) binned_feature = X_binned.T[feature_idx] sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) - ordered_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) + ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) all_hessians = ordered_hessians sum_hessians = all_hessians.sum() for true_bin in range(1, n_bins - 1): for sign in [-1, 1]: ordered_gradients = np.full_like(binned_feature, sign, - dtype=Y_DTYPE) + dtype=G_H_DTYPE) ordered_gradients[binned_feature <= true_bin] *= -1 all_gradients = ordered_gradients sum_gradients = all_gradients.sum() @@ -77,11 +77,11 @@ def test_split_vs_split_subtraction(constant_hessian): dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(Y_DTYPE) + all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) if constant_hessian: - all_hessians = np.ones(1, dtype=Y_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) else: - all_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) + all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -163,11 +163,11 @@ def test_gradient_and_hessian_sanity(constant_hessian): dtype=X_BINNED_DTYPE) X_binned = np.asfortranarray(X_binned) sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(Y_DTYPE) + all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) if constant_hessian: - all_hessians = np.ones(1, dtype=Y_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) else: - all_hessians = rng.lognormal(size=n_samples).astype(Y_DTYPE) + all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -270,8 +270,8 @@ def test_split_indices(): [0, 4]] X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(Y_DTYPE) - all_hessians = np.ones(1, dtype=Y_DTYPE) + all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -322,8 +322,8 @@ def test_min_gain_to_split(): rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE) binned_feature = X_binned[:, 0] sample_indices = np.arange(n_samples, dtype=np.uint32) - all_hessians = np.ones_like(binned_feature, dtype=Y_DTYPE) - all_gradients = np.ones_like(binned_feature, dtype=Y_DTYPE) + all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) + all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE) n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) diff --git a/sklearn/_fast_gradient_boosting/types.pxd b/sklearn/_fast_gradient_boosting/types.pxd index d614df001bb1c..1dd1fbee4273c 100644 --- a/sklearn/_fast_gradient_boosting/types.pxd +++ b/sklearn/_fast_gradient_boosting/types.pxd @@ -6,6 +6,7 @@ cimport numpy as np ctypedef np.npy_float64 X_DTYPE_C ctypedef np.npy_uint8 X_BINNED_DTYPE_C ctypedef np.npy_float64 Y_DTYPE_C +ctypedef np.npy_float32 G_H_DTYPE_C cdef packed struct hist_struct: # Same as histogram dtype but we need a struct to declare views. It needs diff --git a/sklearn/_fast_gradient_boosting/types.pyx b/sklearn/_fast_gradient_boosting/types.pyx index fe2345b3df994..e13b5320bad32 100644 --- a/sklearn/_fast_gradient_boosting/types.pyx +++ b/sklearn/_fast_gradient_boosting/types.pyx @@ -1,11 +1,13 @@ import numpy as np # Y_DYTPE is the dtype to which the targets y are converted to. This is also -# the dtype for gradients, hessians, leaf values, etc. because they are all -# homogeneous to a target. +# dtype for leaf values, gains, and sums of gradients / hessians. The gradients +# and hessians arrays are stored as floats to avoid using too much memory. Y_DTYPE = np.float64 X_DTYPE = np.float64 X_BINNED_DTYPE = np.uint8 # hence max_bins == 256 +# dtypes for gradients and hessians arrays +G_H_DTYPE = np.float32 HISTOGRAM_DTYPE = np.dtype([ ('sum_gradients', Y_DTYPE), # sum of sample gradients in bin diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index 9b594c5beec06..2c0bd4c865c78 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -7,6 +7,7 @@ from cython.parallel import prange from .binning import BinMapper +from .types cimport G_H_DTYPE_C from .types cimport Y_DTYPE_C @@ -71,7 +72,7 @@ def get_lightgbm_estimator(pygbm_estimator): return Est(**lgbm_params) -def sum_parallel(Y_DTYPE_C [:] array): +def sum_parallel(G_H_DTYPE_C [:] array): cdef: Y_DTYPE_C out = 0. From 92dfe9df0abb1c79b92133061e376d7213cfc1bd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jan 2019 11:14:51 -0500 Subject: [PATCH 101/247] More explicit names for sums of gradients and hessians in SplitInfo --- sklearn/_fast_gradient_boosting/grower.py | 16 +-- sklearn/_fast_gradient_boosting/splitting.pyx | 115 +++++++++--------- .../tests/test_splitting.py | 41 ++++--- 3 files changed, 89 insertions(+), 83 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 80b3802fd6bdc..21c52a05376d9 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -315,11 +315,11 @@ def _compute_spittability(self, node, only_hist=False): dtype=HISTOGRAM_DTYPE) if node.hist_subtraction: if node is node.parent.right_child: - sum_gradients = node.parent.split_info.gradient_right - sum_hessians = node.parent.split_info.hessian_right + sum_gradients = node.parent.split_info.sum_gradient_right + sum_hessians = node.parent.split_info.sum_hessian_right else: - sum_gradients = node.parent.split_info.gradient_left - sum_hessians = node.parent.split_info.hessian_left + sum_gradients = node.parent.split_info.sum_gradient_left + sum_hessians = node.parent.split_info.sum_hessian_left split_info = self.splitter.find_node_split_subtraction( node.sample_indices, sum_gradients, sum_hessians, node.parent.histograms, @@ -379,13 +379,13 @@ def split_next(self): left_child_node = TreeNode(depth, sample_indices_left, - node.split_info.gradient_left, - node.split_info.hessian_left, + node.split_info.sum_gradient_left, + node.split_info.sum_hessian_left, parent=node) right_child_node = TreeNode(depth, sample_indices_right, - node.split_info.gradient_right, - node.split_info.hessian_right, + node.split_info.sum_gradient_right, + node.split_info.sum_hessian_right, parent=node) left_child_node.sibling = right_child_node right_child_node.sibling = left_child_node diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 75f1a17f8c5b9..1c91d80e1a490 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -46,10 +46,10 @@ cdef struct split_info_struct: Y_DTYPE_C gain int feature_idx unsigned int bin_idx - Y_DTYPE_C gradient_left - Y_DTYPE_C gradient_right - Y_DTYPE_C hessian_left - Y_DTYPE_C hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right unsigned int n_samples_left unsigned int n_samples_right @@ -66,13 +66,13 @@ cdef class SplitInfo: The index of the feature to be split bin_idx : int The index of the bin on which the split is made - gradient_left : float + sum_gradient_left : float The sum of the gradients of all the samples in the left child - hessian_left : float + sum_hessian_left : float The sum of the hessians of all the samples in the left child - gradient_right : float + sum_gradient_right : float The sum of the gradients of all the samples in the right child - hessian_right : float + sum_hessian_right : float The sum of the hessians of all the samples in the right child n_samples_left : int The number of samples in the left child @@ -83,25 +83,25 @@ cdef class SplitInfo: Y_DTYPE_C gain int feature_idx unsigned int bin_idx - Y_DTYPE_C gradient_left - Y_DTYPE_C gradient_right - Y_DTYPE_C hessian_left - Y_DTYPE_C hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right unsigned int n_samples_left unsigned int n_samples_right def __init__(self, Y_DTYPE_C gain=-1., int feature_idx=0, unsigned - int bin_idx=0, Y_DTYPE_C gradient_left=0., Y_DTYPE_C - hessian_left=0., Y_DTYPE_C gradient_right=0., Y_DTYPE_C - hessian_right=0., unsigned int n_samples_left=0, unsigned - int n_samples_right=0): + int bin_idx=0, Y_DTYPE_C sum_gradient_left=0., Y_DTYPE_C + sum_hessian_left=0., Y_DTYPE_C sum_gradient_right=0., + Y_DTYPE_C sum_hessian_right=0., unsigned int + n_samples_left=0, unsigned int n_samples_right=0): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx - self.gradient_left = gradient_left - self.hessian_left = hessian_left - self.gradient_right = gradient_right - self.hessian_right = hessian_right + self.sum_gradient_left = sum_gradient_left + self.sum_hessian_left = sum_hessian_left + self.sum_gradient_right = sum_gradient_right + self.sum_hessian_right = sum_hessian_right self.n_samples_left = n_samples_left self.n_samples_right = n_samples_right @@ -443,10 +443,10 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, - split_info.gradient_left, - split_info.hessian_left, - split_info.gradient_right, - split_info.hessian_right, + split_info.sum_gradient_left, + split_info.sum_hessian_left, + split_info.sum_gradient_right, + split_info.sum_hessian_right, split_info.n_samples_left, split_info.n_samples_right, ) @@ -573,10 +573,10 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, - split_info.gradient_left, - split_info.hessian_left, - split_info.gradient_right, - split_info.hessian_right, + split_info.sum_gradient_left, + split_info.sum_hessian_left, + split_info.sum_gradient_right, + split_info.sum_hessian_right, split_info.n_samples_left, split_info.n_samples_right, ) @@ -617,15 +617,15 @@ cdef class Splitter: unsigned int n_samples_left unsigned int n_samples_right unsigned int n_samples_ = n_samples - Y_DTYPE_C hessian_left - Y_DTYPE_C hessian_right - Y_DTYPE_C gradient_left - Y_DTYPE_C gradient_right + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right Y_DTYPE_C gain split_info_struct best_split best_split.gain = -1. - gradient_left, hessian_left = 0., 0. + sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 for bin_idx in range(self.n_bins_per_feature[feature_idx]): @@ -633,13 +633,14 @@ cdef class Splitter: n_samples_right = n_samples_ - n_samples_left if self.hessians_are_constant: - hessian_left += histograms[feature_idx, bin_idx].count + sum_hessian_left += histograms[feature_idx, bin_idx].count else: - hessian_left += histograms[feature_idx, bin_idx].sum_hessians - hessian_right = sum_hessians - hessian_left + sum_hessian_left += \ + histograms[feature_idx, bin_idx].sum_hessians + sum_hessian_right = sum_hessians - sum_hessian_left - gradient_left += histograms[feature_idx, bin_idx].sum_gradients - gradient_right = sum_gradients - gradient_left + sum_gradient_left += histograms[feature_idx, bin_idx].sum_gradients + sum_gradient_right = sum_gradients - sum_gradient_left if n_samples_left < self.min_samples_leaf: continue @@ -647,14 +648,14 @@ cdef class Splitter: # won't get any better break - if hessian_left < self.min_hessian_to_split: + if sum_hessian_left < self.min_hessian_to_split: continue - if hessian_right < self.min_hessian_to_split: + if sum_hessian_right < self.min_hessian_to_split: # won't get any better (hessians are > 0 since loss is convex) break - gain = _split_gain(gradient_left, hessian_left, - gradient_right, hessian_right, + gain = _split_gain(sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, sum_gradients, sum_hessians, self.l2_regularization) @@ -662,10 +663,10 @@ cdef class Splitter: best_split.gain = gain best_split.feature_idx = feature_idx best_split.bin_idx = bin_idx - best_split.gradient_left = gradient_left - best_split.gradient_right = gradient_right - best_split.hessian_left = hessian_left - best_split.hessian_right = hessian_right + best_split.sum_gradient_left = sum_gradient_left + best_split.sum_gradient_right = sum_gradient_right + best_split.sum_hessian_left = sum_hessian_left + best_split.sum_hessian_right = sum_hessian_right best_split.n_samples_left = n_samples_left best_split.n_samples_right = n_samples_right @@ -691,20 +692,20 @@ cdef class Splitter: split_info.gain, split_info.feature_idx, split_info.bin_idx, - split_info.gradient_left, - split_info.hessian_left, - split_info.gradient_right, - split_info.hessian_right, + split_info.sum_gradient_left, + split_info.sum_hessian_left, + split_info.sum_gradient_right, + split_info.sum_hessian_right, split_info.n_samples_left, split_info.n_samples_right, ) cdef inline Y_DTYPE_C _split_gain( - Y_DTYPE_C gradient_left, - Y_DTYPE_C hessian_left, - Y_DTYPE_C gradient_right, - Y_DTYPE_C hessian_right, + Y_DTYPE_C sum_gradient_left, + Y_DTYPE_C sum_hessian_left, + Y_DTYPE_C sum_gradient_right, + Y_DTYPE_C sum_hessian_right, Y_DTYPE_C sum_gradients, Y_DTYPE_C sum_hessians, Y_DTYPE_C l2_regularization) nogil: @@ -719,8 +720,10 @@ cdef inline Y_DTYPE_C _split_gain( """ cdef: Y_DTYPE_C gain - gain = negative_loss(gradient_left, hessian_left, l2_regularization) - gain += negative_loss(gradient_right, hessian_right, l2_regularization) + gain = negative_loss(sum_gradient_left, sum_hessian_left, + l2_regularization) + gain += negative_loss(sum_gradient_right, sum_hessian_right, + l2_regularization) gain -= negative_loss(sum_gradients, sum_hessians, l2_regularization) return gain diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index 8307475db415a..d03a49c51b4c4 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -54,7 +54,7 @@ def test_histogram_split(n_bins): assert (split_info.n_samples_left + split_info.n_samples_right == sample_indices.shape[0]) # Constant hessian: 1. per sample. - assert split_info.n_samples_left == split_info.hessian_left + assert split_info.n_samples_left == split_info.sum_hessian_left @pytest.mark.parametrize('constant_hessian', [True, False]) @@ -106,13 +106,13 @@ def test_split_vs_split_subtraction(constant_hessian): # split left with subtraction method si_left_sub = splitter.find_node_split_subtraction( - sample_indices_left, si_parent.gradient_left, - si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) + sample_indices_left, si_parent.sum_gradient_left, + si_parent.sum_hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method si_right_sub = splitter.find_node_split_subtraction( - sample_indices_right, si_parent.gradient_right, - si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) + sample_indices_right, si_parent.sum_gradient_right, + si_parent.sum_hessian_right, hists_parent, hists_left, hists_right_sub) # make sure histograms from classical and subtraction method are the same for hists, hists_sub in ((hists_left, hists_left_sub), @@ -125,19 +125,22 @@ def test_split_vs_split_subtraction(constant_hessian): for si, si_sub in ((si_left, si_left_sub), (si_right, si_right_sub)): assert_almost_equal(si.gain, si_sub.gain, decimal=3) assert_almost_equal(si.feature_idx, si_sub.feature_idx, decimal=3) - assert_almost_equal(si.gradient_left, si_sub.gradient_left, decimal=3) - assert_almost_equal(si.gradient_right, si_sub.gradient_right, + assert_almost_equal(si.sum_gradient_left, si_sub.sum_gradient_left, + decimal=3) + assert_almost_equal(si.sum_gradient_right, si_sub.sum_gradient_right, + decimal=3) + assert_almost_equal(si.sum_hessian_right, si_sub.sum_hessian_right, + decimal=3) + assert_almost_equal(si.sum_hessian_left, si_sub.sum_hessian_left, decimal=3) - assert_almost_equal(si.hessian_right, si_sub.hessian_right, decimal=3) - assert_almost_equal(si.hessian_left, si_sub.hessian_left, decimal=3) @pytest.mark.parametrize('constant_hessian', [True, False]) def test_gradient_and_hessian_sanity(constant_hessian): # This test checks that the values of gradients and hessians are # consistent in different places: - # - in split_info: si.gradient_left + si.gradient_right must be equal to - # the gradient at the node. Same for hessians. + # - in split_info: si.sum_gradient_left + si.sum_gradient_right must be + # equal to the gradient at the node. Same for hessians. # - in the histograms: summing 'sum_gradients' over the bins must be # constant across all features, and those sums must be equal to the # node's gradient. Same for hessians. @@ -194,25 +197,25 @@ def test_gradient_and_hessian_sanity(constant_hessian): # split left with subtraction method si_left_sub = splitter.find_node_split_subtraction( - sample_indices_left, si_parent.gradient_left, - si_parent.hessian_left, hists_parent, hists_right, hists_left_sub) + sample_indices_left, si_parent.sum_gradient_left, + si_parent.sum_hessian_left, hists_parent, hists_right, hists_left_sub) # split right with subtraction method si_right_sub = splitter.find_node_split_subtraction( - sample_indices_right, si_parent.gradient_right, - si_parent.hessian_right, hists_parent, hists_left, hists_right_sub) + sample_indices_right, si_parent.sum_gradient_right, + si_parent.sum_hessian_right, hists_parent, hists_left, hists_right_sub) - # make sure that si.gradient_left + si.gradient_right have their expected - # value, same for hessians + # make sure that si.sum_gradient_left + si.sum_gradient_right have their + # expected value, same for hessians for si, indices in ( (si_parent, sample_indices), (si_left, sample_indices_left), (si_left_sub, sample_indices_left), (si_right, sample_indices_right), (si_right_sub, sample_indices_right)): - gradient = si.gradient_right + si.gradient_left + gradient = si.sum_gradient_right + si.sum_gradient_left expected_gradient = all_gradients[indices].sum() - hessian = si.hessian_right + si.hessian_left + hessian = si.sum_hessian_right + si.sum_hessian_left if constant_hessian: expected_hessian = indices.shape[0] * all_hessians[0] else: From 170a5e1f7d21c5a0c329bcb8ac6f1e654bbc1777 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 4 Feb 2019 13:37:25 -0500 Subject: [PATCH 102/247] first round of comments --- .../gradient_boosting.py | 2 +- sklearn/_fast_gradient_boosting/loss.pyx | 1 + sklearn/_fast_gradient_boosting/splitting.pyx | 2 +- .../tests/test_gradient_boosting.py | 113 ++++++++---------- .../tests/test_grower.py | 43 +++---- .../tests/test_splitting.py | 2 +- 6 files changed, 73 insertions(+), 90 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 6b784390ab42b..72180f0374d8c 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -330,7 +330,7 @@ def _get_scores(self, X, y): -loss_value. """ - if not isinstance(self.scoring, str) and self.scoring != 'loss': + if self.scoring != 'loss': return self.scorer_(self, X, y) # Else, use loss diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index a4ebf3e01f986..8e6509046255e 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -130,6 +130,7 @@ class LeastSquares(BaseLoss): def get_baseline_prediction(self, y_train, prediction_dim): return np.mean(y_train).astype(Y_DTYPE) + @staticmethod def inverse_link_function(self, raw_predictions): return raw_predictions diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 1c91d80e1a490..e699bc5d5b461 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -674,7 +674,7 @@ cdef class Splitter: # Only used for tests (python code cannot use cdef types) # Not sure if this is a good practice... - def find_best_split_wrapper( + def _find_best_split_wrapper( self, int feature_idx, unsigned int [::1] sample_indices, diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 131f1204d186e..c7d39d9c72816 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -1,5 +1,4 @@ import pytest -from sklearn.utils.testing import assert_raises_regex from sklearn.datasets import make_classification, make_regression from sklearn.utils.estimator_checks import check_estimator @@ -17,76 +16,66 @@ ]) def test_init_parameters_validation(GradientBoosting, X, y): - assert_raises_regex( - ValueError, - "Loss blah is not supported for", - GradientBoosting(loss='blah').fit, X, y - ) + with pytest.raises( + ValueError, + match="Loss blah is not supported for"): + GradientBoosting(loss='blah').fit(X, y) for learning_rate in (-1, 0): - assert_raises_regex( + with pytest.raises( + ValueError, + match="learning_rate={} must be strictly positive".format( + learning_rate)): + GradientBoosting(learning_rate=learning_rate).fit(X, y) + + with pytest.raises( ValueError, - "learning_rate={} must be strictly positive".format(learning_rate), - GradientBoosting(learning_rate=learning_rate).fit, X, y - ) - - assert_raises_regex( - ValueError, - "n_estimators=0 must not be smaller than 1", - GradientBoosting(n_estimators=0).fit, X, y - ) - - assert_raises_regex( - ValueError, - "max_leaf_nodes=0 should not be smaller than 1", - GradientBoosting(max_leaf_nodes=0).fit, X, y - ) - - assert_raises_regex( - ValueError, - "max_depth=0 should not be smaller than 1", - GradientBoosting(max_depth=0).fit, X, y - ) - - assert_raises_regex( - ValueError, - "min_samples_leaf=0 should not be smaller than 1", - GradientBoosting(min_samples_leaf=0).fit, X, y - ) - - assert_raises_regex( - ValueError, - "l2_regularization=-1 must be positive", - GradientBoosting(l2_regularization=-1).fit, X, y - ) + match="n_estimators=0 must not be smaller than 1"): + GradientBoosting(n_estimators=0).fit(X, y) - for max_bins in (1, 257): - assert_raises_regex( + with pytest.raises( + ValueError, + match="max_leaf_nodes=0 should not be smaller than 1"): + GradientBoosting(max_leaf_nodes=0).fit(X, y) + + with pytest.raises( + ValueError, + match="max_depth=0 should not be smaller than 1"): + GradientBoosting(max_depth=0).fit(X, y) + + with pytest.raises( ValueError, - "max_bins={} should be no smaller than 2 and no larger".format( - max_bins), - GradientBoosting(max_bins=max_bins).fit, X, y - ) + match="min_samples_leaf=0 should not be smaller than 1"): + GradientBoosting(min_samples_leaf=0).fit(X, y) - assert_raises_regex( - ValueError, - "n_iter_no_change=-1 must be positive", - GradientBoosting(n_iter_no_change=-1).fit, X, y - ) + with pytest.raises( + ValueError, + match="l2_regularization=-1 must be positive"): + GradientBoosting(l2_regularization=-1).fit(X, y) + + for max_bins in (1, 257): + with pytest.raises( + ValueError, + match="max_bins={} should be no smaller than 2 and " + "no larger".format(max_bins)): + GradientBoosting(max_bins=max_bins).fit(X, y) + + with pytest.raises( + ValueError, + match="n_iter_no_change=-1 must be positive"): + GradientBoosting(n_iter_no_change=-1).fit(X, y) for validation_fraction in (-1, 0): - assert_raises_regex( + with pytest.raises( + ValueError, + match="validation_fraction={} must be strictly positive".format( + validation_fraction)): + GradientBoosting(validation_fraction=validation_fraction).fit(X, y) + + with pytest.raises( ValueError, - "validation_fraction={} must be strictly positive".format( - validation_fraction), - GradientBoosting(validation_fraction=validation_fraction).fit, X, y - ) - - assert_raises_regex( - ValueError, - "tol=-1 must not be smaller than 0", - GradientBoosting(tol=-1).fit, X, y - ) + match="tol=-1 must not be smaller than 0"): + GradientBoosting(tol=-1).fit(X, y) @pytest.mark.parametrize( diff --git a/sklearn/_fast_gradient_boosting/tests/test_grower.py b/sklearn/_fast_gradient_boosting/tests/test_grower.py index 0432598478a21..f5024e3bb6594 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_grower.py +++ b/sklearn/_fast_gradient_boosting/tests/test_grower.py @@ -3,7 +3,6 @@ import pytest from pytest import approx -from sklearn.utils.testing import assert_raises_regex from sklearn._fast_gradient_boosting.grower import TreeGrower from sklearn._fast_gradient_boosting.binning import BinMapper from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE @@ -267,29 +266,23 @@ def test_init_parameters_validation(): X_binned, all_gradients, all_hessians = _make_training_data() X_binned_float = X_binned.astype(np.float32) - assert_raises_regex( - NotImplementedError, - "Explicit feature binning required for now", - TreeGrower, X_binned_float, all_gradients, all_hessians - ) + with pytest.raises(NotImplementedError, + match="Explicit feature binning required for now"): + TreeGrower(X_binned_float, all_gradients, all_hessians) X_binned_C_array = np.ascontiguousarray(X_binned) - assert_raises_regex( - ValueError, - "X_binned should be passed as Fortran contiguous array", - TreeGrower, X_binned_C_array, all_gradients, all_hessians - ) - - assert_raises_regex( - ValueError, - "min_gain_to_split=-1 must be positive", - TreeGrower, X_binned, all_gradients, all_hessians, - min_gain_to_split=-1 - ) - - assert_raises_regex( - ValueError, - "min_hessian_to_split=-1 must be positive", - TreeGrower, X_binned, all_gradients, all_hessians, - min_hessian_to_split=-1 - ) + with pytest.raises( + ValueError, + match="X_binned should be passed as Fortran contiguous array"): + TreeGrower(X_binned_C_array, all_gradients, all_hessians) + + with pytest.raises(ValueError, + match="min_gain_to_split=-1 must be positive"): + + TreeGrower(X_binned, all_gradients, all_hessians, + min_gain_to_split=-1) + + with pytest.raises(ValueError, + match="min_hessian_to_split=-1 must be positive"): + TreeGrower(X_binned, all_gradients, all_hessians, + min_hessian_to_split=-1) diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index d03a49c51b4c4..a2ba8f1daa85f 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -44,7 +44,7 @@ def test_histogram_split(n_bins): min_samples_leaf, min_gain_to_split) histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) - split_info = splitter.find_best_split_wrapper( + split_info = splitter._find_best_split_wrapper( feature_idx, sample_indices, histograms, sum_gradients, sum_hessians) From d16ecff893b9e360933160d5f1ce727d27d1a68a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 4 Feb 2019 14:35:49 -0500 Subject: [PATCH 103/247] made raw_predictions a C-contiguous on the n_samples dimension --- .../_gradient_boosting.pyx | 4 +- .../gradient_boosting.py | 12 ++-- sklearn/_fast_gradient_boosting/loss.pyx | 67 ++++++++++--------- .../tests/test_loss.py | 10 +-- 4 files changed, 47 insertions(+), 46 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 2c1a3528ae409..786dcfc19aabd 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -13,7 +13,7 @@ from .types cimport Y_DTYPE_C def _update_raw_predictions( - Y_DTYPE_C [:] raw_predictions, # OUT + Y_DTYPE_C [::1] raw_predictions, # OUT grower): """Update raw_predictions with the predictions of the newest tree @@ -39,7 +39,7 @@ def _update_raw_predictions( cdef void _update_raw_predictions_helper( - Y_DTYPE_C [:] raw_predictions, # OUT + Y_DTYPE_C [::1] raw_predictions, # OUT const unsigned int [:] starts, const unsigned int [:] stops, const unsigned int [:] partition, diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 72180f0374d8c..8e2f3c0c91ee8 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -169,7 +169,7 @@ def fit(self, X, y): self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.n_trees_per_iteration_) raw_predictions = np.zeros( - shape=(n_samples, self.n_trees_per_iteration_), + shape=(self.n_trees_per_iteration_, n_samples), dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ @@ -245,7 +245,7 @@ def fit(self, X, y): # Update raw_predictions with the predictions of the newly # created tree. tic_pred = time() - _update_raw_predictions(raw_predictions[:, k], grower) + _update_raw_predictions(raw_predictions[k, :], grower) toc_pred = time() acc_prediction_time += toc_pred - tic_pred @@ -394,7 +394,7 @@ def _raw_predict(self, X): is_binned = self._in_fit and X.dtype == X_BINNED_DTYPE n_samples = X.shape[0] raw_predictions = np.zeros( - shape=(n_samples, self.n_trees_per_iteration_), + shape=(self.n_trees_per_iteration_, n_samples), dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ @@ -402,7 +402,7 @@ def _raw_predict(self, X): for k, estimator in enumerate(predictors_of_ith_iteration): predict = (estimator.predict_binned if is_binned else estimator.predict) - raw_predictions[:, k] += predict(X) + raw_predictions[k, :] += predict(X) return raw_predictions @@ -725,9 +725,9 @@ def decision_function(self, X): classes in multiclass classification. """ decision = self._raw_predict(X) - if decision.shape[1] == 1: + if decision.shape[0] == 1: decision = decision.ravel() - return decision + return decision.T def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 8e6509046255e..f56db09bfb3ce 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -80,7 +80,7 @@ class BaseLoss(ABC): Returns ------- - baseline_prediction: float or array of shape (1, prediction_dim) + baseline_prediction: float or array of shape (prediction_dim, 1) The baseline prediction. """ pass @@ -131,7 +131,7 @@ class LeastSquares(BaseLoss): return np.mean(y_train).astype(Y_DTYPE) @staticmethod - def inverse_link_function(self, raw_predictions): + def inverse_link_function(raw_predictions): return raw_predictions def update_gradients_and_hessians(self, gradients, hessians, y_true, @@ -142,9 +142,9 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( - G_H_DTYPE_C [:] gradients, - const Y_DTYPE_C [:] y_true, - const Y_DTYPE_C [:] raw_predictions) nogil: + G_H_DTYPE_C [::1] gradients, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [::1] raw_predictions) nogil: cdef: int n_samples int i @@ -206,10 +206,10 @@ class BinaryCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_binary_crossentropy( - G_H_DTYPE_C [:] gradients, - G_H_DTYPE_C [:] hessians, - const Y_DTYPE_C [:] y_true, - const Y_DTYPE_C [:] raw_predictions) nogil: + G_H_DTYPE_C [::1] gradients, + G_H_DTYPE_C [::1] hessians, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [::1] raw_predictions) nogil: cdef: int n_samples G_H_DTYPE_C gradient_abs @@ -234,21 +234,21 @@ class CategoricalCrossEntropy(BaseLoss): def __call__(self, y_true, raw_predictions, average=True): one_hot_true = np.zeros_like(raw_predictions) - prediction_dim = raw_predictions.shape[1] + prediction_dim = raw_predictions.shape[0] for k in range(prediction_dim): - one_hot_true[:, k] = (y_true == k) + one_hot_true[k, :] = (y_true == k) - loss = (logsumexp(raw_predictions, axis=1) - - (one_hot_true * raw_predictions).sum(axis=1)) + loss = (logsumexp(raw_predictions, axis=0) - + (one_hot_true * raw_predictions).sum(axis=0)) return loss.mean() if average else loss def get_baseline_prediction(self, y_train, prediction_dim): - init_value = np.zeros(shape=(1, prediction_dim), dtype=Y_DTYPE) + init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) eps = np.finfo(y_train.dtype).eps for k in range(prediction_dim): proba_kth_class = np.mean(y_train == k) proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) - init_value[:, k] += np.log(proba_kth_class) + init_value[k, :] += np.log(proba_kth_class) return init_value @@ -260,34 +260,35 @@ class CategoricalCrossEntropy(BaseLoss): def predict_proba(self, raw_predictions): # TODO: This could be done in parallel # compute softmax (using exp(log(softmax))) - return np.exp(raw_predictions - - logsumexp(raw_predictions, axis=1)[:, np.newaxis]) + proba = np.exp(raw_predictions - + logsumexp(raw_predictions, axis=0)[np.newaxis, :]) + return proba.T cdef void _update_gradients_hessians_categorical_crossentropy( - G_H_DTYPE_C [:] gradients, # shape (n_samples * prediction_dim,), OUT - G_H_DTYPE_C [:] hessians, # shape (n_samples * prediction_dim,), OUT - const Y_DTYPE_C [:] y_true, # shape (n_samples,), IN + G_H_DTYPE_C [::1] gradients, # shape (n_samples * prediction_dim,), OUT + G_H_DTYPE_C [::1] hessians, # shape (n_samples * prediction_dim,), OUT + const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN # shape (n_samples, n_tree_per_iter), IN - const Y_DTYPE_C [:, :] raw_predictions) nogil: + const Y_DTYPE_C [:, ::1] raw_predictions) nogil: cdef: int n_samples unsigned int prediction_dim unsigned int k int i Y_DTYPE_C p_k - G_H_DTYPE_C [:] gradients_at_k, - G_H_DTYPE_C [:] hessians_at_k, + G_H_DTYPE_C [::1] gradients_at_k, + G_H_DTYPE_C [::1] hessians_at_k, - n_samples = raw_predictions.shape[0] - prediction_dim = raw_predictions.shape[1] + prediction_dim = raw_predictions.shape[0] + n_samples = raw_predictions.shape[1] for k in range(prediction_dim): gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] for i in prange(n_samples, schedule='static'): # p_k is the probability that class(ith sample) == k. # This is a regular softmax. - p_k = exp(raw_predictions[i, k] - clogsumexp(raw_predictions, i)) + p_k = exp(raw_predictions[k, i] - clogsumexp(raw_predictions, i)) gradients_at_k[i] = p_k - (y_true[i] == k) hessians_at_k[i] = p_k * (1. - p_k) @@ -298,7 +299,7 @@ cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: cdef inline Y_DTYPE_C clogsumexp( - const Y_DTYPE_C [:, :] a, + const Y_DTYPE_C [:, ::1] a, const int row) nogil: """Custom logsumexp, with numerical stability""" # Need to pass the whole array and the row index, else prange won't work. @@ -306,14 +307,14 @@ cdef inline Y_DTYPE_C clogsumexp( cdef: int k Y_DTYPE_C out = 0. - Y_DTYPE_C amax = a[row, 0] + Y_DTYPE_C amax = a[0, row] - for k in range(1, a.shape[1]): - if amax < a[row, k]: - amax = a[row, k] + for k in range(1, a.shape[0]): + if amax < a[k, row]: + amax = a[k, row] - for k in range(a.shape[1]): - out += exp(a[row, k] - amax) + for k in range(a.shape[0]): + out += exp(a[k, row] - amax) return log(out) + amax diff --git a/sklearn/_fast_gradient_boosting/tests/test_loss.py b/sklearn/_fast_gradient_boosting/tests/test_loss.py index 4034328454578..60d8f6be183e4 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_loss.py +++ b/sklearn/_fast_gradient_boosting/tests/test_loss.py @@ -102,7 +102,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): else: y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) raw_predictions = rng.normal( - size=(n_samples, prediction_dim) + size=(prediction_dim, n_samples) ).astype(Y_DTYPE) loss = _LOSSES[loss]() get_gradients, get_hessians = get_derivatives_helper(loss) @@ -118,14 +118,14 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): # have no effect on the probabilities, and thus on the loss eps = 1e-9 offset = np.zeros_like(raw_predictions) - offset[:, 0] = eps + offset[0, :] = eps f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) numerical_gradient = (f_plus_eps - f_minus_eps) / eps # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square - offset[:, 0] = eps + offset[0, :] = eps f_plus_eps = loss(y_true, raw_predictions + offset, average=False) f_minus_eps = loss(y_true, raw_predictions - offset, average=False) f = loss(y_true, raw_predictions, average=False) @@ -187,7 +187,7 @@ def test_baseline_categorical_crossentropy(): # link_function = log y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32) baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) - assert baseline_prediction.shape == (1, prediction_dim) + assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() - assert_almost_equal(baseline_prediction[:, k], np.log(p)) + assert_almost_equal(baseline_prediction[k, :], np.log(p)) From 9e68984f8066085bdf801ccd0b5edca968709b5f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 4 Feb 2019 15:49:48 -0500 Subject: [PATCH 104/247] optimized gradient update for multiclass loss --- sklearn/_fast_gradient_boosting/loss.pyx | 73 +++++++++++++----------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index f56db09bfb3ce..c1cb863a6a878 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -12,6 +12,7 @@ from abc import ABC, abstractmethod cimport cython from cython.parallel import prange +from libc.stdlib cimport malloc, free import numpy as np cimport numpy as np from scipy.special import expit @@ -266,56 +267,62 @@ class CategoricalCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_categorical_crossentropy( - G_H_DTYPE_C [::1] gradients, # shape (n_samples * prediction_dim,), OUT - G_H_DTYPE_C [::1] hessians, # shape (n_samples * prediction_dim,), OUT + G_H_DTYPE_C [::1] gradients, # shape (n_samples * pred_dim,), OUT + G_H_DTYPE_C [::1] hessians, # shape (n_samples * pred_dim,), OUT const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN # shape (n_samples, n_tree_per_iter), IN const Y_DTYPE_C [:, ::1] raw_predictions) nogil: cdef: - int n_samples - unsigned int prediction_dim + unsigned int prediction_dim = raw_predictions.shape[0] + int n_samples = raw_predictions.shape[1] unsigned int k int i + Y_DTYPE_C * p = malloc(sizeof(Y_DTYPE_C) * + (prediction_dim * n_samples)) Y_DTYPE_C p_k G_H_DTYPE_C [::1] gradients_at_k, G_H_DTYPE_C [::1] hessians_at_k, - prediction_dim = raw_predictions.shape[0] - n_samples = raw_predictions.shape[1] - for k in range(prediction_dim): - gradients_at_k = gradients[n_samples * k:n_samples * (k + 1)] - hessians_at_k = hessians[n_samples * k:n_samples * (k + 1)] - for i in prange(n_samples, schedule='static'): + for i in prange(n_samples, schedule='static'): + # first compute softmaxes of sample i for each class + for k in range(prediction_dim): + p[i * prediction_dim + k] = raw_predictions[k, i] + compute_softmax(p + (i * prediction_dim), prediction_dim) + # then update gradients and hessians + for k in range(prediction_dim): # p_k is the probability that class(ith sample) == k. - # This is a regular softmax. - p_k = exp(raw_predictions[k, i] - clogsumexp(raw_predictions, i)) - gradients_at_k[i] = p_k - (y_true[i] == k) - hessians_at_k[i] = p_k * (1. - p_k) + p_k = p[i * prediction_dim + k] + gradients[n_samples * k + i] = p_k - (y_true[i] == k) + hessians[n_samples * k + i] = p_k * (1. - p_k) + free(p) -cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: - """Custom expit (logistic sigmoid function)""" - return 1. / (1. + exp(-x)) - +cdef inline void compute_softmax( + Y_DTYPE_C * p, # IN OUT, treated as array with entries + const unsigned int prediction_dim) nogil: + """Compute softmaxes of values in p.""" -cdef inline Y_DTYPE_C clogsumexp( - const Y_DTYPE_C [:, ::1] a, - const int row) nogil: - """Custom logsumexp, with numerical stability""" - # Need to pass the whole array and the row index, else prange won't work. - # See issue Cython #2798 cdef: - int k - Y_DTYPE_C out = 0. - Y_DTYPE_C amax = a[0, row] + Y_DTYPE_C max_value = p[0] + Y_DTYPE_C sum_exps = 0. + unsigned int k + + # Compute max value of array for numerical stability + for k in range(1, prediction_dim): + if max_value < p[k]: + max_value = p[k] - for k in range(1, a.shape[0]): - if amax < a[k, row]: - amax = a[k, row] + for k in range(prediction_dim): + p[k] = exp(p[k] - max_value) + sum_exps += p[k] + + for k in range(prediction_dim): + p[k] /= sum_exps - for k in range(a.shape[0]): - out += exp(a[k, row] - amax) - return log(out) + amax + +cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: + """Custom expit (logistic sigmoid function)""" + return 1. / (1. + exp(-x)) _LOSSES = { From 96d9ea67f719a3f7a2b77c8fccc6e97441fedef8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 4 Feb 2019 16:25:52 -0500 Subject: [PATCH 105/247] used 2d arrays instead of 1d for gradients and hessians --- .../_gradient_boosting.pyx | 19 ++++--- .../gradient_boosting.py | 19 +++---- sklearn/_fast_gradient_boosting/loss.pyx | 49 ++++++++++--------- .../tests/test_loss.py | 18 +++---- 4 files changed, 50 insertions(+), 55 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 786dcfc19aabd..47fca23a6348e 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -21,10 +21,10 @@ def _update_raw_predictions( raw_predictions += last_estimator.predict(X_train) """ cdef: - unsigned int [:] starts # start of each leaf in partition - unsigned int [:] stops # end of each leaf in partition - Y_DTYPE_C [:] values # value of each leaf - const unsigned int [:] partition = grower.splitter.partition + unsigned int [::1] starts # start of each leaf in partition + unsigned int [::1] stops # end of each leaf in partition + Y_DTYPE_C [::1] values # value of each leaf + const unsigned int [::1] partition = grower.splitter.partition list leaves leaves = grower.finalized_leaves @@ -40,17 +40,16 @@ def _update_raw_predictions( cdef void _update_raw_predictions_helper( Y_DTYPE_C [::1] raw_predictions, # OUT - const unsigned int [:] starts, - const unsigned int [:] stops, - const unsigned int [:] partition, - const Y_DTYPE_C [:] values) nogil: + const unsigned int [::1] starts, + const unsigned int [::1] stops, + const unsigned int [::1] partition, + const Y_DTYPE_C [::1] values) nogil: cdef: unsigned int position int leaf_idx - int n_leaves + int n_leaves = starts.shape[0] - n_leaves = starts.shape[0] for leaf_idx in prange(n_leaves): for position in range(starts[leaf_idx], stops[leaf_idx]): raw_predictions[partition[position]] += values[leaf_idx] diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 8e2f3c0c91ee8..17abb49464620 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -163,8 +163,9 @@ def fit(self, X, y): # initialize raw_predictions: those are the accumulated values # predicted by the trees for the training data. raw_predictions has - # shape (n_samples, n_trees_per_iteration) where n_trees_per_iterations - # is n_classes in multiclass classification, else 1. + # shape (n_trees_per_iteration, n_samples) where + # n_trees_per_iterations is n_classes in multiclass classification, + # else 1. n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.n_trees_per_iteration_) @@ -174,8 +175,8 @@ def fit(self, X, y): ) raw_predictions += self.baseline_prediction_ - # initialize gradients and hessians (empty arrays). Those 1D arrays of - # size (n_samples * n_trees_per_iteration). + # initialize gradients and hessians (empty arrays). + # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.n_trees_per_iteration_ @@ -216,16 +217,10 @@ def fit(self, X, y): estimators.append([]) # Build `n_trees_per_iteration` trees. - for k, (gradients_at_k, hessians_at_k) in enumerate(zip( - np.array_split(gradients, self.n_trees_per_iteration_), - np.array_split(hessians, self.n_trees_per_iteration_))): - # the xxxx_at_k arrays are **views** on the original arrays. - # Note that for binary classif and regressions, - # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the - # whole array. + for k in range(self.n_trees_per_iteration_): grower = TreeGrower( - X_binned_train, gradients_at_k, hessians_at_k, + X_binned_train, gradients[k, :], hessians[k, :], max_bins=self.max_bins, n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, max_leaf_nodes=self.max_leaf_nodes, diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index c1cb863a6a878..995df8e06aa42 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -50,18 +50,18 @@ class BaseLoss(ABC): Returns ------- - gradients : array-like, shape=(n_samples * prediction_dim) - hessians : array-like, shape=(n_samples * prediction_dim). - If hessians are constant (e.g. for ``LeastSquares`` loss, shape - is (1,) and the array is initialized to ``1``. + gradients : array-like, shape=(prediction_dim, n_samples) + hessians : array-like, shape=(prediction_dim, n_samples). + If hessians are constant (e.g. for ``LeastSquares`` loss, the + array is initialized to ``1``. """ - shape = n_samples * prediction_dim + shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) if self.hessians_are_constant: # if the hessians are constant, we consider they are equal to 1. # this is correct as long as we adjust the gradients. See e.g. LS # loss - hessians = np.ones(shape=1, dtype=G_H_DTYPE) + hessians = np.ones(shape=shape, dtype=G_H_DTYPE) else: hessians = np.empty(shape=shape, dtype=G_H_DTYPE) @@ -81,7 +81,7 @@ class BaseLoss(ABC): Returns ------- - baseline_prediction: float or array of shape (prediction_dim, 1) + baseline_prediction: float or array of shape (1, prediction_dim) The baseline prediction. """ pass @@ -97,14 +97,14 @@ class BaseLoss(ABC): Parameters ---------- - gradients : array-like, shape=(n_samples * prediction_dim) + gradients : array-like, shape=(prediction_dim, n_samples) The gradients (treated as OUT array). - hessians : array-like, shape=(n_samples * prediction_dim) or \ + hessians : array-like, shape=(prediction_dim, n_samples) or \ (1,) The hessians (treated as OUT array). y_true : array-like, shape=(n_samples,) The true target values or each training sample. - raw_predictions : array-like, shape=(n_samples, prediction_dim) + raw_predictions : array-like, shape=(prediction_dim, n_samples) The raw_predictions (i.e. values from the trees) of the tree ensemble at iteration ``i - 1``. """ @@ -122,7 +122,7 @@ class LeastSquares(BaseLoss): hessians_are_constant = True def __call__(self, y_true, raw_predictions, average=True): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) loss = np.power(y_true - raw_predictions, 2) @@ -137,7 +137,10 @@ class LeastSquares(BaseLoss): def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions): + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to + # return a view. raw_predictions = raw_predictions.reshape(-1) + gradients = gradients.reshape(-1) return _update_gradients_least_squares(gradients, y_true, raw_predictions) @@ -173,7 +176,7 @@ class BinaryCrossEntropy(BaseLoss): inverse_link_function = staticmethod(expit) def __call__(self, y_true, raw_predictions, average=True): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) # logaddexp(0, x) = log(1 + exp(x)) @@ -190,14 +193,16 @@ class BinaryCrossEntropy(BaseLoss): def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) + gradients = gradients.reshape(-1) + hessians = hessians.reshape(-1) return _update_gradients_hessians_binary_crossentropy( gradients, hessians, y_true, raw_predictions) def predict_proba(self, raw_predictions): - # shape (n_samples, 1) --> (n_samples,). reshape(-1) is more likely to + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE) @@ -267,21 +272,19 @@ class CategoricalCrossEntropy(BaseLoss): cdef void _update_gradients_hessians_categorical_crossentropy( - G_H_DTYPE_C [::1] gradients, # shape (n_samples * pred_dim,), OUT - G_H_DTYPE_C [::1] hessians, # shape (n_samples * pred_dim,), OUT + G_H_DTYPE_C [:, ::1] gradients, # shape (pred_dim, n_samples), OUT + G_H_DTYPE_C [:, ::1] hessians, # shape (pred_dim, n_samples), OUT const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN - # shape (n_samples, n_tree_per_iter), IN + # shape (pred_dim, n_samples), IN const Y_DTYPE_C [:, ::1] raw_predictions) nogil: cdef: - unsigned int prediction_dim = raw_predictions.shape[0] + int prediction_dim = raw_predictions.shape[0] int n_samples = raw_predictions.shape[1] - unsigned int k + int k int i Y_DTYPE_C * p = malloc(sizeof(Y_DTYPE_C) * (prediction_dim * n_samples)) Y_DTYPE_C p_k - G_H_DTYPE_C [::1] gradients_at_k, - G_H_DTYPE_C [::1] hessians_at_k, for i in prange(n_samples, schedule='static'): # first compute softmaxes of sample i for each class @@ -292,8 +295,8 @@ cdef void _update_gradients_hessians_categorical_crossentropy( for k in range(prediction_dim): # p_k is the probability that class(ith sample) == k. p_k = p[i * prediction_dim + k] - gradients[n_samples * k + i] = p_k - (y_true[i] == k) - hessians[n_samples * k + i] = p_k * (1. - p_k) + gradients[k, i] = p_k - (y_true[i] == k) + hessians[k, i] = p_k * (1. - p_k) free(p) diff --git a/sklearn/_fast_gradient_boosting/tests/test_loss.py b/sklearn/_fast_gradient_boosting/tests/test_loss.py index 60d8f6be183e4..c6bd7056eae1c 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_loss.py +++ b/sklearn/_fast_gradient_boosting/tests/test_loss.py @@ -16,9 +16,8 @@ def get_derivatives_helper(loss): def get_gradients(y_true, raw_predictions): # create gradients and hessians array, update inplace, and return - shape = raw_predictions.shape[0] * raw_predictions.shape[1] - gradients = np.empty(shape=shape, dtype=G_H_DTYPE) - hessians = np.empty(shape=shape, dtype=G_H_DTYPE) + gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) + hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) @@ -29,15 +28,14 @@ def get_gradients(y_true, raw_predictions): def get_hessians(y_true, raw_predictions): # create gradients and hessians array, update inplace, and return - shape = raw_predictions.shape[0] * raw_predictions.shape[1] - gradients = np.empty(shape=shape, dtype=G_H_DTYPE) - hessians = np.empty(shape=shape, dtype=G_H_DTYPE) + gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE) + hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) if loss.__class__ is _LOSSES['least_squares']: # hessians aren't updated because they're constant - hessians = np.full_like(y_true, fill_value=2) + hessians = np.full_like(raw_predictions, fill_value=2) return hessians @@ -107,9 +105,9 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): loss = _LOSSES[loss]() get_gradients, get_hessians = get_derivatives_helper(loss) - # [:n_samples] to only take gradients and hessians of first tree. - gradients = get_gradients(y_true, raw_predictions)[:n_samples] - hessians = get_hessians(y_true, raw_predictions)[:n_samples] + # only take gradients and hessians of first tree / class. + gradients = get_gradients(y_true, raw_predictions)[0, :].ravel() + hessians = get_hessians(y_true, raw_predictions)[0, :].ravel() # Approximate gradients # For multiclass loss, we should only change the predictions of one tree From cbd9d153793227646229da45c87476eb34dcb87e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 4 Feb 2019 17:10:01 -0500 Subject: [PATCH 106/247] added comment about tests that should be removed --- sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index c7d39d9c72816..e6a116d78d53e 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -180,4 +180,6 @@ def should_stop(scores, n_iter_no_change, tol): )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. + # Just here for convenience, must be removed before merging since these + # tests are run in test_common anyways check_estimator(Estimator) From e160d555d9a6f7d472c2703306a9e8dd0b1835cd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 4 Feb 2019 20:58:03 -0500 Subject: [PATCH 107/247] Addressed Joels comments --- sklearn/_fast_gradient_boosting/grower.py | 75 ++++++++++++----------- sklearn/tree/tree.py | 1 - 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 21c52a05376d9..d43bad3563640 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -464,43 +464,44 @@ def make_predictor(self, bin_thresholds=None): A TreePredictor object. """ predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) - self._fill_predictor_node_array(predictor_nodes, self.root, - bin_thresholds=bin_thresholds) + _fill_predictor_node_array(predictor_nodes, self.root, + bin_thresholds=bin_thresholds) return TreePredictor(predictor_nodes) - def _fill_predictor_node_array(self, predictor_nodes, grower_node, - bin_thresholds=None, next_free_idx=0): - """Helper used in make_predictor to set the TreePredictor fields.""" - node = predictor_nodes[next_free_idx] - node['count'] = grower_node.n_samples - node['depth'] = grower_node.depth - if grower_node.split_info is not None: - node['gain'] = grower_node.split_info.gain - else: - node['gain'] = -1 - if grower_node.value is not None: - # Leaf node - node['is_leaf'] = True - node['value'] = grower_node.value - return next_free_idx + 1 - else: - # Decision node - split_info = grower_node.split_info - feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx - node['feature_idx'] = feature_idx - node['bin_threshold'] = bin_idx - if bin_thresholds is not None: - threshold = bin_thresholds[feature_idx][bin_idx] - node['threshold'] = threshold - next_free_idx += 1 - - node['left'] = next_free_idx - next_free_idx = self._fill_predictor_node_array( - predictor_nodes, grower_node.left_child, - bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) - - node['right'] = next_free_idx - return self._fill_predictor_node_array( - predictor_nodes, grower_node.right_child, - bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) +def _fill_predictor_node_array(predictor_nodes, grower_node, + bin_thresholds=None, next_free_idx=0): + """Helper used in make_predictor to set the TreePredictor fields.""" + node = predictor_nodes[next_free_idx] + node['count'] = grower_node.n_samples + node['depth'] = grower_node.depth + if grower_node.split_info is not None: + node['gain'] = grower_node.split_info.gain + else: + node['gain'] = -1 + + if grower_node.value is not None: + # Leaf node + node['is_leaf'] = True + node['value'] = grower_node.value + return next_free_idx + 1 + else: + # Decision node + split_info = grower_node.split_info + feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx + node['feature_idx'] = feature_idx + node['bin_threshold'] = bin_idx + if bin_thresholds is not None: + threshold = bin_thresholds[feature_idx][bin_idx] + node['threshold'] = threshold + next_free_idx += 1 + + node['left'] = next_free_idx + next_free_idx = _fill_predictor_node_array( + predictor_nodes, grower_node.left_child, + bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) + + node['right'] = next_free_idx + return _fill_predictor_node_array( + predictor_nodes, grower_node.right_child, + bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 36e2683e6a575..973d7f9d1d715 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -378,7 +378,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) - if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] From 3cb197ed3ed92d507f18f07c5cca8bf6d3d56eff Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 5 Feb 2019 08:28:24 -0500 Subject: [PATCH 108/247] slightly more detailed doc about when not to use new estimators --- doc/modules/ensemble.rst | 7 +++++-- sklearn/_fast_gradient_boosting/gradient_boosting.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index a520fb5e8293b..674dad4821dc4 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -464,11 +464,14 @@ trees. :class:`GradientBoostingRegressor` when the number of samples is bigger than ``10 000``. These fast estimators first bin the input samples `X` into integer-valued bins (typically 256 bins) which tremendously reduces the - number of splitting points to consider. The API of these new estimators is + number of splitting points to consider, and allow the algorithm to leverage + integer-based data structures. The API of these new estimators is slightly different, and some features are not yet supported. The following doc focuses on :class:`GradientBoostingClassifier` and - :class:`GradientBoostingRegressor` only. + :class:`GradientBoostingRegressor` only, which might be prefered for small + sample sizes since binning may lead to split points that are too approximate + in this setting. Classification diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 17abb49464620..93d5eb3872090 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -422,7 +422,11 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): :class:`GradientBoostingRegressor` for big datasets (n_samples >= 10 000). The input data `X` is pre-binned into integer-valued bins, which considerably reduces the number of - splitting points to consider. + splitting points to consider, and allows the algorithm to leverage + integer-based data structures. For small sample sizes, + :class:`GradientBoostingRegressor` + might be prefered since binning may lead to split points that are too + approximate in this setting. Parameters ---------- @@ -560,7 +564,11 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, :class:`GradientBoostingClassifier` for big datasets (n_samples >= 10 000). The input data `X` is pre-binned into integer-valued bins, which considerably reduces the number of - splitting points to consider. + splitting points to consider, and allows the algorithm to leverage + integer-based data structures. For small sample sizes, + :class:`GradientBoostingClassifier` + might be prefered since binning may lead to split points that are too + approximate in this setting. Parameters ---------- From d653a549400846f35de902f6b30dddcfdfd731e8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 5 Feb 2019 13:09:44 -0500 Subject: [PATCH 109/247] p is now a 2d numpy array instead of malloc'ed buffer --- sklearn/_fast_gradient_boosting/loss.pyx | 45 ++++++++++++------------ 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 995df8e06aa42..b2dd98b81546f 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -12,7 +12,6 @@ from abc import ABC, abstractmethod cimport cython from cython.parallel import prange -from libc.stdlib cimport malloc, free import numpy as np cimport numpy as np from scipy.special import expit @@ -276,51 +275,51 @@ cdef void _update_gradients_hessians_categorical_crossentropy( G_H_DTYPE_C [:, ::1] hessians, # shape (pred_dim, n_samples), OUT const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN # shape (pred_dim, n_samples), IN - const Y_DTYPE_C [:, ::1] raw_predictions) nogil: + const Y_DTYPE_C [:, ::1] raw_predictions): cdef: int prediction_dim = raw_predictions.shape[0] int n_samples = raw_predictions.shape[1] int k int i - Y_DTYPE_C * p = malloc(sizeof(Y_DTYPE_C) * - (prediction_dim * n_samples)) - Y_DTYPE_C p_k + # p[i, k] is the probability that class(ith sample) == k. + # It's the softmax of the raw predictions + Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) + Y_DTYPE_C p_i_k - for i in prange(n_samples, schedule='static'): + for i in prange(n_samples, schedule='static', nogil=True): # first compute softmaxes of sample i for each class for k in range(prediction_dim): - p[i * prediction_dim + k] = raw_predictions[k, i] - compute_softmax(p + (i * prediction_dim), prediction_dim) + p[i, k] = raw_predictions[k, i] # prepare softmax + compute_softmax(p, i) # then update gradients and hessians for k in range(prediction_dim): - # p_k is the probability that class(ith sample) == k. - p_k = p[i * prediction_dim + k] - gradients[k, i] = p_k - (y_true[i] == k) - hessians[k, i] = p_k * (1. - p_k) - free(p) + p_i_k = p[i, k] + gradients[k, i] = p_i_k - (y_true[i] == k) + hessians[k, i] = p_i_k * (1. - p_i_k) -cdef inline void compute_softmax( - Y_DTYPE_C * p, # IN OUT, treated as array with entries - const unsigned int prediction_dim) nogil: - """Compute softmaxes of values in p.""" +cdef inline void compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: + """Compute softmaxes of values in p[i, :].""" + # i needs to be passed (and stays constant) because otherwise Cython does + # not generate optimal code cdef: - Y_DTYPE_C max_value = p[0] + Y_DTYPE_C max_value = p[i, 0] Y_DTYPE_C sum_exps = 0. unsigned int k + unsigned prediction_dim = p.shape[1] # Compute max value of array for numerical stability for k in range(1, prediction_dim): - if max_value < p[k]: - max_value = p[k] + if max_value < p[i, k]: + max_value = p[i, k] for k in range(prediction_dim): - p[k] = exp(p[k] - max_value) - sum_exps += p[k] + p[i, k] = exp(p[i, k] - max_value) + sum_exps += p[i, k] for k in range(prediction_dim): - p[k] /= sum_exps + p[i, k] /= sum_exps cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: From c3e43400139f373798b28e2772c94bcae7d51f7b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 5 Feb 2019 13:10:08 -0500 Subject: [PATCH 110/247] typo --- sklearn/_fast_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index d43bad3563640..4595b468289a3 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -281,7 +281,7 @@ def _intilialize_root(self): def _compute_spittability(self, node, only_hist=False): """Compute histograms and best possible split of a node. - If the best possible gain is 0 of if the constraints aren't met + If the best possible gain is 0 or if the constraints aren't met (min_samples_leaf, min_hessian_to_split, min_gain_to_split) then the node is finalized (transformed into a leaf), else it is pushed on the splittable node heap. From b1784b0688f31aa18338361a97923b0e0913fa2d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 5 Feb 2019 13:47:13 -0500 Subject: [PATCH 111/247] used memcpy in splitter instead of loop --- sklearn/_fast_gradient_boosting/splitting.pyx | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index e699bc5d5b461..f12d63c91f664 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -15,6 +15,7 @@ import numpy as np cimport numpy as np from openmp cimport omp_get_max_threads from libc.stdlib cimport malloc, free +from libc.string cimport memcpy from .histogram cimport _build_histogram from .histogram cimport _build_histogram_no_hessian @@ -341,13 +342,16 @@ cdef class Splitter: # sample_indices. This also updates self.partition since # sample_indices is a view. for thread_idx in prange(n_threads): - - for i in range(left_counts[thread_idx]): - sample_indices[left_offset[thread_idx] + i] = \ - left_indices_buffer[offset_in_buffers[thread_idx] + i] - for i in range(right_counts[thread_idx]): - sample_indices[right_offset[thread_idx] + i] = \ - right_indices_buffer[offset_in_buffers[thread_idx] + i] + memcpy( + &sample_indices[left_offset[thread_idx]], + &left_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * left_counts[thread_idx] + ) + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], From 2004615caaeb1e4e02f2f1af2864acfd7e4248ac Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 6 Feb 2019 08:42:06 -0500 Subject: [PATCH 112/247] removed unused n_bins parameter to histogram routines --- sklearn/_fast_gradient_boosting/histogram.pxd | 8 ++--- sklearn/_fast_gradient_boosting/histogram.pyx | 5 --- sklearn/_fast_gradient_boosting/splitting.pyx | 9 +++-- .../tests/test_histogram.py | 34 +++++++++---------- 4 files changed, 23 insertions(+), 33 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/histogram.pxd b/sklearn/_fast_gradient_boosting/histogram.pxd index 70487ade70a8e..582abc88f1fd4 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pxd +++ b/sklearn/_fast_gradient_boosting/histogram.pxd @@ -35,7 +35,6 @@ cpdef void _subtract_histograms( """Return histogram for a given feature.""" cpdef void _build_histogram( const int feature_idx, - unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] ordered_gradients, # IN @@ -44,10 +43,9 @@ cpdef void _build_histogram( """Return histogram for a given feature, not updating hessians. -Used when the hessians of the loss are constant (tipycally LS loss).""" +Used when the hessians of the loss are constant (typically LS loss).""" cpdef void _build_histogram_no_hessian( const int feature_idx, - unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] ordered_gradients, # IN @@ -59,17 +57,15 @@ samples from the training set. binned_feature and all_gradients / all_hessians already have a consistent ordering.""" cpdef void _build_histogram_root( const int feature_idx, - unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] all_gradients, # IN const G_H_DTYPE_C [::1] all_hessians, # IN hist_struct [:, ::1] out) nogil # OUT """Compute histogram of the root node, not updating hessians. -Used when the hessians of the loss are constant (tipycally LS loss).""" +Used when the hessians of the loss are constant (typically LS loss).""" cpdef void _build_histogram_root_no_hessian( const int feature_idx, - unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] all_gradients, # IN hist_struct [:, ::1] out) nogil # OUT diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index 4335980b2ec4a..e0a6d6841dcff 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -19,7 +19,6 @@ cimport numpy as np cpdef void _build_histogram_naive( const int feature_idx, - unsigned int n_bins, unsigned int [:] sample_indices, # IN X_BINNED_DTYPE_C [:] binned_feature, # IN G_H_DTYPE_C [:] ordered_gradients, # IN @@ -67,7 +66,6 @@ cpdef void _subtract_histograms( cpdef void _build_histogram( const int feature_idx, - unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] ordered_gradients, # IN @@ -114,7 +112,6 @@ cpdef void _build_histogram( cpdef void _build_histogram_no_hessian( const int feature_idx, - unsigned int n_bins, const unsigned int [::1] sample_indices, # IN const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] ordered_gradients, # IN @@ -154,7 +151,6 @@ cpdef void _build_histogram_no_hessian( cpdef void _build_histogram_root( const int feature_idx, - unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] all_gradients, # IN const G_H_DTYPE_C [::1] all_hessians, # IN @@ -201,7 +197,6 @@ cpdef void _build_histogram_root( cpdef void _build_histogram_root_no_hessian( const int feature_idx, - unsigned int n_bins, const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] all_gradients, # IN hist_struct [:, ::1] out) nogil: # OUT diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index f12d63c91f664..cb51d8fdbfc7e 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -476,21 +476,20 @@ cdef class Splitter: if root_node: if self.hessians_are_constant: - _build_histogram_root_no_hessian(feature_idx, self.max_bins, - X_binned, + _build_histogram_root_no_hessian(feature_idx, X_binned, ordered_gradients, histograms) else: - _build_histogram_root(feature_idx, self.max_bins, X_binned, + _build_histogram_root(feature_idx, X_binned, ordered_gradients, ordered_hessians, histograms) else: if self.hessians_are_constant: - _build_histogram_no_hessian(feature_idx, self.max_bins, + _build_histogram_no_hessian(feature_idx, sample_indices, X_binned, ordered_gradients, histograms) else: - _build_histogram(feature_idx, self.max_bins, sample_indices, + _build_histogram(feature_idx, sample_indices, X_binned, ordered_gradients, ordered_hessians, histograms) diff --git a/sklearn/_fast_gradient_boosting/tests/test_histogram.py b/sklearn/_fast_gradient_boosting/tests/test_histogram.py index b432e2639c7f3..6cb58e01f1469 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_histogram.py +++ b/sklearn/_fast_gradient_boosting/tests/test_histogram.py @@ -28,7 +28,7 @@ def test_build_histogram(build_func): sample_indices = np.array([0, 2, 3], dtype=np.uint32) hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) - build_func(0, 3, sample_indices, binned_feature, ordered_gradients, + build_func(0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist) hist = hist[0] assert_array_equal(hist['count'], [2, 1, 0]) @@ -41,7 +41,7 @@ def test_build_histogram(build_func): ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE) hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) - build_func(0, 3, sample_indices, binned_feature, ordered_gradients, + build_func(0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist) hist = hist[0] assert_array_equal(hist['count'], [2, 2, 1]) @@ -61,22 +61,22 @@ def test_histogram_sample_order_independence(): n_sub_samples, replace=False) ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE) hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, + _build_histogram_no_hessian(0, sample_indices, binned_feature, ordered_gradients, hist_gc) ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE) hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram(0, n_bins, sample_indices, binned_feature, + _build_histogram(0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc) permutation = rng.permutation(n_sub_samples) hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram_no_hessian(0, n_bins, sample_indices[permutation], + _build_histogram_no_hessian(0, sample_indices[permutation], binned_feature, ordered_gradients[permutation], hist_gc_perm) hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram(0, n_bins, sample_indices[permutation], binned_feature, + _build_histogram(0, sample_indices[permutation], binned_feature, ordered_gradients[permutation], ordered_hessians[permutation], hist_ghc_perm) @@ -114,15 +114,15 @@ def test_unrolled_equivalent_to_naive(constant_hessian): hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) - _build_histogram_root_no_hessian(0, n_bins, binned_feature, + _build_histogram_root_no_hessian(0, binned_feature, ordered_gradients, hist_gc_root) - _build_histogram_root(0, n_bins, binned_feature, ordered_gradients, + _build_histogram_root(0, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root) - _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, + _build_histogram_no_hessian(0, sample_indices, binned_feature, ordered_gradients, hist_gc) - _build_histogram(0, n_bins, sample_indices, binned_feature, + _build_histogram(0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc) - _build_histogram_naive(0, n_bins, sample_indices, binned_feature, + _build_histogram_naive(0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_naive) hist_naive = hist_naive[0] @@ -156,10 +156,10 @@ def test_hist_subtraction(constant_hessian): hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: - _build_histogram_no_hessian(0, n_bins, sample_indices, binned_feature, + _build_histogram_no_hessian(0, sample_indices, binned_feature, ordered_gradients, hist_parent) else: - _build_histogram(0, n_bins, sample_indices, binned_feature, + _build_histogram(0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_parent) mask = rng.randint(0, 2, n_samples).astype(np.bool) @@ -169,11 +169,11 @@ def test_hist_subtraction(constant_hessian): ordered_hessians_left = ordered_hessians[mask] hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: - _build_histogram_no_hessian(0, n_bins, sample_indices_left, + _build_histogram_no_hessian(0, sample_indices_left, binned_feature, ordered_gradients_left, hist_left) else: - _build_histogram(0, n_bins, sample_indices_left, binned_feature, + _build_histogram(0, sample_indices_left, binned_feature, ordered_gradients_left, ordered_hessians_left, hist_left) @@ -182,11 +182,11 @@ def test_hist_subtraction(constant_hessian): ordered_hessians_right = ordered_hessians[~mask] hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) if constant_hessian: - _build_histogram_no_hessian(0, n_bins, sample_indices_right, + _build_histogram_no_hessian(0, sample_indices_right, binned_feature, ordered_gradients_right, hist_right) else: - _build_histogram(0, n_bins, sample_indices_right, binned_feature, + _build_histogram(0, sample_indices_right, binned_feature, ordered_gradients_right, ordered_hessians_right, hist_right) From 483a7443b72897a1915a63768f01769889a63f8e Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Thu, 14 Feb 2019 16:44:12 -0500 Subject: [PATCH 113/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- sklearn/_fast_gradient_boosting/_gradient_boosting.pyx | 2 +- sklearn/_fast_gradient_boosting/binning.pyx | 2 +- sklearn/_fast_gradient_boosting/loss.pyx | 6 +++--- sklearn/_fast_gradient_boosting/predictor.pyx | 6 +++--- sklearn/_fast_gradient_boosting/utils.pyx | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 47fca23a6348e..1cefe2418c3ca 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -50,6 +50,6 @@ cdef void _update_raw_predictions_helper( int leaf_idx int n_leaves = starts.shape[0] - for leaf_idx in prange(n_leaves): + for leaf_idx in prange(n_leaves, nogil=True): for position in range(starts[leaf_idx], stops[leaf_idx]): raw_predictions[partition[position]] += values[leaf_idx] diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx index 5361ff82b3b0a..9b92c41ac69e2 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.pyx @@ -102,7 +102,7 @@ cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, int right int middle - for i in prange(data.shape[0], schedule='static'): + for i in prange(data.shape[0], schedule='static', nogil=True): left, right = 0, binning_thresholds.shape[0] while left < right: middle = (right + left - 1) // 2 diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index b2dd98b81546f..78e9226a101d4 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -153,7 +153,7 @@ cdef void _update_gradients_least_squares( int i n_samples = raw_predictions.shape[0] - for i in prange(n_samples, schedule='static'): + for i in prange(n_samples, schedule='static', nogil=True): # Note: a more correct exp is 2 * (raw_predictions - y_true) but # since we use 1 for the constant hessian value (and not 2) this # is strictly equivalent for the leaves values. @@ -214,14 +214,14 @@ cdef void _update_gradients_hessians_binary_crossentropy( G_H_DTYPE_C [::1] gradients, G_H_DTYPE_C [::1] hessians, const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions) nogil: + const Y_DTYPE_C [::1] raw_predictions): cdef: int n_samples G_H_DTYPE_C gradient_abs int i n_samples = raw_predictions.shape[0] - for i in prange(n_samples, schedule='static'): + for i in prange(n_samples, schedule='static', nogil=True): gradients[i] = cexpit(raw_predictions[i]) - y_true[i] gradient_abs = fabs(gradients[i]) hessians[i] = gradient_abs * (1. - gradient_abs) diff --git a/sklearn/_fast_gradient_boosting/predictor.pyx b/sklearn/_fast_gradient_boosting/predictor.pyx index b3ef7173c3064..ddd6ae0225081 100644 --- a/sklearn/_fast_gradient_boosting/predictor.pyx +++ b/sklearn/_fast_gradient_boosting/predictor.pyx @@ -123,7 +123,7 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( cdef void _predict_from_numeric_data( node_struct [:] nodes, const X_DTYPE_C [:, :] numeric_data, - Y_DTYPE_C [:] out) nogil: + Y_DTYPE_C [:] out): cdef: int i @@ -154,10 +154,10 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( cdef void _predict_from_binned_data( node_struct [:] nodes, const X_BINNED_DTYPE_C [:, :] binned_data, - Y_DTYPE_C [:] out) nogil: + Y_DTYPE_C [:] out): cdef: int i - for i in prange(binned_data.shape[0], schedule='static'): + for i in prange(binned_data.shape[0], schedule='static', nogil=True): out[i] = _predict_one_from_binned_data(nodes, binned_data, i) diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index 2c0bd4c865c78..c2720b0fd0fac 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -79,7 +79,7 @@ def sum_parallel(G_H_DTYPE_C [:] array): int i = 0 with nogil: - for i in prange(array.shape[0], schedule='static'): + for i in prange(array.shape[0], schedule='static', nogil=True): out += array[i] return out From c5ccae78558e6e30c9a9a578c9abbdba7bf693c7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 16:58:04 -0500 Subject: [PATCH 114/247] removed useless nogil in function def --- sklearn/_fast_gradient_boosting/_gradient_boosting.pyx | 2 +- sklearn/_fast_gradient_boosting/binning.pyx | 2 +- sklearn/_fast_gradient_boosting/loss.pyx | 2 +- sklearn/_fast_gradient_boosting/predictor.pyx | 2 +- sklearn/_fast_gradient_boosting/utils.pyx | 5 ++--- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 1cefe2418c3ca..3c2d35314468a 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -43,7 +43,7 @@ cdef void _update_raw_predictions_helper( const unsigned int [::1] starts, const unsigned int [::1] stops, const unsigned int [::1] partition, - const Y_DTYPE_C [::1] values) nogil: + const Y_DTYPE_C [::1] values): cdef: unsigned int position diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx index 9b92c41ac69e2..13edb19fb8bab 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.pyx @@ -94,7 +94,7 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, - X_BINNED_DTYPE_C [:] binned) nogil: + X_BINNED_DTYPE_C [:] binned): """Binary search to the find the bin index for each value in data.""" cdef: int i diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 78e9226a101d4..e64dc841f5a24 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -147,7 +147,7 @@ class LeastSquares(BaseLoss): cdef void _update_gradients_least_squares( G_H_DTYPE_C [::1] gradients, const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions) nogil: + const Y_DTYPE_C [::1] raw_predictions): cdef: int n_samples int i diff --git a/sklearn/_fast_gradient_boosting/predictor.pyx b/sklearn/_fast_gradient_boosting/predictor.pyx index ddd6ae0225081..6c8aa850a8d5f 100644 --- a/sklearn/_fast_gradient_boosting/predictor.pyx +++ b/sklearn/_fast_gradient_boosting/predictor.pyx @@ -128,7 +128,7 @@ cdef void _predict_from_numeric_data( cdef: int i - for i in prange(numeric_data.shape[0], schedule='static'): + for i in prange(numeric_data.shape[0], schedule='static', nogil=True): out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i) diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index c2720b0fd0fac..98687ad20791b 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -78,8 +78,7 @@ def sum_parallel(G_H_DTYPE_C [:] array): Y_DTYPE_C out = 0. int i = 0 - with nogil: - for i in prange(array.shape[0], schedule='static', nogil=True): - out += array[i] + for i in prange(array.shape[0], schedule='static', nogil=True): + out += array[i] return out From 23f1d4fd2b41cd36e86a7777d472926a3aaf563b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 17:10:09 -0500 Subject: [PATCH 115/247] Used timeit.default_timer instead of time.time --- sklearn/_fast_gradient_boosting/grower.py | 2 +- sklearn/ensemble/gradient_boosting.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 4595b468289a3..8efacde5d2b8b 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -6,7 +6,7 @@ """ from heapq import heappush, heappop import numpy as np -from time import time +from timeit import default_timer as time from .splitting import Splitter from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index cb312e2070dbf..2308628965292 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -39,7 +39,7 @@ from scipy.sparse import issparse from scipy.special import expit -from time import time +from timeit import default_timer as time from ..model_selection import train_test_split from ..tree.tree import DecisionTreeRegressor from ..tree._tree import DTYPE From f9357612a87c193ffaa5d1b477713b72a781ee1e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 17:12:04 -0500 Subject: [PATCH 116/247] reverted unwanted change --- sklearn/_fast_gradient_boosting/gradient_boosting.py | 2 +- sklearn/ensemble/gradient_boosting.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 93d5eb3872090..881986c53382b 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod import numpy as np -from time import time +from timeit import default_timer as time from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin from sklearn.utils import check_X_y, check_random_state, check_array from sklearn.utils.validation import check_is_fitted diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 2308628965292..cb312e2070dbf 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -39,7 +39,7 @@ from scipy.sparse import issparse from scipy.special import expit -from timeit import default_timer as time +from time import time from ..model_selection import train_test_split from ..tree.tree import DecisionTreeRegressor from ..tree._tree import DTYPE From 9ec5a49d18f87b95c3290981d27e08631d46b4a1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Feb 2019 17:14:56 -0500 Subject: [PATCH 117/247] made n_trees_per_iter and baseline_pred private attributes --- .../gradient_boosting.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 881986c53382b..e5ed4b6ec90b7 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -167,19 +167,19 @@ def fit(self, X, y): # n_trees_per_iterations is n_classes in multiclass classification, # else 1. n_samples = X_binned_train.shape[0] - self.baseline_prediction_ = self.loss_.get_baseline_prediction( - y_train, self.n_trees_per_iteration_) + self._baseline_prediction = self.loss_.get_baseline_prediction( + y_train, self._n_trees_per_iteration) raw_predictions = np.zeros( - shape=(self.n_trees_per_iteration_, n_samples), - dtype=self.baseline_prediction_.dtype + shape=(self._n_trees_per_iteration, n_samples), + dtype=self._baseline_prediction.dtype ) - raw_predictions += self.baseline_prediction_ + raw_predictions += self._baseline_prediction # initialize gradients and hessians (empty arrays). # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, - prediction_dim=self.n_trees_per_iteration_ + prediction_dim=self._n_trees_per_iteration ) # estimators_ is a matrix (list of lists) of TreePredictor objects @@ -217,7 +217,7 @@ def fit(self, X, y): estimators.append([]) # Build `n_trees_per_iteration` trees. - for k in range(self.n_trees_per_iteration_): + for k in range(self._n_trees_per_iteration): grower = TreeGrower( X_binned_train, gradients[k, :], hessians[k, :], @@ -389,10 +389,10 @@ def _raw_predict(self, X): is_binned = self._in_fit and X.dtype == X_BINNED_DTYPE n_samples = X.shape[0] raw_predictions = np.zeros( - shape=(self.n_trees_per_iteration_, n_samples), - dtype=self.baseline_prediction_.dtype + shape=(self._n_trees_per_iteration, n_samples), + dtype=self._baseline_prediction.dtype ) - raw_predictions += self.baseline_prediction_ + raw_predictions += self._baseline_prediction for predictors_of_ith_iteration in self.estimators_: for k, estimator in enumerate(predictors_of_ith_iteration): predict = (estimator.predict_binned if is_binned @@ -548,7 +548,7 @@ def predict(self, X): def _encode_y(self, y): # Just convert y to the expected dtype - self.n_trees_per_iteration_ = 1 + self._n_trees_per_iteration = 1 y = y.astype(Y_DTYPE, copy=False) return y @@ -734,7 +734,7 @@ def decision_function(self, X): def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ - # and n_trees_per_iteration_ + # and _n_trees_per_iteration check_classification_targets(y) label_encoder = LabelEncoder() @@ -743,13 +743,13 @@ def _encode_y(self, y): n_classes = self.classes_.shape[0] # only 1 tree for binary classification. For multiclass classification, # we build 1 tree per class. - self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes + self._n_trees_per_iteration = 1 if n_classes <= 2 else n_classes encoded_y = encoded_y.astype(Y_DTYPE, copy=False) return encoded_y def _get_loss(self): if self.loss == 'auto': - if self.n_trees_per_iteration_ == 1: + if self._n_trees_per_iteration == 1: return _LOSSES['binary_crossentropy']() else: return _LOSSES['categorical_crossentropy']() From 1364f43dccd727479a1e3d85eae7541c0cd91b65 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 17 Feb 2019 07:14:27 -0500 Subject: [PATCH 118/247] Slightly changed logisitic loss gradient computation --- sklearn/_fast_gradient_boosting/loss.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index e64dc841f5a24..106ddc909ff3f 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -217,14 +217,14 @@ cdef void _update_gradients_hessians_binary_crossentropy( const Y_DTYPE_C [::1] raw_predictions): cdef: int n_samples - G_H_DTYPE_C gradient_abs + Y_DTYPE_C p_i # proba that ith sample belongs to positive class int i n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - gradients[i] = cexpit(raw_predictions[i]) - y_true[i] - gradient_abs = fabs(gradients[i]) - hessians[i] = gradient_abs * (1. - gradient_abs) + p_i = cexpit(raw_predictions[i]) + gradients[i] = p_i - y_true[i] + hessians[i] = p_i * (1. - p_i) class CategoricalCrossEntropy(BaseLoss): @@ -279,8 +279,8 @@ cdef void _update_gradients_hessians_categorical_crossentropy( cdef: int prediction_dim = raw_predictions.shape[0] int n_samples = raw_predictions.shape[1] - int k - int i + int k # class index + int i # sample index # p[i, k] is the probability that class(ith sample) == k. # It's the softmax of the raw predictions Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) From e512799f5b8ffdb9e98f21434d3ef2302ab9bcbe Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 17 Feb 2019 08:21:33 -0500 Subject: [PATCH 119/247] Addressed comments --- sklearn/_fast_gradient_boosting/binning.pyx | 9 ++------- .../tests/test_binning.py | 17 ++++++++--------- .../_fast_gradient_boosting/tests/test_loss.py | 2 +- .../tests/test_splitting.py | 2 -- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.pyx index 13edb19fb8bab..83ed001a19e8e 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.pyx @@ -75,13 +75,8 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, binning_thresholds : tuple of arrays For each feature, stores the increasing numeric values that are used to separate the bins. - out : array-like - If not None, write result inplace in out. - - Returns - ------- - binned_data : array of int, shape=data.shape - The binned data. + binned : array-like, shape=(n_samples, n_features) + Output array, must be fortran aligned. """ cdef: int feature_idx diff --git a/sklearn/_fast_gradient_boosting/tests/test_binning.py b/sklearn/_fast_gradient_boosting/tests/test_binning.py index c543a18f16a88..53d0feb8ab6e1 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_binning.py +++ b/sklearn/_fast_gradient_boosting/tests/test_binning.py @@ -17,9 +17,11 @@ def test_find_binning_thresholds_regular_data(): data = np.linspace(0, 10, 1001).reshape(-1, 1) bin_thresholds = _find_binning_thresholds(data, max_bins=10) assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9]) + assert len(bin_thresholds) == 1 bin_thresholds = _find_binning_thresholds(data, max_bins=5) assert_allclose(bin_thresholds[0], [2, 4, 6, 8]) + assert len(bin_thresholds) == 1 def test_find_binning_thresholds_small_regular_data(): @@ -100,9 +102,9 @@ def test_bin_mapper_random_data(n_bins): assert_array_equal(binned.min(axis=0), np.array([0, 0])) assert_array_equal(binned.max(axis=0), np.array([n_bins - 1, n_bins - 1])) assert len(mapper.bin_thresholds_) == n_features - for i in range(len(mapper.bin_thresholds_)): - assert mapper.bin_thresholds_[i].shape == (n_bins - 1,) - assert mapper.bin_thresholds_[i].dtype == DATA.dtype + for bin_thresholds_feature in mapper.bin_thresholds_: + assert bin_thresholds_feature.shape == (n_bins - 1,) + assert bin_thresholds_feature.dtype == DATA.dtype assert np.all(mapper.n_bins_per_feature_ == n_bins) # Check that the binned data is approximately balanced across bins. @@ -216,9 +218,6 @@ def test_subsample(): mapper_subsample = BinMapper(subsample=256, random_state=0).fit(DATA) for feature in range(DATA.shape[1]): - with pytest.raises(AssertionError): - np.testing.assert_array_almost_equal( - mapper_no_subsample.bin_thresholds_[feature], - mapper_subsample.bin_thresholds_[feature], - decimal=3 - ) + assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature], + mapper_subsample.bin_thresholds_[feature], + rtol=1e-4) diff --git a/sklearn/_fast_gradient_boosting/tests/test_loss.py b/sklearn/_fast_gradient_boosting/tests/test_loss.py index c6bd7056eae1c..56a90166dbe9a 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_loss.py +++ b/sklearn/_fast_gradient_boosting/tests/test_loss.py @@ -57,7 +57,7 @@ def get_hessians(y_true, raw_predictions): reason='Newton internally uses float64 != Y_DTYPE') def test_derivatives(loss, x0, y_true): # Check that gradients are zero when the loss is minimized on 1D array - # using the Newton-Raphson and the first and second order derivatives + # using Halley's method with the first and second order derivatives # computed by the Loss instance. loss = _LOSSES[loss]() diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index a2ba8f1daa85f..5ea2a876e8e81 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -61,8 +61,6 @@ def test_histogram_split(n_bins): def test_split_vs_split_subtraction(constant_hessian): # Make sure find_node_split and find_node_split_subtraction return the # same results. - # Should we add a test about computation time to make sure - # time(subtraction) < time(regular)? rng = np.random.RandomState(42) n_bins = 10 From 6266c6d038bba931744a468a16dc304388df8498 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 17 Feb 2019 09:29:12 -0500 Subject: [PATCH 120/247] Removed unused import in loss.pyx --- sklearn/_fast_gradient_boosting/loss.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.pyx index 106ddc909ff3f..3a4fb5bb82fe7 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.pyx @@ -20,7 +20,7 @@ try: except ImportError: from scipy.misc import logsumexp -from libc.math cimport fabs, exp, log +from libc.math cimport exp from .types import Y_DTYPE from .types cimport Y_DTYPE_C From e818f006ac49af9eea43458da8a778e7b3878a9c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Feb 2019 12:49:18 -0500 Subject: [PATCH 121/247] use check_early_stopping insteaf of get_scores --- sklearn/_fast_gradient_boosting/gradient_boosting.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index e5ed4b6ec90b7..4b6b471a610c1 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -195,13 +195,11 @@ def fit(self, X, y): self.train_score_ = [] self.validation_score_ = [] if self.do_early_stopping_: - # Add predictions of the initial model (before the first tree) - self.train_score_.append( - self._get_scores(X_binned_small_train, y_small_train)) + # populate train_score and validation_score with the predictions + # of the initial model (before the first tree) + self._check_early_stopping(X_binned_small_train, y_small_train, + X_binned_val, y_val) - if self.validation_fraction is not None: - self.validation_score_.append( - self._get_scores(X_binned_val, y_val)) for iteration in range(self.n_estimators): From 2d76ad351cb5ccc253b88003cfc6687a6bb07f10 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Feb 2019 17:14:34 -0500 Subject: [PATCH 122/247] Added XGBoost and CatBoost estimators in benchmarks --- benchmarks/bench_fast_gradient_boosting.py | 107 +++++++++++++-- ...bench_fast_gradient_boosting_higgsboson.py | 30 ++++- .../gradient_boosting.py | 1 - sklearn/_fast_gradient_boosting/utils.pyx | 124 ++++++++++++++---- 4 files changed, 215 insertions(+), 47 deletions(-) diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_fast_gradient_boosting.py index 31b96182b8039..8faa72df32119 100644 --- a/benchmarks/bench_fast_gradient_boosting.py +++ b/benchmarks/bench_fast_gradient_boosting.py @@ -7,7 +7,7 @@ from sklearn.ensemble import FastGradientBoostingRegressor from sklearn.datasets import make_classification from sklearn.datasets import make_regression -from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator +from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() @@ -15,6 +15,10 @@ parser.add_argument('--n-trees', type=int, default=10) parser.add_argument('--lightgbm', action="store_true", default=False, help='also plot lightgbm') +parser.add_argument('--xgboost', action="store_true", default=False, + help='also plot xgboost') +parser.add_argument('--catboost', action="store_true", default=False, + help='also plot catboost') parser.add_argument('--learning-rate', type=float, default=.1) parser.add_argument('--problem', type=str, default='classification', choices=['classification', 'regression']) @@ -72,6 +76,9 @@ def one_run(n_samples): print("fit duration: {:.3f}s,".format(sklearn_fit_duration)) print("score duration: {:.3f}s,".format(sklearn_score_duration)) + lightgbm_score = None + lightgbm_fit_duration = None + lightgbm_score_duration = None if args.lightgbm: print("Fitting a LightGBM model...") # get_lightgbm does not accept loss='auto' @@ -79,7 +86,7 @@ def one_run(n_samples): loss = 'binary_crossentropy' if args.n_classes == 2 else \ 'categorical_crossentropy' est.set_params(loss=loss) - lightgbm_est = get_lightgbm_estimator(est) + lightgbm_est = get_equivalent_estimator(est, lib='lgbm') tic = time() lightgbm_est.fit(X_train, y_train) @@ -91,12 +98,54 @@ def one_run(n_samples): print("fit duration: {:.3f}s,".format(lightgbm_fit_duration)) print("score duration: {:.3f}s,".format(lightgbm_score_duration)) - return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, - lightgbm_score, lightgbm_fit_duration, - lightgbm_score_duration) + xgb_score = None + xgb_fit_duration = None + xgb_score_duration = None + if args.xgboost: + print("Fitting an XGBoost model...") + # get_xgb does not accept loss='auto' + if args.problem == 'classification': + loss = 'binary_crossentropy' if args.n_classes == 2 else \ + 'categorical_crossentropy' + est.set_params(loss=loss) + xgb_est = get_equivalent_estimator(est, lib='xgb') + + tic = time() + xgb_est.fit(X_train, y_train) + xgb_fit_duration = time() - tic + tic = time() + xgb_score = xgb_est.score(X_test, y_test) + xgb_score_duration = time() - tic + print("score: {:.4f}".format(xgb_score)) + print("fit duration: {:.3f}s,".format(xgb_fit_duration)) + print("score duration: {:.3f}s,".format(xgb_score_duration)) + + cat_score = None + cat_fit_duration = None + cat_score_duration = None + if args.catboost: + print("Fitting a CatBoost model...") + # get_cat does not accept loss='auto' + if args.problem == 'classification': + loss = 'binary_crossentropy' if args.n_classes == 2 else \ + 'categorical_crossentropy' + est.set_params(loss=loss) + cat_est = get_equivalent_estimator(est, lib='cat') + + tic = time() + cat_est.fit(X_train, y_train) + cat_fit_duration = time() - tic + tic = time() + cat_score = cat_est.score(X_test, y_test) + cat_score_duration = time() - tic + print("score: {:.4f}".format(cat_score)) + print("fit duration: {:.3f}s,".format(cat_fit_duration)) + print("score duration: {:.3f}s,".format(cat_score_duration)) return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, - None, None, None) + lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, + xgb_score, xgb_fit_duration, xgb_score_duration, + cat_score, cat_fit_duration, cat_score_duration) n_samples_list = [1000, 10000, 100000, 500000, 1000000, 5000000, 10000000] @@ -109,6 +158,12 @@ def one_run(n_samples): lightgbm_scores = [] lightgbm_fit_durations = [] lightgbm_score_durations = [] +xgb_scores = [] +xgb_fit_durations = [] +xgb_score_durations = [] +cat_scores = [] +cat_fit_durations = [] +cat_score_durations = [] for n_samples in n_samples_list: (sklearn_score, @@ -116,14 +171,28 @@ def one_run(n_samples): sklearn_score_duration, lightgbm_score, lightgbm_fit_duration, - lightgbm_score_duration) = one_run(n_samples) - - sklearn_scores.append(sklearn_score) - sklearn_fit_durations.append(sklearn_fit_duration) - sklearn_score_durations.append(sklearn_score_duration) - lightgbm_scores.append(lightgbm_score) - lightgbm_fit_durations.append(lightgbm_fit_duration) - lightgbm_score_durations.append(lightgbm_score_duration) + lightgbm_score_duration, + xgb_score, + xgb_fit_duration, + xgb_score_duration, + cat_score, + cat_fit_duration, + cat_score_duration) = one_run(n_samples) + + for scores, score in ( + (sklearn_scores, sklearn_score), + (sklearn_fit_durations, sklearn_fit_duration), + (sklearn_score_durations, sklearn_score_duration), + (lightgbm_scores, lightgbm_score), + (lightgbm_fit_durations, lightgbm_fit_duration), + (lightgbm_score_durations, lightgbm_score_duration), + (xgb_scores, xgb_score), + (xgb_fit_durations, xgb_fit_duration), + (xgb_score_durations, xgb_score_duration), + (cat_scores, cat_score), + (cat_fit_durations, cat_fit_duration), + (cat_score_durations, cat_score_duration)): + scores.append(score) fig, axs = plt.subplots(3, sharex=True) @@ -136,6 +205,16 @@ def one_run(n_samples): axs[1].plot(n_samples_list, lightgbm_fit_durations, label='lgbm') axs[2].plot(n_samples_list, lightgbm_score_durations, label='lgbm') +if args.xgboost: + axs[0].plot(n_samples_list, xgb_scores, label='XGBoost') + axs[1].plot(n_samples_list, xgb_fit_durations, label='XGBoost') + axs[2].plot(n_samples_list, xgb_score_durations, label='XGBoost') + +if args.catboost: + axs[0].plot(n_samples_list, cat_scores, label='CatBoost') + axs[1].plot(n_samples_list, cat_fit_durations, label='CatBoost') + axs[2].plot(n_samples_list, cat_score_durations, label='CatBoost') + for ax in axs: ax.set_xscale('log') ax.legend(loc='best') diff --git a/benchmarks/bench_fast_gradient_boosting_higgsboson.py b/benchmarks/bench_fast_gradient_boosting_higgsboson.py index 4305dc378074a..3e44cc8be570c 100644 --- a/benchmarks/bench_fast_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_fast_gradient_boosting_higgsboson.py @@ -10,13 +10,15 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.ensemble import FastGradientBoostingClassifier -from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator +from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator parser = argparse.ArgumentParser() parser.add_argument('--n-leaf-nodes', type=int, default=31) parser.add_argument('--n-trees', type=int, default=10) parser.add_argument('--lightgbm', action="store_true", default=False) +parser.add_argument('--xgboost', action="store_true", default=False) +parser.add_argument('--catboost', action="store_true", default=False) parser.add_argument('--learning-rate', type=float, default=1.) parser.add_argument('--subsample', type=int, default=None) parser.add_argument('--max-bins', type=int, default=255) @@ -55,7 +57,7 @@ def load_data(): target = df.values[:, 0] data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( - data, target, test_size=50000, random_state=0) + data, target, test_size=.2, random_state=0) if subsample is not None: data_train, target_train = data_train[:subsample], target_train[:subsample] @@ -84,10 +86,32 @@ def load_data(): if args.lightgbm: print("Fitting a LightGBM model...") tic = time() - lightgbm_est = get_lightgbm_estimator(est) + lightgbm_est = get_equivalent_estimator(est, lib='lgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) roc_auc = roc_auc_score(target_test, predicted_test) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") + +if args.xgboost: + print("Fitting an XGBoost model...") + tic = time() + xgboost_est = get_equivalent_estimator(est, lib='xgb') + xgboost_est.fit(data_train, target_train) + toc = time() + predicted_test = xgboost_est.predict(data_test) + roc_auc = roc_auc_score(target_test, predicted_test) + acc = accuracy_score(target_test, predicted_test) + print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") + +if args.catboost: + print("Fitting a Catboost model...") + tic = time() + catboost_est = get_equivalent_estimator(est, lib='cat') + catboost_est.fit(data_train, target_train) + toc = time() + predicted_test = catboost_est.predict(data_test) + roc_auc = roc_auc_score(target_test, predicted_test) + acc = accuracy_score(target_test, predicted_test) + print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 4b6b471a610c1..04c0be0882823 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -200,7 +200,6 @@ def fit(self, X, y): self._check_early_stopping(X_binned_small_train, y_small_train, X_binned_val, y_val) - for iteration in range(self.n_estimators): if self.verbose: diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index 98687ad20791b..96d874153fa66 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -9,67 +9,133 @@ from cython.parallel import prange from .binning import BinMapper from .types cimport G_H_DTYPE_C from .types cimport Y_DTYPE_C +from ..base import is_classifier -def get_lightgbm_estimator(pygbm_estimator): - """Return an unfitted LightGBM estimator with matching hyperparams. +def get_equivalent_estimator(estimator, lib='lgbm'): + """Return an unfitted estimator from another lib with matching hyperparams. - This utility function takes care of renaming the PyGBM parameters into - their LightGBM equivalent parameters. + This utility function takes care of renaming the sklearn parameters into + their LightGBM, XGBoost or CatBoost equivalent parameters. """ - from lightgbm import LGBMRegressor - from lightgbm import LGBMClassifier + # unmapped XGB parameters: + # - min_samples_leaf + # - min_data_in_bin + # - min_split_gain (there is min_split_loss though?) - # Import here to avoid cyclic dependencies - from .gradient_boosting import FastGradientBoostingClassifier + # unmapped Catboost parameters: + # max_leaves + # min_* - pygbm_params = pygbm_estimator.get_params() + if lib not in ('lgbm', 'xgb', 'cat'): + raise ValueError('accepted libs are lgbm, xgb, and cat. got ' + '{}'.format(lib)) - if pygbm_params['loss'] == 'auto': + sklearn_params = estimator.get_params() + + if sklearn_params['loss'] == 'auto': raise ValueError('auto loss is not accepted. We need to know if ' 'the problem is binary or multiclass classification.') - if pygbm_params['n_iter_no_change'] is not None: + if sklearn_params['n_iter_no_change'] is not None: raise NotImplementedError('Early stopping should be deactivated.') - loss_mapping = { + # LGBM + lgbm_loss_mapping = { 'least_squares': 'regression_l2', 'binary_crossentropy': 'binary', 'categorical_crossentropy': 'multiclass' } lgbm_params = { - 'objective': loss_mapping[pygbm_params['loss']], - 'learning_rate': pygbm_params['learning_rate'], - 'n_estimators': pygbm_params['n_estimators'], - 'num_leaves': pygbm_params['max_leaf_nodes'], - 'max_depth': pygbm_params['max_depth'], - 'min_child_samples': pygbm_params['min_samples_leaf'], - 'reg_lambda': pygbm_params['l2_regularization'], - 'max_bin': pygbm_params['max_bins'], + 'objective': lgbm_loss_mapping[sklearn_params['loss']], + 'learning_rate': sklearn_params['learning_rate'], + 'n_estimators': sklearn_params['n_estimators'], + 'num_leaves': sklearn_params['max_leaf_nodes'], + 'max_depth': sklearn_params['max_depth'], + 'min_child_samples': sklearn_params['min_samples_leaf'], + 'reg_lambda': sklearn_params['l2_regularization'], + 'max_bin': sklearn_params['max_bins'], 'min_data_in_bin': 1, 'min_child_weight': 1e-3, 'min_sum_hessian_in_leaf': 1e-3, 'min_split_gain': 0, - 'verbosity': 10 if pygbm_params['verbose'] else -10, + 'verbosity': 10 if sklearn_params['verbose'] else -10, 'boost_from_average': True, 'enable_bundle': False, # also makes feature order consistent 'min_data_in_bin': 1, 'subsample_for_bin': BinMapper().subsample, } - # TODO: change hardcoded values when / if they're arguments to the - # estimator. - if pygbm_params['loss'] == 'categorical_crossentropy': + if sklearn_params['loss'] == 'categorical_crossentropy': # LGBM multiplies hessians by 2 in multiclass loss. lgbm_params['min_sum_hessian_in_leaf'] *= 2 lgbm_params['learning_rate'] *= 2 - if isinstance(pygbm_estimator, FastGradientBoostingClassifier): - Est = LGBMClassifier - else: - Est = LGBMRegressor + # XGB + xgb_loss_mapping = { + 'least_squares': 'reg:linear', + 'binary_crossentropy': 'reg:logistic', + 'categorical_crossentropy': 'multi:softmax' + } + + xgb_params = { + 'tree_method': 'hist', + 'grow_policy': 'lossguide', # so that we can set max_leaves + 'objective': xgb_loss_mapping[sklearn_params['loss']], + 'learning_rate': sklearn_params['learning_rate'], + 'n_estimators': sklearn_params['n_estimators'], + 'max_leaves': sklearn_params['max_leaf_nodes'], + 'max_depth': sklearn_params['max_depth'] or 0, + 'lambda': sklearn_params['l2_regularization'], + 'max_bin': sklearn_params['max_bins'], + 'min_child_weight': 1e-3, + 'verbosity': 2 if sklearn_params['verbose'] else 0, + 'silent': sklearn_params['verbose'] == 0, + 'n_jobs': -1, + } - return Est(**lgbm_params) + # Catboost + cat_loss_mapping = { + 'least_squares': 'RMSE', + 'binary_crossentropy': 'Logloss', + 'categorical_crossentropy': 'MultiClass' + } + + cat_params = { + 'loss_function': cat_loss_mapping[sklearn_params['loss']], + 'learning_rate': sklearn_params['learning_rate'], + 'iterations': sklearn_params['n_estimators'], + 'depth': sklearn_params['max_depth'], + 'reg_lambda': sklearn_params['l2_regularization'], + 'max_bin': sklearn_params['max_bins'], + 'feature_border_type': 'Median', + 'leaf_estimation_method': 'Newton', + 'verbose': bool(sklearn_params['verbose']), + } + + if lib == 'lgbm': + from lightgbm import LGBMRegressor + from lightgbm import LGBMClassifier + if is_classifier(estimator): + return LGBMClassifier(**lgbm_params) + else: + return LGBMRegressor(**lgbm_params) + + elif lib == 'xgb': + from xgboost import XGBRegressor + from xgboost import XGBClassifier + if is_classifier(estimator): + return XGBClassifier(**xgb_params) + else: + return XGBRegressor(**xgb_params) + + else: + from catboost import CatBoostRegressor + from catboost import CatBoostClassifier + if is_classifier(estimator): + return CatBoostClassifier(**cat_params) + else: + return CatBoostRegressor(**cat_params) def sum_parallel(G_H_DTYPE_C [:] array): From 9717834a0bea04d62300a4010a0e3c4df0a3bed6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Feb 2019 19:42:34 -0500 Subject: [PATCH 123/247] Should fix tests --- .../tests/test_compare_lightgbm.py | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 887cf059dd2ff..8f5a821ff8b08 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -7,7 +7,7 @@ from sklearn.ensemble import FastGradientBoostingRegressor from sklearn.ensemble import FastGradientBoostingClassifier from sklearn._fast_gradient_boosting.binning import BinMapper -from sklearn._fast_gradient_boosting.utils import get_lightgbm_estimator +from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator pytest.importorskip("lightgbm") @@ -21,11 +21,11 @@ ]) def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): - # Make sure pygbm has the same predictions as LGBM for easy targets. + # Make sure sklearn has the same predictions as LGBM for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by - # LightGBM and PyGBM should be exactly identical. + # LightGBM and sklearn should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of @@ -59,7 +59,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_lightgbm_estimator(est_sklearn) + est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) @@ -104,7 +104,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = FastGradientBoostingClassifier( + est_sklearn = FastGradientBoostingClassifier( loss='binary_crossentropy', n_estimators=n_estimators, max_bins=max_bins, @@ -112,31 +112,31 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_lightgbm_estimator(est_pygbm) + est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lgbm') est_lightgbm.fit(X_train, y_train) - est_pygbm.fit(X_train, y_train) + est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) - pred_pygbm = est_pygbm.predict(X_train) - assert np.mean(pred_pygbm == pred_lightgbm) > .89 + pred_sklearn = est_sklearn.predict(X_train) + assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_train, pred_lightgbm) - acc_pygbm = accuracy_score(y_train, pred_pygbm) - np.testing.assert_almost_equal(acc_lgbm, acc_pygbm) + acc_sklearn = accuracy_score(y_train, pred_sklearn) + np.testing.assert_almost_equal(acc_lgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) - pred_pygbm = est_pygbm.predict(X_test) - assert np.mean(pred_pygbm == pred_lightgbm) > .89 + pred_sklearn = est_sklearn.predict(X_test) + assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lgbm = accuracy_score(y_test, pred_lightgbm) - acc_pygbm = accuracy_score(y_test, pred_pygbm) - np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) + acc_sklearn = accuracy_score(y_test, pred_sklearn) + np.testing.assert_almost_equal(acc_lgbm, acc_sklearn, decimal=2) @pytest.mark.parametrize('seed', range(5)) @@ -166,7 +166,7 @@ def test_same_predictions_multiclass_classification( X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_pygbm = FastGradientBoostingClassifier( + est_sklearn = FastGradientBoostingClassifier( loss='categorical_crossentropy', n_estimators=n_estimators, max_bins=max_bins, @@ -174,40 +174,40 @@ def test_same_predictions_multiclass_classification( n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_lightgbm_estimator(est_pygbm) + est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lgbm') est_lightgbm.fit(X_train, y_train) - est_pygbm.fit(X_train, y_train) + est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) - pred_pygbm = est_pygbm.predict(X_train) - assert np.mean(pred_pygbm == pred_lightgbm) > .89 + pred_sklearn = est_sklearn.predict(X_train) + assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) - proba_pygbm = est_pygbm.predict_proba(X_train) + proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal - assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 + assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lgbm = accuracy_score(y_train, pred_lightgbm) - acc_pygbm = accuracy_score(y_train, pred_pygbm) - np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) + acc_sklearn = accuracy_score(y_train, pred_sklearn) + np.testing.assert_almost_equal(acc_lgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) - pred_pygbm = est_pygbm.predict(X_test) - assert np.mean(pred_pygbm == pred_lightgbm) > .89 + pred_sklearn = est_sklearn.predict(X_test) + assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) - proba_pygbm = est_pygbm.predict_proba(X_train) + proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal - assert np.mean(np.abs(proba_lightgbm - proba_pygbm) < 1e-2) > .75 + assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lgbm = accuracy_score(y_test, pred_lightgbm) - acc_pygbm = accuracy_score(y_test, pred_pygbm) - np.testing.assert_almost_equal(acc_lgbm, acc_pygbm, decimal=2) + acc_sklearn = accuracy_score(y_test, pred_sklearn) + np.testing.assert_almost_equal(acc_lgbm, acc_sklearn, decimal=2) From a83225e2c38553b9a3bc425877c43335cbdc1e20 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 20 Feb 2019 08:31:55 -0500 Subject: [PATCH 124/247] used lightgbm xgboost catboost full names --- benchmarks/bench_fast_gradient_boosting.py | 6 +++--- .../bench_fast_gradient_boosting_higgsboson.py | 6 +++--- .../tests/test_compare_lightgbm.py | 6 +++--- sklearn/_fast_gradient_boosting/utils.pyx | 12 ++++++------ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_fast_gradient_boosting.py index 8faa72df32119..1f0898aa76365 100644 --- a/benchmarks/bench_fast_gradient_boosting.py +++ b/benchmarks/bench_fast_gradient_boosting.py @@ -86,7 +86,7 @@ def one_run(n_samples): loss = 'binary_crossentropy' if args.n_classes == 2 else \ 'categorical_crossentropy' est.set_params(loss=loss) - lightgbm_est = get_equivalent_estimator(est, lib='lgbm') + lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') tic = time() lightgbm_est.fit(X_train, y_train) @@ -108,7 +108,7 @@ def one_run(n_samples): loss = 'binary_crossentropy' if args.n_classes == 2 else \ 'categorical_crossentropy' est.set_params(loss=loss) - xgb_est = get_equivalent_estimator(est, lib='xgb') + xgb_est = get_equivalent_estimator(est, lib='xgboost') tic = time() xgb_est.fit(X_train, y_train) @@ -130,7 +130,7 @@ def one_run(n_samples): loss = 'binary_crossentropy' if args.n_classes == 2 else \ 'categorical_crossentropy' est.set_params(loss=loss) - cat_est = get_equivalent_estimator(est, lib='cat') + cat_est = get_equivalent_estimator(est, lib='catboost') tic = time() cat_est.fit(X_train, y_train) diff --git a/benchmarks/bench_fast_gradient_boosting_higgsboson.py b/benchmarks/bench_fast_gradient_boosting_higgsboson.py index 3e44cc8be570c..e37341d208078 100644 --- a/benchmarks/bench_fast_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_fast_gradient_boosting_higgsboson.py @@ -86,7 +86,7 @@ def load_data(): if args.lightgbm: print("Fitting a LightGBM model...") tic = time() - lightgbm_est = get_equivalent_estimator(est, lib='lgbm') + lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) @@ -97,7 +97,7 @@ def load_data(): if args.xgboost: print("Fitting an XGBoost model...") tic = time() - xgboost_est = get_equivalent_estimator(est, lib='xgb') + xgboost_est = get_equivalent_estimator(est, lib='xgboost') xgboost_est.fit(data_train, target_train) toc = time() predicted_test = xgboost_est.predict(data_test) @@ -108,7 +108,7 @@ def load_data(): if args.catboost: print("Fitting a Catboost model...") tic = time() - catboost_est = get_equivalent_estimator(est, lib='cat') + catboost_est = get_equivalent_estimator(est, lib='catboost') catboost_est.fit(data_train, target_train) toc = time() predicted_test = catboost_est.predict(data_test) diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 8f5a821ff8b08..8faa1e2b46780 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -59,7 +59,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lgbm') + est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) @@ -112,7 +112,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lgbm') + est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) @@ -174,7 +174,7 @@ def test_same_predictions_multiclass_classification( n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) - est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lgbm') + est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index 96d874153fa66..e7aa0c1b7ba61 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -12,7 +12,7 @@ from .types cimport Y_DTYPE_C from ..base import is_classifier -def get_equivalent_estimator(estimator, lib='lgbm'): +def get_equivalent_estimator(estimator, lib='lightgbm'): """Return an unfitted estimator from another lib with matching hyperparams. This utility function takes care of renaming the sklearn parameters into @@ -27,9 +27,9 @@ def get_equivalent_estimator(estimator, lib='lgbm'): # max_leaves # min_* - if lib not in ('lgbm', 'xgb', 'cat'): - raise ValueError('accepted libs are lgbm, xgb, and cat. got ' - '{}'.format(lib)) + if lib not in ('lightgbm', 'xgboost', 'catboost'): + raise ValueError('accepted libs are lightgbm, xgboost, and catboost. ' + ' got {}'.format(lib)) sklearn_params = estimator.get_params() @@ -113,7 +113,7 @@ def get_equivalent_estimator(estimator, lib='lgbm'): 'verbose': bool(sklearn_params['verbose']), } - if lib == 'lgbm': + if lib == 'lightgbm': from lightgbm import LGBMRegressor from lightgbm import LGBMClassifier if is_classifier(estimator): @@ -121,7 +121,7 @@ def get_equivalent_estimator(estimator, lib='lgbm'): else: return LGBMRegressor(**lgbm_params) - elif lib == 'xgb': + elif lib == 'xgboost': from xgboost import XGBRegressor from xgboost import XGBClassifier if is_classifier(estimator): From b9a151a6c5657e2c25fca5147143f2a5ae45cd26 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 26 Feb 2019 04:50:19 -0500 Subject: [PATCH 125/247] Addressed Adrin's comments: - added author name - split loss, binning and predictor into .py and .pyx files (requires creating dummy wrappers that can be called from python and that just call the cdef parallel code) --- build_tools/travis/install.sh | 2 - setup.py | 6 +- sklearn/_fast_gradient_boosting/_binning.pyx | 58 ++++++++ .../_gradient_boosting.pyx | 2 + sklearn/_fast_gradient_boosting/_loss.pyx | 139 ++++++++++++++++++ .../{predictor.pyx => _predictor.pyx} | 113 ++++---------- .../{binning.pyx => binning.py} | 69 ++------- .../gradient_boosting.py | 2 + sklearn/_fast_gradient_boosting/grower.py | 2 + sklearn/_fast_gradient_boosting/histogram.pyx | 2 + .../{loss.pyx => loss.py} | 114 ++------------ sklearn/_fast_gradient_boosting/predictor.py | 80 ++++++++++ sklearn/_fast_gradient_boosting/setup.py | 12 +- sklearn/_fast_gradient_boosting/splitting.pyx | 2 + sklearn/_fast_gradient_boosting/utils.pyx | 1 + 15 files changed, 340 insertions(+), 264 deletions(-) create mode 100644 sklearn/_fast_gradient_boosting/_binning.pyx create mode 100644 sklearn/_fast_gradient_boosting/_loss.pyx rename sklearn/_fast_gradient_boosting/{predictor.pyx => _predictor.pyx} (55%) rename sklearn/_fast_gradient_boosting/{binning.pyx => binning.py} (71%) rename sklearn/_fast_gradient_boosting/{loss.pyx => loss.py} (69%) create mode 100644 sklearn/_fast_gradient_boosting/predictor.py diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index a4f1734b3f90b..110a8661ed7c0 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -54,8 +54,6 @@ make_conda() { if [ $TRAVIS_OS_NAME = "osx" ] then fname=Miniconda3-latest-MacOSX-x86_64.sh - # we need to install a version on clang which supports OpenMP - TO_INSTALL="$TO_INSTALL llvm-openmp clang" else fname=Miniconda3-latest-Linux-x86_64.sh fi diff --git a/setup.py b/setup.py index a7646b53aceec..645db95120637 100755 --- a/setup.py +++ b/setup.py @@ -128,9 +128,9 @@ def get_openmp_flag(compiler): OPENMP_EXTENSIONS = [ "sklearn._fast_gradient_boosting._gradient_boosting", "sklearn._fast_gradient_boosting.splitting", - "sklearn._fast_gradient_boosting.binning", - "sklearn._fast_gradient_boosting.predictor", - "sklearn._fast_gradient_boosting.loss", + "sklearn._fast_gradient_boosting._binning", + "sklearn._fast_gradient_boosting._predictor", + "sklearn._fast_gradient_boosting._loss", ] diff --git a/sklearn/_fast_gradient_boosting/_binning.pyx b/sklearn/_fast_gradient_boosting/_binning.pyx new file mode 100644 index 0000000000000..711cdf99697a9 --- /dev/null +++ b/sklearn/_fast_gradient_boosting/_binning.pyx @@ -0,0 +1,58 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: nonecheck=False +# cython: language_level=3 + +# Author: Nicolas Hug + +cimport cython + +import numpy as np +cimport numpy as np +from cython.parallel import prange + +from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C + +cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, + X_BINNED_DTYPE_C [::1, :] binned): + """Bin numerical values to discrete integer-coded levels. + + Parameters + ---------- + data : array-like, shape=(n_samples, n_features) + The numerical data to bin. + binning_thresholds : tuple of arrays + For each feature, stores the increasing numeric values that are + used to separate the bins. + binned : array-like, shape=(n_samples, n_features) + Output array, must be fortran aligned. + """ + cdef: + int feature_idx + + for feature_idx in range(data.shape[1]): + _map_num_col_to_bins(data[:, feature_idx], + binning_thresholds[feature_idx], + binned[:, feature_idx]) + + +cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, + const X_DTYPE_C [:] binning_thresholds, + X_BINNED_DTYPE_C [:] binned): + """Binary search to the find the bin index for each value in data.""" + cdef: + int i + int left + int right + int middle + + for i in prange(data.shape[0], schedule='static', nogil=True): + left, right = 0, binning_thresholds.shape[0] + while left < right: + middle = (right + left - 1) // 2 + if data[i] <= binning_thresholds[middle]: + right = middle + else: + left = middle + 1 + binned[i] = left diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index 3c2d35314468a..ed4e85344e697 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -3,6 +3,8 @@ # cython: wraparound=False # cython: language_level=3 +# Author: Nicolas Hug + cimport cython from cython.parallel import prange import numpy as np diff --git a/sklearn/_fast_gradient_boosting/_loss.pyx b/sklearn/_fast_gradient_boosting/_loss.pyx new file mode 100644 index 0000000000000..eb8ef530a610c --- /dev/null +++ b/sklearn/_fast_gradient_boosting/_loss.pyx @@ -0,0 +1,139 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 + +# Author: Nicolas Hug + +cimport cython +from cython.parallel import prange +import numpy as np +cimport numpy as np +from scipy.special import expit +try: + from scipy.special import logsumexp +except ImportError: + from scipy.misc import logsumexp + +from libc.math cimport exp + +from .types cimport Y_DTYPE_C +from .types cimport G_H_DTYPE_C + + +def _update_gradients_least_squares( + G_H_DTYPE_C [::1] gradients, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [::1] raw_predictions): + + _update_gradients_least_squares_parallel( + gradients, y_true, raw_predictions) + + +def _update_gradients_hessians_binary_crossentropy( + G_H_DTYPE_C [::1] gradients, + G_H_DTYPE_C [::1] hessians, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [::1] raw_predictions): + + _update_gradients_hessians_binary_crossentropy_parallel( + gradients, hessians, y_true, raw_predictions) + + +def _update_gradients_hessians_categorical_crossentropy( + G_H_DTYPE_C [:, ::1] gradients, + G_H_DTYPE_C [:, ::1] hessians, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [:, ::1] raw_predictions): + _update_gradients_hessians_categorical_crossentropy_parallel( + gradients, hessians, y_true, raw_predictions) + + +cdef void _update_gradients_least_squares_parallel( + G_H_DTYPE_C [::1] gradients, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [::1] raw_predictions): + cdef: + int n_samples + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static', nogil=True): + # Note: a more correct exp is 2 * (raw_predictions - y_true) but + # since we use 1 for the constant hessian value (and not 2) this + # is strictly equivalent for the leaves values. + gradients[i] = raw_predictions[i] - y_true[i] + + +cdef void _update_gradients_hessians_binary_crossentropy_parallel( + G_H_DTYPE_C [::1] gradients, + G_H_DTYPE_C [::1] hessians, + const Y_DTYPE_C [::1] y_true, + const Y_DTYPE_C [::1] raw_predictions): + cdef: + int n_samples + Y_DTYPE_C p_i # proba that ith sample belongs to positive class + int i + + n_samples = raw_predictions.shape[0] + for i in prange(n_samples, schedule='static', nogil=True): + p_i = cexpit(raw_predictions[i]) + gradients[i] = p_i - y_true[i] + hessians[i] = p_i * (1. - p_i) + + +cdef void _update_gradients_hessians_categorical_crossentropy_parallel( + G_H_DTYPE_C [:, ::1] gradients, # shape (pred_dim, n_samples), OUT + G_H_DTYPE_C [:, ::1] hessians, # shape (pred_dim, n_samples), OUT + const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN + # shape (pred_dim, n_samples), IN + const Y_DTYPE_C [:, ::1] raw_predictions): + cdef: + int prediction_dim = raw_predictions.shape[0] + int n_samples = raw_predictions.shape[1] + int k # class index + int i # sample index + # p[i, k] is the probability that class(ith sample) == k. + # It's the softmax of the raw predictions + Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) + Y_DTYPE_C p_i_k + + for i in prange(n_samples, schedule='static', nogil=True): + # first compute softmaxes of sample i for each class + for k in range(prediction_dim): + p[i, k] = raw_predictions[k, i] # prepare softmax + compute_softmax(p, i) + # then update gradients and hessians + for k in range(prediction_dim): + p_i_k = p[i, k] + gradients[k, i] = p_i_k - (y_true[i] == k) + hessians[k, i] = p_i_k * (1. - p_i_k) + + +cdef inline void compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: + """Compute softmaxes of values in p[i, :].""" + # i needs to be passed (and stays constant) because otherwise Cython does + # not generate optimal code + + cdef: + Y_DTYPE_C max_value = p[i, 0] + Y_DTYPE_C sum_exps = 0. + unsigned int k + unsigned prediction_dim = p.shape[1] + + # Compute max value of array for numerical stability + for k in range(1, prediction_dim): + if max_value < p[i, k]: + max_value = p[i, k] + + for k in range(prediction_dim): + p[i, k] = exp(p[i, k] - max_value) + sum_exps += p[i, k] + + for k in range(prediction_dim): + p[i, k] /= sum_exps + + +cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: + """Custom expit (logistic sigmoid function)""" + return 1. / (1. + exp(-x)) diff --git a/sklearn/_fast_gradient_boosting/predictor.pyx b/sklearn/_fast_gradient_boosting/_predictor.pyx similarity index 55% rename from sklearn/_fast_gradient_boosting/predictor.pyx rename to sklearn/_fast_gradient_boosting/_predictor.pyx index 6c8aa850a8d5f..45ba70095c3c7 100644 --- a/sklearn/_fast_gradient_boosting/predictor.pyx +++ b/sklearn/_fast_gradient_boosting/_predictor.pyx @@ -2,36 +2,19 @@ # cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 -""" -This module contains the TreePredictor class which is used for prediction. -""" + +# Author: Nicolas Hug + cimport cython from cython.parallel import prange import numpy as np cimport numpy as np -from .types import X_DTYPE from .types cimport X_DTYPE_C -from .types import Y_DTYPE from .types cimport Y_DTYPE_C -from .types import X_BINNED_DTYPE from .types cimport X_BINNED_DTYPE_C -PREDICTOR_RECORD_DTYPE = np.dtype([ - ('value', Y_DTYPE), - ('count', np.uint32), - ('feature_idx', np.uint32), - ('threshold', X_DTYPE), - ('left', np.uint32), - ('right', np.uint32), - ('gain', Y_DTYPE), - ('depth', np.uint32), - ('is_leaf', np.uint8), - ('bin_threshold', X_BINNED_DTYPE), -]) - - cdef packed struct node_struct: # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It # needs to be packed since by default numpy dtypes aren't aligned @@ -47,58 +30,24 @@ cdef packed struct node_struct: X_BINNED_DTYPE_C bin_threshold -class TreePredictor: - """Tree class used for predictions. - - Parameters - ---------- - nodes : list of PREDICTOR_RECORD_DTYPE. - The nodes of the tree. - """ - def __init__(self, nodes): - self.nodes = nodes - - def get_n_leaf_nodes(self): - """Return number of leaves.""" - return int(self.nodes['is_leaf'].sum()) - - def get_max_depth(self): - """Return maximum depth among all leaves.""" - return int(self.nodes['depth'].max()) - - def predict(self, X): - """Predict raw values for non-binned data. - - Parameters - ---------- - X : array-like, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y : array, shape (n_samples,) - The raw predicted values. - """ - out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_numeric_data(self.nodes, X, out) - return out - - def predict_binned(self, X): - """Predict raw values for binned data. - - Parameters - ---------- - X : array-like, shape=(n_samples, n_features) - The input samples. - - Returns - ------- - y : array, shape (n_samples,) - The raw predicted values. - """ - out = np.empty(X.shape[0], dtype=Y_DTYPE) - _predict_from_binned_data(self.nodes, X, out) - return out +def _predict_from_numeric_data(nodes, numeric_data, out): + _predict_from_numeric_data_parallel(nodes, numeric_data, out) + + +def _predict_from_binned_data(nodes, binned_data, out): + _predict_from_binned_data_parallel(nodes, binned_data, out) + + +cdef void _predict_from_numeric_data_parallel( + node_struct [:] nodes, + const X_DTYPE_C [:, :] numeric_data, + Y_DTYPE_C [:] out): + + cdef: + int i + + for i in prange(numeric_data.shape[0], schedule='static', nogil=True): + out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i) cdef inline Y_DTYPE_C _predict_one_from_numeric_data( @@ -120,16 +69,16 @@ cdef inline Y_DTYPE_C _predict_one_from_numeric_data( node = nodes[node.right] -cdef void _predict_from_numeric_data( +cdef void _predict_from_binned_data_parallel( node_struct [:] nodes, - const X_DTYPE_C [:, :] numeric_data, + const X_BINNED_DTYPE_C [:, :] binned_data, Y_DTYPE_C [:] out): cdef: int i - for i in prange(numeric_data.shape[0], schedule='static', nogil=True): - out[i] = _predict_one_from_numeric_data(nodes, numeric_data, i) + for i in prange(binned_data.shape[0], schedule='static', nogil=True): + out[i] = _predict_one_from_binned_data(nodes, binned_data, i) cdef inline Y_DTYPE_C _predict_one_from_binned_data( @@ -149,15 +98,3 @@ cdef inline Y_DTYPE_C _predict_one_from_binned_data( node = nodes[node.left] else: node = nodes[node.right] - - -cdef void _predict_from_binned_data( - node_struct [:] nodes, - const X_BINNED_DTYPE_C [:, :] binned_data, - Y_DTYPE_C [:] out): - - cdef: - int i - - for i in prange(binned_data.shape[0], schedule='static', nogil=True): - out[i] = _predict_one_from_binned_data(nodes, binned_data, i) diff --git a/sklearn/_fast_gradient_boosting/binning.pyx b/sklearn/_fast_gradient_boosting/binning.py similarity index 71% rename from sklearn/_fast_gradient_boosting/binning.pyx rename to sklearn/_fast_gradient_boosting/binning.py index 83ed001a19e8e..d200bf9210208 100644 --- a/sklearn/_fast_gradient_boosting/binning.pyx +++ b/sklearn/_fast_gradient_boosting/binning.py @@ -1,31 +1,24 @@ -# cython: cdivision=True -# cython: boundscheck=False -# cython: wraparound=False -# cython: nonecheck=False -# cython: language_level=3 """ This module contains the BinMapper class. -BinMapper is used for mapping a real-valued dataset into integer-valued bins -with equally-spaced thresholds. +BinMapper is used for mapping a real-valued dataset into integer-valued bins. +Bin thresholds are computed with the quantiles so that each bin contains +approximately the same number of samples. """ -cimport cython +# Author: Nicolas Hug import numpy as np -cimport numpy as np -from cython.parallel import prange from ..utils import check_random_state, check_array -from ..utils.validation import check_is_fitted from ..base import BaseEstimator, TransformerMixin +from ..utils.validation import check_is_fitted +from ._binning import _map_to_bins from .types import X_DTYPE, X_BINNED_DTYPE -from .types cimport X_DTYPE_C, X_BINNED_DTYPE_C def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), random_state=None): - """Extract feature-wise equally-spaced quantiles from numerical data - + """Extract feature-wise quantiles from numerical data. Return ------ @@ -64,55 +57,11 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return binning_thresholds -cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, - X_BINNED_DTYPE_C [::1, :] binned): - """Bin numerical values to discrete integer-coded levels. - - Parameters - ---------- - data : array-like, shape=(n_samples, n_features) - The numerical data to bin. - binning_thresholds : tuple of arrays - For each feature, stores the increasing numeric values that are - used to separate the bins. - binned : array-like, shape=(n_samples, n_features) - Output array, must be fortran aligned. - """ - cdef: - int feature_idx - - for feature_idx in range(data.shape[1]): - _map_num_col_to_bins(data[:, feature_idx], - binning_thresholds[feature_idx], - binned[:, feature_idx]) - - -cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, - const X_DTYPE_C [:] binning_thresholds, - X_BINNED_DTYPE_C [:] binned): - """Binary search to the find the bin index for each value in data.""" - cdef: - int i - int left - int right - int middle - - for i in prange(data.shape[0], schedule='static', nogil=True): - left, right = 0, binning_thresholds.shape[0] - while left < right: - middle = (right + left - 1) // 2 - if data[i] <= binning_thresholds[middle]: - right = middle - else: - left = middle + 1 - binned[i] = left - - class BinMapper(BaseEstimator, TransformerMixin): """Transformer that maps a dataset into integer-valued bins. - The bins are created in a feature-wise fashion, with equally-spaced - quantiles. + The bins are created in a feature-wise fashion, using quantiles so that + each bins contains approximately the same number of samples. Large datasets are subsampled, but the feature-wise quantiles should remain stable. diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 04c0be0882823..66e68b1c00523 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -1,4 +1,6 @@ """Fast Gradient Boosting decision trees for classification and regression.""" +# Author: Nicolas Hug + from abc import ABC, abstractmethod import numpy as np diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 8efacde5d2b8b..9e97fcfd46fff 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -4,6 +4,8 @@ TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on the gradients and hessians of the training data. """ +# Author: Nicolas Hug + from heapq import heappush, heappop import numpy as np from timeit import default_timer as time diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index e0a6d6841dcff..3768b2738f256 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -8,6 +8,8 @@ A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each feature has its own histogram. A histogram contains the sum of gradients and hessians of all the samples belonging to each bin. """ +# Author: Nicolas Hug + cimport cython import numpy as np diff --git a/sklearn/_fast_gradient_boosting/loss.pyx b/sklearn/_fast_gradient_boosting/loss.py similarity index 69% rename from sklearn/_fast_gradient_boosting/loss.pyx rename to sklearn/_fast_gradient_boosting/loss.py index 3a4fb5bb82fe7..7f7334ae141ed 100644 --- a/sklearn/_fast_gradient_boosting/loss.pyx +++ b/sklearn/_fast_gradient_boosting/loss.py @@ -1,31 +1,25 @@ -# cython: cdivision=True -# cython: boundscheck=False -# cython: wraparound=False -# cython: language_level=3 """ This module contains the loss classes. Specific losses are used for regression, binary classification or multiclass classification. """ +# Author: Nicolas Hug + from abc import ABC, abstractmethod -cimport cython -from cython.parallel import prange import numpy as np -cimport numpy as np from scipy.special import expit try: from scipy.special import logsumexp except ImportError: from scipy.misc import logsumexp -from libc.math cimport exp - from .types import Y_DTYPE -from .types cimport Y_DTYPE_C from .types import G_H_DTYPE -from .types cimport G_H_DTYPE_C +from ._loss import _update_gradients_least_squares +from ._loss import _update_gradients_hessians_binary_crossentropy +from ._loss import _update_gradients_hessians_categorical_crossentropy class BaseLoss(ABC): @@ -140,24 +134,8 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - return _update_gradients_least_squares(gradients, y_true, - raw_predictions) - - -cdef void _update_gradients_least_squares( - G_H_DTYPE_C [::1] gradients, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions): - cdef: - int n_samples - int i - - n_samples = raw_predictions.shape[0] - for i in prange(n_samples, schedule='static', nogil=True): - # Note: a more correct exp is 2 * (raw_predictions - y_true) but - # since we use 1 for the constant hessian value (and not 2) this - # is strictly equivalent for the leaves values. - gradients[i] = raw_predictions[i] - y_true[i] + _update_gradients_least_squares(gradients, y_true, + raw_predictions) class BinaryCrossEntropy(BaseLoss): @@ -197,7 +175,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) hessians = hessians.reshape(-1) - return _update_gradients_hessians_binary_crossentropy( + _update_gradients_hessians_binary_crossentropy( gradients, hessians, y_true, raw_predictions) def predict_proba(self, raw_predictions): @@ -210,23 +188,6 @@ def predict_proba(self, raw_predictions): return proba -cdef void _update_gradients_hessians_binary_crossentropy( - G_H_DTYPE_C [::1] gradients, - G_H_DTYPE_C [::1] hessians, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions): - cdef: - int n_samples - Y_DTYPE_C p_i # proba that ith sample belongs to positive class - int i - - n_samples = raw_predictions.shape[0] - for i in prange(n_samples, schedule='static', nogil=True): - p_i = cexpit(raw_predictions[i]) - gradients[i] = p_i - y_true[i] - hessians[i] = p_i * (1. - p_i) - - class CategoricalCrossEntropy(BaseLoss): """Categorical cross-entropy loss, for multiclass classification. @@ -259,7 +220,7 @@ def get_baseline_prediction(self, y_train, prediction_dim): def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions): - return _update_gradients_hessians_categorical_crossentropy( + _update_gradients_hessians_categorical_crossentropy( gradients, hessians, y_true, raw_predictions) def predict_proba(self, raw_predictions): @@ -270,63 +231,6 @@ def predict_proba(self, raw_predictions): return proba.T -cdef void _update_gradients_hessians_categorical_crossentropy( - G_H_DTYPE_C [:, ::1] gradients, # shape (pred_dim, n_samples), OUT - G_H_DTYPE_C [:, ::1] hessians, # shape (pred_dim, n_samples), OUT - const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN - # shape (pred_dim, n_samples), IN - const Y_DTYPE_C [:, ::1] raw_predictions): - cdef: - int prediction_dim = raw_predictions.shape[0] - int n_samples = raw_predictions.shape[1] - int k # class index - int i # sample index - # p[i, k] is the probability that class(ith sample) == k. - # It's the softmax of the raw predictions - Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim)) - Y_DTYPE_C p_i_k - - for i in prange(n_samples, schedule='static', nogil=True): - # first compute softmaxes of sample i for each class - for k in range(prediction_dim): - p[i, k] = raw_predictions[k, i] # prepare softmax - compute_softmax(p, i) - # then update gradients and hessians - for k in range(prediction_dim): - p_i_k = p[i, k] - gradients[k, i] = p_i_k - (y_true[i] == k) - hessians[k, i] = p_i_k * (1. - p_i_k) - - -cdef inline void compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: - """Compute softmaxes of values in p[i, :].""" - # i needs to be passed (and stays constant) because otherwise Cython does - # not generate optimal code - - cdef: - Y_DTYPE_C max_value = p[i, 0] - Y_DTYPE_C sum_exps = 0. - unsigned int k - unsigned prediction_dim = p.shape[1] - - # Compute max value of array for numerical stability - for k in range(1, prediction_dim): - if max_value < p[i, k]: - max_value = p[i, k] - - for k in range(prediction_dim): - p[i, k] = exp(p[i, k] - max_value) - sum_exps += p[i, k] - - for k in range(prediction_dim): - p[i, k] /= sum_exps - - -cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: - """Custom expit (logistic sigmoid function)""" - return 1. / (1. + exp(-x)) - - _LOSSES = { 'least_squares': LeastSquares, 'binary_crossentropy': BinaryCrossEntropy, diff --git a/sklearn/_fast_gradient_boosting/predictor.py b/sklearn/_fast_gradient_boosting/predictor.py new file mode 100644 index 0000000000000..71d5b44796d50 --- /dev/null +++ b/sklearn/_fast_gradient_boosting/predictor.py @@ -0,0 +1,80 @@ +""" +This module contains the TreePredictor class which is used for prediction. +""" +# Author: Nicolas Hug + +import numpy as np + +from .types import X_DTYPE +from .types import Y_DTYPE +from .types import X_BINNED_DTYPE +from ._predictor import _predict_from_numeric_data +from ._predictor import _predict_from_binned_data + + +PREDICTOR_RECORD_DTYPE = np.dtype([ + ('value', Y_DTYPE), + ('count', np.uint32), + ('feature_idx', np.uint32), + ('threshold', X_DTYPE), + ('left', np.uint32), + ('right', np.uint32), + ('gain', Y_DTYPE), + ('depth', np.uint32), + ('is_leaf', np.uint8), + ('bin_threshold', X_BINNED_DTYPE), +]) + + +class TreePredictor: + """Tree class used for predictions. + + Parameters + ---------- + nodes : list of PREDICTOR_RECORD_DTYPE. + The nodes of the tree. + """ + def __init__(self, nodes): + self.nodes = nodes + + def get_n_leaf_nodes(self): + """Return number of leaves.""" + return int(self.nodes['is_leaf'].sum()) + + def get_max_depth(self): + """Return maximum depth among all leaves.""" + return int(self.nodes['depth'].max()) + + def predict(self, X): + """Predict raw values for non-binned data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y : array, shape (n_samples,) + The raw predicted values. + """ + out = np.empty(X.shape[0], dtype=Y_DTYPE) + _predict_from_numeric_data(self.nodes, X, out) + return out + + def predict_binned(self, X): + """Predict raw values for binned data. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y : array, shape (n_samples,) + The raw predicted values. + """ + out = np.empty(X.shape[0], dtype=Y_DTYPE) + _predict_from_binned_data(self.nodes, X, out) + return out diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py index a64ea2f92b3a0..48952619c10e2 100644 --- a/sklearn/_fast_gradient_boosting/setup.py +++ b/sklearn/_fast_gradient_boosting/setup.py @@ -17,16 +17,16 @@ def configuration(parent_package="", top_path=None): sources=["splitting.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("binning", - sources=["binning.pyx"], + config.add_extension("_binning", + sources=["_binning.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("predictor", - sources=["predictor.pyx"], + config.add_extension("_predictor", + sources=["_predictor.pyx"], include_dirs=[numpy.get_include()]) - config.add_extension("loss", - sources=["loss.pyx"], + config.add_extension("_loss", + sources=["_loss.pyx"], include_dirs=[numpy.get_include()]) config.add_extension("types", diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index cb51d8fdbfc7e..c97bcea025b35 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -9,6 +9,8 @@ - Apply a split to a node, i.e. split the indices of the samples at the node into the newly created left and right childs. """ +# Author: Nicolas Hug + cimport cython from cython.parallel import prange import numpy as np diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index e7aa0c1b7ba61..b4c307d41cb15 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -3,6 +3,7 @@ # cython: wraparound=False # cython: language_level=3 """This module contains utility routines.""" +# Author: Nicolas Hug from cython.parallel import prange From b7cf145a4e1762408b49ca17cc46d89d59828969 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 04:34:57 -0500 Subject: [PATCH 126/247] better use of _in_fit attribute --- .../gradient_boosting.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 66e68b1c00523..200dd977b7969 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -102,9 +102,13 @@ def fit(self, X, y): self.n_features_ = X.shape[1] # used for validation in predict() # we need this stateful variable to tell raw_predict() that it was - # called from fit(), which only passes pre-binned data to - # raw_predict() via the scorer_ attribute. predicting is faster on - # pre-binned data. + # called from fit() (this current method), and that the data it has + # received is pre-binned. + # predicting is faster on pre-binned data, so we want early stopping + # predictions to be made on pre-binned data. Unfortunately the scorer_ + # can only call predict() or predict_proba(), not raw_predict(), and + # there's no way to tell the scorer that it needs to predict binned + # data. self._in_fit = True # bin the data @@ -276,7 +280,7 @@ def fit(self, X, y): self.train_score_ = np.asarray(self.train_score_) self.validation_score_ = np.asarray(self.validation_score_) - self._in_fit = False + del self._in_fit # hard delete so we're sure it can't be used anymore return self def _check_early_stopping(self, X_binned_train, y_train, @@ -316,8 +320,8 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _get_scores(self, X, y): - """Compute scores on data X with target y. + def _get_scores(self, X_binned, y): + """Compute scores on data X_binned with target y. Scores are computed with a scorer if scoring parameter is not 'loss', else with the loss. As higher is always better, we return @@ -325,10 +329,10 @@ def _get_scores(self, X, y): """ if self.scoring != 'loss': - return self.scorer_(self, X, y) + return self.scorer_(self, X_binned, y) # Else, use loss - raw_predictions = self._raw_predict(X) + raw_predictions = self._raw_predict(X_binned) return -self.loss_(y, raw_predictions) def _print_iteration_stats(self, iteration_start_time): @@ -385,7 +389,7 @@ def _raw_predict(self, X): 'X has {} features but this estimator was trained with ' '{} features.'.format(X.shape[1], self.n_features_) ) - is_binned = self._in_fit and X.dtype == X_BINNED_DTYPE + is_binned = getattr(self, '_in_fit', False) n_samples = X.shape[0] raw_predictions = np.zeros( shape=(self._n_trees_per_iteration, n_samples), From 82f4ce1f1d8eeff1985f484c80d4c456f2b38e99 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 04:47:10 -0500 Subject: [PATCH 127/247] changed use of estimators for predictors and iterations --- .../gradient_boosting.py | 94 +++++++++---------- .../tests/test_compare_lightgbm.py | 12 +-- .../tests/test_gradient_boosting.py | 20 ++-- sklearn/_fast_gradient_boosting/utils.pyx | 6 +- 4 files changed, 63 insertions(+), 69 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 200dd977b7969..378db96c8588a 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -24,13 +24,13 @@ class BaseFastGradientBoosting(BaseEstimator, ABC): """Base class for fast gradient boosting estimators.""" @abstractmethod - def __init__(self, loss, learning_rate, n_estimators, max_leaf_nodes, + def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, scoring, validation_fraction, n_iter_no_change, tol, verbose, random_state): self.loss = loss self.learning_rate = learning_rate - self.n_estimators = n_estimators + self.max_iter = max_iter self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf @@ -58,9 +58,9 @@ def _validate_parameters(self): if self.learning_rate <= 0: raise ValueError('learning_rate={} must ' 'be strictly positive'.format(self.learning_rate)) - if self.n_estimators < 1: - raise ValueError('n_estimators={} must not be smaller ' - 'than 1.'.format(self.n_estimators)) + if self.max_iter < 1: + raise ValueError('max_iter={} must not be smaller ' + 'than 1.'.format(self.max_iter)) if self.n_iter_no_change is not None and self.n_iter_no_change < 0: raise ValueError('n_iter_no_change={} must be ' 'positive.'.format(self.n_iter_no_change)) @@ -188,9 +188,9 @@ def fit(self, X, y): prediction_dim=self._n_trees_per_iteration ) - # estimators_ is a matrix (list of lists) of TreePredictor objects + # predictors is a matrix (list of lists) of TreePredictor objects # with shape (n_iter_, n_trees_per_iteration) - self.estimators_ = estimators = [] + self._predictors = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. @@ -206,18 +206,18 @@ def fit(self, X, y): self._check_early_stopping(X_binned_small_train, y_small_train, X_binned_val, y_val) - for iteration in range(self.n_estimators): + for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() - print("[{}/{}] ".format(iteration + 1, self.n_estimators), + print("[{}/{}] ".format(iteration + 1, self.max_iter), end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) - estimators.append([]) + predictors.append([]) # Build `n_trees_per_iteration` trees. for k in range(self._n_trees_per_iteration): @@ -236,9 +236,9 @@ def fit(self, X, y): acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time - estimator = grower.make_predictor( + predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) - estimators[-1].append(estimator) + predictors[-1].append(predictor) # Update raw_predictions with the predictions of the newly # created tree. @@ -263,12 +263,12 @@ def fit(self, X, y): if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( - estimator.get_n_leaf_nodes() - for predictors_at_ith_iteration in self.estimators_ - for estimator in predictors_at_ith_iteration) + predictor.get_n_leaf_nodes() + for predictors_at_ith_iteration in self._predictors + for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) - for predictors_at_ith_iteration in self.estimators_) + for predictors_at_ith_iteration in self._predictors) print("Fit {} trees in {:.3f} s, ({} total leaves)".format( n_predictors, duration, n_total_leaves)) print("{:<32} {:.3f}s".format('Time spent finding best splits:', @@ -340,14 +340,14 @@ def _print_iteration_stats(self, iteration_start_time): log_msg = '' predictors_of_ith_iteration = [ - predictors_list for predictors_list in self.estimators_[-1] + predictors_list for predictors_list in self._predictors[-1] if predictors_list ] n_trees = len(predictors_of_ith_iteration) - max_depth = max(estimator.get_max_depth() - for estimator in predictors_of_ith_iteration) - n_leaves = sum(estimator.get_n_leaf_nodes() - for estimator in predictors_of_ith_iteration) + max_depth = max(predictor.get_max_depth() + for predictor in predictors_of_ith_iteration) + n_leaves = sum(predictor.get_n_leaf_nodes() + for predictor in predictors_of_ith_iteration) if n_trees == 1: log_msg += ("{} tree, {} leaves, ".format(n_trees, n_leaves)) @@ -383,7 +383,7 @@ def _raw_predict(self, X): The raw predicted values. """ X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE]) - check_is_fitted(self, 'estimators_') + check_is_fitted(self, '_predictors') if X.shape[1] != self.n_features_: raise ValueError( 'X has {} features but this estimator was trained with ' @@ -396,10 +396,10 @@ def _raw_predict(self, X): dtype=self._baseline_prediction.dtype ) raw_predictions += self._baseline_prediction - for predictors_of_ith_iteration in self.estimators_: - for k, estimator in enumerate(predictors_of_ith_iteration): - predict = (estimator.predict_binned if is_binned - else estimator.predict) + for predictors_of_ith_iteration in self._predictors: + for k, predictor in enumerate(predictors_of_ith_iteration): + predict = (predictor.predict_binned if is_binned + else predictor.predict) raw_predictions[k, :] += predict(X) return raw_predictions @@ -413,9 +413,9 @@ def _encode_y(self, y=None): pass @property - def n_estimators_(self): - check_is_fitted(self, 'estimators_') - return len(self.estimators_) + def n_iter_(self): + check_is_fitted(self, '_predictors') + return len(self._predictors) class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): @@ -439,7 +439,7 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. - n_estimators : int, optional(default=100) + max_iter : int, optional(default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees. max_leaf_nodes : int or None, optional(default=None) @@ -489,18 +489,15 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): Attributes ---------- - n_estimators_ : int - The number of estimators as selected by early stopping (if - n_iter_no_change is not None). Otherwise it is set to n_estimators. - estimators_ : list of lists, shape=(n_estimators, n_trees_per_iteration) - The collection of fitted sub-estimators. The number of trees per - iteration is ``n_classes`` in multiclass classification, else 1. - train_score_ : array, shape=(n_estimators + 1) + n_iter_ : int + The number of iterations as selected by early stopping (if + n_iter_no_change is not None). Otherwise it corresponds to max_iter. + train_score_ : array, shape=(max_iter + 1) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if no early stopping. - validation_score_ : array, shape=(n_estimators + 1) + validation_score_ : array, shape=(max_iter + 1) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if @@ -519,12 +516,12 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): _VALID_LOSSES = ('least_squares',) def __init__(self, loss='least_squares', learning_rate=0.1, - n_estimators=100, max_leaf_nodes=31, max_depth=None, + max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=5, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingRegressor, self).__init__( - loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, + loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, @@ -586,7 +583,7 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. - n_estimators : int, optional(default=100) + max_iter : int, optional(default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built. @@ -637,18 +634,15 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, Attributes ---------- - n_estimators_ : int + n_iter_ : int The number of estimators as selected by early stopping (if - n_iter_no_change is not None). Otherwise it is set to n_estimators. - estimators_ : list of lists, shape=(n_estimators, n_trees_per_iteration) - The collection of fitted sub-estimators. The number of trees per - iteration is ``n_classes`` in multiclass classification, else 1. - train_score_ : array, shape=(n_estimators + 1) + n_iter_no_change is not None). Otherwise it corresponds to max_iter. + train_score_ : array, shape=(max_iter + 1) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if no early stopping. - validation_score_ : array, shape=(n_estimators + 1) + validation_score_ : array, shape=(max_iter + 1) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if @@ -667,13 +661,13 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', 'auto') - def __init__(self, loss='auto', learning_rate=0.1, n_estimators=100, + def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=5, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(FastGradientBoostingClassifier, self).__init__( - loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, + loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 8faa1e2b46780..5265975936b56 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -39,7 +39,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples - n_estimators = 1 + max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, @@ -53,7 +53,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = FastGradientBoostingRegressor( - n_estimators=n_estimators, + max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, @@ -91,7 +91,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, rng = np.random.RandomState(seed=seed) n_samples = n_samples - n_estimators = 1 + max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, @@ -106,7 +106,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, est_sklearn = FastGradientBoostingClassifier( loss='binary_crossentropy', - n_estimators=n_estimators, + max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, @@ -151,7 +151,7 @@ def test_same_predictions_multiclass_classification( rng = np.random.RandomState(seed=seed) n_samples = n_samples - n_estimators = 1 + max_iter = 1 max_bins = 256 lr = 1 @@ -168,7 +168,7 @@ def test_same_predictions_multiclass_classification( est_sklearn = FastGradientBoostingClassifier( loss='categorical_crossentropy', - n_estimators=n_estimators, + max_iter=max_iter, max_bins=max_bins, learning_rate=lr, n_iter_no_change=None, diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index e6a116d78d53e..ada99d03aa973 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -30,8 +30,8 @@ def test_init_parameters_validation(GradientBoosting, X, y): with pytest.raises( ValueError, - match="n_estimators=0 must not be smaller than 1"): - GradientBoosting(n_estimators=0).fit(X, y) + match="max_iter=0 must not be smaller than 1"): + GradientBoosting(max_iter=0).fit(X, y) with pytest.raises( ValueError, @@ -91,7 +91,7 @@ def test_init_parameters_validation(GradientBoosting, X, y): def test_early_stopping_regression(scoring, validation_fraction, n_iter_no_change, tol): - n_estimators = 200 + max_iter = 200 X, y = make_regression(random_state=0) @@ -99,15 +99,15 @@ def test_early_stopping_regression(scoring, validation_fraction, scoring=scoring, tol=tol, validation_fraction=validation_fraction, - n_estimators=n_estimators, + max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if n_iter_no_change is not None: - assert n_iter_no_change <= gb.n_estimators_ < n_estimators + assert n_iter_no_change <= gb.n_iter_ < max_iter else: - assert gb.n_estimators_ == n_estimators + assert gb.n_iter_ == max_iter @pytest.mark.parametrize('data', ( @@ -127,7 +127,7 @@ def test_early_stopping_regression(scoring, validation_fraction, def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): - n_estimators = 50 + max_iter = 50 X, y = data @@ -136,15 +136,15 @@ def test_early_stopping_classification(data, scoring, validation_fraction, scoring=scoring, tol=tol, validation_fraction=validation_fraction, - n_estimators=n_estimators, + max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if n_iter_no_change is not None: - assert n_iter_no_change <= gb.n_estimators_ < n_estimators + assert n_iter_no_change <= gb.n_iter_ < max_iter else: - assert gb.n_estimators_ == n_estimators + assert gb.n_iter_ == max_iter def test_should_stop(): diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index b4c307d41cb15..0f81a42d3f44a 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -50,7 +50,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): lgbm_params = { 'objective': lgbm_loss_mapping[sklearn_params['loss']], 'learning_rate': sklearn_params['learning_rate'], - 'n_estimators': sklearn_params['n_estimators'], + 'n_estimators': sklearn_params['max_iter'], 'num_leaves': sklearn_params['max_leaf_nodes'], 'max_depth': sklearn_params['max_depth'], 'min_child_samples': sklearn_params['min_samples_leaf'], @@ -84,7 +84,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): 'grow_policy': 'lossguide', # so that we can set max_leaves 'objective': xgb_loss_mapping[sklearn_params['loss']], 'learning_rate': sklearn_params['learning_rate'], - 'n_estimators': sklearn_params['n_estimators'], + 'n_estimators': sklearn_params['max_iter'], 'max_leaves': sklearn_params['max_leaf_nodes'], 'max_depth': sklearn_params['max_depth'] or 0, 'lambda': sklearn_params['l2_regularization'], @@ -105,7 +105,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): cat_params = { 'loss_function': cat_loss_mapping[sklearn_params['loss']], 'learning_rate': sklearn_params['learning_rate'], - 'iterations': sklearn_params['n_estimators'], + 'iterations': sklearn_params['max_iter'], 'depth': sklearn_params['max_depth'], 'reg_lambda': sklearn_params['l2_regularization'], 'max_bin': sklearn_params['max_bins'], From 5d53e5bccc3607ad4ffb43c042bf3b9cd1e3bf88 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 05:05:06 -0500 Subject: [PATCH 128/247] BinMapper now private --- sklearn/_fast_gradient_boosting/binning.py | 2 +- .../gradient_boosting.py | 4 ++-- .../tests/test_binning.py | 24 +++++++++---------- .../tests/test_compare_lightgbm.py | 8 +++---- .../tests/test_grower.py | 6 ++--- .../tests/test_predictor.py | 4 ++-- sklearn/_fast_gradient_boosting/utils.pyx | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/binning.py b/sklearn/_fast_gradient_boosting/binning.py index d200bf9210208..a7738d6607161 100644 --- a/sklearn/_fast_gradient_boosting/binning.py +++ b/sklearn/_fast_gradient_boosting/binning.py @@ -57,7 +57,7 @@ def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), return binning_thresholds -class BinMapper(BaseEstimator, TransformerMixin): +class _BinMapper(BaseEstimator, TransformerMixin): """Transformer that maps a dataset into integer-valued bins. The bins are created in a feature-wise fashion, using quantiles so that diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 378db96c8588a..394f05d8bbd16 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -15,7 +15,7 @@ from ._gradient_boosting import _update_raw_predictions from .types import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE -from .binning import BinMapper +from .binning import _BinMapper from .grower import TreeGrower from .loss import _LOSSES @@ -116,7 +116,7 @@ def fit(self, X, y): print("Binning {:.3f} GB of data: ".format(X.nbytes / 1e9), end="", flush=True) tic = time() - self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) + self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) toc = time() if self.verbose: diff --git a/sklearn/_fast_gradient_boosting/tests/test_binning.py b/sklearn/_fast_gradient_boosting/tests/test_binning.py index 53d0feb8ab6e1..71eb5513e668b 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_binning.py +++ b/sklearn/_fast_gradient_boosting/tests/test_binning.py @@ -2,7 +2,7 @@ from numpy.testing import assert_array_equal, assert_allclose import pytest -from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.binning import _BinMapper from sklearn._fast_gradient_boosting.binning import _find_binning_thresholds from sklearn._fast_gradient_boosting.binning import _map_to_bins from sklearn._fast_gradient_boosting.types import X_DTYPE, X_BINNED_DTYPE @@ -94,7 +94,7 @@ def test_bin_mapper_random_data(n_bins): expected_count_per_bin = n_samples // n_bins tol = int(0.05 * expected_count_per_bin) - mapper = BinMapper(max_bins=n_bins, random_state=42).fit(DATA) + mapper = _BinMapper(max_bins=n_bins, random_state=42).fit(DATA) binned = mapper.transform(DATA) assert binned.shape == (n_samples, n_features) @@ -124,7 +124,7 @@ def test_bin_mapper_small_random_data(n_samples, n_bins): data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) assert len(np.unique(data)) == n_samples - mapper = BinMapper(max_bins=n_bins, random_state=42) + mapper = _BinMapper(max_bins=n_bins, random_state=42) binned = mapper.fit_transform(data) assert binned.shape == data.shape @@ -140,7 +140,7 @@ def test_bin_mapper_small_random_data(n_samples, n_bins): ]) def test_bin_mapper_identity_repeated_values(n_bins, n_distinct, multiplier): data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) - binned = BinMapper(max_bins=n_bins).fit_transform(data) + binned = _BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(data, binned) @@ -157,12 +157,12 @@ def test_bin_mapper_repeated_values_invariance(n_distinct): data = data.reshape(-1, 1) - mapper_1 = BinMapper(max_bins=n_distinct) + mapper_1 = _BinMapper(max_bins=n_distinct) binned_1 = mapper_1.fit_transform(data) assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) # Adding more bins to the mapper yields the same results (same thresholds) - mapper_2 = BinMapper(max_bins=min(256, n_distinct * 3)) + mapper_2 = _BinMapper(max_bins=min(256, n_distinct * 3)) binned_2 = mapper_2.fit_transform(data) assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) @@ -176,7 +176,7 @@ def test_bin_mapper_repeated_values_invariance(n_distinct): ]) def test_bin_mapper_identity_small(n_bins, scale, offset): data = np.arange(n_bins).reshape(-1, 1) * scale + offset - binned = BinMapper(max_bins=n_bins).fit_transform(data) + binned = _BinMapper(max_bins=n_bins).fit_transform(data) assert_array_equal(binned, np.arange(n_bins).reshape(-1, 1)) @@ -192,8 +192,8 @@ def test_bin_mapper_identity_small(n_bins, scale, offset): def test_bin_mapper_idempotence(n_bins_small, n_bins_large): assert n_bins_large >= n_bins_small data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) - mapper_small = BinMapper(max_bins=n_bins_small) - mapper_large = BinMapper(max_bins=n_bins_large) + mapper_small = _BinMapper(max_bins=n_bins_small) + mapper_large = _BinMapper(max_bins=n_bins_large) binned_small = mapper_small.fit_transform(data) binned_large = mapper_large.fit_transform(binned_small) assert_array_equal(binned_small, binned_large) @@ -208,14 +208,14 @@ def test_n_bins_per_feature(max_bins, diff): n_unique_values = max_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) - mapper = BinMapper(max_bins=max_bins).fit(X) + mapper = _BinMapper(max_bins=max_bins).fit(X) assert np.all(mapper.n_bins_per_feature_ == min(max_bins, n_unique_values)) def test_subsample(): # Make sure bin thresholds are different when applying subsampling - mapper_no_subsample = BinMapper(subsample=None, random_state=0).fit(DATA) - mapper_subsample = BinMapper(subsample=256, random_state=0).fit(DATA) + mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA) + mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA) for feature in range(DATA.shape[1]): assert not np.allclose(mapper_no_subsample.bin_thresholds_[feature], diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 5265975936b56..38769b8dfd8ca 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -6,7 +6,7 @@ from sklearn.ensemble import FastGradientBoostingRegressor from sklearn.ensemble import FastGradientBoostingClassifier -from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.binning import _BinMapper from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator @@ -48,7 +48,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) @@ -100,7 +100,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) @@ -162,7 +162,7 @@ def test_same_predictions_multiclass_classification( if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned - X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) + X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) diff --git a/sklearn/_fast_gradient_boosting/tests/test_grower.py b/sklearn/_fast_gradient_boosting/tests/test_grower.py index f5024e3bb6594..f662056c26b6d 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_grower.py +++ b/sklearn/_fast_gradient_boosting/tests/test_grower.py @@ -4,7 +4,7 @@ from pytest import approx from sklearn._fast_gradient_boosting.grower import TreeGrower -from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.binning import _BinMapper from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE from sklearn._fast_gradient_boosting.types import Y_DTYPE from sklearn._fast_gradient_boosting.types import G_H_DTYPE @@ -206,7 +206,7 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, if noise: y_scale = y.std() y += rng.normal(scale=noise, size=n_samples) * y_scale - mapper = BinMapper(max_bins=n_bins) + mapper = _BinMapper(max_bins=n_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) @@ -245,7 +245,7 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): # data = linear target, 3 features, 1 irrelevant. X = rng.normal(size=(n_samples, 3)) y = X[:, 0] - X[:, 1] - mapper = BinMapper(max_bins=max_bins) + mapper = _BinMapper(max_bins=max_bins) X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) diff --git a/sklearn/_fast_gradient_boosting/tests/test_predictor.py b/sklearn/_fast_gradient_boosting/tests/test_predictor.py index e31c639c09dbe..724a238dabcfb 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_predictor.py +++ b/sklearn/_fast_gradient_boosting/tests/test_predictor.py @@ -4,7 +4,7 @@ from sklearn.metrics import r2_score import pytest -from sklearn._fast_gradient_boosting.binning import BinMapper +from sklearn._fast_gradient_boosting.binning import _BinMapper from sklearn._fast_gradient_boosting.grower import TreeGrower from sklearn._fast_gradient_boosting.types import G_H_DTYPE @@ -15,7 +15,7 @@ def test_boston_dataset(max_bins): X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=42) - mapper = BinMapper(max_bins=max_bins, random_state=42) + mapper = _BinMapper(max_bins=max_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/_fast_gradient_boosting/utils.pyx index 0f81a42d3f44a..cdbf6ee032c93 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/_fast_gradient_boosting/utils.pyx @@ -7,7 +7,7 @@ from cython.parallel import prange -from .binning import BinMapper +from .binning import _BinMapper from .types cimport G_H_DTYPE_C from .types cimport Y_DTYPE_C from ..base import is_classifier @@ -64,7 +64,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): 'boost_from_average': True, 'enable_bundle': False, # also makes feature order consistent 'min_data_in_bin': 1, - 'subsample_for_bin': BinMapper().subsample, + 'subsample_for_bin': _BinMapper().subsample, } if sklearn_params['loss'] == 'categorical_crossentropy': From d79d636030a1fa9fe8a40db4b91cb439449d1c2e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 05:21:57 -0500 Subject: [PATCH 129/247] renamed estimators from Fastblahblah to Histblahblah --- benchmarks/bench_fast_gradient_boosting.py | 8 +++---- ...bench_fast_gradient_boosting_higgsboson.py | 4 ++-- doc/modules/classes.rst | 4 ++-- doc/modules/ensemble.rst | 4 ++-- sklearn/_fast_gradient_boosting/__init__.py | 6 ++--- .../gradient_boosting.py | 24 +++++++++---------- .../tests/test_compare_lightgbm.py | 10 ++++---- .../tests/test_gradient_boosting.py | 18 +++++++------- sklearn/ensemble/__init__.py | 6 ++--- sklearn/ensemble/gradient_boosting.py | 4 ++-- 10 files changed, 44 insertions(+), 44 deletions(-) diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_fast_gradient_boosting.py index 1f0898aa76365..24f3aac450955 100644 --- a/benchmarks/bench_fast_gradient_boosting.py +++ b/benchmarks/bench_fast_gradient_boosting.py @@ -3,8 +3,8 @@ import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split -from sklearn.ensemble import FastGradientBoostingClassifier -from sklearn.ensemble import FastGradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.datasets import make_classification from sklearn.datasets import make_regression from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator @@ -41,11 +41,11 @@ def get_estimator_and_data(): n_classes=args.n_classes, n_clusters_per_class=1, random_state=0) - return X, y, FastGradientBoostingClassifier + return X, y, HistGradientBoostingClassifier elif args.problem == 'regression': X, y = make_regression(args.n_samples_max, n_features=args.n_features, random_state=0) - return X, y, FastGradientBoostingRegressor + return X, y, HistGradientBoostingRegressor X, y, Estimator = get_estimator_and_data() diff --git a/benchmarks/bench_fast_gradient_boosting_higgsboson.py b/benchmarks/bench_fast_gradient_boosting_higgsboson.py index e37341d208078..3ddc03fd75619 100644 --- a/benchmarks/bench_fast_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_fast_gradient_boosting_higgsboson.py @@ -9,7 +9,7 @@ from joblib import Memory from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score -from sklearn.ensemble import FastGradientBoostingClassifier +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator @@ -67,7 +67,7 @@ def load_data(): print("Fitting a sklearn model...") tic = time() -est = FastGradientBoostingClassifier( +est = HistGradientBoostingClassifier( loss='binary_crossentropy', learning_rate=lr, n_estimators=n_trees, diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3933c3a46dd11..39365b12bbafb 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -415,8 +415,8 @@ Samples generator ensemble.ExtraTreesRegressor ensemble.GradientBoostingClassifier ensemble.GradientBoostingRegressor - ensemble.FastGradientBoostingClassifier - ensemble.FastGradientBoostingRegressor + ensemble.HistGradientBoostingClassifier + ensemble.HistGradientBoostingRegressor ensemble.IsolationForest ensemble.RandomForestClassifier ensemble.RandomForestRegressor diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 674dad4821dc4..e92f75ddccdbb 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -458,8 +458,8 @@ trees. .. note:: - :class:`FastGradientBoostingClassifier` and - :class:`FastGradientBoostingRegressor` were introduced in version 0.21 and + :class:`HistGradientBoostingClassifier` and + :class:`HistGradientBoostingRegressor` were introduced in version 0.21 and are considerably faster than :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` when the number of samples is bigger than ``10 000``. These fast estimators first bin the input samples `X` into diff --git a/sklearn/_fast_gradient_boosting/__init__.py b/sklearn/_fast_gradient_boosting/__init__.py index 0318177174f98..46b26b56263a8 100644 --- a/sklearn/_fast_gradient_boosting/__init__.py +++ b/sklearn/_fast_gradient_boosting/__init__.py @@ -3,7 +3,7 @@ The implementation is a port from pygbm which is itself strongly inspired from LightGBM. """ -from .gradient_boosting import FastGradientBoostingClassifier -from .gradient_boosting import FastGradientBoostingRegressor +from .gradient_boosting import HistGradientBoostingClassifier +from .gradient_boosting import HistGradientBoostingRegressor -__all__ = ["FastGradientBoostingClassifier", "FastGradientBoostingRegressor"] +__all__ = ["HistGradientBoostingClassifier", "HistGradientBoostingRegressor"] diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 394f05d8bbd16..78d7aac7951e5 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -20,8 +20,8 @@ from .loss import _LOSSES -class BaseFastGradientBoosting(BaseEstimator, ABC): - """Base class for fast gradient boosting estimators.""" +class BaseHistGradientBoosting(BaseEstimator, ABC): + """Base class for histogram-based gradient boosting estimators.""" @abstractmethod def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, @@ -418,8 +418,8 @@ def n_iter_(self): return len(self._predictors) -class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): - """Fast Gradient Boosting Regression Tree. +class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): + """Histogram-based Gradient Boosting Regression Tree. This estimator is much faster than :class:`GradientBoostingRegressor` @@ -506,9 +506,9 @@ class FastGradientBoostingRegressor(BaseFastGradientBoosting, RegressorMixin): Examples -------- >>> from sklearn.datasets import load_boston - >>> from sklearn.ensemble import FastGradientBoostingRegressor + >>> from sklearn.ensemble import HistGradientBoostingRegressor >>> X, y = load_boston(return_X_y=True) - >>> est = FastGradientBoostingRegressor().fit(X, y) + >>> est = HistGradientBoostingRegressor().fit(X, y) >>> est.score(X, y) 0.99... """ @@ -520,7 +520,7 @@ def __init__(self, loss='least_squares', learning_rate=0.1, min_samples_leaf=5, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): - super(FastGradientBoostingRegressor, self).__init__( + super(HistGradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -556,9 +556,9 @@ def _get_loss(self): return _LOSSES[self.loss]() -class FastGradientBoostingClassifier(BaseFastGradientBoosting, +class HistGradientBoostingClassifier(BaseHistGradientBoosting, ClassifierMixin): - """Fast Gradient Boosting Classification Tree. + """Histogram-based Gradient Boosting Classification Tree. This estimator is much faster than :class:`GradientBoostingClassifier` @@ -651,9 +651,9 @@ class FastGradientBoostingClassifier(BaseFastGradientBoosting, Examples -------- >>> from sklearn.datasets import load_iris - >>> from sklearn.ensemble import FastGradientBoostingClassifier + >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> X, y = load_iris(return_X_y=True) - >>> clf = FastGradientBoostingClassifier().fit(X, y) + >>> clf = HistGradientBoostingClassifier().fit(X, y) >>> clf.score(X, y) 1.0 """ @@ -666,7 +666,7 @@ def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): - super(FastGradientBoostingClassifier, self).__init__( + super(HistGradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 38769b8dfd8ca..5ebabb473def0 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -4,8 +4,8 @@ import numpy as np import pytest -from sklearn.ensemble import FastGradientBoostingRegressor -from sklearn.ensemble import FastGradientBoostingClassifier +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn._fast_gradient_boosting.binning import _BinMapper from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator @@ -52,7 +52,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_sklearn = FastGradientBoostingRegressor( + est_sklearn = HistGradientBoostingRegressor( max_iter=max_iter, max_bins=max_bins, learning_rate=1, @@ -104,7 +104,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_sklearn = FastGradientBoostingClassifier( + est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, @@ -166,7 +166,7 @@ def test_same_predictions_multiclass_classification( X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) - est_sklearn = FastGradientBoostingClassifier( + est_sklearn = HistGradientBoostingClassifier( loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index ada99d03aa973..5e28e54cefb54 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -2,8 +2,8 @@ from sklearn.datasets import make_classification, make_regression from sklearn.utils.estimator_checks import check_estimator -from sklearn.ensemble import FastGradientBoostingClassifier -from sklearn.ensemble import FastGradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble import HistGradientBoostingRegressor X_classification, y_classification = make_classification(random_state=0) @@ -11,8 +11,8 @@ @pytest.mark.parametrize('GradientBoosting, X, y', [ - (FastGradientBoostingClassifier, X_classification, y_classification), - (FastGradientBoostingRegressor, X_regression, y_regression) + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression) ]) def test_init_parameters_validation(GradientBoosting, X, y): @@ -95,7 +95,7 @@ def test_early_stopping_regression(scoring, validation_fraction, X, y = make_regression(random_state=0) - gb = FastGradientBoostingRegressor(verbose=1, # just for coverage + gb = HistGradientBoostingRegressor(verbose=1, # just for coverage scoring=scoring, tol=tol, validation_fraction=validation_fraction, @@ -131,7 +131,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction, X, y = data - gb = FastGradientBoostingClassifier( + gb = HistGradientBoostingClassifier( verbose=1, # just for coverage scoring=scoring, tol=tol, @@ -150,7 +150,7 @@ def test_early_stopping_classification(data, scoring, validation_fraction, def test_should_stop(): def should_stop(scores, n_iter_no_change, tol): - gbdt = FastGradientBoostingClassifier( + gbdt = HistGradientBoostingClassifier( n_iter_no_change=n_iter_no_change, tol=tol) return gbdt._should_stop(scores) @@ -175,8 +175,8 @@ def should_stop(scores, n_iter_no_change, tol): @pytest.mark.parametrize('Estimator', ( - FastGradientBoostingRegressor(), - FastGradientBoostingClassifier(), + HistGradientBoostingRegressor(), + HistGradientBoostingClassifier(), )) def test_estimator_checks(Estimator): # Run the check_estimator() test suite on GBRegressor and GBClassifier. diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index 282f477c76679..2a20dbc7b88c1 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -17,8 +17,8 @@ from .gradient_boosting import GradientBoostingClassifier from .gradient_boosting import GradientBoostingRegressor from .voting_classifier import VotingClassifier -from .._fast_gradient_boosting import FastGradientBoostingClassifier -from .._fast_gradient_boosting import FastGradientBoostingRegressor +from .._fast_gradient_boosting import HistGradientBoostingClassifier +from .._fast_gradient_boosting import HistGradientBoostingRegressor from . import bagging from . import forest @@ -35,4 +35,4 @@ "AdaBoostRegressor", "VotingClassifier", "bagging", "forest", "gradient_boosting", "partial_dependence", "weight_boosting", - "FastGradientBoostingClassifier", "FastGradientBoostingRegressor"] + "HistGradientBoostingClassifier", "HistGradientBoostingRegressor"] diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 13635f710bce2..f227fe80a4f81 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1918,7 +1918,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): See also -------- - FastGradientBoostingClassifier, sklearn.tree.DecisionTreeClassifier, + HistGradientBoostingClassifier, sklearn.tree.DecisionTreeClassifier, RandomForestClassifier AdaBoostClassifier References @@ -2372,7 +2372,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): See also -------- - FastGradientBoostingRegressor, sklearn.tree.DecisionTreeRegressor, + HistGradientBoostingRegressor, sklearn.tree.DecisionTreeRegressor, RandomForestRegressor References From 0204a5d719d4fbc4473648a1588bf71c24f25bd1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 07:02:49 -0500 Subject: [PATCH 130/247] Created experimental module --- ...ing.py => bench_hist_gradient_boosting.py} | 6 +++--- ...ench_hist_gradient_boosting_higgsboson.py} | 4 ++-- doc/modules/classes.rst | 20 +++++++++++++++++-- doc/modules/ensemble.rst | 19 +++++++++--------- .../gradient_boosting.py | 4 ++-- .../tests/test_compare_lightgbm.py | 4 ++-- .../tests/test_gradient_boosting.py | 4 ++-- sklearn/ensemble/__init__.py | 5 +---- sklearn/ensemble/gradient_boosting.py | 9 +++++---- 9 files changed, 45 insertions(+), 30 deletions(-) rename benchmarks/{bench_fast_gradient_boosting.py => bench_hist_gradient_boosting.py} (98%) rename benchmarks/{bench_fast_gradient_boosting_higgsboson.py => bench_hist_gradient_boosting_higgsboson.py} (97%) diff --git a/benchmarks/bench_fast_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py similarity index 98% rename from benchmarks/bench_fast_gradient_boosting.py rename to benchmarks/bench_hist_gradient_boosting.py index 24f3aac450955..eb3024ec24713 100644 --- a/benchmarks/bench_fast_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -3,8 +3,8 @@ import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.experimental import HistGradientBoostingClassifier +from sklearn.experimental import HistGradientBoostingRegressor from sklearn.datasets import make_classification from sklearn.datasets import make_regression from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator @@ -61,7 +61,7 @@ def one_run(n_samples): print("Fitting a sklearn model...") tic = time() est = Estimator(learning_rate=lr, - n_estimators=n_trees, + max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, diff --git a/benchmarks/bench_fast_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py similarity index 97% rename from benchmarks/bench_fast_gradient_boosting_higgsboson.py rename to benchmarks/bench_hist_gradient_boosting_higgsboson.py index 3ddc03fd75619..90ca122d68dbc 100644 --- a/benchmarks/bench_fast_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -9,7 +9,7 @@ from joblib import Memory from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score -from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.experimental import HistGradientBoostingClassifier from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator @@ -70,7 +70,7 @@ def load_data(): est = HistGradientBoostingClassifier( loss='binary_crossentropy', learning_rate=lr, - n_estimators=n_trees, + max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 39365b12bbafb..0632fba8a97c9 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -415,8 +415,6 @@ Samples generator ensemble.ExtraTreesRegressor ensemble.GradientBoostingClassifier ensemble.GradientBoostingRegressor - ensemble.HistGradientBoostingClassifier - ensemble.HistGradientBoostingRegressor ensemble.IsolationForest ensemble.RandomForestClassifier ensemble.RandomForestRegressor @@ -1489,6 +1487,24 @@ Utilities from joblib: utils.parallel_backend utils.register_parallel_backend +.. _experimental_ref: + +Experimental +============ + +.. automodule:: sklearn.experimental + :no-members: + :no-inherited-members: + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + experimental.HistGradientBoostingRegressor + experimental.HistGradientBoostingClassifier + Recently deprecated =================== diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index e92f75ddccdbb..ef0ed6be2daba 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -458,15 +458,16 @@ trees. .. note:: - :class:`HistGradientBoostingClassifier` and - :class:`HistGradientBoostingRegressor` were introduced in version 0.21 and - are considerably faster than :class:`GradientBoostingClassifier` and - :class:`GradientBoostingRegressor` when the number of samples is bigger than - ``10 000``. These fast estimators first bin the input samples `X` into - integer-valued bins (typically 256 bins) which tremendously reduces the - number of splitting points to consider, and allow the algorithm to leverage - integer-based data structures. The API of these new estimators is - slightly different, and some features are not yet supported. + :class:`sklearn.experimental.HistGradientBoostingClassifier` and + :class:`sklearn.experimental.HistGradientBoostingRegressor` were introduced + in version 0.21 and are considerably faster than + :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` + when the number of samples is bigger than ``10 000``. These fast estimators + first bin the input samples `X` into integer-valued bins (typically 256 bins) + which tremendously reduces the number of splitting points to consider, and + allow the algorithm to leverage integer-based data structures. The API of + these new estimators is slightly different, and some features are not yet + supported. The following doc focuses on :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` only, which might be prefered for small diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 78d7aac7951e5..3adb0507b496b 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -506,7 +506,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): Examples -------- >>> from sklearn.datasets import load_boston - >>> from sklearn.ensemble import HistGradientBoostingRegressor + >>> from sklearn.experimental import HistGradientBoostingRegressor >>> X, y = load_boston(return_X_y=True) >>> est = HistGradientBoostingRegressor().fit(X, y) >>> est.score(X, y) @@ -651,7 +651,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, Examples -------- >>> from sklearn.datasets import load_iris - >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> from sklearn.experimental import HistGradientBoostingClassifier >>> X, y = load_iris(return_X_y=True) >>> clf = HistGradientBoostingClassifier().fit(X, y) >>> clf.score(X, y) diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py index 5ebabb473def0..23b395450a0df 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py @@ -4,8 +4,8 @@ import numpy as np import pytest -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.experimental import HistGradientBoostingRegressor +from sklearn.experimental import HistGradientBoostingClassifier from sklearn._fast_gradient_boosting.binning import _BinMapper from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py index 5e28e54cefb54..e47aee7abb62f 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py @@ -2,8 +2,8 @@ from sklearn.datasets import make_classification, make_regression from sklearn.utils.estimator_checks import check_estimator -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.experimental import HistGradientBoostingClassifier +from sklearn.experimental import HistGradientBoostingRegressor X_classification, y_classification = make_classification(random_state=0) diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index 2a20dbc7b88c1..5586a9e1e1fba 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -17,8 +17,6 @@ from .gradient_boosting import GradientBoostingClassifier from .gradient_boosting import GradientBoostingRegressor from .voting_classifier import VotingClassifier -from .._fast_gradient_boosting import HistGradientBoostingClassifier -from .._fast_gradient_boosting import HistGradientBoostingRegressor from . import bagging from . import forest @@ -34,5 +32,4 @@ "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "bagging", "forest", "gradient_boosting", - "partial_dependence", "weight_boosting", - "HistGradientBoostingClassifier", "HistGradientBoostingRegressor"] + "partial_dependence", "weight_boosting"] diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index f227fe80a4f81..22dce632bafa5 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1918,8 +1918,9 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): See also -------- - HistGradientBoostingClassifier, sklearn.tree.DecisionTreeClassifier, - RandomForestClassifier AdaBoostClassifier + sklearn.experimental.HistGradientBoostingClassifier, + sklearn.tree.DecisionTreeClassifier, RandomForestClassifier + AdaBoostClassifier References ---------- @@ -2372,8 +2373,8 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): See also -------- - HistGradientBoostingRegressor, sklearn.tree.DecisionTreeRegressor, - RandomForestRegressor + sklearn.experimental.HistGradientBoostingRegressor, + sklearn.tree.DecisionTreeRegressor, RandomForestRegressor References ---------- From 8045eb908a7d97389dd70de20640a331e92ea61a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 07:16:26 -0500 Subject: [PATCH 131/247] add subpackage --- sklearn/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/setup.py b/sklearn/setup.py index 247a62e9662a7..860a8da096dba 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -59,6 +59,7 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('svm') config.add_subpackage('_fast_gradient_boosting') config.add_subpackage('linear_model') + config.add_subpackage('experimental') # add cython extension module for isotonic regression config.add_extension('_isotonic', From b3d32bafaf681ffa5ca2a47a2596eb89555c3eaa Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 08:04:29 -0500 Subject: [PATCH 132/247] hmmm --- sklearn/experimental/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 sklearn/experimental/__init__.py diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py new file mode 100644 index 0000000000000..2f4438ba273d2 --- /dev/null +++ b/sklearn/experimental/__init__.py @@ -0,0 +1,10 @@ +""" +The :mod:`sklearn.experimetal` module includes estimator and tools whose API +and behaviour might change without a deprecation cycle. +""" + +from .._fast_gradient_boosting import HistGradientBoostingClassifier +from .._fast_gradient_boosting import HistGradientBoostingRegressor + +__all__ = ['HistGradientBoostingRegressor', 'HistGradientBoostingClassifier'] + From de051a9391b6f2b410cb257a809f0c6d7e6df1d7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 08:30:40 -0500 Subject: [PATCH 133/247] added experimental in sklearn.__init__.__all__ --- sklearn/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index aafc8a34b2a13..24f35e2f2ab14 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -75,6 +75,7 @@ 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', 'preprocessing', 'random_projection', 'semi_supervised', 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', + 'experimental', # Non-modules: 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] From 431920de0a7ec8c457a1d9856e9c4bb8d6051d53 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 08:58:44 -0500 Subject: [PATCH 134/247] added empty test folder --- sklearn/experimental/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sklearn/experimental/tests/__init__.py diff --git a/sklearn/experimental/tests/__init__.py b/sklearn/experimental/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 404f3ae5e6485e34e5de2a2059f713822d3cab71 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 27 Feb 2019 11:08:04 -0500 Subject: [PATCH 135/247] test --- sklearn/experimental/__init__.py | 1 - sklearn/setup.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py index 2f4438ba273d2..c0465f98d06e5 100644 --- a/sklearn/experimental/__init__.py +++ b/sklearn/experimental/__init__.py @@ -7,4 +7,3 @@ from .._fast_gradient_boosting import HistGradientBoostingRegressor __all__ = ['HistGradientBoostingRegressor', 'HistGradientBoostingClassifier'] - diff --git a/sklearn/setup.py b/sklearn/setup.py index 860a8da096dba..960f6bc0c1da9 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -43,6 +43,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('preprocessing/tests') config.add_subpackage('semi_supervised') config.add_subpackage('semi_supervised/tests') + config.add_subpackage('experimental') + config.add_subpackage('experimental/tests') # submodules which have their own setup.py config.add_subpackage('cluster') @@ -59,7 +61,6 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('svm') config.add_subpackage('_fast_gradient_boosting') config.add_subpackage('linear_model') - config.add_subpackage('experimental') # add cython extension module for isotonic regression config.add_extension('_isotonic', From fb8603049e4d412bc7e4f7b023f1c0209bc6d5c4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 28 Feb 2019 12:12:10 -0500 Subject: [PATCH 136/247] Biggish refactoring of splitting: - histogram computation is decoupled from finding the best split - avoided redundant computations of the gradients / hessians sums - dispatching logic for histogram computation (brute or histogram subtraction trick is now more straightforward) --- sklearn/_fast_gradient_boosting/grower.py | 129 +++---- sklearn/_fast_gradient_boosting/splitting.pyx | 345 +++++++----------- .../tests/test_splitting.py | 145 ++------ 3 files changed, 234 insertions(+), 385 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 9e97fcfd46fff..93277f76039b3 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -278,67 +278,18 @@ def _intilialize_root(self): self._finalize_leaf(self.root) return - self._compute_spittability(self.root) + # self._compute_spittability(self.root) + self._compute_histograms_brute(self.root) + self._compute_best_split_and_push(self.root) - def _compute_spittability(self, node, only_hist=False): - """Compute histograms and best possible split of a node. + def _compute_best_split_and_push(self, node): + """Compute the best possible split (SplitInfo) of a given node. - If the best possible gain is 0 or if the constraints aren't met - (min_samples_leaf, min_hessian_to_split, min_gain_to_split) then the - node is finalized (transformed into a leaf), else it is pushed on - the splittable node heap. + Also push it in the heap of splittable nodes if gain isn't zero.""" - Parameters - ---------- - node : TreeNode - The node to evaluate. - only_hist : bool, optional (default=False) - Whether to only compute the histograms and the SplitInfo. It is - set to ``True`` when ``_compute_spittability`` was called by a - sibling node: we only want to compute the histograms (which also - computes the ``SplitInfo``), not finalize or push the node. If - ``_compute_spittability`` is called again by the grower on this - same node, the histograms won't be computed again. - """ - # Compute split_info and histograms if not already done - if node.split_info is None and node.histograms is None: - # If the sibling has less samples, compute its hist first (with - # the regular method) and use the subtraction method for the - # current node - if node.sibling is not None: # root has no sibling - if node.sibling.n_samples < node.n_samples: - self._compute_spittability(node.sibling, only_hist=True) - # As hist of sibling is now computed we'll use the hist - # subtraction method for the current node. - node.hist_subtraction = True - - tic = time() - histograms = np.zeros(shape=(self.n_features, self.max_bins), - dtype=HISTOGRAM_DTYPE) - if node.hist_subtraction: - if node is node.parent.right_child: - sum_gradients = node.parent.split_info.sum_gradient_right - sum_hessians = node.parent.split_info.sum_hessian_right - else: - sum_gradients = node.parent.split_info.sum_gradient_left - sum_hessians = node.parent.split_info.sum_hessian_left - split_info = self.splitter.find_node_split_subtraction( - node.sample_indices, - sum_gradients, sum_hessians, node.parent.histograms, - node.sibling.histograms, histograms) - else: - split_info = self.splitter.find_node_split( - node.sample_indices, histograms) - toc = time() - node.find_split_time = toc - tic - self.total_find_split_time += node.find_split_time - node.split_info = split_info - node.histograms = histograms - - if only_hist: - # _compute_spittability was called by a sibling. We only needed to - # compute the histogram. - return + node.split_info = self.splitter.find_node_split( + node.sample_indices, node.histograms, node.sum_gradients, + node.sum_hessians) if node.split_info.gain <= 0: # no valid split # Note: this condition is reached if either all the leaves are @@ -346,7 +297,6 @@ def _compute_spittability(self, node, only_hist=False): # constraints, (min_hessians_to_split, min_gain_to_split, # min_samples_leaf) self._finalize_leaf(node) - else: heappush(self.splittable_nodes, node) @@ -416,16 +366,67 @@ def split_next(self): if left_child_node.n_samples < self.min_samples_leaf * 2: self._finalize_leaf(left_child_node) - else: - self._compute_spittability(left_child_node) - if right_child_node.n_samples < self.min_samples_leaf * 2: self._finalize_leaf(right_child_node) - else: - self._compute_spittability(right_child_node) + + # Compute histograms of childs, and compute their best possible split + # (if needed) + should_split_left = left_child_node.value is None # node isn't a leaf + should_split_right = right_child_node.value is None + if should_split_left or should_split_right: + + # We will compute the histograms of both nodes even if one of them + # is a leaf, since computing the second histogram is very cheap + # (using histogram subtraction). + n_samples_left = left_child_node.sample_indices.shape[0] + n_samples_right = right_child_node.sample_indices.shape[0] + if n_samples_left < n_samples_right: + smallest_child = left_child_node + largest_child = right_child_node + else: + smallest_child = right_child_node + largest_child = left_child_node + + self._compute_histograms_brute(smallest_child) + self._compute_histograms_subtraction(largest_child) + + if should_split_left: + self._compute_best_split_and_push(left_child_node) + if should_split_right: + self._compute_best_split_and_push(right_child_node) return left_child_node, right_child_node + def _compute_histograms_brute(self, node): + """Compute the histograms of the node by scanning through all the data. + + For a given feature, the complexity is O(n_samples) + """ + node.histograms = np.zeros(shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE) + self.splitter.compute_histograms_brute(node.sample_indices, + node.histograms) + + def _compute_histograms_subtraction(self, node): + """Compute the histograms of the node using the subtraction trick. + + hist(parent) = hist(left_child) + hist(right_child) + + For a given feature, the complexity is O(n_bins). This is much more + efficient than compute_histograms_brute, but it's only possible for one + of the siblings. + """ + node.histograms = np.zeros(shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE) + + if node.parent.left_child is node: + sibling = node.parent.right_child + else: + sibling = node.parent.left_child + self.splitter.compute_histograms_subtraction(node.parent.histograms, + sibling.histograms, + node.histograms) + def can_split_further(self): """Return True if there are still nodes to split.""" return len(self.splittable_nodes) >= 1 diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index c97bcea025b35..752635dd4cba6 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -38,10 +38,6 @@ from .types import HISTOGRAM_DTYPE # related to the GIL release and the custom histogram dtype) when using 1d # histogram arrays. -# epsilon for comparing gains to avoid floating precision issues that might be -# caused by the (slightly non-deterministic) parallel sums over gradients and -# hessians -cdef Y_DTYPE_C EPS = 1e-13 cdef struct split_info_struct: # Same as the SplitInfo class, but we need a C struct to use it in the @@ -203,7 +199,7 @@ cdef class Splitter: self.left_indices_buffer = np.empty_like(self.partition) self.right_indices_buffer = np.empty_like(self.partition) - def split_indices(self, SplitInfo split_info, unsigned int [::1] + def split_indices(Splitter self, SplitInfo split_info, unsigned int [::1] sample_indices): """Split samples into left and right arrays. @@ -359,14 +355,15 @@ cdef class Splitter: sample_indices[right_child_position:], right_child_position) - def find_node_split(self, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms): # OUT + def find_node_split( + Splitter self, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms, # IN + const Y_DTYPE_C sum_gradients, + const Y_DTYPE_C sum_hessians): """For each feature, find the best bin to split on at a given node. - Return the best split info among all features, and the histograms of - all the features. The histograms are computed by scanning the whole - data. + Return the best split info among all features. Parameters ---------- @@ -374,7 +371,11 @@ cdef class Splitter: The indices of the samples at the node to split. histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) - The histograms of the current node (to be computed) + The histograms of the current node. + sum_gradients : float + The sum of the gradients for each sample at the node + sum_hessians : float + The sum of the hessians for each sample at the node Returns ------- @@ -386,184 +387,17 @@ cdef class Splitter: int feature_idx int best_feature_idx int n_features = self.n_features - int i - unsigned int thread_idx - unsigned int [:] starts - unsigned int [:] ends - unsigned int n_threads split_info_struct split_info split_info_struct * split_infos - Y_DTYPE_C sum_gradients = 0. - Y_DTYPE_C sum_hessians = 0. - # need local views to avoid python interactions - G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients - G_H_DTYPE_C [::1] gradients = self.gradients - G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians - G_H_DTYPE_C [::1] hessians = self.hessians with nogil: n_samples = sample_indices.shape[0] - # Populate ordered_gradients and ordered_hessians. (Already done - # for root) Ordering the gradients and hessians helps to improve - # cache hit. - if sample_indices.shape[0] != gradients.shape[0]: - if self.hessians_are_constant: - for i in prange(n_samples, schedule='static'): - ordered_gradients[i] = gradients[sample_indices[i]] - else: - for i in prange(n_samples, schedule='static'): - ordered_gradients[i] = gradients[sample_indices[i]] - ordered_hessians[i] = hessians[sample_indices[i]] - - # Compute sums of gradients and hessians at the node - for i in prange(n_samples, schedule='static'): - sum_gradients += ordered_gradients[i] - if self.hessians_are_constant: - sum_hessians = n_samples - else: - # Using prange seems to be OK here - for i in prange(n_samples, schedule='static'): - sum_hessians += ordered_hessians[i] - split_infos = malloc( self.n_features * sizeof(split_info_struct)) - for feature_idx in prange(n_features): - # Compute histogram of each feature - self._compute_histogram(feature_idx, sample_indices, - histograms) - - # and get the best possible split for the feature among all - # bins - split_info = self._find_best_bin_to_split_helper( - feature_idx, histograms, n_samples, - sum_gradients, sum_hessians) - split_infos[feature_idx] = split_info - - # then compute best possible split among all feature - best_feature_idx = self._find_best_feature_to_split_helper( - split_infos) - split_info = split_infos[best_feature_idx] - - out = SplitInfo( - split_info.gain, - split_info.feature_idx, - split_info.bin_idx, - split_info.sum_gradient_left, - split_info.sum_hessian_left, - split_info.sum_gradient_right, - split_info.sum_hessian_right, - split_info.n_samples_left, - split_info.n_samples_right, - ) - free(split_infos) - return out - - cdef void _compute_histogram(self, - const int feature_idx, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms # OUT - ) nogil: - """Compute the histogram for a given feature.""" - - cdef: - unsigned int n_samples = sample_indices.shape[0] - const X_BINNED_DTYPE_C [::1] X_binned = \ - self.X_binned[:, feature_idx] - unsigned int root_node = X_binned.shape[0] == n_samples - G_H_DTYPE_C [::1] ordered_gradients = \ - self.ordered_gradients[:n_samples] - G_H_DTYPE_C [::1] ordered_hessians = \ - self.ordered_hessians[:n_samples] - - if root_node: - if self.hessians_are_constant: - _build_histogram_root_no_hessian(feature_idx, X_binned, - ordered_gradients, - histograms) - else: - _build_histogram_root(feature_idx, X_binned, - ordered_gradients, ordered_hessians, - histograms) - else: - if self.hessians_are_constant: - _build_histogram_no_hessian(feature_idx, - sample_indices, X_binned, - ordered_gradients, histograms) - else: - _build_histogram(feature_idx, sample_indices, - X_binned, ordered_gradients, - ordered_hessians, histograms) - - def find_node_split_subtraction( - Splitter self, - unsigned int [::1] sample_indices, # IN - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians, - hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] sibling_histograms, # IN - hist_struct [:, ::1] histograms): # OUT - """For each feature, find the best bin to split on at a given node. - - Return the best split info among all features, and the histograms of - all the features. - - This does the same job as ``find_node_split()`` but uses the - histograms of the parent and sibling of the node to split. This - allows to use the identity: ``histogram(parent) = histogram(node) - - histogram(sibling)``, which is significantly faster than computing - the histograms from data. - - Returns the best SplitInfo among all features, along with all the - feature histograms that can be later used to compute the sibling or - children histograms by substraction. - - Parameters - ---------- - sample_indices : array of int - The indices of the samples at the node to split. - sum_gradients : float - Sum of the samples gradients at the current node - sum_hessians : float - Sum of the samples hessians at the current node - parent_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the parent - sibling_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the sibling - histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the current node (to be computed) - - Returns - ------- - best_split_info : SplitInfo - The info about the best possible split among all features. - """ - - cdef: - int feature_idx - int n_features = self.n_features - unsigned int n_samples - split_info_struct split_info - split_info_struct * split_infos - int i - - with nogil: - n_samples = sample_indices.shape[0] - split_infos = malloc( - self.n_features * sizeof(split_info_struct)) for feature_idx in prange(n_features): - # Compute histogram of each feature - _subtract_histograms(feature_idx, - self.max_bins, - parent_histograms, - sibling_histograms, - histograms) - # and get the best possible split for the feature among all - # bins + # For each feature, find best bin to split on split_info = self._find_best_bin_to_split_helper( feature_idx, histograms, n_samples, sum_gradients, sum_hessians) @@ -597,8 +431,8 @@ cdef class Splitter: int best_feature_idx = 0 for feature_idx in range(1, self.n_features): - if (split_infos[feature_idx].gain - - split_infos[best_feature_idx].gain) > EPS: + if (split_infos[feature_idx].gain > + split_infos[best_feature_idx].gain): best_feature_idx = feature_idx return best_feature_idx @@ -664,7 +498,7 @@ cdef class Splitter: sum_gradients, sum_hessians, self.l2_regularization) - if gain - best_split.gain > EPS and gain > self.min_gain_to_split: + if gain > best_split.gain and gain > self.min_gain_to_split: best_split.gain = gain best_split.feature_idx = feature_idx best_split.bin_idx = bin_idx @@ -677,33 +511,128 @@ cdef class Splitter: return best_split - # Only used for tests (python code cannot use cdef types) - # Not sure if this is a good practice... - def _find_best_split_wrapper( - self, - int feature_idx, - unsigned int [::1] sample_indices, - hist_struct [:, ::1] histograms, - Y_DTYPE_C sum_gradients, - Y_DTYPE_C sum_hessians): + def compute_histograms_brute( + Splitter self, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms): # OUT + """Compute the histograms of the node by scanning through all the data. - self._compute_histogram(feature_idx, sample_indices, histograms) - n_samples = sample_indices.shape[0] - split_info = self._find_best_bin_to_split_helper( - feature_idx, histograms, n_samples, - sum_gradients, sum_hessians) + For a given feature, the complexity is O(n_samples) - return SplitInfo( - split_info.gain, - split_info.feature_idx, - split_info.bin_idx, - split_info.sum_gradient_left, - split_info.sum_hessian_left, - split_info.sum_gradient_right, - split_info.sum_hessian_right, - split_info.n_samples_left, - split_info.n_samples_right, - ) + Parameters + ---------- + sample_indices : array of int + The indices of the samples at the node to split. + histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the current node (to be computed) + """ + cdef: + int n_samples + int feature_idx + int n_features = self.n_features + int i + # need local views to avoid python interactions + G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients + G_H_DTYPE_C [::1] gradients = self.gradients + G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians + G_H_DTYPE_C [::1] hessians = self.hessians + + with nogil: + n_samples = sample_indices.shape[0] + + # Populate ordered_gradients and ordered_hessians. (Already done + # for root) Ordering the gradients and hessians helps to improve + # cache hit. + if sample_indices.shape[0] != gradients.shape[0]: + if self.hessians_are_constant: + for i in prange(n_samples, schedule='static'): + ordered_gradients[i] = gradients[sample_indices[i]] + else: + for i in prange(n_samples, schedule='static'): + ordered_gradients[i] = gradients[sample_indices[i]] + ordered_hessians[i] = hessians[sample_indices[i]] + + for feature_idx in prange(n_features): + # Compute histogram of each feature + self._compute_histogram_single_feature( + feature_idx, sample_indices, histograms) + + cdef void _compute_histogram_single_feature( + Splitter self, + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms) nogil: # OUT + """Compute the histogram for a given feature.""" + + cdef: + unsigned int n_samples = sample_indices.shape[0] + const X_BINNED_DTYPE_C [::1] X_binned = \ + self.X_binned[:, feature_idx] + unsigned int root_node = X_binned.shape[0] == n_samples + G_H_DTYPE_C [::1] ordered_gradients = \ + self.ordered_gradients[:n_samples] + G_H_DTYPE_C [::1] ordered_hessians = \ + self.ordered_hessians[:n_samples] + + if root_node: + if self.hessians_are_constant: + _build_histogram_root_no_hessian(feature_idx, X_binned, + ordered_gradients, + histograms) + else: + _build_histogram_root(feature_idx, X_binned, + ordered_gradients, ordered_hessians, + histograms) + else: + if self.hessians_are_constant: + _build_histogram_no_hessian(feature_idx, + sample_indices, X_binned, + ordered_gradients, histograms) + else: + _build_histogram(feature_idx, sample_indices, + X_binned, ordered_gradients, + ordered_hessians, histograms) + + def compute_histograms_subtraction( + Splitter self, + hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] sibling_histograms, # IN + hist_struct [:, ::1] histograms): # OUT + """Compute the histograms of the node using the subtraction trick. + + hist(parent) = hist(left_child) + hist(right_child) + + For a given feature, the complexity is O(n_bins). This is much more + efficient than compute_histograms_brute, but it's only possible for one + of the siblings. + + Parameters + ---------- + sample_indices : array of int + The indices of the samples at the node to split. + parent_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the parent + sibling_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the sibling + histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the current node (to be computed) + """ + + cdef: + int feature_idx + int n_features = self.n_features + + for feature_idx in prange(n_features, nogil=True): + # Compute histogram of each feature + _subtract_histograms(feature_idx, + self.max_bins, + parent_histograms, + sibling_histograms, + histograms) cdef inline Y_DTYPE_C _split_gain( diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index 5ea2a876e8e81..87ba71f56044c 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -18,7 +18,7 @@ def test_histogram_split(n_bins): min_samples_leaf = 1 min_gain_to_split = 0. X_binned = np.asfortranarray( - rng.randint(0, n_bins, size=(int(1e4), 2)), dtype=X_BINNED_DTYPE) + rng.randint(0, n_bins, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE) binned_feature = X_binned.T[feature_idx] sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) @@ -44,8 +44,9 @@ def test_histogram_split(n_bins): min_samples_leaf, min_gain_to_split) histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) - split_info = splitter._find_best_split_wrapper( - feature_idx, sample_indices, histograms, sum_gradients, + splitter.compute_histograms_brute(sample_indices, histograms) + split_info = splitter.find_node_split( + sample_indices, histograms, sum_gradients, sum_hessians) assert split_info.bin_idx == true_bin @@ -57,82 +58,6 @@ def test_histogram_split(n_bins): assert split_info.n_samples_left == split_info.sum_hessian_left -@pytest.mark.parametrize('constant_hessian', [True, False]) -def test_split_vs_split_subtraction(constant_hessian): - # Make sure find_node_split and find_node_split_subtraction return the - # same results. - rng = np.random.RandomState(42) - - n_bins = 10 - n_features = 20 - n_samples = 500 - l2_regularization = 0. - min_hessian_to_split = 1e-3 - min_samples_leaf = 1 - min_gain_to_split = 0. - - X_binned = rng.randint(0, n_bins, size=(n_samples, n_features), - dtype=X_BINNED_DTYPE) - X_binned = np.asfortranarray(X_binned) - sample_indices = np.arange(n_samples, dtype=np.uint32) - all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) - if constant_hessian: - all_hessians = np.ones(1, dtype=G_H_DTYPE) - else: - all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) - - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) - splitter = Splitter(X_binned, n_bins, n_bins_per_feature, all_gradients, - all_hessians, l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) - - hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_left_sub = np.zeros(shape=(n_features, n_bins), - dtype=HISTOGRAM_DTYPE) - hists_right_sub = np.zeros(shape=(n_features, n_bins), - dtype=HISTOGRAM_DTYPE) - - # first split parent, left and right with classical method - si_parent = splitter.find_node_split(sample_indices, hists_parent) - sample_indices_left, sample_indices_right, _ = splitter.split_indices( - si_parent, sample_indices) - si_left = splitter.find_node_split(sample_indices_left, hists_left) - si_right = splitter.find_node_split(sample_indices_right, hists_right) - - # split left with subtraction method - si_left_sub = splitter.find_node_split_subtraction( - sample_indices_left, si_parent.sum_gradient_left, - si_parent.sum_hessian_left, hists_parent, hists_right, hists_left_sub) - - # split right with subtraction method - si_right_sub = splitter.find_node_split_subtraction( - sample_indices_right, si_parent.sum_gradient_right, - si_parent.sum_hessian_right, hists_parent, hists_left, hists_right_sub) - - # make sure histograms from classical and subtraction method are the same - for hists, hists_sub in ((hists_left, hists_left_sub), - (hists_right, hists_right_sub)): - for hist, hist_sub in zip(hists, hists_sub): - for key in ('count', 'sum_hessians', 'sum_gradients'): - assert_array_almost_equal(hist[key], hist_sub[key], decimal=4) - - # make sure split_infos from classical and subtraction method are the same - for si, si_sub in ((si_left, si_left_sub), (si_right, si_right_sub)): - assert_almost_equal(si.gain, si_sub.gain, decimal=3) - assert_almost_equal(si.feature_idx, si_sub.feature_idx, decimal=3) - assert_almost_equal(si.sum_gradient_left, si_sub.sum_gradient_left, - decimal=3) - assert_almost_equal(si.sum_gradient_right, si_sub.sum_gradient_right, - decimal=3) - assert_almost_equal(si.sum_hessian_right, si_sub.sum_hessian_right, - decimal=3) - assert_almost_equal(si.sum_hessian_left, si_sub.sum_hessian_left, - decimal=3) - - @pytest.mark.parametrize('constant_hessian', [True, False]) def test_gradient_and_hessian_sanity(constant_hessian): # This test checks that the values of gradients and hessians are @@ -142,13 +67,6 @@ def test_gradient_and_hessian_sanity(constant_hessian): # - in the histograms: summing 'sum_gradients' over the bins must be # constant across all features, and those sums must be equal to the # node's gradient. Same for hessians. - # - # These checks are carried out for split_info and histograms resulting - # from both find_node_split() and find_node_split_subtraction(). - # - # The structure of this test is exactly the same as in - # test_split_vs_split_subtraction() but it's probably best to keep them - # separate because they're not checking the same things. rng = np.random.RandomState(42) @@ -165,10 +83,13 @@ def test_gradient_and_hessian_sanity(constant_hessian): X_binned = np.asfortranarray(X_binned) sample_indices = np.arange(n_samples, dtype=np.uint32) all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + sum_gradients = all_gradients.sum() if constant_hessian: all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_hessians = 1 * n_samples else: all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) + sum_hessians = all_hessians.sum() n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -181,36 +102,28 @@ def test_gradient_and_hessian_sanity(constant_hessian): hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_left_sub = np.zeros(shape=(n_features, n_bins), - dtype=HISTOGRAM_DTYPE) - hists_right_sub = np.zeros(shape=(n_features, n_bins), - dtype=HISTOGRAM_DTYPE) - # first split parent, left and right with classical method - si_parent = splitter.find_node_split(sample_indices, hists_parent) + + splitter.compute_histograms_brute(sample_indices, hists_parent) + si_parent = splitter.find_node_split(sample_indices, hists_parent, + sum_gradients, sum_hessians) sample_indices_left, sample_indices_right, _ = splitter.split_indices( si_parent, sample_indices) - si_left = splitter.find_node_split(sample_indices_left, hists_left) - si_right = splitter.find_node_split(sample_indices_right, hists_right) - - # split left with subtraction method - si_left_sub = splitter.find_node_split_subtraction( - sample_indices_left, si_parent.sum_gradient_left, - si_parent.sum_hessian_left, hists_parent, hists_right, hists_left_sub) - - # split right with subtraction method - si_right_sub = splitter.find_node_split_subtraction( - sample_indices_right, si_parent.sum_gradient_right, - si_parent.sum_hessian_right, hists_parent, hists_left, hists_right_sub) + splitter.compute_histograms_brute(sample_indices_left, hists_left) + splitter.compute_histograms_brute(sample_indices_right, hists_right) + si_left = splitter.find_node_split(sample_indices_left, hists_left, + si_parent.sum_gradient_left, + si_parent.sum_hessian_left) + si_right = splitter.find_node_split(sample_indices_right, hists_right, + si_parent.sum_gradient_right, + si_parent.sum_hessian_right) # make sure that si.sum_gradient_left + si.sum_gradient_right have their # expected value, same for hessians for si, indices in ( (si_parent, sample_indices), (si_left, sample_indices_left), - (si_left_sub, sample_indices_left), - (si_right, sample_indices_right), - (si_right_sub, sample_indices_right)): + (si_right, sample_indices_right)): gradient = si.sum_gradient_right + si.sum_gradient_left expected_gradient = all_gradients[indices].sum() hessian = si.sum_hessian_right + si.sum_hessian_left @@ -227,12 +140,10 @@ def test_gradient_and_hessian_sanity(constant_hessian): for hists, indices in ( (hists_parent, sample_indices), (hists_left, sample_indices_left), - (hists_left_sub, sample_indices_left), - (hists_right, sample_indices_right), - (hists_right_sub, sample_indices_right)): + (hists_right, sample_indices_right)): # note: gradients and hessians have shape (n_features,), # we're comparing them to *scalars*. This has the benefit of also - # making sure that all the entries are equal. + # making sure that all the entries are equal across features. gradients = hists['sum_gradients'].sum(axis=1) # shape = (n_features,) expected_gradient = all_gradients[indices].sum() # scalar hessians = hists['sum_hessians'].sum(axis=1) @@ -273,6 +184,8 @@ def test_split_indices(): sample_indices = np.arange(n_samples, dtype=np.uint32) all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -285,7 +198,9 @@ def test_split_indices(): assert_array_almost_equal(sample_indices, splitter.partition) histograms = np.zeros(shape=(2, n_bins), dtype=HISTOGRAM_DTYPE) - si_root = splitter.find_node_split(sample_indices, histograms) + splitter.compute_histograms_brute(sample_indices, histograms) + si_root = splitter.find_node_split(sample_indices, histograms, + sum_gradients, sum_hessians) # sanity checks for best split assert si_root.feature_idx == 1 @@ -325,6 +240,8 @@ def test_min_gain_to_split(): sample_indices = np.arange(n_samples, dtype=np.uint32) all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = all_hessians.sum() n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) @@ -335,5 +252,7 @@ def test_min_gain_to_split(): min_samples_leaf, min_gain_to_split) histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) - split_info = splitter.find_node_split(sample_indices, histograms) + splitter.compute_histograms_brute(sample_indices, histograms) + split_info = splitter.find_node_split(sample_indices, histograms, + sum_gradients, sum_hessians) assert split_info.gain == -1 From 69f6c4b04c225c94f3ce371f9b4716f68ed6e230 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 28 Feb 2019 12:15:13 -0500 Subject: [PATCH 137/247] typo --- doc/modules/ensemble.rst | 2 +- sklearn/_fast_gradient_boosting/gradient_boosting.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index ef0ed6be2daba..3a365a7242939 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -470,7 +470,7 @@ trees. supported. The following doc focuses on :class:`GradientBoostingClassifier` and - :class:`GradientBoostingRegressor` only, which might be prefered for small + :class:`GradientBoostingRegressor` only, which might be preferred for small sample sizes since binning may lead to split points that are too approximate in this setting. diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 3adb0507b496b..8081cc813632f 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -428,7 +428,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): splitting points to consider, and allows the algorithm to leverage integer-based data structures. For small sample sizes, :class:`GradientBoostingRegressor` - might be prefered since binning may lead to split points that are too + might be preferred since binning may lead to split points that are too approximate in this setting. Parameters @@ -567,7 +567,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, splitting points to consider, and allows the algorithm to leverage integer-based data structures. For small sample sizes, :class:`GradientBoostingClassifier` - might be prefered since binning may lead to split points that are too + might be preferred since binning may lead to split points that are too approximate in this setting. Parameters From 6f5c93f5c849b3d81e0e73ae149133cabe579f91 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 28 Feb 2019 15:03:55 -0500 Subject: [PATCH 138/247] histogram are returned, not passed as OUT variables --- sklearn/_fast_gradient_boosting/grower.py | 45 +++++-------------- sklearn/_fast_gradient_boosting/splitting.pyx | 27 ++++++++--- .../tests/test_splitting.py | 22 ++++----- 3 files changed, 41 insertions(+), 53 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 93277f76039b3..8da3653d58dca 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -13,7 +13,6 @@ from .splitting import Splitter from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE from .utils import sum_parallel -from .types import HISTOGRAM_DTYPE class TreeNode: @@ -279,7 +278,8 @@ def _intilialize_root(self): return # self._compute_spittability(self.root) - self._compute_histograms_brute(self.root) + self.root.histograms = self.splitter.compute_histograms_brute( + self.root.sample_indices) self._compute_best_split_and_push(self.root) def _compute_best_split_and_push(self, node): @@ -387,8 +387,15 @@ def split_next(self): smallest_child = right_child_node largest_child = left_child_node - self._compute_histograms_brute(smallest_child) - self._compute_histograms_subtraction(largest_child) + # We use the brute O(n_samples) method on the child that has the + # smallest number of samples, and the subtraction trick O(n_bins) + # on the other one. + smallest_child.histograms = \ + self.splitter.compute_histograms_brute( + smallest_child.sample_indices) + largest_child.histograms = \ + self.splitter.compute_histograms_subtraction( + node.histograms, smallest_child.histograms) if should_split_left: self._compute_best_split_and_push(left_child_node) @@ -397,36 +404,6 @@ def split_next(self): return left_child_node, right_child_node - def _compute_histograms_brute(self, node): - """Compute the histograms of the node by scanning through all the data. - - For a given feature, the complexity is O(n_samples) - """ - node.histograms = np.zeros(shape=(self.n_features, self.max_bins), - dtype=HISTOGRAM_DTYPE) - self.splitter.compute_histograms_brute(node.sample_indices, - node.histograms) - - def _compute_histograms_subtraction(self, node): - """Compute the histograms of the node using the subtraction trick. - - hist(parent) = hist(left_child) + hist(right_child) - - For a given feature, the complexity is O(n_bins). This is much more - efficient than compute_histograms_brute, but it's only possible for one - of the siblings. - """ - node.histograms = np.zeros(shape=(self.n_features, self.max_bins), - dtype=HISTOGRAM_DTYPE) - - if node.parent.left_child is node: - sibling = node.parent.right_child - else: - sibling = node.parent.left_child - self.splitter.compute_histograms_subtraction(node.parent.histograms, - sibling.histograms, - node.histograms) - def can_split_further(self): """Return True if there are still nodes to split.""" return len(self.splittable_nodes) >= 1 diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 752635dd4cba6..f2978c48749da 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -513,8 +513,7 @@ cdef class Splitter: def compute_histograms_brute( Splitter self, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms): # OUT + const unsigned int [::1] sample_indices): # IN """Compute the histograms of the node by scanning through all the data. For a given feature, the complexity is O(n_samples) @@ -523,9 +522,12 @@ cdef class Splitter: ---------- sample_indices : array of int The indices of the samples at the node to split. + + Returns + ------- histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) - The histograms of the current node (to be computed) + The histograms of the current node """ cdef: int n_samples @@ -537,6 +539,10 @@ cdef class Splitter: G_H_DTYPE_C [::1] gradients = self.gradients G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians G_H_DTYPE_C [::1] hessians = self.hessians + hist_struct [:, ::1] histograms = np.zeros( + shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE + ) with nogil: n_samples = sample_indices.shape[0] @@ -558,6 +564,8 @@ cdef class Splitter: self._compute_histogram_single_feature( feature_idx, sample_indices, histograms) + return histograms + cdef void _compute_histogram_single_feature( Splitter self, const int feature_idx, @@ -597,8 +605,7 @@ cdef class Splitter: def compute_histograms_subtraction( Splitter self, hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] sibling_histograms, # IN - hist_struct [:, ::1] histograms): # OUT + hist_struct [:, ::1] sibling_histograms): # IN """Compute the histograms of the node using the subtraction trick. hist(parent) = hist(left_child) + hist(right_child) @@ -617,14 +624,21 @@ cdef class Splitter: sibling_histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) The histograms of the sibling + + Returns + ------- histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) - The histograms of the current node (to be computed) + The histograms of the current node """ cdef: int feature_idx int n_features = self.n_features + hist_struct [:, ::1] histograms = np.zeros( + shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE + ) for feature_idx in prange(n_features, nogil=True): # Compute histogram of each feature @@ -633,6 +647,7 @@ cdef class Splitter: parent_histograms, sibling_histograms, histograms) + return histograms cdef inline Y_DTYPE_C _split_gain( diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index 87ba71f56044c..61ef115aa18a5 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -43,8 +43,7 @@ def test_histogram_split(n_bins): min_hessian_to_split, min_samples_leaf, min_gain_to_split) - histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) - splitter.compute_histograms_brute(sample_indices, histograms) + histograms = splitter.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split( sample_indices, histograms, sum_gradients, sum_hessians) @@ -99,18 +98,14 @@ def test_gradient_and_hessian_sanity(constant_hessian): l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) - hists_parent = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_left = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - hists_right = np.zeros(shape=(n_features, n_bins), dtype=HISTOGRAM_DTYPE) - - splitter.compute_histograms_brute(sample_indices, hists_parent) + hists_parent = splitter.compute_histograms_brute(sample_indices) si_parent = splitter.find_node_split(sample_indices, hists_parent, sum_gradients, sum_hessians) sample_indices_left, sample_indices_right, _ = splitter.split_indices( si_parent, sample_indices) - splitter.compute_histograms_brute(sample_indices_left, hists_left) - splitter.compute_histograms_brute(sample_indices_right, hists_right) + hists_left = splitter.compute_histograms_brute(sample_indices_left) + hists_right = splitter.compute_histograms_brute(sample_indices_right) si_left = splitter.find_node_split(sample_indices_left, hists_left, si_parent.sum_gradient_left, si_parent.sum_hessian_left) @@ -137,6 +132,9 @@ def test_gradient_and_hessian_sanity(constant_hessian): # make sure sum of gradients in histograms are the same for all features, # and make sure they're equal to their expected value + hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE) + hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE) + hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE) for hists, indices in ( (hists_parent, sample_indices), (hists_left, sample_indices_left), @@ -197,8 +195,7 @@ def test_split_indices(): assert_array_almost_equal(sample_indices, splitter.partition) - histograms = np.zeros(shape=(2, n_bins), dtype=HISTOGRAM_DTYPE) - splitter.compute_histograms_brute(sample_indices, histograms) + histograms = splitter.compute_histograms_brute(sample_indices) si_root = splitter.find_node_split(sample_indices, histograms, sum_gradients, sum_hessians) @@ -251,8 +248,7 @@ def test_min_gain_to_split(): min_hessian_to_split, min_samples_leaf, min_gain_to_split) - histograms = np.zeros(shape=(1, n_bins), dtype=HISTOGRAM_DTYPE) - splitter.compute_histograms_brute(sample_indices, histograms) + histograms = splitter.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(sample_indices, histograms, sum_gradients, sum_hessians) assert split_info.gain == -1 From 796183f27c381abff714b00fa881cae0960afc0f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Mar 2019 05:07:19 -0500 Subject: [PATCH 139/247] renaming and comments --- sklearn/_fast_gradient_boosting/_loss.pyx | 8 ++++---- sklearn/_fast_gradient_boosting/splitting.pyx | 12 ++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_loss.pyx b/sklearn/_fast_gradient_boosting/_loss.pyx index eb8ef530a610c..5f275181a8272 100644 --- a/sklearn/_fast_gradient_boosting/_loss.pyx +++ b/sklearn/_fast_gradient_boosting/_loss.pyx @@ -77,7 +77,7 @@ cdef void _update_gradients_hessians_binary_crossentropy_parallel( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - p_i = cexpit(raw_predictions[i]) + p_i = _cexpit(raw_predictions[i]) gradients[i] = p_i - y_true[i] hessians[i] = p_i * (1. - p_i) @@ -102,7 +102,7 @@ cdef void _update_gradients_hessians_categorical_crossentropy_parallel( # first compute softmaxes of sample i for each class for k in range(prediction_dim): p[i, k] = raw_predictions[k, i] # prepare softmax - compute_softmax(p, i) + _compute_softmax(p, i) # then update gradients and hessians for k in range(prediction_dim): p_i_k = p[i, k] @@ -110,7 +110,7 @@ cdef void _update_gradients_hessians_categorical_crossentropy_parallel( hessians[k, i] = p_i_k * (1. - p_i_k) -cdef inline void compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: +cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: """Compute softmaxes of values in p[i, :].""" # i needs to be passed (and stays constant) because otherwise Cython does # not generate optimal code @@ -134,6 +134,6 @@ cdef inline void compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil: p[i, k] /= sum_exps -cdef inline Y_DTYPE_C cexpit(const Y_DTYPE_C x) nogil: +cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil: """Custom expit (logistic sigmoid function)""" return 1. / (1. + exp(-x)) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index f2978c48749da..fb036c35b267e 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -525,9 +525,8 @@ cdef class Splitter: Returns ------- - histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the current node + histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + The computed histograms of the current node """ cdef: int n_samples @@ -616,8 +615,6 @@ cdef class Splitter: Parameters ---------- - sample_indices : array of int - The indices of the samples at the node to split. parent_histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) The histograms of the parent @@ -627,9 +624,8 @@ cdef class Splitter: Returns ------- - histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the current node + histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + The computed histograms of the current node """ cdef: From f04f4d83756264963975fb7b0a1a42f08e230fa9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Mar 2019 05:19:49 -0500 Subject: [PATCH 140/247] use regular class instead of cdef class for SplitInfo --- sklearn/_fast_gradient_boosting/splitting.pyx | 31 ++++++------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index fb036c35b267e..414f05daa1c14 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -53,8 +53,7 @@ cdef struct split_info_struct: unsigned int n_samples_right -@cython.final -cdef class SplitInfo: +class SplitInfo: """Pure data class to store information about a potential split. Parameters @@ -78,22 +77,10 @@ cdef class SplitInfo: n_samples_right : int The number of samples in the right child """ - cdef public: - Y_DTYPE_C gain - int feature_idx - unsigned int bin_idx - Y_DTYPE_C sum_gradient_left - Y_DTYPE_C sum_gradient_right - Y_DTYPE_C sum_hessian_left - Y_DTYPE_C sum_hessian_right - unsigned int n_samples_left - unsigned int n_samples_right - - def __init__(self, Y_DTYPE_C gain=-1., int feature_idx=0, unsigned - int bin_idx=0, Y_DTYPE_C sum_gradient_left=0., Y_DTYPE_C - sum_hessian_left=0., Y_DTYPE_C sum_gradient_right=0., - Y_DTYPE_C sum_hessian_right=0., unsigned int - n_samples_left=0, unsigned int n_samples_right=0): + def __init__(self, gain=-1., feature_idx=0, bin_idx=0, + sum_gradient_left=0., sum_hessian_left=0., + sum_gradient_right=0., sum_hessian_right=0., + n_samples_left=0, n_samples_right=0): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx @@ -199,7 +186,7 @@ cdef class Splitter: self.left_indices_buffer = np.empty_like(self.partition) self.right_indices_buffer = np.empty_like(self.partition) - def split_indices(Splitter self, SplitInfo split_info, unsigned int [::1] + def split_indices(Splitter self, split_info, unsigned int [::1] sample_indices): """Split samples into left and right arrays. @@ -274,8 +261,10 @@ cdef class Splitter: cdef: int n_samples = sample_indices.shape[0] + X_BINNED_DTYPE_C bin_idx = split_info.bin_idx + int feature_idx = split_info.feature_idx const X_BINNED_DTYPE_C [::1] X_binned = \ - self.X_binned[:, split_info.feature_idx] + self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer unsigned int [::1] right_indices_buffer = self.right_indices_buffer int n_threads = omp_get_max_threads() @@ -312,7 +301,7 @@ cdef class Splitter: stop = start + sizes[thread_idx] for i in range(start, stop): sample_idx = sample_indices[i] - if X_binned[sample_idx] <= split_info.bin_idx: + if X_binned[sample_idx] <= bin_idx: left_indices_buffer[start + left_count] = sample_idx left_count = left_count + 1 else: From 7fcf760798942f3ac839847534e834bd807e0d58 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Mar 2019 06:23:31 -0500 Subject: [PATCH 141/247] Created HistogramBuilder class --- sklearn/_fast_gradient_boosting/grower.py | 29 ++- sklearn/_fast_gradient_boosting/histogram.pxd | 71 ------ sklearn/_fast_gradient_boosting/histogram.pyx | 230 +++++++++++++++++- sklearn/_fast_gradient_boosting/splitting.pyx | 161 +----------- .../tests/test_splitting.py | 52 ++-- 5 files changed, 275 insertions(+), 268 deletions(-) delete mode 100644 sklearn/_fast_gradient_boosting/histogram.pxd diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 8da3653d58dca..3ba6b3a3b5031 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -11,6 +11,7 @@ from timeit import default_timer as time from .splitting import Splitter +from .histogram import HistogramBuilder from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE from .utils import sum_parallel @@ -189,10 +190,13 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, [n_bins_per_feature] * X_binned.shape[1], dtype=np.uint32) + hessians_are_constant = hessians.shape[0] == 1 + self.histogram_builder = HistogramBuilder( + X_binned, max_bins, gradients, hessians, hessians_are_constant) self.splitter = Splitter( - X_binned, max_bins, n_bins_per_feature, gradients, - hessians, l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + X_binned, max_bins, n_bins_per_feature, l2_regularization, + min_hessian_to_split, min_samples_leaf, min_gain_to_split, + hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes self.max_bins = max_bins self.n_features = X_binned.shape[1] @@ -205,7 +209,7 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, self.finalized_leaves = [] self.total_find_split_time = 0. # time spent finding the best splits self.total_apply_split_time = 0. # time spent splitting nodes - self._intilialize_root() + self._intilialize_root(gradients, hessians, hessians_are_constant) self.n_nodes = 1 def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, @@ -246,16 +250,15 @@ def grow(self): while self.can_split_further(): self.split_next() - def _intilialize_root(self): + def _intilialize_root(self, gradients, hessians, hessians_are_constant): """Initialize root node and finalize it if needed.""" n_samples = self.X_binned.shape[0] depth = 0 - # sum_gradients = np.sum(self.splitter.gradients) - sum_gradients = sum_parallel(self.splitter.gradients) - if self.splitter.hessians_are_constant: - sum_hessians = self.splitter.hessians[0] * n_samples + sum_gradients = sum_parallel(gradients) + if self.histogram_builder.hessians_are_constant: + sum_hessians = hessians[0] * n_samples else: - sum_hessians = np.sum(self.splitter.hessians) + sum_hessians = sum_parallel(hessians) self.root = TreeNode( depth=depth, sample_indices=self.splitter.partition, @@ -278,7 +281,7 @@ def _intilialize_root(self): return # self._compute_spittability(self.root) - self.root.histograms = self.splitter.compute_histograms_brute( + self.root.histograms = self.histogram_builder.compute_histograms_brute( self.root.sample_indices) self._compute_best_split_and_push(self.root) @@ -391,10 +394,10 @@ def split_next(self): # smallest number of samples, and the subtraction trick O(n_bins) # on the other one. smallest_child.histograms = \ - self.splitter.compute_histograms_brute( + self.histogram_builder.compute_histograms_brute( smallest_child.sample_indices) largest_child.histograms = \ - self.splitter.compute_histograms_subtraction( + self.histogram_builder.compute_histograms_subtraction( node.histograms, smallest_child.histograms) if should_split_left: diff --git a/sklearn/_fast_gradient_boosting/histogram.pxd b/sklearn/_fast_gradient_boosting/histogram.pxd deleted file mode 100644 index 582abc88f1fd4..0000000000000 --- a/sklearn/_fast_gradient_boosting/histogram.pxd +++ /dev/null @@ -1,71 +0,0 @@ -# cython: language_level=3 -"""This module contains routines for building histograms. - -A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each -feature has its own histogram. A histogram contains the sum of gradients and -hessians of all the samples belonging to each bin. - -There are different ways to build a histogram: -- by subtraction: hist(child) = hist(parent) - hist(sibling) -- from scratch. In this case we have rountines that update the hessians or not - (not useful when hessians are constant for some losses e.g. least squares). - Also, there's a special case for the root which contains all the samples, - leading to some possible optimizations. Overall all the implementations look - the same, and are optimized for cache hit. -""" -import numpy as np -cimport numpy as np - -from .types import HISTOGRAM_DTYPE -from .types cimport X_BINNED_DTYPE_C -from .types cimport Y_DTYPE_C -from .types cimport G_H_DTYPE_C -from .types cimport hist_struct - -"""compute (hist_a - hist_b) in out""" -cpdef void _subtract_histograms( - const int feature_idx, - unsigned int n_bins, - const hist_struct [:, ::1] hist_a, # IN - const hist_struct [:, ::1] hist_b, # IN - hist_struct [:, ::1] out, # OUT - ) nogil - - -"""Return histogram for a given feature.""" -cpdef void _build_histogram( - const int feature_idx, - const unsigned int [::1] sample_indices, # IN - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const G_H_DTYPE_C [::1] ordered_gradients, # IN - const G_H_DTYPE_C [::1] ordered_hessians, # IN - hist_struct [:, ::1] out) nogil # OUT - - -"""Return histogram for a given feature, not updating hessians. -Used when the hessians of the loss are constant (typically LS loss).""" -cpdef void _build_histogram_no_hessian( - const int feature_idx, - const unsigned int [::1] sample_indices, # IN - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const G_H_DTYPE_C [::1] ordered_gradients, # IN - hist_struct [:, ::1] out) nogil # OUT - -"""Compute histogram of the root node. -Unlike other nodes, the root node has to find the split among *all* the -samples from the training set. binned_feature and all_gradients / -all_hessians already have a consistent ordering.""" -cpdef void _build_histogram_root( - const int feature_idx, - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const G_H_DTYPE_C [::1] all_gradients, # IN - const G_H_DTYPE_C [::1] all_hessians, # IN - hist_struct [:, ::1] out) nogil # OUT - -"""Compute histogram of the root node, not updating hessians. -Used when the hessians of the loss are constant (typically LS loss).""" -cpdef void _build_histogram_root_no_hessian( - const int feature_idx, - const X_BINNED_DTYPE_C [::1] binned_feature, # IN - const G_H_DTYPE_C [::1] all_gradients, # IN - hist_struct [:, ::1] out) nogil # OUT diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index 3768b2738f256..dc6545d04161e 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -2,21 +2,220 @@ # cython: boundscheck=False # cython: wraparound=False # cython: language_level=3 -"""This module contains routines for building histograms. +"""This module contains routines for building histograms.""" -A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each -feature has its own histogram. A histogram contains the sum of gradients and -hessians of all the samples belonging to each bin. -""" # Author: Nicolas Hug cimport cython +from cython.parallel import prange import numpy as np cimport numpy as np +from .types import HISTOGRAM_DTYPE +from .types cimport hist_struct +from .types cimport X_BINNED_DTYPE_C +from .types cimport G_H_DTYPE_C +from .types cimport hist_struct + # Note: IN views are read-only, OUT views are write-only -# See histogram.pxd for docstrings and details + + +@cython.final +cdef class HistogramBuilder: + """A Histogram builder... used to build histograms. + + A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each + feature has its own histogram. A histogram contains the sum of gradients + and hessians of all the samples belonging to each bin. + + There are different ways to build a histogram: + - by subtraction: hist(child) = hist(parent) - hist(sibling) + - from scratch. In this case we have rountines that update the hessians + or not (not useful when hessians are constant for some losses e.g. + least squares). Also, there's a special case for the root which + contains all the samples, leading to some possible optimizations. + Overall all the implementations look the same, and are optimized for + cache hit. + + Parameters + ---------- + X_binned : array of int + The binned input samples. Must be Fortran-aligned. + max_bins : int, optional(default=256) + The maximum number of bins. Used to define the shape of the + histograms. + gradients : array-like, shape=(n_samples,) + The gradients of each training sample. Those are the gradients of the + loss w.r.t the predictions, evaluated at iteration i - 1. + hessians : array-like, shape=(n_samples,) + The hessians of each training sample. Those are the hessians of the + loss w.r.t the predictions, evaluated at iteration i - 1. + """ + cdef public: + const X_BINNED_DTYPE_C [::1, :] X_binned + unsigned int n_features + unsigned int max_bins + G_H_DTYPE_C [::1] gradients + G_H_DTYPE_C [::1] hessians + G_H_DTYPE_C [::1] ordered_gradients + G_H_DTYPE_C [::1] ordered_hessians + unsigned char hessians_are_constant + + def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int + max_bins, G_H_DTYPE_C [::1] gradients, + G_H_DTYPE_C [::1] hessians, + unsigned char hessians_are_constant): + + self.X_binned = X_binned + self.n_features = X_binned.shape[1] + # Note: all histograms will have bins, but some of the + # last bins may be unused if n_bins_per_feature[f] < max_bins + self.max_bins = max_bins + self.gradients = gradients + self.hessians = hessians + # for root node, gradients and hessians are already ordered + self.ordered_gradients = gradients.copy() + self.ordered_hessians = hessians.copy() + self.hessians_are_constant = hessians_are_constant + + def compute_histograms_brute( + HistogramBuilder self, + const unsigned int [::1] sample_indices): # IN + """Compute the histograms of the node by scanning through all the data. + + For a given feature, the complexity is O(n_samples) + + Parameters + ---------- + sample_indices : array of int + The indices of the samples at the node to split. + + Returns + ------- + histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + The computed histograms of the current node + """ + cdef: + int n_samples + int feature_idx + int i + # need local views to avoid python interactions + unsigned char hessians_are_constant = \ + self.hessians_are_constant + int n_features = self.n_features + G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients + G_H_DTYPE_C [::1] gradients = self.gradients + G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians + G_H_DTYPE_C [::1] hessians = self.hessians + hist_struct [:, ::1] histograms = np.zeros( + shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE + ) + + with nogil: + n_samples = sample_indices.shape[0] + + # Populate ordered_gradients and ordered_hessians. (Already done + # for root) Ordering the gradients and hessians helps to improve + # cache hit. + if sample_indices.shape[0] != gradients.shape[0]: + if hessians_are_constant: + for i in prange(n_samples, schedule='static'): + ordered_gradients[i] = gradients[sample_indices[i]] + else: + for i in prange(n_samples, schedule='static'): + ordered_gradients[i] = gradients[sample_indices[i]] + ordered_hessians[i] = hessians[sample_indices[i]] + + for feature_idx in prange(n_features): + # Compute histogram of each feature + self._compute_histogram_brute_single_feature( + feature_idx, sample_indices, histograms) + + return histograms + + cdef void _compute_histogram_brute_single_feature( + HistogramBuilder self, + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms) nogil: # OUT + """Compute the histogram for a given feature.""" + + cdef: + unsigned int n_samples = sample_indices.shape[0] + const X_BINNED_DTYPE_C [::1] X_binned = \ + self.X_binned[:, feature_idx] + unsigned int root_node = X_binned.shape[0] == n_samples + G_H_DTYPE_C [::1] ordered_gradients = \ + self.ordered_gradients[:n_samples] + G_H_DTYPE_C [::1] ordered_hessians = \ + self.ordered_hessians[:n_samples] + unsigned char hessians_are_constant = \ + self.hessians_are_constant + + if root_node: + if hessians_are_constant: + _build_histogram_root_no_hessian(feature_idx, X_binned, + ordered_gradients, + histograms) + else: + _build_histogram_root(feature_idx, X_binned, + ordered_gradients, ordered_hessians, + histograms) + else: + if hessians_are_constant: + _build_histogram_no_hessian(feature_idx, + sample_indices, X_binned, + ordered_gradients, histograms) + else: + _build_histogram(feature_idx, sample_indices, + X_binned, ordered_gradients, + ordered_hessians, histograms) + + def compute_histograms_subtraction( + HistogramBuilder self, + hist_struct [:, ::1] parent_histograms, # IN + hist_struct [:, ::1] sibling_histograms): # IN + """Compute the histograms of the node using the subtraction trick. + + hist(parent) = hist(left_child) + hist(right_child) + + For a given feature, the complexity is O(n_bins). This is much more + efficient than compute_histograms_brute, but it's only possible for one + of the siblings. + + Parameters + ---------- + parent_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the parent + sibling_histograms : array of HISTOGRAM_DTYPE of \ + shape(n_features, max_bins) + The histograms of the sibling + + Returns + ------- + histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) + The computed histograms of the current node + """ + + cdef: + int feature_idx + int n_features = self.n_features + hist_struct [:, ::1] histograms = np.zeros( + shape=(self.n_features, self.max_bins), + dtype=HISTOGRAM_DTYPE + ) + + for feature_idx in prange(n_features, nogil=True): + # Compute histogram of each feature + _subtract_histograms(feature_idx, + self.max_bins, + parent_histograms, + sibling_histograms, + histograms) + return histograms cpdef void _build_histogram_naive( @@ -49,6 +248,7 @@ cpdef void _subtract_histograms( hist_struct [:, ::1] hist_a, # IN hist_struct [:, ::1] hist_b, # IN hist_struct [:, ::1] out) nogil: # OUT + """compute (hist_a - hist_b) in out""" cdef: unsigned int i = 0 for i in range(n_bins): @@ -73,6 +273,7 @@ cpdef void _build_histogram( const G_H_DTYPE_C [::1] ordered_gradients, # IN const G_H_DTYPE_C [::1] ordered_hessians, # IN hist_struct [:, ::1] out) nogil: # OUT + """Return histogram for a given feature.""" cdef: unsigned int i = 0 unsigned int n_node_samples = sample_indices.shape[0] @@ -118,6 +319,11 @@ cpdef void _build_histogram_no_hessian( const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] ordered_gradients, # IN hist_struct [:, ::1] out) nogil: # OUT + """Return histogram for a given feature, not updating hessians. + + Used when the hessians of the loss are constant (typically LS loss). + """ + cdef: unsigned int i = 0 unsigned int n_node_samples = sample_indices.shape[0] @@ -157,6 +363,13 @@ cpdef void _build_histogram_root( const G_H_DTYPE_C [::1] all_gradients, # IN const G_H_DTYPE_C [::1] all_hessians, # IN hist_struct [:, ::1] out) nogil: # OUT + """Compute histogram of the root node. + + Unlike other nodes, the root node has to find the split among *all* the + samples from the training set. binned_feature and all_gradients / + all_hessians already have a consistent ordering. + """ + cdef: unsigned int i = 0 unsigned int n_samples = binned_feature.shape[0] @@ -202,6 +415,11 @@ cpdef void _build_histogram_root_no_hessian( const X_BINNED_DTYPE_C [::1] binned_feature, # IN const G_H_DTYPE_C [::1] all_gradients, # IN hist_struct [:, ::1] out) nogil: # OUT + """Compute histogram of the root node, not updating hessians. + + Used when the hessians of the loss are constant (typically LS loss). + """ + cdef: unsigned int i = 0 unsigned int n_samples = binned_feature.shape[0] diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 414f05daa1c14..5aa9e0ffa86c8 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -19,12 +19,6 @@ from openmp cimport omp_get_max_threads from libc.stdlib cimport malloc, free from libc.string cimport memcpy -from .histogram cimport _build_histogram -from .histogram cimport _build_histogram_no_hessian -from .histogram cimport _build_histogram_root -from .histogram cimport _build_histogram_root_no_hessian -from .histogram cimport _subtract_histograms -# from .histogram cimport _subtract_histograms from .types cimport X_BINNED_DTYPE_C from .types cimport Y_DTYPE_C from .types cimport G_H_DTYPE_C @@ -111,12 +105,6 @@ cdef class Splitter: n_bins_per_feature : array-like of int The actual number of bins needed for each feature, which is lower or equal to max_bins. - gradients : array-like, shape=(n_samples,) - The gradients of each training sample. Those are the gradients of the - loss w.r.t the predictions, evaluated at iteration i - 1. - hessians : array-like, shape=(n_samples,) - The hessians of each training sample. Those are the hessians of the - loss w.r.t the predictions, evaluated at iteration i - 1. l2_regularization : float The L2 regularization parameter. min_hessian_to_split : float @@ -134,10 +122,6 @@ cdef class Splitter: unsigned int n_features unsigned int max_bins unsigned int [::1] n_bins_per_feature - G_H_DTYPE_C [::1] gradients - G_H_DTYPE_C [::1] hessians - G_H_DTYPE_C [::1] ordered_gradients - G_H_DTYPE_C [::1] ordered_hessians unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -150,10 +134,10 @@ cdef class Splitter: def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, - G_H_DTYPE_C [::1] gradients, G_H_DTYPE_C [::1] hessians, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int - min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0.): + min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0., + unsigned char hessians_are_constant=False): self.X_binned = X_binned self.n_features = X_binned.shape[1] @@ -161,16 +145,11 @@ cdef class Splitter: # last bins may be unused if n_bins_per_feature[f] < max_bins self.max_bins = max_bins self.n_bins_per_feature = n_bins_per_feature - self.gradients = gradients - self.hessians = hessians - # for root node, gradients and hessians are already ordered - self.ordered_gradients = gradients.copy() - self.ordered_hessians = hessians.copy() - self.hessians_are_constant = hessians.shape[0] == 1 self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf self.min_gain_to_split = min_gain_to_split + self.hessians_are_constant = hessians_are_constant # The partition array maps each sample index into the leaves of the # tree (a leaf in this context is a node that isn't splitted yet, not @@ -500,140 +479,6 @@ cdef class Splitter: return best_split - def compute_histograms_brute( - Splitter self, - const unsigned int [::1] sample_indices): # IN - """Compute the histograms of the node by scanning through all the data. - - For a given feature, the complexity is O(n_samples) - - Parameters - ---------- - sample_indices : array of int - The indices of the samples at the node to split. - - Returns - ------- - histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) - The computed histograms of the current node - """ - cdef: - int n_samples - int feature_idx - int n_features = self.n_features - int i - # need local views to avoid python interactions - G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients - G_H_DTYPE_C [::1] gradients = self.gradients - G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians - G_H_DTYPE_C [::1] hessians = self.hessians - hist_struct [:, ::1] histograms = np.zeros( - shape=(self.n_features, self.max_bins), - dtype=HISTOGRAM_DTYPE - ) - - with nogil: - n_samples = sample_indices.shape[0] - - # Populate ordered_gradients and ordered_hessians. (Already done - # for root) Ordering the gradients and hessians helps to improve - # cache hit. - if sample_indices.shape[0] != gradients.shape[0]: - if self.hessians_are_constant: - for i in prange(n_samples, schedule='static'): - ordered_gradients[i] = gradients[sample_indices[i]] - else: - for i in prange(n_samples, schedule='static'): - ordered_gradients[i] = gradients[sample_indices[i]] - ordered_hessians[i] = hessians[sample_indices[i]] - - for feature_idx in prange(n_features): - # Compute histogram of each feature - self._compute_histogram_single_feature( - feature_idx, sample_indices, histograms) - - return histograms - - cdef void _compute_histogram_single_feature( - Splitter self, - const int feature_idx, - const unsigned int [::1] sample_indices, # IN - hist_struct [:, ::1] histograms) nogil: # OUT - """Compute the histogram for a given feature.""" - - cdef: - unsigned int n_samples = sample_indices.shape[0] - const X_BINNED_DTYPE_C [::1] X_binned = \ - self.X_binned[:, feature_idx] - unsigned int root_node = X_binned.shape[0] == n_samples - G_H_DTYPE_C [::1] ordered_gradients = \ - self.ordered_gradients[:n_samples] - G_H_DTYPE_C [::1] ordered_hessians = \ - self.ordered_hessians[:n_samples] - - if root_node: - if self.hessians_are_constant: - _build_histogram_root_no_hessian(feature_idx, X_binned, - ordered_gradients, - histograms) - else: - _build_histogram_root(feature_idx, X_binned, - ordered_gradients, ordered_hessians, - histograms) - else: - if self.hessians_are_constant: - _build_histogram_no_hessian(feature_idx, - sample_indices, X_binned, - ordered_gradients, histograms) - else: - _build_histogram(feature_idx, sample_indices, - X_binned, ordered_gradients, - ordered_hessians, histograms) - - def compute_histograms_subtraction( - Splitter self, - hist_struct [:, ::1] parent_histograms, # IN - hist_struct [:, ::1] sibling_histograms): # IN - """Compute the histograms of the node using the subtraction trick. - - hist(parent) = hist(left_child) + hist(right_child) - - For a given feature, the complexity is O(n_bins). This is much more - efficient than compute_histograms_brute, but it's only possible for one - of the siblings. - - Parameters - ---------- - parent_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the parent - sibling_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the sibling - - Returns - ------- - histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) - The computed histograms of the current node - """ - - cdef: - int feature_idx - int n_features = self.n_features - hist_struct [:, ::1] histograms = np.zeros( - shape=(self.n_features, self.max_bins), - dtype=HISTOGRAM_DTYPE - ) - - for feature_idx in prange(n_features, nogil=True): - # Compute histogram of each feature - _subtract_histograms(feature_idx, - self.max_bins, - parent_histograms, - sibling_histograms, - histograms) - return histograms - cdef inline Y_DTYPE_C _split_gain( Y_DTYPE_C sum_gradient_left, diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/_fast_gradient_boosting/tests/test_splitting.py index 61ef115aa18a5..2e9d37c12da02 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/_fast_gradient_boosting/tests/test_splitting.py @@ -7,6 +7,7 @@ from sklearn._fast_gradient_boosting.types import G_H_DTYPE from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE from sklearn._fast_gradient_boosting.splitting import Splitter +from sklearn._fast_gradient_boosting.histogram import HistogramBuilder @pytest.mark.parametrize('n_bins', [3, 32, 256]) @@ -24,6 +25,7 @@ def test_histogram_split(n_bins): ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) all_hessians = ordered_hessians sum_hessians = all_hessians.sum() + hessians_are_constant = False for true_bin in range(1, n_bins - 1): for sign in [-1, 1]: @@ -35,15 +37,20 @@ def test_histogram_split(n_bins): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + builder = HistogramBuilder(X_binned, + n_bins, + all_gradients, + all_hessians, + hessians_are_constant) splitter = Splitter(X_binned, n_bins, n_bins_per_feature, - all_gradients, all_hessians, l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + min_samples_leaf, min_gain_to_split, + hessians_are_constant) - histograms = splitter.compute_histograms_brute(sample_indices) + histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split( sample_indices, histograms, sum_gradients, sum_hessians) @@ -92,20 +99,20 @@ def test_gradient_and_hessian_sanity(constant_hessian): n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - splitter = Splitter(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, + builder = HistogramBuilder(X_binned, n_bins, all_gradients, + all_hessians, constant_hessian) + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + min_samples_leaf, min_gain_to_split, constant_hessian) - hists_parent = splitter.compute_histograms_brute(sample_indices) + hists_parent = builder.compute_histograms_brute(sample_indices) si_parent = splitter.find_node_split(sample_indices, hists_parent, sum_gradients, sum_hessians) sample_indices_left, sample_indices_right, _ = splitter.split_indices( si_parent, sample_indices) - hists_left = splitter.compute_histograms_brute(sample_indices_left) - hists_right = splitter.compute_histograms_brute(sample_indices_right) + hists_left = builder.compute_histograms_brute(sample_indices_left) + hists_right = builder.compute_histograms_brute(sample_indices_right) si_left = splitter.find_node_split(sample_indices_left, hists_left, si_parent.sum_gradient_left, si_parent.sum_hessian_left) @@ -184,18 +191,21 @@ def test_split_indices(): all_hessians = np.ones(1, dtype=G_H_DTYPE) sum_gradients = all_gradients.sum() sum_hessians = 1 * n_samples + hessians_are_constant = True n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) - splitter = Splitter(X_binned, n_bins, - n_bins_per_feature, - all_gradients, all_hessians, + builder = HistogramBuilder(X_binned, n_bins, + all_gradients, all_hessians, + hessians_are_constant) + splitter = Splitter(X_binned, n_bins, n_bins_per_feature, l2_regularization, min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + min_samples_leaf, min_gain_to_split, + hessians_are_constant) assert_array_almost_equal(sample_indices, splitter.partition) - histograms = splitter.compute_histograms_brute(sample_indices) + histograms = builder.compute_histograms_brute(sample_indices) si_root = splitter.find_node_split(sample_indices, histograms, sum_gradients, sum_hessians) @@ -239,16 +249,18 @@ def test_min_gain_to_split(): all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE) sum_gradients = all_gradients.sum() sum_hessians = all_hessians.sum() + hessians_are_constant = False n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + builder = HistogramBuilder(X_binned, n_bins, all_gradients, + all_hessians, hessians_are_constant) splitter = Splitter(X_binned, n_bins, n_bins_per_feature, - all_gradients, all_hessians, - l2_regularization, - min_hessian_to_split, - min_samples_leaf, min_gain_to_split) + l2_regularization, min_hessian_to_split, + min_samples_leaf, min_gain_to_split, + hessians_are_constant) - histograms = splitter.compute_histograms_brute(sample_indices) + histograms = builder.compute_histograms_brute(sample_indices) split_info = splitter.find_node_split(sample_indices, histograms, sum_gradients, sum_hessians) assert split_info.gain == -1 From 8de4e4f53950382fb13176dc41849402debe91dc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Mar 2019 07:10:09 -0500 Subject: [PATCH 142/247] Added compute_hist_time for verbose output --- .../gradient_boosting.py | 4 ++++ sklearn/_fast_gradient_boosting/grower.py | 20 ++++++------------- sklearn/_fast_gradient_boosting/splitting.pyx | 1 - 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index 8081cc813632f..ca28309be2779 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -92,6 +92,7 @@ def fit(self, X, y): fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes + acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. X, y = check_X_y(X, y, dtype=[X_DTYPE]) @@ -235,6 +236,7 @@ def fit(self, X, y): acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time + acc_compute_hist_time += grower.total_compute_hist_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_) @@ -271,6 +273,8 @@ def fit(self, X, y): for predictors_at_ith_iteration in self._predictors) print("Fit {} trees in {:.3f} s, ({} total leaves)".format( n_predictors, duration, n_total_leaves)) + print("{:<32} {:.3f}s".format('Time spent computing histograms:', + acc_compute_hist_time)) print("{:<32} {:.3f}s".format('Time spent finding best splits:', acc_find_split_time)) print("{:<32} {:.3f}s".format('Time spent applying splits:', diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index 3ba6b3a3b5031..dc62c9b250559 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -56,14 +56,6 @@ class TreeNode: value : float or None The value of the leaf, as computed in finalize_leaf(). None for non-leaf nodes - find_split_time : float - The total time spent computing the histogram and finding the best - split at the node. - apply_split_time : float - The total time spent actually splitting the node, e.g. splitting - sample_indices into left and right child. - hist_subtraction : bool - Wheter the subtraction method was used for computing the histograms. partition_start : int start position of the node's sample_indices in splitter.partition partition_stop : int @@ -77,9 +69,6 @@ class TreeNode: histograms = None sibling = None parent = None - find_split_time = 0. - apply_split_time = 0. - hist_subtraction = False # start and stop indices of the node in the splitter.partition # array. Concretely, @@ -208,6 +197,7 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, self.splittable_nodes = [] self.finalized_leaves = [] self.total_find_split_time = 0. # time spent finding the best splits + self.total_compute_hist_time = 0. # time spent computing histograms self.total_apply_split_time = 0. # time spent splitting nodes self._intilialize_root(gradients, hessians, hessians_are_constant) self.n_nodes = 1 @@ -324,9 +314,7 @@ def split_next(self): sample_indices_right, right_child_pos) = self.splitter.split_indices(node.split_info, node.sample_indices) - toc = time() - node.apply_split_time = toc - tic - self.total_apply_split_time += node.apply_split_time + self.total_apply_split_time += time() - tic depth = node.depth + 1 n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes) @@ -393,17 +381,21 @@ def split_next(self): # We use the brute O(n_samples) method on the child that has the # smallest number of samples, and the subtraction trick O(n_bins) # on the other one. + tic = time() smallest_child.histograms = \ self.histogram_builder.compute_histograms_brute( smallest_child.sample_indices) largest_child.histograms = \ self.histogram_builder.compute_histograms_subtraction( node.histograms, smallest_child.histograms) + self.total_compute_hist_time += time() - tic + tic = time() if should_split_left: self._compute_best_split_and_push(left_child_node) if should_split_right: self._compute_best_split_and_push(right_child_node) + self.total_find_split_time += time() - tic return left_child_node, right_child_node diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 5aa9e0ffa86c8..4cf1465e12759 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -21,7 +21,6 @@ from libc.string cimport memcpy from .types cimport X_BINNED_DTYPE_C from .types cimport Y_DTYPE_C -from .types cimport G_H_DTYPE_C from .types cimport hist_struct from .types import HISTOGRAM_DTYPE From c76dcd4f75bc09ac75528ccdcab2e5273ecc1f03 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Mar 2019 07:39:36 -0500 Subject: [PATCH 143/247] some cleaning --- sklearn/_fast_gradient_boosting/_loss.pyx | 5 ----- sklearn/_fast_gradient_boosting/grower.py | 1 - sklearn/_fast_gradient_boosting/histogram.pyx | 8 +++++++- sklearn/_fast_gradient_boosting/splitting.pyx | 9 +-------- 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_loss.pyx b/sklearn/_fast_gradient_boosting/_loss.pyx index 5f275181a8272..dbb67829894b2 100644 --- a/sklearn/_fast_gradient_boosting/_loss.pyx +++ b/sklearn/_fast_gradient_boosting/_loss.pyx @@ -9,11 +9,6 @@ cimport cython from cython.parallel import prange import numpy as np cimport numpy as np -from scipy.special import expit -try: - from scipy.special import logsumexp -except ImportError: - from scipy.misc import logsumexp from libc.math cimport exp diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index dc62c9b250559..d2732570cb74a 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -270,7 +270,6 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): self._finalize_leaf(self.root) return - # self._compute_spittability(self.root) self.root.histograms = self.histogram_builder.compute_histograms_brute( self.root.sample_indices) self._compute_best_split_and_push(self.root) diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index dc6545d04161e..70478eca57ecb 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -20,12 +20,18 @@ from .types cimport hist_struct # Note: IN views are read-only, OUT views are write-only +# Note: in a lot of functions here, we pass feature_idx and the whole 2d +# histograms arrays instead a lot just histograms[feature_idx]. This is +# because Cython generated C code will have strange Python interactions (likely +# related to the GIL release and the custom histogram dtype) when using 1d +# histogram arrays. + @cython.final cdef class HistogramBuilder: """A Histogram builder... used to build histograms. - A histogram is an array with n_bins entry of type HISTOGRAM_DTYPE. Each + A histogram is an array with n_bins entries of type HISTOGRAM_DTYPE. Each feature has its own histogram. A histogram contains the sum of gradients and hessians of all the samples belonging to each bin. diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/_fast_gradient_boosting/splitting.pyx index 4cf1465e12759..2aa9a77644300 100644 --- a/sklearn/_fast_gradient_boosting/splitting.pyx +++ b/sklearn/_fast_gradient_boosting/splitting.pyx @@ -25,16 +25,9 @@ from .types cimport hist_struct from .types import HISTOGRAM_DTYPE -# Note: in a lot of functions here, we pass feature_idx and the whole 2d -# histograms arrays instead a lot just histograms[feature_idx]. This is -# because Cython generated C code will have strange Python interactions (likely -# related to the GIL release and the custom histogram dtype) when using 1d -# histogram arrays. - - cdef struct split_info_struct: # Same as the SplitInfo class, but we need a C struct to use it in the - # nogil sections + # nogil sections and to use in arrays. Y_DTYPE_C gain int feature_idx unsigned int bin_idx From ee96ac3e3b4c215e9565958fc389c9f6127cc1ae Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Mar 2019 07:45:14 -0500 Subject: [PATCH 144/247] Fixed constant hessian issue --- sklearn/_fast_gradient_boosting/loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/loss.py b/sklearn/_fast_gradient_boosting/loss.py index 7f7334ae141ed..f35aa1c72c091 100644 --- a/sklearn/_fast_gradient_boosting/loss.py +++ b/sklearn/_fast_gradient_boosting/loss.py @@ -54,7 +54,7 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): # if the hessians are constant, we consider they are equal to 1. # this is correct as long as we adjust the gradients. See e.g. LS # loss - hessians = np.ones(shape=shape, dtype=G_H_DTYPE) + hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE) else: hessians = np.empty(shape=shape, dtype=G_H_DTYPE) From c08ca89b9b52a41b26e6a9ebf308eb95a22e2f91 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 15 Mar 2019 12:08:15 +0100 Subject: [PATCH 145/247] Update sklearn/_fast_gradient_boosting/_binning.pyx typo Co-Authored-By: NicolasHug --- sklearn/_fast_gradient_boosting/_binning.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_fast_gradient_boosting/_binning.pyx b/sklearn/_fast_gradient_boosting/_binning.pyx index 711cdf99697a9..2019f7fd0955a 100644 --- a/sklearn/_fast_gradient_boosting/_binning.pyx +++ b/sklearn/_fast_gradient_boosting/_binning.pyx @@ -40,7 +40,7 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, cpdef void _map_num_col_to_bins(const X_DTYPE_C [:] data, const X_DTYPE_C [:] binning_thresholds, X_BINNED_DTYPE_C [:] binned): - """Binary search to the find the bin index for each value in data.""" + """Binary search to find the bin index for each value in the data.""" cdef: int i int left From bc0d805855fa0dc2020b64e509bc0022e6e72fe2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 15 Mar 2019 08:30:34 -0400 Subject: [PATCH 146/247] Removed wrapper functions in loss updates --- sklearn/_fast_gradient_boosting/_loss.pyx | 54 ++++++----------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/_loss.pyx b/sklearn/_fast_gradient_boosting/_loss.pyx index dbb67829894b2..91c3e53101ed6 100644 --- a/sklearn/_fast_gradient_boosting/_loss.pyx +++ b/sklearn/_fast_gradient_boosting/_loss.pyx @@ -17,37 +17,10 @@ from .types cimport G_H_DTYPE_C def _update_gradients_least_squares( - G_H_DTYPE_C [::1] gradients, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions): + G_H_DTYPE_C [::1] gradients, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions): # IN - _update_gradients_least_squares_parallel( - gradients, y_true, raw_predictions) - - -def _update_gradients_hessians_binary_crossentropy( - G_H_DTYPE_C [::1] gradients, - G_H_DTYPE_C [::1] hessians, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions): - - _update_gradients_hessians_binary_crossentropy_parallel( - gradients, hessians, y_true, raw_predictions) - - -def _update_gradients_hessians_categorical_crossentropy( - G_H_DTYPE_C [:, ::1] gradients, - G_H_DTYPE_C [:, ::1] hessians, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [:, ::1] raw_predictions): - _update_gradients_hessians_categorical_crossentropy_parallel( - gradients, hessians, y_true, raw_predictions) - - -cdef void _update_gradients_least_squares_parallel( - G_H_DTYPE_C [::1] gradients, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions): cdef: int n_samples int i @@ -60,11 +33,11 @@ cdef void _update_gradients_least_squares_parallel( gradients[i] = raw_predictions[i] - y_true[i] -cdef void _update_gradients_hessians_binary_crossentropy_parallel( - G_H_DTYPE_C [::1] gradients, - G_H_DTYPE_C [::1] hessians, - const Y_DTYPE_C [::1] y_true, - const Y_DTYPE_C [::1] raw_predictions): +def _update_gradients_hessians_binary_crossentropy( + G_H_DTYPE_C [::1] gradients, # OUT + G_H_DTYPE_C [::1] hessians, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions): # IN cdef: int n_samples Y_DTYPE_C p_i # proba that ith sample belongs to positive class @@ -77,12 +50,11 @@ cdef void _update_gradients_hessians_binary_crossentropy_parallel( hessians[i] = p_i * (1. - p_i) -cdef void _update_gradients_hessians_categorical_crossentropy_parallel( - G_H_DTYPE_C [:, ::1] gradients, # shape (pred_dim, n_samples), OUT - G_H_DTYPE_C [:, ::1] hessians, # shape (pred_dim, n_samples), OUT - const Y_DTYPE_C [::1] y_true, # shape (n_samples,), IN - # shape (pred_dim, n_samples), IN - const Y_DTYPE_C [:, ::1] raw_predictions): +def _update_gradients_hessians_categorical_crossentropy( + G_H_DTYPE_C [:, ::1] gradients, # OUT + G_H_DTYPE_C [:, ::1] hessians, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [:, ::1] raw_predictions): # IN cdef: int prediction_dim = raw_predictions.shape[0] int n_samples = raw_predictions.shape[1] From fcfbf6473f9a715850c92db08c8c30689b595e20 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 15 Mar 2019 09:40:20 -0400 Subject: [PATCH 147/247] Addressed comments from Adrin --- doc/modules/ensemble.rst | 11 ++++---- sklearn/_fast_gradient_boosting/__init__.py | 2 +- .../_gradient_boosting.pyx | 2 +- sklearn/_fast_gradient_boosting/binning.py | 28 +++++++++++++++---- sklearn/_fast_gradient_boosting/histogram.pyx | 6 ++-- sklearn/_fast_gradient_boosting/loss.py | 7 +++-- .../_fast_gradient_boosting/tests/__init__.py | 0 .../tests/test_binning.py | 11 +++++++- 8 files changed, 47 insertions(+), 20 deletions(-) delete mode 100644 sklearn/_fast_gradient_boosting/tests/__init__.py diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 3a365a7242939..eabc707b84a81 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -463,11 +463,12 @@ trees. in version 0.21 and are considerably faster than :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` when the number of samples is bigger than ``10 000``. These fast estimators - first bin the input samples `X` into integer-valued bins (typically 256 bins) - which tremendously reduces the number of splitting points to consider, and - allow the algorithm to leverage integer-based data structures. The API of - these new estimators is slightly different, and some features are not yet - supported. + first bin the input samples ``X`` into integer-valued bins (typically 256 + bins) which tremendously reduces the number of splitting points to + consider, and allow the algorithm to leverage integer-based data + structures. The API of these new estimators is slightly different, and + some of the features from :class:`GradientBoostingClassifier` and + :class:`GradientBoostingRegressor` are not yet supported. The following doc focuses on :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` only, which might be preferred for small diff --git a/sklearn/_fast_gradient_boosting/__init__.py b/sklearn/_fast_gradient_boosting/__init__.py index 46b26b56263a8..1a0e0b67e35f7 100644 --- a/sklearn/_fast_gradient_boosting/__init__.py +++ b/sklearn/_fast_gradient_boosting/__init__.py @@ -1,4 +1,4 @@ -"""This module implements the 'fast' gradient boosting estimators. +"""This module implements histogram-based gradient boosting estimators. The implementation is a port from pygbm which is itself strongly inspired from LightGBM. diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx index ed4e85344e697..d13e463e3f29b 100644 --- a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx @@ -40,7 +40,7 @@ def _update_raw_predictions( values) -cdef void _update_raw_predictions_helper( +cdef inline void _update_raw_predictions_helper( Y_DTYPE_C [::1] raw_predictions, # OUT const unsigned int [::1] starts, const unsigned int [::1] stops, diff --git a/sklearn/_fast_gradient_boosting/binning.py b/sklearn/_fast_gradient_boosting/binning.py index a7738d6607161..5fd03d3d7b7cb 100644 --- a/sklearn/_fast_gradient_boosting/binning.py +++ b/sklearn/_fast_gradient_boosting/binning.py @@ -16,10 +16,25 @@ from .types import X_DTYPE, X_BINNED_DTYPE -def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), - random_state=None): +def _find_binning_thresholds(data, max_bins, subsample, random_state): """Extract feature-wise quantiles from numerical data. + Parameters + ---------- + data: array-like + The data to bin + max_bins : int + The maximum number of bins to use. If for a given feature the number of + unique values is less than ``max_bins``, then those unique values + will be used to compute the bin thresholds, instead of the quantiles. + subsample : int or None + If ``n_samples > subsample``, then ``sub_samples`` samples will be + randomly choosen to compute the quantiles. If ``None``, the whole data + is used. + random_state: int or numpy.random.RandomState or None + Pseudo-random number generator to control the random sub-sampling. + See :term:`random_state`. + Return ------ binning_thresholds: tuple of arrays @@ -76,17 +91,16 @@ class _BinMapper(BaseEstimator, TransformerMixin): The maximum number of bins to use. If for a given feature the number of unique values is less than ``max_bins``, then those unique values will be used to compute the bin thresholds, instead of the quantiles. - subsample : int or None, optional (default=1e5) + subsample : int or None, optional (default=2e5) If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly choosen to compute the quantiles. If ``None``, the whole data is used. random_state: int or numpy.random.RandomState or None, \ optional (default=None) Pseudo-random number generator to control the random sub-sampling. - See `scikit-learn glossary - `_. + See :term:`random_state`. """ - def __init__(self, max_bins=256, subsample=int(1e5), random_state=None): + def __init__(self, max_bins=256, subsample=int(2e5), random_state=None): self.max_bins = max_bins self.subsample = subsample self.random_state = random_state @@ -98,6 +112,8 @@ def fit(self, X, y=None): ---------- X: array-like The data to bin + y: None + Ignored Returns ------- diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/_fast_gradient_boosting/histogram.pyx index 70478eca57ecb..1376be8666df3 100644 --- a/sklearn/_fast_gradient_boosting/histogram.pyx +++ b/sklearn/_fast_gradient_boosting/histogram.pyx @@ -21,10 +21,10 @@ from .types cimport hist_struct # Note: IN views are read-only, OUT views are write-only # Note: in a lot of functions here, we pass feature_idx and the whole 2d -# histograms arrays instead a lot just histograms[feature_idx]. This is -# because Cython generated C code will have strange Python interactions (likely +# histograms arrays instead of just histograms[feature_idx]. This is because +# Cython generated C code will have strange Python interactions (likely # related to the GIL release and the custom histogram dtype) when using 1d -# histogram arrays. +# histogram arrays that come from 2d arrays. @cython.final diff --git a/sklearn/_fast_gradient_boosting/loss.py b/sklearn/_fast_gradient_boosting/loss.py index f35aa1c72c091..dcdc067017bd6 100644 --- a/sklearn/_fast_gradient_boosting/loss.py +++ b/sklearn/_fast_gradient_boosting/loss.py @@ -10,7 +10,7 @@ import numpy as np from scipy.special import expit -try: +try: # logsumexp was moved from mist to special in 0.19 from scipy.special import logsumexp except ImportError: from scipy.misc import logsumexp @@ -45,7 +45,7 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): ------- gradients : array-like, shape=(prediction_dim, n_samples) hessians : array-like, shape=(prediction_dim, n_samples). - If hessians are constant (e.g. for ``LeastSquares`` loss, the + If hessians are constant (e.g. for `LeastSquares` loss, the array is initialized to ``1``. """ shape = (prediction_dim, n_samples) @@ -146,7 +146,8 @@ class BinaryCrossEntropy(BaseLoss): loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i - See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman. + See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman, + section 4.4.1 (about logistic regression). """ hessians_are_constant = False diff --git a/sklearn/_fast_gradient_boosting/tests/__init__.py b/sklearn/_fast_gradient_boosting/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/sklearn/_fast_gradient_boosting/tests/test_binning.py b/sklearn/_fast_gradient_boosting/tests/test_binning.py index 71eb5513e668b..41bb655223a2f 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_binning.py +++ b/sklearn/_fast_gradient_boosting/tests/test_binning.py @@ -3,7 +3,8 @@ import pytest from sklearn._fast_gradient_boosting.binning import _BinMapper -from sklearn._fast_gradient_boosting.binning import _find_binning_thresholds +from sklearn._fast_gradient_boosting.binning import ( + _find_binning_thresholds as _find_binning_thresholds_orig) from sklearn._fast_gradient_boosting.binning import _map_to_bins from sklearn._fast_gradient_boosting.types import X_DTYPE, X_BINNED_DTYPE @@ -13,6 +14,14 @@ ).astype(X_DTYPE) +def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5), + random_state=None): + # Just a redef to avoid having to pass arguments all the time (as the + # function is private we don't use default values for parameters) + return _find_binning_thresholds_orig(data, max_bins, subsample, + random_state) + + def test_find_binning_thresholds_regular_data(): data = np.linspace(0, 10, 1001).reshape(-1, 1) bin_thresholds = _find_binning_thresholds(data, max_bins=10) From 2d2c081ceda3d8ef4cfb424b4072e73dd11b3519 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 15 Mar 2019 10:17:21 -0400 Subject: [PATCH 148/247] removed __all__ from _fast.../__init__.py --- sklearn/_fast_gradient_boosting/__init__.py | 4 ---- sklearn/experimental/__init__.py | 6 ++++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/__init__.py b/sklearn/_fast_gradient_boosting/__init__.py index 1a0e0b67e35f7..879fae1189f87 100644 --- a/sklearn/_fast_gradient_boosting/__init__.py +++ b/sklearn/_fast_gradient_boosting/__init__.py @@ -3,7 +3,3 @@ The implementation is a port from pygbm which is itself strongly inspired from LightGBM. """ -from .gradient_boosting import HistGradientBoostingClassifier -from .gradient_boosting import HistGradientBoostingRegressor - -__all__ = ["HistGradientBoostingClassifier", "HistGradientBoostingRegressor"] diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py index c0465f98d06e5..225b1145c741d 100644 --- a/sklearn/experimental/__init__.py +++ b/sklearn/experimental/__init__.py @@ -3,7 +3,9 @@ and behaviour might change without a deprecation cycle. """ -from .._fast_gradient_boosting import HistGradientBoostingClassifier -from .._fast_gradient_boosting import HistGradientBoostingRegressor +from .._fast_gradient_boosting.gradient_boosting import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor +) __all__ = ['HistGradientBoostingRegressor', 'HistGradientBoostingClassifier'] From 2af250411d6a67f67f7fb0c5a2520dae12555ada Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 16 Mar 2019 13:35:46 -0400 Subject: [PATCH 149/247] optional ( instead of optional( --- .../gradient_boosting.py | 40 +++++++++---------- sklearn/_fast_gradient_boosting/grower.py | 22 +++++----- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/_fast_gradient_boosting/gradient_boosting.py index ca28309be2779..edafe059590fc 100644 --- a/sklearn/_fast_gradient_boosting/gradient_boosting.py +++ b/sklearn/_fast_gradient_boosting/gradient_boosting.py @@ -437,26 +437,26 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): Parameters ---------- - loss : {'least_squares'}, optional(default='least_squares') + loss : {'least_squares'}, optional (default='least_squares') The loss function to use in the boosting process. - learning_rate : float, optional(default=0.1) + learning_rate : float, optional (default=0.1) The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. - max_iter : int, optional(default=100) + max_iter : int, optional (default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees. - max_leaf_nodes : int or None, optional(default=None) + max_leaf_nodes : int or None, optional (default=None) The maximum number of leaves for each tree. If None, there is no maximum limit. - max_depth : int or None, optional(default=None) + max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. - min_samples_leaf : int, optional(default=5) + min_samples_leaf : int, optional (default=5) The minimum number of samples per leaf. - l2_regularization : float, optional(default=0) + l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. - max_bins : int, optional(default=256) + max_bins : int, optional (default=256) The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small @@ -468,7 +468,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): :ref:`scoring`). If None, the estimator's default scorer is used. If ``scoring='loss'``, early stopping is checked w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. - validation_fraction : int or float or None, optional(default=0.1) + validation_fraction : int or float or None, optional (default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. Only used if ``n_iter_no_change`` is not None. @@ -577,31 +577,31 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, Parameters ---------- loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \ - optional(default='auto') + optional (default='auto') The loss function to use in the boosting process. 'binary_crossentropy' (also known as logistic loss) is used for binary classification and generalizes to 'categorical_crossentropy' for multiclass classification. 'auto' will automatically choose either loss depending on the nature of the problem. - learning_rate : float, optional(default=1) + learning_rate : float, optional (default=1) The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. - max_iter : int, optional(default=100) + max_iter : int, optional (default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built. - max_leaf_nodes : int or None, optional(default=None) + max_leaf_nodes : int or None, optional (default=None) The maximum number of leaves for each tree. If None, there is no maximum limit. - max_depth : int or None, optional(default=None) + max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. - min_samples_leaf : int, optional(default=5) + min_samples_leaf : int, optional (default=5) The minimum number of samples per leaf. - l2_regularization : float, optional(default=0) + l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. - max_bins : int, optional(default=256) + max_bins : int, optional (default=256) The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small @@ -613,7 +613,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, :ref:`scoring`). If None, the estimator's default scorer is used. If ``scoring='loss'``, early stopping is checked w.r.t the loss value. Only used if ``n_iter_no_change`` is not None. - validation_fraction : int or float or None, optional(default=0.1) + validation_fraction : int or float or None, optional (default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. @@ -627,11 +627,11 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, tolerance, the more likely we are to early stop: higher tolerance means that it will be harder for subsequent iterations to be considered an improvement upon the reference score. - verbose: int, optional(default=0) + verbose: int, optional (default=0) The verbosity level. If not zero, print some information about the fitting process. random_state : int, np.random.RandomStateInstance or None, \ - optional(default=None) + optional (default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping is enabled. See :term:`random_state`. diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/_fast_gradient_boosting/grower.py index d2732570cb74a..247c17dd1f142 100644 --- a/sklearn/_fast_gradient_boosting/grower.py +++ b/sklearn/_fast_gradient_boosting/grower.py @@ -32,7 +32,7 @@ class TreeNode: The sum of the gradients of the samples at the node sum_hessians : float The sum of the hessians of the samples at the node - parent : TreeNode or None, optional(default=None) + parent : TreeNode or None, optional (default=None) The parent of the node. None for root. Attributes @@ -45,7 +45,7 @@ class TreeNode: The sum of the gradients of the samples at the node sum_hessians : float The sum of the hessians of the samples at the node - parent : TreeNode or None, optional(default=None) + parent : TreeNode or None, optional (default=None) The parent of the node. None for root. split_info : SplitInfo or None The result of the split evaluation @@ -133,32 +133,32 @@ class TreeGrower: hessians : array-like, shape=(n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. - max_leaf_nodes : int or None, optional(default=None) + max_leaf_nodes : int or None, optional (default=None) The maximum number of leaves for each tree. If None, there is no maximum limit. - max_depth : int or None, optional(default=None) + max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. - min_samples_leaf : int, optional(default=20) + min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. - min_gain_to_split : float, optional(default=0.) + min_gain_to_split : float, optional (default=0.) The minimum gain needed to split a node. Splits with lower gain will be ignored. - max_bins : int, optional(default=256) + max_bins : int, optional (default=256) The maximum number of bins. Used to define the shape of the histograms. - n_bins_per_feature : array-like of int or int, optional(default=None) + n_bins_per_feature : array-like of int or int, optional (default=None) The actual number of bins needed for each feature, which is lower or equal to ``max_bins``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to have ``max_bins`` bins. - l2_regularization : float, optional(default=0) + l2_regularization : float, optional (default=0) The L2 regularization parameter. - min_hessian_to_split : float, optional(default=1e-3) + min_hessian_to_split : float, optional (default=1e-3) The minimum sum of hessians needed in each node. Splits that result in at least one child having a sum of hessians less than min_hessian_to_split are discarded. - shrinkage : float, optional(default=1) + shrinkage : float, optional (default=1) The shrinkage parameter to apply to the leaves values, also known as learning rate. """ From cec180e1d36c365390a3c6e97229403fc3fd2e91 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 16 Mar 2019 14:11:20 -0400 Subject: [PATCH 150/247] moved _fast_.. into sklearn/ensemble/ and renamed *fast* into *hist* --- benchmarks/bench_hist_gradient_boosting.py | 3 +- ...bench_hist_gradient_boosting_higgsboson.py | 3 +- sklearn/_fast_gradient_boosting/setup.py | 47 ------------------- .../_hist_gradient_boosting}/__init__.py | 0 .../_hist_gradient_boosting}/_binning.pyx | 0 .../_gradient_boosting.pyx | 0 .../_hist_gradient_boosting}/_loss.pyx | 0 .../_hist_gradient_boosting}/_predictor.pyx | 0 .../_hist_gradient_boosting}/binning.py | 6 +-- .../gradient_boosting.py | 0 .../_hist_gradient_boosting}/grower.py | 0 .../_hist_gradient_boosting}/histogram.pyx | 0 .../_hist_gradient_boosting}/loss.py | 0 .../_hist_gradient_boosting}/predictor.py | 0 .../_hist_gradient_boosting}/splitting.pyx | 0 .../tests/test_binning.py | 12 +++-- .../tests/test_compare_lightgbm.py | 5 +- .../tests/test_gradient_boosting.py | 0 .../tests/test_grower.py | 10 ++-- .../tests/test_histogram.py | 22 ++++----- .../tests/test_loss.py | 6 +-- .../tests/test_predictor.py | 6 +-- .../tests/test_splitting.py | 10 ++-- .../_hist_gradient_boosting}/types.pxd | 0 .../_hist_gradient_boosting}/types.pyx | 0 .../_hist_gradient_boosting}/utils.pyx | 2 +- sklearn/ensemble/setup.py | 38 ++++++++++++++- sklearn/experimental/__init__.py | 4 +- sklearn/setup.py | 1 - 29 files changed, 84 insertions(+), 91 deletions(-) delete mode 100644 sklearn/_fast_gradient_boosting/setup.py rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/__init__.py (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/_binning.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/_gradient_boosting.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/_loss.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/_predictor.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/binning.py (97%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/gradient_boosting.py (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/grower.py (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/histogram.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/loss.py (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/predictor.py (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/splitting.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_binning.py (96%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_compare_lightgbm.py (98%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_gradient_boosting.py (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_grower.py (96%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_histogram.py (92%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_loss.py (97%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_predictor.py (84%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/tests/test_splitting.py (96%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/types.pxd (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/types.pyx (100%) rename sklearn/{_fast_gradient_boosting => ensemble/_hist_gradient_boosting}/utils.pyx (99%) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index eb3024ec24713..028954741f973 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -7,7 +7,8 @@ from sklearn.experimental import HistGradientBoostingRegressor from sklearn.datasets import make_classification from sklearn.datasets import make_regression -from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator +from sklearn.ensemble._hist_gradient_boosting.utils import ( + get_equivalent_estimator) parser = argparse.ArgumentParser() diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index 90ca122d68dbc..fd793f61d3a8c 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -10,7 +10,8 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.experimental import HistGradientBoostingClassifier -from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator +from sklearn.ensemble._hist_gradient_boosting.utils import ( + get_equivalent_estimator) parser = argparse.ArgumentParser() diff --git a/sklearn/_fast_gradient_boosting/setup.py b/sklearn/_fast_gradient_boosting/setup.py deleted file mode 100644 index 48952619c10e2..0000000000000 --- a/sklearn/_fast_gradient_boosting/setup.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy -from numpy.distutils.misc_util import Configuration - - -def configuration(parent_package="", top_path=None): - config = Configuration("_fast_gradient_boosting", parent_package, top_path) - - config.add_extension("_gradient_boosting", - sources=["_gradient_boosting.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("histogram", - sources=["histogram.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("splitting", - sources=["splitting.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("_binning", - sources=["_binning.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("_predictor", - sources=["_predictor.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("_loss", - sources=["_loss.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("types", - sources=["types.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_extension("utils", - sources=["utils.pyx"], - include_dirs=[numpy.get_include()]) - - config.add_subpackage("tests") - - return config - - -if __name__ == "__main__": - from numpy.distutils.core import setup - setup(**configuration().todict()) diff --git a/sklearn/_fast_gradient_boosting/__init__.py b/sklearn/ensemble/_hist_gradient_boosting/__init__.py similarity index 100% rename from sklearn/_fast_gradient_boosting/__init__.py rename to sklearn/ensemble/_hist_gradient_boosting/__init__.py diff --git a/sklearn/_fast_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/_binning.pyx rename to sklearn/ensemble/_hist_gradient_boosting/_binning.pyx diff --git a/sklearn/_fast_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/_gradient_boosting.pyx rename to sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx diff --git a/sklearn/_fast_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/_loss.pyx rename to sklearn/ensemble/_hist_gradient_boosting/_loss.pyx diff --git a/sklearn/_fast_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/_predictor.pyx rename to sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx diff --git a/sklearn/_fast_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py similarity index 97% rename from sklearn/_fast_gradient_boosting/binning.py rename to sklearn/ensemble/_hist_gradient_boosting/binning.py index 5fd03d3d7b7cb..3c98de2e7b01f 100644 --- a/sklearn/_fast_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -9,9 +9,9 @@ import numpy as np -from ..utils import check_random_state, check_array -from ..base import BaseEstimator, TransformerMixin -from ..utils.validation import check_is_fitted +from ...utils import check_random_state, check_array +from ...base import BaseEstimator, TransformerMixin +from ...utils.validation import check_is_fitted from ._binning import _map_to_bins from .types import X_DTYPE, X_BINNED_DTYPE diff --git a/sklearn/_fast_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py similarity index 100% rename from sklearn/_fast_gradient_boosting/gradient_boosting.py rename to sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py diff --git a/sklearn/_fast_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py similarity index 100% rename from sklearn/_fast_gradient_boosting/grower.py rename to sklearn/ensemble/_hist_gradient_boosting/grower.py diff --git a/sklearn/_fast_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/histogram.pyx rename to sklearn/ensemble/_hist_gradient_boosting/histogram.pyx diff --git a/sklearn/_fast_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py similarity index 100% rename from sklearn/_fast_gradient_boosting/loss.py rename to sklearn/ensemble/_hist_gradient_boosting/loss.py diff --git a/sklearn/_fast_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py similarity index 100% rename from sklearn/_fast_gradient_boosting/predictor.py rename to sklearn/ensemble/_hist_gradient_boosting/predictor.py diff --git a/sklearn/_fast_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/splitting.pyx rename to sklearn/ensemble/_hist_gradient_boosting/splitting.pyx diff --git a/sklearn/_fast_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py similarity index 96% rename from sklearn/_fast_gradient_boosting/tests/test_binning.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 41bb655223a2f..aac8b0977363e 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -2,11 +2,13 @@ from numpy.testing import assert_array_equal, assert_allclose import pytest -from sklearn._fast_gradient_boosting.binning import _BinMapper -from sklearn._fast_gradient_boosting.binning import ( - _find_binning_thresholds as _find_binning_thresholds_orig) -from sklearn._fast_gradient_boosting.binning import _map_to_bins -from sklearn._fast_gradient_boosting.types import X_DTYPE, X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.binning import ( + _BinMapper, + _find_binning_thresholds as _find_binning_thresholds_orig, + _map_to_bins +) +from sklearn.ensemble._hist_gradient_boosting.types import X_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE DATA = np.random.RandomState(42).normal( diff --git a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py similarity index 98% rename from sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 23b395450a0df..03592405ecf9c 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -6,8 +6,9 @@ from sklearn.experimental import HistGradientBoostingRegressor from sklearn.experimental import HistGradientBoostingClassifier -from sklearn._fast_gradient_boosting.binning import _BinMapper -from sklearn._fast_gradient_boosting.utils import get_equivalent_estimator +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.utils import ( + get_equivalent_estimator) pytest.importorskip("lightgbm") diff --git a/sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py similarity index 100% rename from sklearn/_fast_gradient_boosting/tests/test_gradient_boosting.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py diff --git a/sklearn/_fast_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py similarity index 96% rename from sklearn/_fast_gradient_boosting/tests/test_grower.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index f662056c26b6d..ac4ab3c77b696 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -3,11 +3,11 @@ import pytest from pytest import approx -from sklearn._fast_gradient_boosting.grower import TreeGrower -from sklearn._fast_gradient_boosting.binning import _BinMapper -from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE -from sklearn._fast_gradient_boosting.types import Y_DTYPE -from sklearn._fast_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE def _make_training_data(n_bins=256, constant_hessian=True): diff --git a/sklearn/_fast_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py similarity index 92% rename from sklearn/_fast_gradient_boosting/tests/test_histogram.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index 6cb58e01f1469..20a04c46d4d99 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -4,17 +4,17 @@ from numpy.testing import assert_allclose from numpy.testing import assert_array_equal -from sklearn._fast_gradient_boosting.histogram import _build_histogram_naive -from sklearn._fast_gradient_boosting.histogram import _build_histogram -from sklearn._fast_gradient_boosting.histogram import \ - _build_histogram_no_hessian -from sklearn._fast_gradient_boosting.histogram import \ - _build_histogram_root_no_hessian -from sklearn._fast_gradient_boosting.histogram import _build_histogram_root -from sklearn._fast_gradient_boosting.histogram import _subtract_histograms -from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import G_H_DTYPE -from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.histogram import ( + _build_histogram_naive, + _build_histogram, + _build_histogram_no_hessian, + _build_histogram_root_no_hessian, + _build_histogram_root, + _subtract_histograms +) +from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE @pytest.mark.parametrize( diff --git a/sklearn/_fast_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py similarity index 97% rename from sklearn/_fast_gradient_boosting/tests/test_loss.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 56a90166dbe9a..408a3582a3670 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -5,9 +5,9 @@ from sklearn.utils import assert_all_finite import pytest -from sklearn._fast_gradient_boosting.loss import _LOSSES -from sklearn._fast_gradient_boosting.types import Y_DTYPE -from sklearn._fast_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES +from sklearn.ensemble._hist_gradient_boosting.types import Y_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE def get_derivatives_helper(loss): diff --git a/sklearn/_fast_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py similarity index 84% rename from sklearn/_fast_gradient_boosting/tests/test_predictor.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 724a238dabcfb..4a33f5ac68b1f 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -4,9 +4,9 @@ from sklearn.metrics import r2_score import pytest -from sklearn._fast_gradient_boosting.binning import _BinMapper -from sklearn._fast_gradient_boosting.grower import TreeGrower -from sklearn._fast_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE @pytest.mark.parametrize('max_bins', [200, 256]) diff --git a/sklearn/_fast_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py similarity index 96% rename from sklearn/_fast_gradient_boosting/tests/test_splitting.py rename to sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 2e9d37c12da02..567bbb917e868 100644 --- a/sklearn/_fast_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -3,11 +3,11 @@ from numpy.testing import assert_array_almost_equal import pytest -from sklearn._fast_gradient_boosting.types import HISTOGRAM_DTYPE -from sklearn._fast_gradient_boosting.types import G_H_DTYPE -from sklearn._fast_gradient_boosting.types import X_BINNED_DTYPE -from sklearn._fast_gradient_boosting.splitting import Splitter -from sklearn._fast_gradient_boosting.histogram import HistogramBuilder +from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.types import X_BINNED_DTYPE +from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter +from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder @pytest.mark.parametrize('n_bins', [3, 32, 256]) diff --git a/sklearn/_fast_gradient_boosting/types.pxd b/sklearn/ensemble/_hist_gradient_boosting/types.pxd similarity index 100% rename from sklearn/_fast_gradient_boosting/types.pxd rename to sklearn/ensemble/_hist_gradient_boosting/types.pxd diff --git a/sklearn/_fast_gradient_boosting/types.pyx b/sklearn/ensemble/_hist_gradient_boosting/types.pyx similarity index 100% rename from sklearn/_fast_gradient_boosting/types.pyx rename to sklearn/ensemble/_hist_gradient_boosting/types.pyx diff --git a/sklearn/_fast_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx similarity index 99% rename from sklearn/_fast_gradient_boosting/utils.pyx rename to sklearn/ensemble/_hist_gradient_boosting/utils.pyx index cdbf6ee032c93..7f64dd6128fe9 100644 --- a/sklearn/_fast_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -7,10 +7,10 @@ from cython.parallel import prange +from ...base import is_classifier from .binning import _BinMapper from .types cimport G_H_DTYPE_C from .types cimport Y_DTYPE_C -from ..base import is_classifier def get_equivalent_estimator(estimator, lib='lightgbm'): diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py index 63a9f25947f91..88e1b2e32d98d 100644 --- a/sklearn/ensemble/setup.py +++ b/sklearn/ensemble/setup.py @@ -4,12 +4,48 @@ def configuration(parent_package="", top_path=None): config = Configuration("ensemble", parent_package, top_path) + config.add_extension("_gradient_boosting", sources=["_gradient_boosting.pyx"], include_dirs=[numpy.get_include()]) config.add_subpackage("tests") - # config.add_data_files("gbm/histogram.pxd") + + # Histogram-based gradient boosting files + config.add_extension( + "_hist_gradient_boosting._gradient_boosting", + sources=["_hist_gradient_boosting/_gradient_boosting.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting.histogram", + sources=["_hist_gradient_boosting/histogram.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting.splitting", + sources=["_hist_gradient_boosting/splitting.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting._binning", + sources=["_hist_gradient_boosting/_binning.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting._predictor", + sources=["_hist_gradient_boosting/_predictor.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting._loss", + sources=["_hist_gradient_boosting/_loss.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting.types", + sources=["_hist_gradient_boosting/types.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_extension("_hist_gradient_boosting.utils", + sources=["_hist_gradient_boosting/utils.pyx"], + include_dirs=[numpy.get_include()]) + + config.add_subpackage("_hist_gradient_boosting.tests") return config diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py index 225b1145c741d..402499b01735d 100644 --- a/sklearn/experimental/__init__.py +++ b/sklearn/experimental/__init__.py @@ -1,9 +1,9 @@ """ -The :mod:`sklearn.experimetal` module includes estimator and tools whose API +The :mod:`sklearn.experimental` module includes estimator and tools whose API and behaviour might change without a deprecation cycle. """ -from .._fast_gradient_boosting.gradient_boosting import ( +from ..ensemble._hist_gradient_boosting.gradient_boosting import ( HistGradientBoostingClassifier, HistGradientBoostingRegressor ) diff --git a/sklearn/setup.py b/sklearn/setup.py index 960f6bc0c1da9..5f3699a6c96c2 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -59,7 +59,6 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('tree') config.add_subpackage('utils') config.add_subpackage('svm') - config.add_subpackage('_fast_gradient_boosting') config.add_subpackage('linear_model') # add cython extension module for isotonic regression From f79763e97c0e318717cdd255b424053116d8e158 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 16 Mar 2019 14:18:36 -0400 Subject: [PATCH 151/247] typo --- sklearn/experimental/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py index 402499b01735d..269a850dd5321 100644 --- a/sklearn/experimental/__init__.py +++ b/sklearn/experimental/__init__.py @@ -1,5 +1,5 @@ """ -The :mod:`sklearn.experimental` module includes estimator and tools whose API +The :mod:`sklearn.experimental` module includes estimators and tools whose API and behaviour might change without a deprecation cycle. """ From 930c4d6b770b3cea79f492bdb1e6de5111257e3d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 16 Mar 2019 14:19:05 -0400 Subject: [PATCH 152/247] removed unnecessary estimator check change? --- sklearn/utils/estimator_checks.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f4d69f3d959f0..570f8ff160687 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2366,7 +2366,6 @@ def check_decision_proba_consistency(name, estimator_orig): # Check whether an estimator having both decision_function and # predict_proba methods has outputs with perfect rank correlation. - np.random.seed(0) centers = [(2, 2), (4, 4)] X, y = make_blobs(n_samples=100, random_state=0, n_features=4, centers=centers, cluster_std=1.0, shuffle=True) @@ -2379,10 +2378,6 @@ def check_decision_proba_consistency(name, estimator_orig): estimator.fit(X, y) a = estimator.predict_proba(X_test)[:, 1] b = estimator.decision_function(X_test) - # truncate arrays to the 10th decimal to avoid rank discrepancies that - # would be caused by floating point precision issue - a = np.around(a, decimals=10) - b = np.around(b, decimals=10) assert_array_equal(rankdata(a), rankdata(b)) @@ -2446,7 +2441,7 @@ def check_fit_idempotent(name, estimator_orig): rng = np.random.RandomState(0) estimator = clone(estimator_orig) - set_random_state(estimator, random_state=0) + set_random_state(estimator) if 'warm_start' in estimator.get_params().keys(): estimator.set_params(warm_start=False) @@ -2471,7 +2466,7 @@ def check_fit_idempotent(name, estimator_orig): if hasattr(estimator, method)} # Fit again - set_random_state(estimator, random_state=0) + set_random_state(estimator) estimator.fit(X_train, y_train) for method in check_methods: From 8df021ef32de27368ae8304f88be4601240fa65d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 16 Mar 2019 14:54:34 -0400 Subject: [PATCH 153/247] windows fix? --- sklearn/setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/setup.py b/sklearn/setup.py index 5f3699a6c96c2..482732412eb93 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -45,6 +45,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('semi_supervised/tests') config.add_subpackage('experimental') config.add_subpackage('experimental/tests') + config.add_subpackage('ensemble/_hist_gradient_boosting') + config.add_subpackage('ensemble/_hist_gradient_boosting/tests') # submodules which have their own setup.py config.add_subpackage('cluster') From d6df35f7c9d5f13f2e89c7043683138e37bb3085 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 10:20:56 -0400 Subject: [PATCH 154/247] Addressing comments --- .../_hist_gradient_boosting/grower.py | 33 ++++++++----------- .../_hist_gradient_boosting/histogram.pyx | 4 +-- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 247c17dd1f142..b6b402ac137ae 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -73,9 +73,11 @@ class TreeNode: # start and stop indices of the node in the splitter.partition # array. Concretely, # self.sample_indices = view(self.splitter.partition[start:stop]) - # Only used in _update_raw_prediction, because we need to iterate over the - # leaves and I don't know how to efficiently store the sample_indices - # views because they're all of different sizes. + # Please see the comments about splitter.partition and + # splitter.split_indices for more info about this design. + # These 2 attributes are only used in _update_raw_prediction, because we + # need to iterate over the leaves and I don't know how to efficiently + # store the sample_indices views because they're all of different sizes. partition_start = 0 partition_stop = 0 @@ -88,15 +90,6 @@ def __init__(self, depth, sample_indices, sum_gradients, self.sum_hessians = sum_hessians self.parent = parent - def __repr__(self): - # To help with debugging - out = "TreeNode: depth={}, ".format(self.depth) - out += "samples={}".format(len(self.sample_indices)) - if self.split_info is not None: - out += ", feature_idx={}".format(self.split_info.feature_idx) - out += ", bin_idx={}".format(self.split_info.bin_idx) - return out - def __lt__(self, other_node): """Comparison for priority queue. @@ -112,7 +105,7 @@ def __lt__(self, other_node): The node to compare with. """ if self.split_info is None or other_node.split_info is None: - raise ValueError("Cannot compare nodes with split_info") + raise ValueError("Cannot compare nodes without split_info") return self.split_info.gain > other_node.split_info.gain @@ -157,7 +150,7 @@ class TreeGrower: min_hessian_to_split : float, optional (default=1e-3) The minimum sum of hessians needed in each node. Splits that result in at least one child having a sum of hessians less than - min_hessian_to_split are discarded. + ``min_hessian_to_split`` are discarded. shrinkage : float, optional (default=1) The shrinkage parameter to apply to the leaves values, also known as learning rate. @@ -277,17 +270,17 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): def _compute_best_split_and_push(self, node): """Compute the best possible split (SplitInfo) of a given node. - Also push it in the heap of splittable nodes if gain isn't zero.""" + Also push it in the heap of splittable nodes if gain isn't zero. + The gain of a node is 0 if either all the leaves are pure + (best gain = 0), or if no split would satisfy the constraints, + (min_hessians_to_split, min_gain_to_split, min_samples_leaf) + """ node.split_info = self.splitter.find_node_split( node.sample_indices, node.histograms, node.sum_gradients, node.sum_hessians) if node.split_info.gain <= 0: # no valid split - # Note: this condition is reached if either all the leaves are - # pure (best gain = 0), or if no split would satisfy the - # constraints, (min_hessians_to_split, min_gain_to_split, - # min_samples_leaf) self._finalize_leaf(node) else: heappush(self.splittable_nodes, node) @@ -444,7 +437,7 @@ def make_predictor(self, bin_thresholds=None): def _fill_predictor_node_array(predictor_nodes, grower_node, - bin_thresholds=None, next_free_idx=0): + bin_thresholds, next_free_idx=0): """Helper used in make_predictor to set the TreePredictor fields.""" node = predictor_nodes[next_free_idx] node['count'] = grower_node.n_samples diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 1376be8666df3..317d8e268f56f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -68,8 +68,8 @@ cdef class HistogramBuilder: G_H_DTYPE_C [::1] ordered_hessians unsigned char hessians_are_constant - def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int - max_bins, G_H_DTYPE_C [::1] gradients, + def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, + unsigned int max_bins, G_H_DTYPE_C [::1] gradients, G_H_DTYPE_C [::1] hessians, unsigned char hessians_are_constant): From e8d35549e7a937a8381b445f9a7bd1ebff7773e8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 10:27:17 -0400 Subject: [PATCH 155/247] more addressing --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index edafe059590fc..8a227f8ebeb84 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -427,7 +427,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): This estimator is much faster than :class:`GradientBoostingRegressor` - for big datasets (n_samples >= 10 000). The input data `X` is pre-binned + for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned into integer-valued bins, which considerably reduces the number of splitting points to consider, and allows the algorithm to leverage integer-based data structures. For small sample sizes, @@ -477,7 +477,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some tolerance. If None or 0, no early-stopping is done. - tol : float or None optional (default=1e-7) + tol : float or None, optional (default=1e-7) The absolute tolerance to use when comparing scores during early stopping. The higher the tolerance, the more likely we are to early stop: higher tolerance means that it will be harder for subsequent @@ -566,7 +566,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, This estimator is much faster than :class:`GradientBoostingClassifier` - for big datasets (n_samples >= 10 000). The input data `X` is pre-binned + for big datasets (n_samples >= 10 000). The input data ``X`` is pre-binned into integer-valued bins, which considerably reduces the number of splitting points to consider, and allows the algorithm to leverage integer-based data structures. For small sample sizes, @@ -622,7 +622,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some tolerance. If None or 0, no early-stopping is done. - tol : float or None optional (default=1e-7) + tol : float or None, optional (default=1e-7) The absolute tolerance to use when comparing scores. The higher the tolerance, the more likely we are to early stop: higher tolerance means that it will be harder for subsequent iterations to be From 33e83740dd8ff6a4a6e7dc736e431474b2ab37bf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 10:44:49 -0400 Subject: [PATCH 156/247] added notes about unwrapping --- .../_hist_gradient_boosting/histogram.pyx | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 317d8e268f56f..ce10422e5a114 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -18,13 +18,28 @@ from .types cimport X_BINNED_DTYPE_C from .types cimport G_H_DTYPE_C from .types cimport hist_struct -# Note: IN views are read-only, OUT views are write-only - -# Note: in a lot of functions here, we pass feature_idx and the whole 2d -# histograms arrays instead of just histograms[feature_idx]. This is because -# Cython generated C code will have strange Python interactions (likely -# related to the GIL release and the custom histogram dtype) when using 1d -# histogram arrays that come from 2d arrays. +# Notes: +# - IN views are read-only, OUT views are write-only +# - In a lot of functions here, we pass feature_idx and the whole 2d +# histograms arrays instead of just histograms[feature_idx]. This is because +# Cython generated C code will have strange Python interactions (likely +# related to the GIL release and the custom histogram dtype) when using 1d +# histogram arrays that come from 2d arrays. +# - The for loops are un-wrapped, for example: +# +# for i in range(n): +# array[i] = i +# +# will become +# +# for i in range(n // 4): +# array[i] = i +# array[i + 1] = i + 1 +# array[i + 2] = i + 2 +# array[i + 3] = i + 3 +# +# This is to hint gcc that it can auto-vectorize these 4 operations and +# perform them all at once. @cython.final From fa38f0210c516ac1c9c06fc5b64fe4eacc40b0c2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 25 Mar 2019 10:57:59 -0400 Subject: [PATCH 157/247] renamed n_bins_per_feature to actual_n_bins --- .../_hist_gradient_boosting/binning.py | 8 +++---- .../gradient_boosting.py | 2 +- .../_hist_gradient_boosting/grower.py | 16 ++++++------- .../_hist_gradient_boosting/histogram.pyx | 2 +- .../_hist_gradient_boosting/splitting.pyx | 12 +++++----- .../tests/test_binning.py | 8 +++---- .../tests/test_predictor.py | 2 +- .../tests/test_splitting.py | 24 +++++++++---------- 8 files changed, 37 insertions(+), 37 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 3c98de2e7b01f..9d75c442be9c2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -124,7 +124,7 @@ def fit(self, X, y=None): X, self.max_bins, subsample=self.subsample, random_state=self.random_state) - self.n_bins_per_feature_ = np.array( + self.actual_n_bins_ = np.array( [thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_], dtype=np.uint32) @@ -144,11 +144,11 @@ def transform(self, X): The binned data """ X = check_array(X, dtype=[X_DTYPE]) - check_is_fitted(self, ['bin_thresholds_', 'n_bins_per_feature_']) - if X.shape[1] != self.n_bins_per_feature_.shape[0]: + check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_']) + if X.shape[1] != self.actual_n_bins_.shape[0]: raise ValueError( 'This estimator was fitted with {} features but {} got passed ' - 'to transform()'.format(self.n_bins_per_feature_.shape[0], + 'to transform()'.format(self.actual_n_bins_.shape[0], X.shape[1]) ) binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F') diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 8a227f8ebeb84..23c372d8f002e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -226,7 +226,7 @@ def fit(self, X, y): grower = TreeGrower( X_binned_train, gradients[k, :], hessians[k, :], max_bins=self.max_bins, - n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_, + actual_n_bins=self.bin_mapper_.actual_n_bins_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index b6b402ac137ae..4f66b5d28ceaf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -140,7 +140,7 @@ class TreeGrower: max_bins : int, optional (default=256) The maximum number of bins. Used to define the shape of the histograms. - n_bins_per_feature : array-like of int or int, optional (default=None) + actual_n_bins : array-like of int or int, optional (default=None) The actual number of bins needed for each feature, which is lower or equal to ``max_bins``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to @@ -157,26 +157,26 @@ class TreeGrower: """ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., - max_bins=256, n_bins_per_feature=None, l2_regularization=0., + max_bins=256, actual_n_bins=None, l2_regularization=0., min_hessian_to_split=1e-3, shrinkage=1.): self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split) - if n_bins_per_feature is None: - n_bins_per_feature = max_bins + if actual_n_bins is None: + actual_n_bins = max_bins - if isinstance(n_bins_per_feature, int): - n_bins_per_feature = np.array( - [n_bins_per_feature] * X_binned.shape[1], + if isinstance(actual_n_bins, int): + actual_n_bins = np.array( + [actual_n_bins] * X_binned.shape[1], dtype=np.uint32) hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( X_binned, max_bins, gradients, hessians, hessians_are_constant) self.splitter = Splitter( - X_binned, max_bins, n_bins_per_feature, l2_regularization, + X_binned, max_bins, actual_n_bins, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) self.max_leaf_nodes = max_leaf_nodes diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index ce10422e5a114..35676632b795d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -91,7 +91,7 @@ cdef class HistogramBuilder: self.X_binned = X_binned self.n_features = X_binned.shape[1] # Note: all histograms will have bins, but some of the - # last bins may be unused if n_bins_per_feature[f] < max_bins + # last bins may be unused if actual_n_bins[f] < max_bins self.max_bins = max_bins self.gradients = gradients self.hessians = hessians diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 2aa9a77644300..456042db782eb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -94,7 +94,7 @@ cdef class Splitter: max_bins : int, optional(default=256) The maximum number of bins. Used to define the shape of the histograms. - n_bins_per_feature : array-like of int + actual_n_bins : array-like of int The actual number of bins needed for each feature, which is lower or equal to max_bins. l2_regularization : float @@ -113,7 +113,7 @@ cdef class Splitter: const X_BINNED_DTYPE_C [::1, :] X_binned unsigned int n_features unsigned int max_bins - unsigned int [::1] n_bins_per_feature + unsigned int [::1] actual_n_bins unsigned char hessians_are_constant Y_DTYPE_C l2_regularization Y_DTYPE_C min_hessian_to_split @@ -125,7 +125,7 @@ cdef class Splitter: unsigned int [::1] right_indices_buffer def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, unsigned int - max_bins, np.ndarray[np.uint32_t] n_bins_per_feature, + max_bins, np.ndarray[np.uint32_t] actual_n_bins, Y_DTYPE_C l2_regularization, Y_DTYPE_C min_hessian_to_split=1e-3, unsigned int min_samples_leaf=20, Y_DTYPE_C min_gain_to_split=0., @@ -134,9 +134,9 @@ cdef class Splitter: self.X_binned = X_binned self.n_features = X_binned.shape[1] # Note: all histograms will have bins, but some of the - # last bins may be unused if n_bins_per_feature[f] < max_bins + # last bins may be unused if actual_n_bins[f] < max_bins self.max_bins = max_bins - self.n_bins_per_feature = n_bins_per_feature + self.actual_n_bins = actual_n_bins self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf @@ -427,7 +427,7 @@ cdef class Splitter: sum_gradient_left, sum_hessian_left = 0., 0. n_samples_left = 0 - for bin_idx in range(self.n_bins_per_feature[feature_idx]): + for bin_idx in range(self.actual_n_bins[feature_idx]): n_samples_left += histograms[feature_idx, bin_idx].count n_samples_right = n_samples_ - n_samples_left diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index aac8b0977363e..86572cd359a70 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -116,7 +116,7 @@ def test_bin_mapper_random_data(n_bins): for bin_thresholds_feature in mapper.bin_thresholds_: assert bin_thresholds_feature.shape == (n_bins - 1,) assert bin_thresholds_feature.dtype == DATA.dtype - assert np.all(mapper.n_bins_per_feature_ == n_bins) + assert np.all(mapper.actual_n_bins_ == n_bins) # Check that the binned data is approximately balanced across bins. for feature_idx in range(n_features): @@ -212,15 +212,15 @@ def test_bin_mapper_idempotence(n_bins_small, n_bins_large): @pytest.mark.parametrize('max_bins', [10, 100, 256]) @pytest.mark.parametrize('diff', [-5, 0, 5]) -def test_n_bins_per_feature(max_bins, diff): - # Check that n_bins_per_feature is n_unique_values when +def test_actual_n_bins(max_bins, diff): + # Check that actual_n_bins is n_unique_values when # n_unique_values <= max_bins, else max_bins. n_unique_values = max_bins + diff X = list(range(n_unique_values)) * 2 X = np.array(X).reshape(-1, 1) mapper = _BinMapper(max_bins=max_bins).fit(X) - assert np.all(mapper.n_bins_per_feature_ == min(max_bins, n_unique_values)) + assert np.all(mapper.actual_n_bins_ == min(max_bins, n_unique_values)) def test_subsample(): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 4a33f5ac68b1f..80a56bfe78ded 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -27,7 +27,7 @@ def test_boston_dataset(max_bins): grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, max_bins=max_bins, - n_bins_per_feature=mapper.n_bins_per_feature_) + actual_n_bins=mapper.actual_n_bins_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 567bbb917e868..92b1ea7262853 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -35,8 +35,8 @@ def test_histogram_split(n_bins): all_gradients = ordered_gradients sum_gradients = all_gradients.sum() - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) + actual_n_bins = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, @@ -44,7 +44,7 @@ def test_histogram_split(n_bins): hessians_are_constant) splitter = Splitter(X_binned, n_bins, - n_bins_per_feature, + actual_n_bins, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, @@ -97,11 +97,11 @@ def test_gradient_and_hessian_sanity(constant_hessian): all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) sum_hessians = all_hessians.sum() - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) + actual_n_bins = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, constant_hessian) - splitter = Splitter(X_binned, n_bins, n_bins_per_feature, + splitter = Splitter(X_binned, n_bins, actual_n_bins, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, constant_hessian) @@ -193,12 +193,12 @@ def test_split_indices(): sum_hessians = 1 * n_samples hessians_are_constant = True - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) + actual_n_bins = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, n_bins, n_bins_per_feature, + splitter = Splitter(X_binned, n_bins, actual_n_bins, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) @@ -251,11 +251,11 @@ def test_min_gain_to_split(): sum_hessians = all_hessians.sum() hessians_are_constant = False - n_bins_per_feature = np.array([n_bins] * X_binned.shape[1], - dtype=np.uint32) + actual_n_bins = np.array([n_bins] * X_binned.shape[1], + dtype=np.uint32) builder = HistogramBuilder(X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant) - splitter = Splitter(X_binned, n_bins, n_bins_per_feature, + splitter = Splitter(X_binned, n_bins, actual_n_bins, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant) From 27f648131e75174b920432ab506e741695be357e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 31 Mar 2019 11:20:48 -0400 Subject: [PATCH 158/247] more pythonic empty list checking --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 8 ++------ .../ensemble/_hist_gradient_boosting/tests/test_grower.py | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 4f66b5d28ceaf..69500c2eb5eda 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -230,7 +230,7 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, def grow(self): """Grow the tree, from root to leaves.""" - while self.can_split_further(): + while self.splittable_nodes: self.split_next() def _intilialize_root(self, gradients, hessians, hessians_are_constant): @@ -295,7 +295,7 @@ def split_next(self): right : TreeNode The resulting right child. """ - if len(self.splittable_nodes) == 0: + if not self.splittable_nodes: raise StopIteration("No more splittable nodes") # Consider the node with the highest loss reduction (a.k.a. gain) @@ -391,10 +391,6 @@ def split_next(self): return left_child_node, right_child_node - def can_split_further(self): - """Return True if there are still nodes to split.""" - return len(self.splittable_nodes) >= 1 - def _finalize_leaf(self, node): """Compute the prediction value that minimizes the objective function. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index ac4ab3c77b696..f4bd4e196de03 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -103,7 +103,6 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): # Calling split next applies the next split and computes the best split # for each of the two newly introduced children nodes. - assert grower.can_split_further() left_node, right_node = grower.split_next() # All training samples have ben splitted in the two nodes, approximately @@ -126,7 +125,7 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): assert right_node.right_child is None # The right split has not been applied yet. Let's do it now: - assert grower.can_split_further() + assert len(grower.splittable_nodes) == 1 right_left_node, right_right_node = grower.split_next() _check_children_consistency(right_node, right_left_node, right_right_node) assert len(right_left_node.sample_indices) > 0.1 * n_samples @@ -136,7 +135,7 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): assert len(right_right_node.sample_indices) < 0.4 * n_samples # All the leafs are pure, it is not possible to split any further: - assert not grower.can_split_further() + assert not grower.splittable_nodes # Check the values of the leaves: assert grower.root.left_child.value == approx(shrinkage) From e4d67f7aba509376b35ab02509bff0534b62e2a0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 4 Apr 2019 11:14:37 -0400 Subject: [PATCH 159/247] Benchmark now using AUC from predict_proba --- .../bench_hist_gradient_boosting_higgsboson.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index fd793f61d3a8c..8832d0c7c786c 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -80,7 +80,8 @@ def load_data(): est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) -roc_auc = roc_auc_score(target_test, predicted_test) +predicted_proba_test = est.predict_proba(data_test) +roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") @@ -91,7 +92,8 @@ def load_data(): lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) - roc_auc = roc_auc_score(target_test, predicted_test) + predicted_proba_test = lightgbm_est.predict_proba(data_test) + roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") @@ -102,7 +104,8 @@ def load_data(): xgboost_est.fit(data_train, target_train) toc = time() predicted_test = xgboost_est.predict(data_test) - roc_auc = roc_auc_score(target_test, predicted_test) + predicted_proba_test = xgboost_est.predict_proba(data_test) + roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") @@ -113,6 +116,7 @@ def load_data(): catboost_est.fit(data_train, target_train) toc = time() predicted_test = catboost_est.predict(data_test) - roc_auc = roc_auc_score(target_test, predicted_test) + predicted_proba_test = catboost_est.predict_proba(data_test) + roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") From 3f94a32a94dd437af2247644378e02bc04d20047 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 4 Apr 2019 11:18:45 -0400 Subject: [PATCH 160/247] lgbm -> lightgbm, xgb -> xgboost, etc. --- .../_hist_gradient_boosting/utils.pyx | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index 7f64dd6128fe9..6dd541eb7ff33 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -41,14 +41,14 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): raise NotImplementedError('Early stopping should be deactivated.') # LGBM - lgbm_loss_mapping = { + lightgbm_loss_mapping = { 'least_squares': 'regression_l2', 'binary_crossentropy': 'binary', 'categorical_crossentropy': 'multiclass' } - lgbm_params = { - 'objective': lgbm_loss_mapping[sklearn_params['loss']], + lightgbm_params = { + 'objective': lightgbm_loss_mapping[sklearn_params['loss']], 'learning_rate': sklearn_params['learning_rate'], 'n_estimators': sklearn_params['max_iter'], 'num_leaves': sklearn_params['max_leaf_nodes'], @@ -69,20 +69,20 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): if sklearn_params['loss'] == 'categorical_crossentropy': # LGBM multiplies hessians by 2 in multiclass loss. - lgbm_params['min_sum_hessian_in_leaf'] *= 2 - lgbm_params['learning_rate'] *= 2 + lightgbm_params['min_sum_hessian_in_leaf'] *= 2 + lightgbm_params['learning_rate'] *= 2 # XGB - xgb_loss_mapping = { + xgboost_loss_mapping = { 'least_squares': 'reg:linear', 'binary_crossentropy': 'reg:logistic', 'categorical_crossentropy': 'multi:softmax' } - xgb_params = { + xgboost_params = { 'tree_method': 'hist', 'grow_policy': 'lossguide', # so that we can set max_leaves - 'objective': xgb_loss_mapping[sklearn_params['loss']], + 'objective': xgboost_loss_mapping[sklearn_params['loss']], 'learning_rate': sklearn_params['learning_rate'], 'n_estimators': sklearn_params['max_iter'], 'max_leaves': sklearn_params['max_leaf_nodes'], @@ -96,14 +96,14 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): } # Catboost - cat_loss_mapping = { + catboost_loss_mapping = { 'least_squares': 'RMSE', 'binary_crossentropy': 'Logloss', 'categorical_crossentropy': 'MultiClass' } - cat_params = { - 'loss_function': cat_loss_mapping[sklearn_params['loss']], + catboost_params = { + 'loss_function': catboost_loss_mapping[sklearn_params['loss']], 'learning_rate': sklearn_params['learning_rate'], 'iterations': sklearn_params['max_iter'], 'depth': sklearn_params['max_depth'], @@ -118,25 +118,25 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): from lightgbm import LGBMRegressor from lightgbm import LGBMClassifier if is_classifier(estimator): - return LGBMClassifier(**lgbm_params) + return LGBMClassifier(**lightgbm_params) else: - return LGBMRegressor(**lgbm_params) + return LGBMRegressor(**lightgbm_params) elif lib == 'xgboost': from xgboost import XGBRegressor from xgboost import XGBClassifier if is_classifier(estimator): - return XGBClassifier(**xgb_params) + return XGBClassifier(**xgboost_params) else: - return XGBRegressor(**xgb_params) + return XGBRegressor(**xgboost_params) else: from catboost import CatBoostRegressor from catboost import CatBoostClassifier if is_classifier(estimator): - return CatBoostClassifier(**cat_params) + return CatBoostClassifier(**catboost_params) else: - return CatBoostRegressor(**cat_params) + return CatBoostRegressor(**catboost_params) def sum_parallel(G_H_DTYPE_C [:] array): From a4d5c9b45b019bd5e71a5b31f0f21ae6658e3b84 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 4 Apr 2019 13:47:43 -0400 Subject: [PATCH 161/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- .../_hist_gradient_boosting/binning.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 9d75c442be9c2..82fd4bd3d1bbc 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -21,8 +21,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): Parameters ---------- - data: array-like - The data to bin + data : array-like, shape (n_samples, n_features) + The data to bin. max_bins : int The maximum number of bins to use. If for a given feature the number of unique values is less than ``max_bins``, then those unique values @@ -37,9 +37,9 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): Return ------ - binning_thresholds: tuple of arrays + binning_thresholds: list of arrays For each feature, stores the increasing numeric values that can - be used to separate the bins. len(binning_thresholds) == n_features. + be used to separate the bins. Thus `len(binning_thresholds) == n_features`. """ if not (2 <= max_bins <= 256): raise ValueError('max_bins={} should be no smaller than 2 ' @@ -47,7 +47,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: subset = rng.choice(np.arange(data.shape[0]), subsample) - data = data[subset] + data = data.take(subset, axis=0) percentiles = np.linspace(0, 100, num=max_bins + 1) end = percentiles.shape[0] # no negative indexing! @@ -58,7 +58,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): distinct_values = np.unique(col_data) if len(distinct_values) <= max_bins: end = distinct_values.shape[0] # no negative indexing! - midpoints = (distinct_values[:end - 1] + distinct_values[1:]) + midpoints = distinct_values[:end - 1] + distinct_values[1:] midpoints *= .5 else: # We sort again the data in this case. We could compute @@ -110,10 +110,10 @@ def fit(self, X, y=None): Parameters ---------- - X: array-like - The data to bin + X : array-like, shape (n_samples, n_features) + The data to bin. y: None - Ignored + Ignored. Returns ------- @@ -135,13 +135,13 @@ def transform(self, X): Parameters ---------- - X: array-like - The data to bin + X : array-like, shape (n_samples, n_features) + The data to bin. Returns ------- - X_binned : array-like - The binned data + X_binned : array-like, shape (n_samples, n_features) + The binned data. """ X = check_array(X, dtype=[X_DTYPE]) check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_']) From da1174c060ff69ac929184054ee616d15df2239c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 4 Apr 2019 13:55:18 -0400 Subject: [PATCH 162/247] Addressed comments --- sklearn/ensemble/_hist_gradient_boosting/_binning.pyx | 6 +++--- sklearn/ensemble/_hist_gradient_boosting/binning.py | 10 ++++------ sklearn/ensemble/_hist_gradient_boosting/utils.pyx | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 2019f7fd0955a..52c1e51dd5045 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -20,12 +20,12 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, Parameters ---------- - data : array-like, shape=(n_samples, n_features) + data : ndarray, shape=(n_samples, n_features) The numerical data to bin. - binning_thresholds : tuple of arrays + binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. - binned : array-like, shape=(n_samples, n_features) + binned : ndarray, shape=(n_samples, n_features) Output array, must be fortran aligned. """ cdef: diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 9d75c442be9c2..5802cbf2c2367 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -50,15 +50,13 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): data = data[subset] percentiles = np.linspace(0, 100, num=max_bins + 1) - end = percentiles.shape[0] # no negative indexing! - percentiles = percentiles[1:end - 1] + percentiles = percentiles[1:-1] binning_thresholds = [] for f_idx in range(data.shape[1]): col_data = np.ascontiguousarray(data[:, f_idx], dtype=X_DTYPE) distinct_values = np.unique(col_data) if len(distinct_values) <= max_bins: - end = distinct_values.shape[0] # no negative indexing! - midpoints = (distinct_values[:end - 1] + distinct_values[1:]) + midpoints = distinct_values[:-1] + distinct_values[1:] midpoints *= .5 else: # We sort again the data in this case. We could compute @@ -78,8 +76,8 @@ class _BinMapper(BaseEstimator, TransformerMixin): The bins are created in a feature-wise fashion, using quantiles so that each bins contains approximately the same number of samples. - Large datasets are subsampled, but the feature-wise quantiles should - remain stable. + For large datasets, quantiles are computed on a subset of the data to + speed-up the binning, but the quantiles should remain stable. If the number of unique values for a given feature is less than ``max_bins``, then the unique values of this feature are used instead of diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index 6dd541eb7ff33..35db124ad0da2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -18,7 +18,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): This utility function takes care of renaming the sklearn parameters into their LightGBM, XGBoost or CatBoost equivalent parameters. - """ + # unmapped XGB parameters: # - min_samples_leaf # - min_data_in_bin @@ -27,6 +27,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): # unmapped Catboost parameters: # max_leaves # min_* + """ if lib not in ('lightgbm', 'xgboost', 'catboost'): raise ValueError('accepted libs are lightgbm, xgboost, and catboost. ' From e2319beb8c9e53dbdfe6dd65c728ca9cd574afe9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 4 Apr 2019 14:02:33 -0400 Subject: [PATCH 163/247] Flake8 --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 0433095056bff..117d6af2eb2e5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -39,7 +39,8 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): ------ binning_thresholds: list of arrays For each feature, stores the increasing numeric values that can - be used to separate the bins. Thus `len(binning_thresholds) == n_features`. + be used to separate the bins. Thus ``len(binning_thresholds) == + n_features``. """ if not (2 <= max_bins <= 256): raise ValueError('max_bins={} should be no smaller than 2 ' From 1fc79af482c135f9c9759857f70474c9a235b17b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 4 Apr 2019 14:06:27 -0400 Subject: [PATCH 164/247] subsampling without replacement --- sklearn/ensemble/_hist_gradient_boosting/binning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 117d6af2eb2e5..075ed4f175ac3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -47,7 +47,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state): 'and no larger than 256.'.format(max_bins)) rng = check_random_state(random_state) if subsample is not None and data.shape[0] > subsample: - subset = rng.choice(np.arange(data.shape[0]), subsample) + subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False) data = data.take(subset, axis=0) percentiles = np.linspace(0, 100, num=max_bins + 1) From 86a8496a42e86f4c005785ed0c9655a5fd8adc2d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 5 Apr 2019 08:30:05 -0400 Subject: [PATCH 165/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- .../ensemble/_hist_gradient_boosting/_binning.pyx | 4 ++-- .../_hist_gradient_boosting/gradient_boosting.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 52c1e51dd5045..be958948bec6a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -20,12 +20,12 @@ cpdef _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, Parameters ---------- - data : ndarray, shape=(n_samples, n_features) + data : ndarray, shape (n_samples, n_features) The numerical data to bin. binning_thresholds : list of arrays For each feature, stores the increasing numeric values that are used to separate the bins. - binned : ndarray, shape=(n_samples, n_features) + binned : ndarray, shape (n_samples, n_features) Output array, must be fortran aligned. """ cdef: diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 23c372d8f002e..f0cc362c0c1ab 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -175,7 +175,8 @@ def fit(self, X, y): # else 1. n_samples = X_binned_train.shape[0] self._baseline_prediction = self.loss_.get_baseline_prediction( - y_train, self._n_trees_per_iteration) + y_train, self._n_trees_per_iteration + ) raw_predictions = np.zeros( shape=(self._n_trees_per_iteration, n_samples), dtype=self._baseline_prediction.dtype @@ -239,7 +240,8 @@ def fit(self, X, y): acc_compute_hist_time += grower.total_compute_hist_time predictor = grower.make_predictor( - bin_thresholds=self.bin_mapper_.bin_thresholds_) + bin_thresholds=self.bin_mapper_.bin_thresholds_ + ) predictors[-1].append(predictor) # Update raw_predictions with the predictions of the newly @@ -253,7 +255,8 @@ def fit(self, X, y): if self.do_early_stopping_: should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, - X_binned_val, y_val) + X_binned_val, y_val + ) if self.verbose: self._print_iteration_stats(iteration_start_time) @@ -455,7 +458,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): min_samples_leaf : int, optional (default=5) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) - The L2 regularization parameter. Use 0 for no regularization. + The L2 regularization parameter. Use ``0`` for no regularization (default). max_bins : int, optional (default=256) The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which @@ -496,12 +499,12 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): n_iter_ : int The number of iterations as selected by early stopping (if n_iter_no_change is not None). Otherwise it corresponds to max_iter. - train_score_ : array, shape=(max_iter + 1) + train_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if no early stopping. - validation_score_ : array, shape=(max_iter + 1) + validation_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if From 3c5f9229c6b51383ed78ff9f080a5e4ce86f0cb0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 5 Apr 2019 09:20:37 -0400 Subject: [PATCH 166/247] Addressed comments --- .../gradient_boosting.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f0cc362c0c1ab..6b1f770267126 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -159,7 +159,8 @@ def fit(self, X, y): subsample_size = 10000 # should we expose this parameter? indices = np.arange(X_binned_train.shape[0]) if X_binned_train.shape[0] > subsample_size: - indices = rng.choice(indices, subsample_size) + # TODO: not critical but stratify using resample(stratify=y) + indices = rng.choice(indices, subsample_size, replace=False) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] # Predicting is faster on C-contiguous arrays. @@ -219,6 +220,7 @@ def fit(self, X, y): self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) + # Append a list since there may be more than 1 predictor per iter predictors.append([]) # Build `n_trees_per_iteration` trees. @@ -449,16 +451,18 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): max_iter : int, optional (default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int or None, optional (default=31) The maximum number of leaves for each tree. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. + nodes to go from the root to the deepest leaf. Depth isn't constrained + by default. min_samples_leaf : int, optional (default=5) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) - The L2 regularization parameter. Use ``0`` for no regularization (default). + The L2 regularization parameter. Use ``0`` for no regularization + (default). max_bins : int, optional (default=256) The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which @@ -502,8 +506,8 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): train_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are - computed according to the ``scoring`` parameter. Empty if no early - stopping. + computed according to the ``scoring`` parameter. Scores are computed on + a subset of at most 10 000 samples. Empty if no early stopping. validation_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. @@ -580,7 +584,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, Parameters ---------- loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \ - optional (default='auto') + optional (default='auto') The loss function to use in the boosting process. 'binary_crossentropy' (also known as logistic loss) is used for binary classification and generalizes to 'categorical_crossentropy' for multiclass @@ -594,12 +598,13 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int or None, optional (default=31) The maximum number of leaves for each tree. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. + nodes to go from the root to the deepest leaf. Depth isn't constrained + by default. min_samples_leaf : int, optional (default=5) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) @@ -644,12 +649,12 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, n_iter_ : int The number of estimators as selected by early stopping (if n_iter_no_change is not None). Otherwise it corresponds to max_iter. - train_score_ : array, shape=(max_iter + 1) + train_score_ : array, shape (max_iter + 1,) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are - computed according to the ``scoring`` parameter. Empty if no early - stopping. - validation_score_ : array, shape=(max_iter + 1) + computed according to the ``scoring`` parameter. Scores are computed on + a subset of at most 10 000 samples. Empty if no early stopping. + validation_score_ : array, shape (max_iter + 1,) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if From 491e14c071e58391e373b47c0bc7c3fa3609dee1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 15:46:39 +0200 Subject: [PATCH 167/247] Make sure score time runs on n_samples --- benchmarks/bench_hist_gradient_boosting.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 028954741f973..396d159563f27 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -37,20 +37,21 @@ def get_estimator_and_data(): if args.problem == 'classification': - X, y = make_classification(args.n_samples_max, + X, y = make_classification(args.n_samples_max * 2, n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, random_state=0) return X, y, HistGradientBoostingClassifier elif args.problem == 'regression': - X, y = make_regression(args.n_samples_max, + X, y = make_regression(args.n_samples_max * 2, n_features=args.n_features, random_state=0) return X, y, HistGradientBoostingRegressor X, y, Estimator = get_estimator_and_data() -X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, random_state=0) +X_train_, X_test_, y_train_, y_test_ = train_test_split( + X, y, test_size=0.5, random_state=0) def one_run(n_samples): @@ -58,6 +59,8 @@ def one_run(n_samples): X_test = X_test_[:n_samples] y_train = y_train_[:n_samples] y_test = y_test_[:n_samples] + assert X_train.shape[0] == n_samples + assert X_test.shape[0] == n_samples print("Fitting a sklearn model...") tic = time() From 04f0e8655b67ba52a8d16ba7453399e763ed8ddb Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 16:22:07 +0200 Subject: [PATCH 168/247] Small improvement to benchmark script --- benchmarks/bench_hist_gradient_boosting.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 396d159563f27..570ee1b6adef7 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -61,7 +61,8 @@ def one_run(n_samples): y_test = y_test_[:n_samples] assert X_train.shape[0] == n_samples assert X_test.shape[0] == n_samples - + print("Data size: %d samples train, %d samples test." + % (n_samples, n_samples)) print("Fitting a sklearn model...") tic = time() est = Estimator(learning_rate=lr, @@ -205,9 +206,9 @@ def one_run(n_samples): axs[2].plot(n_samples_list, sklearn_score_durations, label='sklearn') if args.lightgbm: - axs[0].plot(n_samples_list, lightgbm_scores, label='lgbm') - axs[1].plot(n_samples_list, lightgbm_fit_durations, label='lgbm') - axs[2].plot(n_samples_list, lightgbm_score_durations, label='lgbm') + axs[0].plot(n_samples_list, lightgbm_scores, label='lightgbm') + axs[1].plot(n_samples_list, lightgbm_fit_durations, label='lightgbm') + axs[2].plot(n_samples_list, lightgbm_score_durations, label='lightgbm') if args.xgboost: axs[0].plot(n_samples_list, xgb_scores, label='XGBoost') From bdfacb1ddfe660921048c1403c74d97a5de4fffe Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 16:35:49 +0200 Subject: [PATCH 169/247] scipy/scipy#9608 seems to be fixed in 1.2.1 --- sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 408a3582a3670..4bdc86cca7c5e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -51,7 +51,7 @@ def get_hessians(y_true, raw_predictions): ('binary_crossentropy', -12, 1), ('binary_crossentropy', 30, 1), ]) -@pytest.mark.skipif(scipy.__version__.split('.')[:2] == ['1', '2'], +@pytest.mark.skipif(scipy.__version__.split('.')[:3] == ['1', '2', '0'], reason='bug in scipy 1.2.0, see scipy issue #9608') @pytest.mark.skipif(Y_DTYPE != np.float64, reason='Newton internally uses float64 != Y_DTYPE') From 2644cb39860c1b9994dcc30b6b91f41265fbe147 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 18:44:20 +0200 Subject: [PATCH 170/247] Better coverage and error message for binary_crossentropy on multiclass data --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 5 +++++ .../tests/test_gradient_boosting.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index dcdc067017bd6..09d17d8181894 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -162,6 +162,11 @@ def __call__(self, y_true, raw_predictions, average=True): return loss.mean() if average else loss def get_baseline_prediction(self, y_train, prediction_dim): + if prediction_dim > 2: + raise ValueError( + "loss='binary_crossentropy' is not defined for multiclass" + " classification with n_classes=%d, use" + " loss='categorical_crossentropy' instead" % prediction_dim) proba_positive_class = np.mean(y_train) eps = np.finfo(y_train.dtype).eps proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index e47aee7abb62f..325943b7e61d9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from sklearn.datasets import make_classification, make_regression from sklearn.utils.estimator_checks import check_estimator @@ -78,6 +79,16 @@ def test_init_parameters_validation(GradientBoosting, X, y): GradientBoosting(tol=-1).fit(X, y) +def test_invalid_classification_loss(): + binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy") + with pytest.raises( + ValueError, + match="loss='binary_crossentropy' is not defined for multiclass" + " classification with n_classes=3, use" + " loss='categorical_crossentropy' instead"): + binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3)) + + @pytest.mark.parametrize( 'scoring, validation_fraction, n_iter_no_change, tol', [ ('neg_mean_squared_error', .1, 5, 1e-7), # use scorer From 2416cb77ad9db3d30bc6b5ff752c3c3a41f20c5b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 19:19:56 +0200 Subject: [PATCH 171/247] Cosmetic --- .../tests/test_compare_lightgbm.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 03592405ecf9c..3380511afd418 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -22,7 +22,7 @@ ]) def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): - # Make sure sklearn has the same predictions as LGBM for easy targets. + # Make sure sklearn has the same predictions as lightgbm for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by @@ -68,16 +68,16 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) - pred_lgbm = est_lightgbm.predict(X_train) + pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal - assert np.mean(abs(pred_lgbm - pred_sklearn) > 1e-3) < .011 + assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: - pred_lgbm = est_lightgbm.predict(X_test) + pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal - assert np.mean(abs(pred_lgbm - pred_sklearn) > 1e-4) < .01 + assert np.mean(abs(pred_lightgbm - pred_sklearn) > 1e-4) < .01 @pytest.mark.parametrize('seed', range(5)) @@ -125,9 +125,9 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 - acc_lgbm = accuracy_score(y_train, pred_lightgbm) + acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) - np.testing.assert_almost_equal(acc_lgbm, acc_sklearn) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: @@ -135,9 +135,9 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples, pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 - acc_lgbm = accuracy_score(y_test, pred_lightgbm) + acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) - np.testing.assert_almost_equal(acc_lgbm, acc_sklearn, decimal=2) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) @pytest.mark.parametrize('seed', range(5)) @@ -193,9 +193,9 @@ def test_same_predictions_multiclass_classification( # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 - acc_lgbm = accuracy_score(y_train, pred_lightgbm) + acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) - np.testing.assert_almost_equal(acc_lgbm, acc_sklearn, decimal=2) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: @@ -209,6 +209,6 @@ def test_same_predictions_multiclass_classification( # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 - acc_lgbm = accuracy_score(y_test, pred_lightgbm) + acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) - np.testing.assert_almost_equal(acc_lgbm, acc_sklearn, decimal=2) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) From b0ba1d65f58abc4b83178673b6b92a284a2a67ce Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 19:21:49 +0200 Subject: [PATCH 172/247] Cosmetic --- sklearn/ensemble/_hist_gradient_boosting/utils.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index 35db124ad0da2..fa9556ef9efb5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -41,7 +41,6 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): if sklearn_params['n_iter_no_change'] is not None: raise NotImplementedError('Early stopping should be deactivated.') - # LGBM lightgbm_loss_mapping = { 'least_squares': 'regression_l2', 'binary_crossentropy': 'binary', @@ -69,7 +68,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): } if sklearn_params['loss'] == 'categorical_crossentropy': - # LGBM multiplies hessians by 2 in multiclass loss. + # LightGBM multiplies hessians by 2 in multiclass loss. lightgbm_params['min_sum_hessian_in_leaf'] *= 2 lightgbm_params['learning_rate'] *= 2 From ae0d10173769fee5e9b52cda06fecf71a17da6b4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Apr 2019 19:33:56 +0200 Subject: [PATCH 173/247] Make the least squares loss slightly less surprising --- .../_hist_gradient_boosting/gradient_boosting.py | 4 +++- sklearn/ensemble/_hist_gradient_boosting/loss.py | 11 +++++++---- .../_hist_gradient_boosting/tests/test_loss.py | 10 ++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 6b1f770267126..f666034f40d9a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -443,7 +443,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): Parameters ---------- loss : {'least_squares'}, optional (default='least_squares') - The loss function to use in the boosting process. + The loss function to use in the boosting process. Note that the + "least squares" loss actually implements an "half least squares loss" + to simplify the computation of the gradient. learning_rate : float, optional (default=0.1) The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 09d17d8181894..b06808a01197d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -109,7 +109,11 @@ class LeastSquares(BaseLoss): For a given sample x_i, least squares loss is defined as:: - loss(x_i) = (y_true_i - raw_pred_i)**2 + loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2 + + This actually computes the half least squares loss to optimize simplify + the computation of the gradients and get a unit hessian (and be consistent + with what is done in LightGBM). """ hessians_are_constant = True @@ -118,7 +122,7 @@ def __call__(self, y_true, raw_predictions, average=True): # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to # return a view. raw_predictions = raw_predictions.reshape(-1) - loss = np.power(y_true - raw_predictions, 2) + loss = 0.5 * np.power(y_true - raw_predictions, 2) return loss.mean() if average else loss def get_baseline_prediction(self, y_train, prediction_dim): @@ -134,8 +138,7 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, # return a view. raw_predictions = raw_predictions.reshape(-1) gradients = gradients.reshape(-1) - _update_gradients_least_squares(gradients, y_true, - raw_predictions) + _update_gradients_least_squares(gradients, y_true, raw_predictions) class BinaryCrossEntropy(BaseLoss): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 4bdc86cca7c5e..b8e871cc80cca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -20,10 +20,6 @@ def get_gradients(y_true, raw_predictions): hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE) loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) - - if loss.__class__ is _LOSSES['least_squares']: - gradients *= 2 # ommitted a factor of 2 to be consistent with LGBM - return gradients def get_hessians(y_true, raw_predictions): @@ -34,8 +30,10 @@ def get_hessians(y_true, raw_predictions): raw_predictions) if loss.__class__ is _LOSSES['least_squares']: - # hessians aren't updated because they're constant - hessians = np.full_like(raw_predictions, fill_value=2) + # hessians aren't updated because they're constant: + # the value is 1 because the loss is actually an half + # least squares loss. + hessians = np.full_like(raw_predictions, fill_value=1) return hessians From 9c3c45046ee9d09bb8c2efc3ee9f86b7072786d5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 5 Apr 2019 15:27:38 -0400 Subject: [PATCH 174/247] update Note text --- doc/modules/ensemble.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index eabc707b84a81..ef333395d1832 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -458,17 +458,21 @@ trees. .. note:: + + Scikit-learn 0.21 introduces two new experimental implementation of + gradient boosting trees, namely :class:`sklearn.experimental.HistGradientBoostingClassifier` and - :class:`sklearn.experimental.HistGradientBoostingRegressor` were introduced - in version 0.21 and are considerably faster than - :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` - when the number of samples is bigger than ``10 000``. These fast estimators - first bin the input samples ``X`` into integer-valued bins (typically 256 - bins) which tremendously reduces the number of splitting points to - consider, and allow the algorithm to leverage integer-based data - structures. The API of these new estimators is slightly different, and - some of the features from :class:`GradientBoostingClassifier` and - :class:`GradientBoostingRegressor` are not yet supported. + :class:`sklearn.experimental.HistGradientBoostingRegressor`. These fast + estimators first bin the input samples X into integer-valued bins + (typically 256 bins) which tremendously reduces the number of splitting + points to consider, and allow the algorithm to leverage integer-based data + structures (histograms) instead of relying on sorted continuous values. + + The new histogram-based estimators can be orders of magnitude faster than + their continuous counterparts when the number of samples is larger than + tens of thousands of samples. The API of these new estimators is slightly + different, and some of the features from :class:`GradientBoostingClassifier` + and :class:`GradientBoostingRegressor` are not yet supported. The following doc focuses on :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` only, which might be preferred for small From 5b40ffd9ee15176e2f48dd084858788c320c8542 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 5 Apr 2019 15:27:49 -0400 Subject: [PATCH 175/247] print loss instead of neg loss --- .../_hist_gradient_boosting/gradient_boosting.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index f666034f40d9a..b8cf95ebdbdb0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -367,11 +367,17 @@ def _print_iteration_stats(self, iteration_start_time): log_msg += "max depth = {}, ".format(max_depth) if self.do_early_stopping_: - name = 'neg-loss' if self.scoring == 'loss' else 'score' - log_msg += "train {}: {:.5f}, ".format(name, self.train_score_[-1]) + if self.scoring == 'loss': + factor = -1 # score_ arrays contain the negative loss + name = 'loss' + else: + factor = 1 + name = 'score' + log_msg += "train {}: {:.5f}, ".format(name, factor * + self.train_score_[-1]) if self.validation_fraction is not None: log_msg += "val {}: {:.5f}, ".format( - name, self.validation_score_[-1]) + name, factor * self.validation_score_[-1]) iteration_time = time() - iteration_start_time log_msg += "in {:0.3f}s".format(iteration_time) From 47a72da32734cdfcfada83daa104a6ecad3e8700 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 5 Apr 2019 15:36:05 -0400 Subject: [PATCH 176/247] n_trees_per_iteration_ is now a public attribute --- .../gradient_boosting.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index b8cf95ebdbdb0..91576c2e68bc7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -176,10 +176,10 @@ def fit(self, X, y): # else 1. n_samples = X_binned_train.shape[0] self._baseline_prediction = self.loss_.get_baseline_prediction( - y_train, self._n_trees_per_iteration + y_train, self.n_trees_per_iteration_ ) raw_predictions = np.zeros( - shape=(self._n_trees_per_iteration, n_samples), + shape=(self.n_trees_per_iteration_, n_samples), dtype=self._baseline_prediction.dtype ) raw_predictions += self._baseline_prediction @@ -188,7 +188,7 @@ def fit(self, X, y): # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, - prediction_dim=self._n_trees_per_iteration + prediction_dim=self.n_trees_per_iteration_ ) # predictors is a matrix (list of lists) of TreePredictor objects @@ -224,7 +224,7 @@ def fit(self, X, y): predictors.append([]) # Build `n_trees_per_iteration` trees. - for k in range(self._n_trees_per_iteration): + for k in range(self.n_trees_per_iteration_): grower = TreeGrower( X_binned_train, gradients[k, :], hessians[k, :], @@ -407,7 +407,7 @@ def _raw_predict(self, X): is_binned = getattr(self, '_in_fit', False) n_samples = X.shape[0] raw_predictions = np.zeros( - shape=(self._n_trees_per_iteration, n_samples), + shape=(self.n_trees_per_iteration_, n_samples), dtype=self._baseline_prediction.dtype ) raw_predictions += self._baseline_prediction @@ -511,6 +511,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): n_iter_ : int The number of iterations as selected by early stopping (if n_iter_no_change is not None). Otherwise it corresponds to max_iter. + n_trees_per_iteration_ : int + The number of tree that are built at each iteration. For regressors, + this is always 1. train_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are @@ -567,7 +570,7 @@ def predict(self, X): def _encode_y(self, y): # Just convert y to the expected dtype - self._n_trees_per_iteration = 1 + self.n_trees_per_iteration_ = 1 y = y.astype(Y_DTYPE, copy=False) return y @@ -657,6 +660,10 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, n_iter_ : int The number of estimators as selected by early stopping (if n_iter_no_change is not None). Otherwise it corresponds to max_iter. + n_trees_per_iteration_ : int + The number of tree that are built at each iteration. This is equal to 1 + for binary classification, and to ``n_classes`` for multiclass + classification. train_score_ : array, shape (max_iter + 1,) The scores at each iteration on the training data. The first entry is the score of the ensemble before the first iteration. Scores are @@ -751,7 +758,7 @@ def decision_function(self, X): def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ - # and _n_trees_per_iteration + # and n_trees_per_iteration_ check_classification_targets(y) label_encoder = LabelEncoder() @@ -760,13 +767,13 @@ def _encode_y(self, y): n_classes = self.classes_.shape[0] # only 1 tree for binary classification. For multiclass classification, # we build 1 tree per class. - self._n_trees_per_iteration = 1 if n_classes <= 2 else n_classes + self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes encoded_y = encoded_y.astype(Y_DTYPE, copy=False) return encoded_y def _get_loss(self): if self.loss == 'auto': - if self._n_trees_per_iteration == 1: + if self.n_trees_per_iteration_ == 1: return _LOSSES['binary_crossentropy']() else: return _LOSSES['categorical_crossentropy']() From 01ec7d649a9bb34a2af3e94748b051c2a91c417a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 8 Apr 2019 08:59:58 -0400 Subject: [PATCH 177/247] Optimized early stopping when computed on the loss --- .../_gradient_boosting.pyx | 3 + .../gradient_boosting.py | 73 +++++++++++++++---- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index d13e463e3f29b..ab0efe3832bd0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -21,6 +21,9 @@ def _update_raw_predictions( This is equivalent to raw_predictions += last_estimator.predict(X_train) + + and it's much faster. It's only possible for data X_train that is used to + train the trees (it isn't usable for e.g. X_val or X_small_train) """ cdef: unsigned int [::1] starts # start of each leaf in partition diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 91576c2e68bc7..7ca057ee24472 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -195,19 +195,48 @@ def fit(self, X, y): # with shape (n_iter_, n_trees_per_iteration) self._predictors = predictors = [] - # scorer_ is a callable with signature (est, X, y) and calls - # est.predict() or est.predict_proba() depending on its nature. + self.scorer_ = None # set if scoring != loss + raw_predictions_binned_small_train = None # set if scoring == loss + raw_predictions_binned_val = None # set if scoring == loss and val if self.scoring != 'loss': + # scorer_ is a callable with signature (est, X, y) and calls + # est.predict() or est.predict_proba() depending on its nature. self.scorer_ = check_scoring(self, self.scoring) else: - self.scorer_ = None + # we're going to compute scoring w.r.t the loss. As losses take + # raw predictions as input (unlike the scorers), we can optimize a + # bit and avoid repeating computing the predictions of the + # previous trees by storing the raw predictions of the small train + # and validation sets. This way at each iteration, we only need to + # compute the raw predictions of the newest tree(s). + init_value = self.loss_.get_baseline_prediction( + y_small_train, self.n_trees_per_iteration_) + raw_predictions_binned_small_train = np.zeros( + shape=(self.n_trees_per_iteration_, + X_binned_small_train.shape[0]), + dtype=init_value.dtype + ) + raw_predictions_binned_small_train += init_value + + if self.validation_fraction is not None: + init_value = self.loss_.get_baseline_prediction( + y_val, self.n_trees_per_iteration_) + raw_predictions_binned_val = np.zeros( + shape=(self.n_trees_per_iteration_, + X_binned_val.shape[0]), + dtype=init_value.dtype + ) + raw_predictions_binned_val += init_value + self.train_score_ = [] self.validation_score_ = [] if self.do_early_stopping_: # populate train_score and validation_score with the predictions # of the initial model (before the first tree) self._check_early_stopping(X_binned_small_train, y_small_train, - X_binned_val, y_val) + X_binned_val, y_val, + raw_predictions_binned_small_train, + raw_predictions_binned_val) for iteration in range(self.max_iter): @@ -255,9 +284,19 @@ def fit(self, X, y): should_early_stop = False if self.do_early_stopping_: + if self.scoring == 'loss': + # Need to update raw_predicitons_binned_small_train and + # maybe raw_predictions_binned_val too + for k, pred in enumerate(self._predictors[-1]): + raw_predictions_binned_small_train[k, :] += pred.predict_binned(X_binned_small_train) + if self.validation_fraction is not None: + raw_predictions_binned_val[k, :] += pred.predict_binned(X_binned_val) + should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, - X_binned_val, y_val + X_binned_val, y_val, + raw_predictions_binned_small_train, + raw_predictions_binned_val ) if self.verbose: @@ -292,22 +331,28 @@ def fit(self, X, y): del self._in_fit # hard delete so we're sure it can't be used anymore return self - def _check_early_stopping(self, X_binned_train, y_train, - X_binned_val, y_val): + def _check_early_stopping(self, X_binned_small_train, y_small_train, + X_binned_val, y_val, + raw_predictions_binned_small_train, + raw_predictions_binned_val): """Check if fitting should be early-stopped. Scores are computed on validation data or on training data. """ self.train_score_.append( - self._get_scores(X_binned_train, y_train)) + self._get_scores(X_binned_small_train, y_small_train, + raw_predictions_binned_small_train) + ) if self.validation_fraction is not None: self.validation_score_.append( - self._get_scores(X_binned_val, y_val)) + self._get_scores(X_binned_val, y_val, + raw_predictions_binned_val) + ) return self._should_stop(self.validation_score_) - - return self._should_stop(self.train_score_) + else: + return self._should_stop(self.train_score_) def _should_stop(self, scores): """ @@ -329,7 +374,7 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _get_scores(self, X_binned, y): + def _get_scores(self, X_binned, y, raw_predictions): """Compute scores on data X_binned with target y. Scores are computed with a scorer if scoring parameter is not @@ -338,10 +383,10 @@ def _get_scores(self, X_binned, y): """ if self.scoring != 'loss': + # use scorer on X_binned and y return self.scorer_(self, X_binned, y) - # Else, use loss - raw_predictions = self._raw_predict(X_binned) + # Else, use loss on raw_predictions. return -self.loss_(y, raw_predictions) def _print_iteration_stats(self, iteration_start_time): From 7813e96f95e6654c55b92f0874543ba95d83dff8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 8 Apr 2019 10:14:53 -0400 Subject: [PATCH 178/247] forgot to ammend changes --- .../_gradient_boosting.pyx | 8 +- .../gradient_boosting.py | 201 +++++++++--------- 2 files changed, 108 insertions(+), 101 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index ab0efe3832bd0..1e41f55d6a437 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -19,11 +19,11 @@ def _update_raw_predictions( grower): """Update raw_predictions with the predictions of the newest tree - This is equivalent to - raw_predictions += last_estimator.predict(X_train) + This is equivalent to (and much faster than): + raw_predictions += last_estimator.predict(X_train) - and it's much faster. It's only possible for data X_train that is used to - train the trees (it isn't usable for e.g. X_val or X_small_train) + It's only possible for data X_train that is used to train the trees (it + isn't usable for e.g. X_val) """ cdef: unsigned int [::1] starts # start of each leaf in partition diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 7ca057ee24472..15208283a74ca 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -130,7 +130,8 @@ def fit(self, X, y): self.n_iter_no_change > 0) # create validation data if needed - if self.do_early_stopping_ and self.validation_fraction is not None: + self._use_validation_data = self.validation_fraction is not None + if self.do_early_stopping_ and self._use_validation_data: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None @@ -154,18 +155,6 @@ def fit(self, X, y): X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None - # Subsample the training set for early stopping and score monitoring - if self.do_early_stopping_: - subsample_size = 10000 # should we expose this parameter? - indices = np.arange(X_binned_train.shape[0]) - if X_binned_train.shape[0] > subsample_size: - # TODO: not critical but stratify using resample(stratify=y) - indices = rng.choice(indices, subsample_size, replace=False) - X_binned_small_train = X_binned_train[indices] - y_small_train = y_train[indices] - # Predicting is faster on C-contiguous arrays. - X_binned_small_train = np.ascontiguousarray(X_binned_small_train) - if self.verbose: print("Fitting gradient boosted rounds:") @@ -195,48 +184,58 @@ def fit(self, X, y): # with shape (n_iter_, n_trees_per_iteration) self._predictors = predictors = [] + # Initialize structures and attributes related to early stopping self.scorer_ = None # set if scoring != loss - raw_predictions_binned_small_train = None # set if scoring == loss - raw_predictions_binned_val = None # set if scoring == loss and val - if self.scoring != 'loss': - # scorer_ is a callable with signature (est, X, y) and calls - # est.predict() or est.predict_proba() depending on its nature. - self.scorer_ = check_scoring(self, self.scoring) - else: - # we're going to compute scoring w.r.t the loss. As losses take - # raw predictions as input (unlike the scorers), we can optimize a - # bit and avoid repeating computing the predictions of the - # previous trees by storing the raw predictions of the small train - # and validation sets. This way at each iteration, we only need to - # compute the raw predictions of the newest tree(s). - init_value = self.loss_.get_baseline_prediction( - y_small_train, self.n_trees_per_iteration_) - raw_predictions_binned_small_train = np.zeros( - shape=(self.n_trees_per_iteration_, - X_binned_small_train.shape[0]), - dtype=init_value.dtype - ) - raw_predictions_binned_small_train += init_value - - if self.validation_fraction is not None: - init_value = self.loss_.get_baseline_prediction( - y_val, self.n_trees_per_iteration_) - raw_predictions_binned_val = np.zeros( - shape=(self.n_trees_per_iteration_, - X_binned_val.shape[0]), - dtype=init_value.dtype - ) - raw_predictions_binned_val += init_value - + raw_predictions_val = None # set if scoring == loss and use val self.train_score_ = [] self.validation_score_ = [] if self.do_early_stopping_: # populate train_score and validation_score with the predictions # of the initial model (before the first tree) - self._check_early_stopping(X_binned_small_train, y_small_train, - X_binned_val, y_val, - raw_predictions_binned_small_train, - raw_predictions_binned_val) + + if self.scoring == 'loss': + # we're going to compute scoring w.r.t the loss. As losses + # take raw predictions as input (unlike the scorers), we can + # optimize a bit and avoid repeating computing the predictions + # of the previous trees. We'll re-use raw_predictions (as it's + # needed for training anyway) for evaluating the training + # loss, and create raw_predictions_val for storing the + # raw predictions of the validation data. + + if self._use_validation_data: + raw_predictions_val = np.zeros( + shape=(self.n_trees_per_iteration_, + X_binned_val.shape[0]), + dtype=self._baseline_prediction.dtype + ) + + raw_predictions_val += self._baseline_prediction + + self._check_early_stopping_loss(raw_predictions, y_train, + raw_predictions_val, y_val) + else: + self.scorer_ = check_scoring(self, self.scoring) + # scorer_ is a callable with signature (est, X, y) and calls + # est.predict() or est.predict_proba() depending on its nature. + # Unfortunately, each call to scorer_() will compute + # the predictions of all the trees. So we use a subset of the + # training set to compute train scores. + subsample_size = 10000 # should we expose this parameter? + indices = np.arange(X_binned_train.shape[0]) + if X_binned_train.shape[0] > subsample_size: + # TODO: not critical but stratify using resample() + indices = rng.choice(indices, subsample_size, + replace=False) + X_binned_small_train = X_binned_train[indices] + y_small_train = y_train[indices] + # Predicting is faster on C-contiguous arrays. + X_binned_small_train = np.ascontiguousarray( + X_binned_small_train) + + self._check_early_stopping_scorer( + X_binned_small_train, y_small_train, + X_binned_val, y_val, + ) for iteration in range(self.max_iter): @@ -285,19 +284,22 @@ def fit(self, X, y): should_early_stop = False if self.do_early_stopping_: if self.scoring == 'loss': - # Need to update raw_predicitons_binned_small_train and - # maybe raw_predictions_binned_val too - for k, pred in enumerate(self._predictors[-1]): - raw_predictions_binned_small_train[k, :] += pred.predict_binned(X_binned_small_train) - if self.validation_fraction is not None: - raw_predictions_binned_val[k, :] += pred.predict_binned(X_binned_val) - - should_early_stop = self._check_early_stopping( - X_binned_small_train, y_small_train, - X_binned_val, y_val, - raw_predictions_binned_small_train, - raw_predictions_binned_val - ) + # Update raw_predictions_val with the newest tree(s) + if self._use_validation_data: + for k, pred in enumerate(self._predictors[-1]): + raw_predictions_val[k, :] += ( + pred.predict_binned(X_binned_val)) + + should_early_stop = self._check_early_stopping_loss( + raw_predictions, y_train, + raw_predictions_val, y_val + ) + + else: + should_early_stop = self._check_early_stopping_scorer( + X_binned_small_train, y_small_train, + X_binned_val, y_val, + ) if self.verbose: self._print_iteration_stats(iteration_start_time) @@ -331,24 +333,42 @@ def fit(self, X, y): del self._in_fit # hard delete so we're sure it can't be used anymore return self - def _check_early_stopping(self, X_binned_small_train, y_small_train, - X_binned_val, y_val, - raw_predictions_binned_small_train, - raw_predictions_binned_val): - """Check if fitting should be early-stopped. + def _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, + X_binned_val, y_val): + """Check if fitting should be early-stopped based on scorer. + + Scores are computed on validation data or on training data. + """ + + self.train_score_.append( + self.scorer_(self, X_binned_small_train, y_small_train) + ) + + if self._use_validation_data: + self.validation_score_.append( + self.scorer_(self, X_binned_val, y_val) + ) + return self._should_stop(self.validation_score_) + else: + return self._should_stop(self.train_score_) + + def _check_early_stopping_loss(self, + raw_predictions, + y_train, + raw_predictions_val, + y_val): + """Check if fitting should be early-stopped based on loss. Scores are computed on validation data or on training data. """ self.train_score_.append( - self._get_scores(X_binned_small_train, y_small_train, - raw_predictions_binned_small_train) + -self.loss_(y_train, raw_predictions) ) - if self.validation_fraction is not None: + if self._use_validation_data: self.validation_score_.append( - self._get_scores(X_binned_val, y_val, - raw_predictions_binned_val) + -self.loss_(y_val, raw_predictions_val) ) return self._should_stop(self.validation_score_) else: @@ -374,21 +394,6 @@ def _should_stop(self, scores): for score in recent_scores] return not any(recent_improvements) - def _get_scores(self, X_binned, y, raw_predictions): - """Compute scores on data X_binned with target y. - - Scores are computed with a scorer if scoring parameter is not - 'loss', else with the loss. As higher is always better, we return - -loss_value. - """ - - if self.scoring != 'loss': - # use scorer on X_binned and y - return self.scorer_(self, X_binned, y) - - # Else, use loss on raw_predictions. - return -self.loss_(y, raw_predictions) - def _print_iteration_stats(self, iteration_start_time): """Print info about the current fitting iteration.""" log_msg = '' @@ -420,7 +425,7 @@ def _print_iteration_stats(self, iteration_start_time): name = 'score' log_msg += "train {}: {:.5f}, ".format(name, factor * self.train_score_[-1]) - if self.validation_fraction is not None: + if self._use_validation_data: log_msg += "val {}: {:.5f}, ".format( name, factor * self.validation_score_[-1]) @@ -560,10 +565,11 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): The number of tree that are built at each iteration. For regressors, this is always 1. train_score_ : ndarray, shape (max_iter + 1,) - The scores at each iteration on the training data. The first entry is - the score of the ensemble before the first iteration. Scores are - computed according to the ``scoring`` parameter. Scores are computed on - a subset of at most 10 000 samples. Empty if no early stopping. + The scores at each iteration on the training data. The first entry + is the score of the ensemble before the first iteration. Scores are + computed according to the ``scoring`` parameter. If ``scoring`` is + not 'loss', scores are computed on a subset of at most 10 000 + samples. Empty if no early stopping. validation_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. @@ -709,11 +715,12 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, The number of tree that are built at each iteration. This is equal to 1 for binary classification, and to ``n_classes`` for multiclass classification. - train_score_ : array, shape (max_iter + 1,) - The scores at each iteration on the training data. The first entry is - the score of the ensemble before the first iteration. Scores are - computed according to the ``scoring`` parameter. Scores are computed on - a subset of at most 10 000 samples. Empty if no early stopping. + train_score_ : ndarray, shape (max_iter + 1,) + The scores at each iteration on the training data. The first entry + is the score of the ensemble before the first iteration. Scores are + computed according to the ``scoring`` parameter. If ``scoring`` is + not 'loss', scores are computed on a subset of at most 10 000 + samples. Empty if no early stopping. validation_score_ : array, shape (max_iter + 1,) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. From f1c1c3dc19965ecdf16d32ab77af3d0e10fbfdbc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 11 Apr 2019 10:49:52 -0400 Subject: [PATCH 179/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 69500c2eb5eda..4fdf7030c0633 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -118,12 +118,12 @@ class TreeGrower: Parameters ---------- - X_binned : array-like of int, shape=(n_samples, n_features) + X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - gradients : array-like, shape=(n_samples,) + gradients : ndarray, shape (n_samples,) The gradients of each training sample. Those are the gradients of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. - hessians : array-like, shape=(n_samples,) + hessians : ndarray, shape (n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. max_leaf_nodes : int or None, optional (default=None) From c56397792153cd2ebec24b121acdf28aa46e451f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 11 Apr 2019 10:57:10 -0400 Subject: [PATCH 180/247] Addressed Guillaume's comments --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 4fdf7030c0633..68f8aa26b396b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -9,6 +9,7 @@ from heapq import heappush, heappop import numpy as np from timeit import default_timer as time +import numbers from .splitting import Splitter from .histogram import HistogramBuilder @@ -140,7 +141,7 @@ class TreeGrower: max_bins : int, optional (default=256) The maximum number of bins. Used to define the shape of the histograms. - actual_n_bins : array-like of int or int, optional (default=None) + actual_n_bins : ndarray of int or int, optional (default=None) The actual number of bins needed for each feature, which is lower or equal to ``max_bins``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to @@ -167,10 +168,12 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, if actual_n_bins is None: actual_n_bins = max_bins - if isinstance(actual_n_bins, int): + if isinstance(actual_n_bins, numbers.Integral): actual_n_bins = np.array( [actual_n_bins] * X_binned.shape[1], dtype=np.uint32) + else: + actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32) hessians_are_constant = hessians.shape[0] == 1 self.histogram_builder = HistogramBuilder( From 6d1b6069455363195f106e8c0337bf3924e127b8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 11 Apr 2019 11:00:51 -0400 Subject: [PATCH 181/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 456042db782eb..aba2a01be9aac 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -89,12 +89,12 @@ cdef class Splitter: Parameters ---------- - X_binned : array of int + X_binned : ndarray of int, shape(n_samples, n_features) The binned input samples. Must be Fortran-aligned. - max_bins : int, optional(default=256) + max_bins : int, optional (default=256) The maximum number of bins. Used to define the shape of the histograms. - actual_n_bins : array-like of int + actual_n_bins : ndarray, shape (n_features,) The actual number of bins needed for each feature, which is lower or equal to max_bins. l2_regularization : float From b33ebadc6733967ebc3f050549aff722a59f76f3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 11 Apr 2019 11:06:11 -0400 Subject: [PATCH 182/247] Addressed comments --- sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 2 ++ sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 35676632b795d..d61c9d12c1016 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -72,6 +72,8 @@ cdef class HistogramBuilder: hessians : array-like, shape=(n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration i - 1. + hessians_are_constant: bool + Whether hessians are constant. """ cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index aba2a01be9aac..cdc3c3a1a1dc6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -105,9 +105,11 @@ cdef class Splitter: min_hessian_to_split are discarded. min_samples_leaf : int The minimum number of samples per leaf. - min_gain_to_split : float, optional(default=0.) + min_gain_to_split : float The minimum gain needed to split a node. Splits with lower gain will be ignored. + hessians_are_constant: bool + Whether hessians are constant. """ cdef public: const X_BINNED_DTYPE_C [::1, :] X_binned @@ -171,7 +173,7 @@ cdef class Splitter: ---------- split_info : SplitInfo The SplitInfo of the node to split - sample_indices : array of unsigned int + sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) The indices of the samples at the node to split. This is a view on self.partition, and it is modified inplace by placing the indices of the left child at the beginning, and the indices of From 946823fd7b226ce129632b4191c9d51dcb1c9bfa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 11 Apr 2019 11:31:21 -0400 Subject: [PATCH 183/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- .../_gradient_boosting.pyx | 4 +-- .../gradient_boosting.py | 22 ++++++++-------- .../_hist_gradient_boosting/grower.py | 26 +++++++++---------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index 1e41f55d6a437..eb7517139beec 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -17,13 +17,13 @@ from .types cimport Y_DTYPE_C def _update_raw_predictions( Y_DTYPE_C [::1] raw_predictions, # OUT grower): - """Update raw_predictions with the predictions of the newest tree + """Update raw_predictions with the predictions of the newest tree. This is equivalent to (and much faster than): raw_predictions += last_estimator.predict(X_train) It's only possible for data X_train that is used to train the trees (it - isn't usable for e.g. X_val) + isn't usable for e.g. X_val). """ cdef: unsigned int [::1] starts # start of each leaf in partition diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 15208283a74ca..155365e34f92c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -607,12 +607,12 @@ def predict(self, X): Parameters ---------- - X : array-like, shape=(n_samples, n_features) + X : array-like, shape (n_samples, n_features) The input samples. Returns ------- - y : array, shape (n_samples,) + y : ndarray, shape (n_samples,) The predicted values. """ # Return raw predictions after converting shape @@ -721,7 +721,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, computed according to the ``scoring`` parameter. If ``scoring`` is not 'loss', scores are computed on a subset of at most 10 000 samples. Empty if no early stopping. - validation_score_ : array, shape (max_iter + 1,) + validation_score_ : ndarray, shape (max_iter + 1,) The scores at each iteration on the held-out validation data. The first entry is the score of the ensemble before the first iteration. Scores are computed according to the ``scoring`` parameter. Empty if @@ -759,15 +759,15 @@ def predict(self, X): Parameters ---------- - X : array-like, shape=(n_samples, n_features) + X : array-like, shape (n_samples, n_features) The input samples. Returns ------- - y : array, shape (n_samples,) + y : ndarray, shape (n_samples,) The predicted classes. """ - # This could be done in parallel + # TODO: This could be done in parallel encoded_classes = np.argmax(self.predict_proba(X), axis=1) return self.classes_[encoded_classes] @@ -776,28 +776,28 @@ def predict_proba(self, X): Parameters ---------- - X : array-like, shape=(n_samples, n_features) + X : array-like, shape (n_samples, n_features) The input samples. Returns ------- - p : array, shape (n_samples, n_classes) + p : ndarray, shape (n_samples, n_classes) The class probabilities of the input samples. """ raw_predictions = self._raw_predict(X) return self.loss_.predict_proba(raw_predictions) def decision_function(self, X): - """Compute the decision function of X + """Compute the decision function of X. Parameters ---------- - X : array-like, shape=(n_samples, n_features) + X : array-like, shape (n_samples, n_features) The input samples. Returns ------- - decision : array, shape (n_samples,) or \ + decision : ndarray, shape (n_samples,) or \ (n_samples, n_trees_per_iteration) The raw predicted values (i.e. the sum of the trees leaves) for each sample. n_trees_per_iteration is equal to the number of diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 68f8aa26b396b..8a2c0e6b5185e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -26,41 +26,41 @@ class TreeNode: Parameters ---------- depth : int - The depth of the node, i.e. its distance from the root + The depth of the node, i.e. its distance from the root. sample_indices : array of int - The indices of the samples at the node + The indices of the samples at the node. sum_gradients : float - The sum of the gradients of the samples at the node + The sum of the gradients of the samples at the node. sum_hessians : float - The sum of the hessians of the samples at the node + The sum of the hessians of the samples at the node. parent : TreeNode or None, optional (default=None) The parent of the node. None for root. Attributes ---------- depth : int - The depth of the node, i.e. its distance from the root + The depth of the node, i.e. its distance from the root. sample_indices : array of int - The indices of the samples at the node + The indices of the samples at the node. sum_gradients : float - The sum of the gradients of the samples at the node + The sum of the gradients of the samples at the node. sum_hessians : float - The sum of the hessians of the samples at the node - parent : TreeNode or None, optional (default=None) + The sum of the hessians of the samples at the node. + parent : TreeNode or None The parent of the node. None for root. split_info : SplitInfo or None - The result of the split evaluation + The result of the split evaluation. left_child : TreeNode or None The left child of the node. None for leaves. right_child : TreeNode or None The right child of the node. None for leaves. value : float or None The value of the leaf, as computed in finalize_leaf(). None for - non-leaf nodes + non-leaf nodes. partition_start : int - start position of the node's sample_indices in splitter.partition + start position of the node's sample_indices in splitter.partition. partition_stop : int - stop position of the node's sample_indices in splitter.partition + stop position of the node's sample_indices in splitter.partition. """ split_info = None From 1934c56a7f38b23101844db74b6a00e897bb1bb0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 11 Apr 2019 11:33:36 -0400 Subject: [PATCH 184/247] Added shape for samples_indices --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 8a2c0e6b5185e..b635592cee910 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -27,7 +27,7 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root. - sample_indices : array of int + sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) The indices of the samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. @@ -40,7 +40,7 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root. - sample_indices : array of int + sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) The indices of the samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index cdc3c3a1a1dc6..31be276c1e889 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -329,7 +329,7 @@ cdef class Splitter: Parameters ---------- - sample_indices : array of int + sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) The indices of the samples at the node to split. histograms : array of HISTOGRAM_DTYPE of \ shape(n_features, max_bins) From 81a51c963bca2e7e748e925a33e77523bf71e5cb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 15 Apr 2019 08:38:00 -0400 Subject: [PATCH 185/247] Update sklearn/ensemble/_hist_gradient_boosting/grower.py Co-Authored-By: NicolasHug --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index b635592cee910..c6d8870bcbb5b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -255,7 +255,7 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): self.root.partition_start = 0 self.root.partition_stop = n_samples - if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1): + if self.max_leaf_nodes is not None and self.max_leaf_nodes == 1: self._finalize_leaf(self.root) return if self.root.n_samples < 2 * self.min_samples_leaf: From 2e86b3c3c410e641bd20b7771eac9b5a794b8cb1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 15 Apr 2019 08:38:10 -0400 Subject: [PATCH 186/247] Update sklearn/ensemble/_hist_gradient_boosting/splitting.pyx Co-Authored-By: NicolasHug --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 31be276c1e889..7b1089c20ed86 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -337,7 +337,7 @@ cdef class Splitter: sum_gradients : float The sum of the gradients for each sample at the node sum_hessians : float - The sum of the hessians for each sample at the node + The sum of the hessians for each sample at the node. Returns ------- From ccde6661d7fda63a5d17c28af8deadd7b18558e6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 15 Apr 2019 08:38:51 -0400 Subject: [PATCH 187/247] Update sklearn/ensemble/_hist_gradient_boosting/splitting.pyx Co-Authored-By: NicolasHug --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 7b1089c20ed86..038b12d08fbed 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -335,7 +335,7 @@ cdef class Splitter: shape(n_features, max_bins) The histograms of the current node. sum_gradients : float - The sum of the gradients for each sample at the node + The sum of the gradients for each sample at the node. sum_hessians : float The sum of the hessians for each sample at the node. From 903b522b9af1ca08ccf3d119381f0bbc2857774a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 09:27:15 -0400 Subject: [PATCH 188/247] Added explicit scheduling and chunksizes for prange --- sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 4 ++-- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index d61c9d12c1016..dd8ee046c7773 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -151,7 +151,7 @@ cdef class HistogramBuilder: ordered_gradients[i] = gradients[sample_indices[i]] ordered_hessians[i] = hessians[sample_indices[i]] - for feature_idx in prange(n_features): + for feature_idx in prange(n_features, schedule='static'): # Compute histogram of each feature self._compute_histogram_brute_single_feature( feature_idx, sample_indices, histograms) @@ -231,7 +231,7 @@ cdef class HistogramBuilder: dtype=HISTOGRAM_DTYPE ) - for feature_idx in prange(n_features, nogil=True): + for feature_idx in prange(n_features, schedule='static', nogil=True): # Compute histogram of each feature _subtract_histograms(feature_idx, self.max_bins, diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 038b12d08fbed..3e1e8a1a6ce54 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -266,7 +266,8 @@ cdef class Splitter: offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] # map indices from sample_indices to left/right_indices_buffer - for thread_idx in prange(n_threads): + for thread_idx in prange(n_threads, schedule='static', + chunksize=1): left_count = 0 right_count = 0 @@ -301,7 +302,8 @@ cdef class Splitter: # map indices in left/right_indices_buffer back into # sample_indices. This also updates self.partition since # sample_indices is a view. - for thread_idx in prange(n_threads): + for thread_idx in prange(n_threads, schedule='static', + chunksize=1): memcpy( &sample_indices[left_offset[thread_idx]], &left_indices_buffer[offset_in_buffers[thread_idx]], @@ -358,7 +360,7 @@ cdef class Splitter: split_infos = malloc( self.n_features * sizeof(split_info_struct)) - for feature_idx in prange(n_features): + for feature_idx in prange(n_features, schedule='static'): # For each feature, find best bin to split on split_info = self._find_best_bin_to_split_helper( feature_idx, histograms, n_samples, From a120db28f904bdfd8e8d69b7abbe2c65788262cc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 10:26:47 -0400 Subject: [PATCH 189/247] assert baseline_prediction has the same dtype has y_train --- .../ensemble/_hist_gradient_boosting/tests/test_loss.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index b8e871cc80cca..8430e084775bf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -141,6 +141,7 @@ def test_baseline_least_squares(): y_train = rng.normal(size=100) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert baseline_prediction.shape == tuple() # scalar + assert baseline_prediction.dtype == y_train.dtype # Make sure baseline prediction is the mean of all targets assert_almost_equal(baseline_prediction, y_train.mean()) @@ -150,7 +151,7 @@ def test_baseline_binary_crossentropy(): loss = _LOSSES['binary_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): - y_train = y_train.astype(np.float32) + y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert_all_finite(baseline_prediction) assert_almost_equal(loss.inverse_link_function(baseline_prediction), @@ -161,9 +162,10 @@ def test_baseline_binary_crossentropy(): # and by definition # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction) # So we want raw_prediction = link_function(p) = log(p / (1 - p)) - y_train = rng.randint(0, 2, size=100).astype(np.float32) + y_train = rng.randint(0, 2, size=100).astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert baseline_prediction.shape == tuple() # scalar + assert baseline_prediction.dtype == y_train.dtype p = y_train.mean() assert_almost_equal(baseline_prediction, np.log(p / (1 - p))) @@ -174,9 +176,10 @@ def test_baseline_categorical_crossentropy(): prediction_dim = 4 loss = _LOSSES['categorical_crossentropy']() for y_train in (np.zeros(shape=100), np.ones(shape=100)): - y_train = y_train.astype(np.float32) + y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, prediction_dim) + assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) # Same logic as for above test. Here inverse_link_function = softmax and From 4c4a05aea808c988601d0143c2cbe8a0296c86a2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 10:35:36 -0400 Subject: [PATCH 190/247] removed default values for SplitInfo --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 3e1e8a1a6ce54..1ed1101d5d054 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -63,10 +63,9 @@ class SplitInfo: n_samples_right : int The number of samples in the right child """ - def __init__(self, gain=-1., feature_idx=0, bin_idx=0, - sum_gradient_left=0., sum_hessian_left=0., - sum_gradient_right=0., sum_hessian_right=0., - n_samples_left=0, n_samples_right=0): + def __init__(self, gain, feature_idx, bin_idx, sum_gradient_left, + sum_hessian_left, sum_gradient_right, sum_hessian_right, + n_samples_left, n_samples_right): self.gain = gain self.feature_idx = feature_idx self.bin_idx = bin_idx From 8b70c5defbde5bceb0a04c24ba2a6f7d00debbc1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 10:37:48 -0400 Subject: [PATCH 191/247] removed check_estimators --- .../tests/test_gradient_boosting.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 325943b7e61d9..0f9199fb2ceb6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,7 +1,6 @@ import numpy as np import pytest from sklearn.datasets import make_classification, make_regression -from sklearn.utils.estimator_checks import check_estimator from sklearn.experimental import HistGradientBoostingClassifier from sklearn.experimental import HistGradientBoostingRegressor @@ -183,14 +182,3 @@ def should_stop(scores, n_iter_no_change, tol): assert should_stop([1] * 6, n_iter_no_change=5, tol=0.) assert should_stop([1] * 6, n_iter_no_change=5, tol=0.001) assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=5) - - -@pytest.mark.parametrize('Estimator', ( - HistGradientBoostingRegressor(), - HistGradientBoostingClassifier(), - )) -def test_estimator_checks(Estimator): - # Run the check_estimator() test suite on GBRegressor and GBClassifier. - # Just here for convenience, must be removed before merging since these - # tests are run in test_common anyways - check_estimator(Estimator) From 82428f07e5e46c826525fabfc7e4d801141890df Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 15 Apr 2019 10:40:12 -0400 Subject: [PATCH 192/247] Apply suggestions from code review Co-Authored-By: NicolasHug --- .../gradient_boosting.py | 3 +- .../_hist_gradient_boosting/histogram.pyx | 30 ++++++------ .../ensemble/_hist_gradient_boosting/loss.py | 24 +++++----- .../_hist_gradient_boosting/predictor.py | 10 ++-- .../_hist_gradient_boosting/splitting.pyx | 46 +++++++++---------- 5 files changed, 58 insertions(+), 55 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 155365e34f92c..29ceea288ab22 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -313,7 +313,8 @@ def fit(self, X, y): n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self._predictors - for predictor in predictors_at_ith_iteration) + for predictor in predictors_at_ith_iteration + ) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self._predictors) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index dd8ee046c7773..1c4f47851c47a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -61,18 +61,18 @@ cdef class HistogramBuilder: Parameters ---------- - X_binned : array of int + X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - max_bins : int, optional(default=256) + max_bins : int The maximum number of bins. Used to define the shape of the histograms. - gradients : array-like, shape=(n_samples,) + gradients : ndarray, shape (n_samples,) The gradients of each training sample. Those are the gradients of the loss w.r.t the predictions, evaluated at iteration i - 1. - hessians : array-like, shape=(n_samples,) + hessians : ndarray, shape (n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration i - 1. - hessians_are_constant: bool + hessians_are_constant : bool Whether hessians are constant. """ cdef public: @@ -116,8 +116,8 @@ cdef class HistogramBuilder: Returns ------- - histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) - The computed histograms of the current node + histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, max_bins) + The computed histograms of the current node. """ cdef: int n_samples @@ -210,17 +210,17 @@ cdef class HistogramBuilder: Parameters ---------- - parent_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the parent - sibling_histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) - The histograms of the sibling + parent_histograms : ndarray of HISTOGRAM_DTYPE, \ + shape (n_features, max_bins) + The histograms of the parent. + sibling_histograms : ndarray of HISTOGRAM_DTYPE, \ + shape (n_features, max_bins) + The histograms of the sibling. Returns ------- - histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins) - The computed histograms of the current node + histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, max_bins) + The computed histograms of the current node. """ cdef: diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index b06808a01197d..aef7aa67d566c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -34,7 +34,7 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): Parameters ---------- n_samples : int - The number of samples passed to `fit()` + The number of samples passed to `fit()`. prediction_dim : int The dimension of a raw prediction, i.e. the number of trees built at each iteration. Equals 1 for regression and binary @@ -43,10 +43,12 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): Returns ------- - gradients : array-like, shape=(prediction_dim, n_samples) - hessians : array-like, shape=(prediction_dim, n_samples). + gradients : ndarray, shape (prediction_dim, n_samples) + The initial gradients. Note that the array as not been zero-initialized. + hessians : ndarray, shape (prediction_dim, n_samples) If hessians are constant (e.g. for `LeastSquares` loss, the - array is initialized to ``1``. + array is initialized to ``1``. Otherwise, the array is allocated without + being zero-initialized. """ shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) @@ -66,7 +68,7 @@ def get_baseline_prediction(self, y_train, prediction_dim): Parameters ---------- - y_train : array-like, shape=(n_samples,) + y_train : ndarray, shape (n_samples,) The target training values. prediction_dim : int The dimension of one prediction: 1 for binary classification and @@ -74,7 +76,7 @@ def get_baseline_prediction(self, y_train, prediction_dim): Returns ------- - baseline_prediction: float or array of shape (1, prediction_dim) + baseline_prediction : float or ndarray, shape (1, prediction_dim) The baseline prediction. """ pass @@ -90,14 +92,14 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, Parameters ---------- - gradients : array-like, shape=(prediction_dim, n_samples) + gradients : ndarray, shape (prediction_dim, n_samples) The gradients (treated as OUT array). - hessians : array-like, shape=(prediction_dim, n_samples) or \ + hessians : ndarray, shape (prediction_dim, n_samples) or \ (1,) The hessians (treated as OUT array). - y_true : array-like, shape=(n_samples,) + y_true : ndarray, shape (n_samples,) The true target values or each training sample. - raw_predictions : array-like, shape=(prediction_dim, n_samples) + raw_predictions : ndarray, shape (prediction_dim, n_samples) The raw_predictions (i.e. values from the trees) of the tree ensemble at iteration ``i - 1``. """ @@ -126,7 +128,7 @@ def __call__(self, y_true, raw_predictions, average=True): return loss.mean() if average else loss def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train).astype(Y_DTYPE) + return np.mean(y_train).astype(Y_DTYPE, copy=False) @staticmethod def inverse_link_function(raw_predictions): diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index 71d5b44796d50..5b18048cc24e2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -31,7 +31,7 @@ class TreePredictor: Parameters ---------- - nodes : list of PREDICTOR_RECORD_DTYPE. + nodes : list of PREDICTOR_RECORD_DTYPE The nodes of the tree. """ def __init__(self, nodes): @@ -50,12 +50,12 @@ def predict(self, X): Parameters ---------- - X : array-like, shape=(n_samples, n_features) + X : ndarray, shape (n_samples, n_features) The input samples. Returns ------- - y : array, shape (n_samples,) + y : ndarray, shape (n_samples,) The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) @@ -67,12 +67,12 @@ def predict_binned(self, X): Parameters ---------- - X : array-like, shape=(n_samples, n_features) + X : ndarray, shape (n_samples, n_features) The input samples. Returns ------- - y : array, shape (n_samples,) + y : ndarray, shape (n_samples,) The raw predicted values. """ out = np.empty(X.shape[0], dtype=Y_DTYPE) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 3e1e8a1a6ce54..4ac3ab0f69153 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -45,23 +45,23 @@ class SplitInfo: Parameters ---------- gain : float - The gain of the split + The gain of the split. feature_idx : int - The index of the feature to be split + The index of the feature to be split. bin_idx : int - The index of the bin on which the split is made + The index of the bin on which the split is made. sum_gradient_left : float - The sum of the gradients of all the samples in the left child + The sum of the gradients of all the samples in the left child. sum_hessian_left : float - The sum of the hessians of all the samples in the left child + The sum of the hessians of all the samples in the left child. sum_gradient_right : float - The sum of the gradients of all the samples in the right child + The sum of the gradients of all the samples in the right child. sum_hessian_right : float - The sum of the hessians of all the samples in the right child - n_samples_left : int - The number of samples in the left child + The sum of the hessians of all the samples in the right child. + n_samples_left : int, default=0 + The number of samples in the left child. n_samples_right : int - The number of samples in the right child + The number of samples in the right child. """ def __init__(self, gain=-1., feature_idx=0, bin_idx=0, sum_gradient_left=0., sum_hessian_left=0., @@ -89,9 +89,9 @@ cdef class Splitter: Parameters ---------- - X_binned : ndarray of int, shape(n_samples, n_features) + X_binned : ndarray of int, shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - max_bins : int, optional (default=256) + max_bins : int The maximum number of bins. Used to define the shape of the histograms. actual_n_bins : ndarray, shape (n_features,) @@ -99,16 +99,16 @@ cdef class Splitter: equal to max_bins. l2_regularization : float The L2 regularization parameter. - min_hessian_to_split : float + min_hessian_to_split : float, default=1e-3 The minimum sum of hessians needed in each node. Splits that result in at least one child having a sum of hessians less than min_hessian_to_split are discarded. - min_samples_leaf : int + min_samples_leaf : int, default=20 The minimum number of samples per leaf. - min_gain_to_split : float + min_gain_to_split : float, default=0.0 The minimum gain needed to split a node. Splits with lower gain will be ignored. - hessians_are_constant: bool + hessians_are_constant: bool, default is False Whether hessians are constant. """ cdef public: @@ -172,7 +172,7 @@ cdef class Splitter: Parameters ---------- split_info : SplitInfo - The SplitInfo of the node to split + The SplitInfo of the node to split. sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) The indices of the samples at the node to split. This is a view on self.partition, and it is modified inplace by placing the @@ -181,14 +181,14 @@ cdef class Splitter: Returns ------- - left_indices : array of int + left_indices : ndarray of int, shape (n_left_samples,) The indices of the samples in the left child. This is a view on self.partition. - right_indices : array of int + right_indices : ndarray of int, shape (n_right_samples,) The indices of the samples in the right child. This is a view on self.partition. right_child_position : int - The position of the right child in ``sample_indices`` + The position of the right child in ``sample_indices``. """ # This is a multi-threaded implementation inspired by lightgbm. Here # is a quick break down. Let's suppose we want to split a node with 24 @@ -333,8 +333,8 @@ cdef class Splitter: ---------- sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) The indices of the samples at the node to split. - histograms : array of HISTOGRAM_DTYPE of \ - shape(n_features, max_bins) + histograms : ndarray of HISTOGRAM_DTYPE of \ + shape (n_features, max_bins) The histograms of the current node. sum_gradients : float The sum of the gradients for each sample at the node. @@ -367,7 +367,7 @@ cdef class Splitter: sum_gradients, sum_hessians) split_infos[feature_idx] = split_info - # then compute best possible split among all feature + # then compute best possible split among all features best_feature_idx = self._find_best_feature_to_split_helper( split_infos) split_info = split_infos[best_feature_idx] From bd72a4b8497484d146e6713e550890803aa0b750 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 10:40:19 -0400 Subject: [PATCH 193/247] pep8 --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index aef7aa67d566c..aede995978bce 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -44,11 +44,11 @@ def init_gradients_and_hessians(self, n_samples, prediction_dim): Returns ------- gradients : ndarray, shape (prediction_dim, n_samples) - The initial gradients. Note that the array as not been zero-initialized. + The initial gradients. The array is not initialized. hessians : ndarray, shape (prediction_dim, n_samples) If hessians are constant (e.g. for `LeastSquares` loss, the - array is initialized to ``1``. Otherwise, the array is allocated without - being zero-initialized. + array is initialized to ``1``. Otherwise, the array is allocated + without being initialized. """ shape = (prediction_dim, n_samples) gradients = np.empty(shape=shape, dtype=G_H_DTYPE) From 2e24b71b42d70decebfd50ad6f3aee446f04a4ac Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 10:42:00 -0400 Subject: [PATCH 194/247] minor docstring --- sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 1c4f47851c47a..cf7d0fd7a7607 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -111,7 +111,7 @@ cdef class HistogramBuilder: Parameters ---------- - sample_indices : array of int + sample_indices : array of int, shape (n_samples_at_node,) The indices of the samples at the node to split. Returns From a7766faf6f4583f7e5c0a6581096047c85992224 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 11:38:55 -0400 Subject: [PATCH 195/247] removed explicit type conversion and copy=False not supported in all versions --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index aede995978bce..88f4f1f7a08a4 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -128,7 +128,7 @@ def __call__(self, y_true, raw_predictions, average=True): return loss.mean() if average else loss def get_baseline_prediction(self, y_train, prediction_dim): - return np.mean(y_train).astype(Y_DTYPE, copy=False) + return np.mean(y_train) @staticmethod def inverse_link_function(raw_predictions): From 1536120ac6ef7e59bbc785ff485b9c18a9034b01 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 15 Apr 2019 13:59:21 -0400 Subject: [PATCH 196/247] Update sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py Co-Authored-By: NicolasHug --- sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 86572cd359a70..76499f12e6e8e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -75,7 +75,8 @@ def test_find_binning_thresholds_low_n_bins(): def test_find_binning_thresholds_invalid_n_bins(): - with pytest.raises(ValueError): + err_msg = 'no smaller than 2 and no larger than 256' + with pytest.raises(ValueError, match=err_msg): _find_binning_thresholds(DATA, max_bins=1024) From 7e4a88bad4a242130fa104926a6cf5a8f0adfca8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 14:23:18 -0400 Subject: [PATCH 197/247] changed min_samples_leaf default back to 20, and updated set_checking_parameters accordingly --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++---- sklearn/utils/estimator_checks.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 29ceea288ab22..50547a266db2d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -517,7 +517,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. Depth isn't constrained by default. - min_samples_leaf : int, optional (default=5) + min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use ``0`` for no regularization @@ -591,7 +591,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): def __init__(self, loss='least_squares', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, - min_samples_leaf=5, l2_regularization=0., max_bins=256, + min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): super(HistGradientBoostingRegressor, self).__init__( @@ -668,7 +668,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. Depth isn't constrained by default. - min_samples_leaf : int, optional (default=5) + min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. @@ -742,7 +742,7 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, 'auto') def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, - max_leaf_nodes=31, max_depth=None, min_samples_leaf=5, + max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-7, verbose=0, random_state=None): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fc2e6b9a1c1bc..fccbe695eb17c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -396,6 +396,10 @@ def set_checking_parameters(estimator): # which is more feature than we have in most case. estimator.set_params(k=1) + if name in ('HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor'): + estimator.set_params(min_samples_leaf=5) + class NotAnArray: """An object that is convertible to an array From b4ce89096de7d4fed249c36d4bea9a8726df9994 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 18:53:13 -0400 Subject: [PATCH 198/247] added check for bin mapper for wrong n_features at transform --- .../ensemble/_hist_gradient_boosting/tests/test_binning.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 76499f12e6e8e..4f4def6199411 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -80,6 +80,13 @@ def test_find_binning_thresholds_invalid_n_bins(): _find_binning_thresholds(DATA, max_bins=1024) +def test_bin_mapper_n_features_transform(): + mapper = _BinMapper(max_bins=42, random_state=42).fit(DATA) + err_msg = 'This estimator was fitted with 2 features but 4 got passed' + with pytest.raises(ValueError, match=err_msg): + mapper.transform(np.repeat(DATA, 2, axis=1)) + + @pytest.mark.parametrize('n_bins', [16, 128, 256]) def test_map_to_bins(n_bins): bin_thresholds = _find_binning_thresholds(DATA, max_bins=n_bins, From c272fd0a657a09f69491c22202c24518f9806a50 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 19:00:11 -0400 Subject: [PATCH 199/247] Adjusted early stopping tests now taht min_samples_leaf default has changed --- .../gradient_boosting.py | 2 +- .../tests/test_gradient_boosting.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 50547a266db2d..a56acc8f2a9a1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -584,7 +584,7 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): >>> X, y = load_boston(return_X_y=True) >>> est = HistGradientBoostingRegressor().fit(X, y) >>> est.score(X, y) - 0.99... + 0.98... """ _VALID_LOSSES = ('least_squares',) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0f9199fb2ceb6..200f1d91f4bfe 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -105,13 +105,16 @@ def test_early_stopping_regression(scoring, validation_fraction, X, y = make_regression(random_state=0) - gb = HistGradientBoostingRegressor(verbose=1, # just for coverage - scoring=scoring, - tol=tol, - validation_fraction=validation_fraction, - max_iter=max_iter, - n_iter_no_change=n_iter_no_change, - random_state=0) + gb = HistGradientBoostingRegressor( + verbose=1, # just for coverage + min_samples_leaf=5, # easier to overfit fast + scoring=scoring, + tol=tol, + validation_fraction=validation_fraction, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0 + ) gb.fit(X, y) if n_iter_no_change is not None: @@ -143,12 +146,14 @@ def test_early_stopping_classification(data, scoring, validation_fraction, gb = HistGradientBoostingClassifier( verbose=1, # just for coverage + min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, - random_state=0) + random_state=0 + ) gb.fit(X, y) if n_iter_no_change is not None: From 22ce4fa201847c41e6cddd0b9d14fb3cf0958239 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 15 Apr 2019 19:00:34 -0400 Subject: [PATCH 200/247] changed confusing should_stop test --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 200f1d91f4bfe..93d4d866617b7 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -180,10 +180,8 @@ def should_stop(scores, n_iter_no_change, tol): assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.001) assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.) assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.999) - assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, - tol=5 - 1e-5) # no significant progress according to tol assert should_stop([1] * 6, n_iter_no_change=5, tol=0.) assert should_stop([1] * 6, n_iter_no_change=5, tol=0.001) - assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=5) + assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=1.001) From 11f55739ca1c74b4d109638dff898741607e9200 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 16 Apr 2019 06:57:58 -0400 Subject: [PATCH 201/247] fixed again should_stop test --- .../_hist_gradient_boosting/tests/test_gradient_boosting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 93d4d866617b7..d5280fa211c96 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -179,9 +179,10 @@ def should_stop(scores, n_iter_no_change, tol): # still making significant progress up to tol assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.001) assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.) - assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.999) + assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, + tol=5 - 1e-5) # no significant progress according to tol assert should_stop([1] * 6, n_iter_no_change=5, tol=0.) assert should_stop([1] * 6, n_iter_no_change=5, tol=0.001) - assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=1.001) + assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=5) From 0bb5a9fab13694072e24b57dbf3f0655c1adaa33 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 16 Apr 2019 09:50:52 -0400 Subject: [PATCH 202/247] Addressed comments --- .../gradient_boosting.py | 10 +- .../tests/test_gradient_boosting.py | 137 ++++++------------ 2 files changed, 47 insertions(+), 100 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index a56acc8f2a9a1..06d40b59f0957 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -138,15 +138,7 @@ def fit(self, X, y): X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_fraction, stratify=stratify, random_state=rng) - if X_binned_train.size == 0 or X_binned_val.size == 0: - raise ValueError( - 'Not enough data (n_samples={}) to ' - 'perform early stopping with validation_fraction=' - '{}. Use more training data or ' - 'adjust validation_fraction.'.format( - X_binned.shape[0], - self.validation_fraction) - ) + # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. X_binned_val = np.ascontiguousarray(X_binned_val) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index d5280fa211c96..7bd5cf835c32d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -14,77 +14,35 @@ (HistGradientBoostingClassifier, X_classification, y_classification), (HistGradientBoostingRegressor, X_regression, y_regression) ]) -def test_init_parameters_validation(GradientBoosting, X, y): - - with pytest.raises( - ValueError, - match="Loss blah is not supported for"): - GradientBoosting(loss='blah').fit(X, y) - - for learning_rate in (-1, 0): - with pytest.raises( - ValueError, - match="learning_rate={} must be strictly positive".format( - learning_rate)): - GradientBoosting(learning_rate=learning_rate).fit(X, y) - - with pytest.raises( - ValueError, - match="max_iter=0 must not be smaller than 1"): - GradientBoosting(max_iter=0).fit(X, y) - - with pytest.raises( - ValueError, - match="max_leaf_nodes=0 should not be smaller than 1"): - GradientBoosting(max_leaf_nodes=0).fit(X, y) - - with pytest.raises( - ValueError, - match="max_depth=0 should not be smaller than 1"): - GradientBoosting(max_depth=0).fit(X, y) - - with pytest.raises( - ValueError, - match="min_samples_leaf=0 should not be smaller than 1"): - GradientBoosting(min_samples_leaf=0).fit(X, y) - - with pytest.raises( - ValueError, - match="l2_regularization=-1 must be positive"): - GradientBoosting(l2_regularization=-1).fit(X, y) - - for max_bins in (1, 257): - with pytest.raises( - ValueError, - match="max_bins={} should be no smaller than 2 and " - "no larger".format(max_bins)): - GradientBoosting(max_bins=max_bins).fit(X, y) - - with pytest.raises( - ValueError, - match="n_iter_no_change=-1 must be positive"): - GradientBoosting(n_iter_no_change=-1).fit(X, y) - - for validation_fraction in (-1, 0): - with pytest.raises( - ValueError, - match="validation_fraction={} must be strictly positive".format( - validation_fraction)): - GradientBoosting(validation_fraction=validation_fraction).fit(X, y) - - with pytest.raises( - ValueError, - match="tol=-1 must not be smaller than 0"): - GradientBoosting(tol=-1).fit(X, y) +@pytest.mark.parametrize( + 'params, err_msg', + [({'loss': 'blah'}, 'Loss blah is not supported for'), + ({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'), + ({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'), + ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'), + ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 1'), + ({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'), + ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'), + ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'), + ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'), + ({'max_bins': 257}, 'max_bins=257 should be no smaller than 2 and no'), + ({'n_iter_no_change': -1}, 'n_iter_no_change=-1 must be positive'), + ({'validation_fraction': -1}, 'validation_fraction=-1 must be strictly'), + ({'validation_fraction': 0}, 'validation_fraction=0 must be strictly'), + ({'tol': -1}, 'tol=-1 must not be smaller than 0')] +) +def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg): + + with pytest.raises(ValueError, match=err_msg): + GradientBoosting(**params).fit(X, y) def test_invalid_classification_loss(): binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy") - with pytest.raises( - ValueError, - match="loss='binary_crossentropy' is not defined for multiclass" - " classification with n_classes=3, use" - " loss='categorical_crossentropy' instead"): + err_msg = ("loss='binary_crossentropy' is not defined for multiclass " + "classification with n_classes=3, use " + "loss='categorical_crossentropy' instead") + with pytest.raises(ValueError, match=err_msg): binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3)) @@ -162,27 +120,24 @@ def test_early_stopping_classification(data, scoring, validation_fraction, assert gb.n_iter_ == max_iter -def test_should_stop(): - - def should_stop(scores, n_iter_no_change, tol): - gbdt = HistGradientBoostingClassifier( - n_iter_no_change=n_iter_no_change, - tol=tol) - return gbdt._should_stop(scores) - - # not enough iterations - assert not should_stop([], n_iter_no_change=1, tol=0.001) - - assert not should_stop([1, 1, 1], n_iter_no_change=5, tol=0.001) - assert not should_stop([1] * 5, n_iter_no_change=5, tol=0.001) - - # still making significant progress up to tol - assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.001) - assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=0.) - assert not should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, - tol=5 - 1e-5) - - # no significant progress according to tol - assert should_stop([1] * 6, n_iter_no_change=5, tol=0.) - assert should_stop([1] * 6, n_iter_no_change=5, tol=0.001) - assert should_stop([1, 2, 3, 4, 5, 6], n_iter_no_change=5, tol=5) +@pytest.mark.parametrize( + 'scores, n_iter_no_change, tol, stopping', + [ + ([], 1, 0.001, False), # not enough iterations + ([1, 1, 1], 5, 0.001, False), # not enough iterations + ([1, 1, 1, 1, 1], 5, 0.001, False), # not enough iterations + ([1, 2, 3, 4, 5, 6], 5, 0.001, False), # significant improvement + ([1, 2, 3, 4, 5, 6], 5, 0., False), # significant improvement + ([1, 2, 3, 4, 5, 6], 5, 0.999, False), # significant improvement + ([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False), # significant improvement + ([1] * 6, 5, 0., True), # no significant improvement + ([1] * 6, 5, 0.001, True), # no significant improvement + ([1] * 6, 5, 5, True), # no significant improvement + ] +) +def test_should_stop(scores, n_iter_no_change, tol, stopping): + + gbdt = HistGradientBoostingClassifier( + n_iter_no_change=n_iter_no_change, tol=tol + ) + assert gbdt._should_stop(scores) == stopping From 2c461d65a44e8128f453354f0e46d103ecafba46 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 16 Apr 2019 10:07:08 -0400 Subject: [PATCH 203/247] forces max_depth and max_leaf_nodes >= 2 and added max_depth test --- .../gradient_boosting.py | 16 ++++++------- .../_hist_gradient_boosting/grower.py | 16 ++++--------- .../tests/test_gradient_boosting.py | 6 +++-- .../tests/test_grower.py | 23 +++++++++++++++++++ 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 06d40b59f0957..fa5fc0c992a48 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -503,12 +503,12 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): The maximum number of iterations of the boosting process, i.e. the maximum number of trees. max_leaf_nodes : int or None, optional (default=31) - The maximum number of leaves for each tree. If None, there is no - maximum limit. + The maximum number of leaves for each tree. Must be strictly greater + than 1. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. Depth isn't constrained - by default. + nodes to go from the root to the deepest leaf. Must be strictly greater + than 1. Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) @@ -654,12 +654,12 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built. max_leaf_nodes : int or None, optional (default=31) - The maximum number of leaves for each tree. If None, there is no - maximum limit. + The maximum number of leaves for each tree. Must be strictly greater + than 1. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. Depth isn't constrained - by default. + nodes to go from the root to the deepest leaf. Must be strictly greater + than 1. Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. l2_regularization : float, optional (default=0) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index c6d8870bcbb5b..7a4fe78bc74c6 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -105,8 +105,6 @@ def __lt__(self, other_node): other_node : TreeNode The node to compare with. """ - if self.split_info is None or other_node.split_info is None: - raise ValueError("Cannot compare nodes without split_info") return self.split_info.gain > other_node.split_info.gain @@ -212,12 +210,12 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, raise ValueError( "X_binned should be passed as Fortran contiguous " "array for maximum efficiency.") - if max_leaf_nodes is not None and max_leaf_nodes < 1: + if max_leaf_nodes is not None and max_leaf_nodes <= 1: raise ValueError('max_leaf_nodes={} should not be' - ' smaller than 1'.format(max_leaf_nodes)) - if max_depth is not None and max_depth < 1: + ' smaller than 2'.format(max_leaf_nodes)) + if max_depth is not None and max_depth <= 1: raise ValueError('max_depth={} should not be' - ' smaller than 1'.format(max_depth)) + ' smaller than 2'.format(max_depth)) if min_samples_leaf < 1: raise ValueError('min_samples_leaf={} should ' 'not be smaller than 1'.format(min_samples_leaf)) @@ -255,9 +253,6 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant): self.root.partition_start = 0 self.root.partition_stop = n_samples - if self.max_leaf_nodes is not None and self.max_leaf_nodes == 1: - self._finalize_leaf(self.root) - return if self.root.n_samples < 2 * self.min_samples_leaf: # Do not even bother computing any splitting statistics. self._finalize_leaf(self.root) @@ -298,9 +293,6 @@ def split_next(self): right : TreeNode The resulting right child. """ - if not self.splittable_nodes: - raise StopIteration("No more splittable nodes") - # Consider the node with the highest loss reduction (a.k.a. gain) node = heappop(self.splittable_nodes) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 7bd5cf835c32d..12ef2ea7a4cae 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -20,8 +20,10 @@ ({'learning_rate': 0}, 'learning_rate=0 must be strictly positive'), ({'learning_rate': -1}, 'learning_rate=-1 must be strictly positive'), ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'), - ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 1'), - ({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'), + ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'), + ({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'), + ({'max_depth': 0}, 'max_depth=0 should not be smaller than 2'), + ({'max_depth': 1}, 'max_depth=1 should not be smaller than 2'), ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'), ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'), ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'), diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index f4bd4e196de03..30570fa828bad 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -260,6 +260,29 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): assert len(grower.finalized_leaves) == 1 +@pytest.mark.parametrize('max_depth', [2, 3]) +def test_max_depth(max_depth): + # Make sure max_depth parameter works as expected + rng = np.random.RandomState(seed=0) + + max_bins = 255 + n_samples = 1000 + + # data = linear target, 3 features, 1 irrelevant. + X = rng.normal(size=(n_samples, 3)) + y = X[:, 0] - X[:, 1] + mapper = _BinMapper(max_bins=max_bins) + X = mapper.fit_transform(X) + + all_gradients = y.astype(G_H_DTYPE) + all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) + grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth) + grower.grow() + + depth = max(leaf.depth for leaf in grower.finalized_leaves) + assert depth == max_depth + + def test_init_parameters_validation(): X_binned, all_gradients, all_hessians = _make_training_data() From 45a1d050d2a6195c261016abe774e0775d8f60eb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 16 Apr 2019 10:36:15 -0400 Subject: [PATCH 204/247] addressed comments --- .../_hist_gradient_boosting/grower.py | 2 +- .../tests/test_grower.py | 33 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 7a4fe78bc74c6..ce7ac7116030a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -205,7 +205,7 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, """ if X_binned.dtype != np.uint8: raise NotImplementedError( - "Explicit feature binning required for now") + "X_binned must be of type uint8.") if not X_binned.flags.f_contiguous: raise ValueError( "X_binned should be passed as Fortran contiguous " diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 30570fa828bad..49b19ce2778dd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -1,5 +1,4 @@ import numpy as np -from numpy.testing import assert_array_almost_equal import pytest from pytest import approx @@ -30,10 +29,7 @@ def true_decision_function(input_features): if input_features[0] <= n_bins // 2: return -1 else: - if input_features[1] <= n_bins // 3: - return -1 - else: - return 1 + return -1 if input_features[1] <= n_bins // 3 else 1 target = np.array([true_decision_function(x) for x in X_binned], dtype=Y_DTYPE) @@ -41,14 +37,15 @@ def true_decision_function(input_features): # Assume a square loss applied to an initial model that always predicts 0 # (hardcoded for this test): all_gradients = target.astype(G_H_DTYPE) - if constant_hessian: - all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) - else: - all_hessians = np.ones_like(all_gradients) + shape_hessians = 1 if constant_hessian else all_gradients.shape + all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE) + return X_binned, all_gradients, all_hessians def _check_children_consistency(parent, left, right): + # Make sure the samples are correctly dispatched from a parent to its + # children assert parent.left_child is left assert parent.right_child is right @@ -162,6 +159,7 @@ def test_predictor_from_grower(): assert predictor.nodes['is_leaf'].sum() == 3 # Probe some predictions for each leaf of the tree + # each group of 3 samples corresponds to a condition in _make_training_data input_data = np.array([ [0, 0], [42, 99], @@ -177,11 +175,11 @@ def test_predictor_from_grower(): ], dtype=np.uint8) predictions = predictor.predict_binned(input_data) expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] - assert_array_almost_equal(predictions, expected_targets, decimal=5) + assert np.allclose(predictions, expected_targets) # Check that training set can be recovered exactly: predictions = predictor.predict_binned(X_binned) - assert_array_almost_equal(predictions, -all_gradients, decimal=5) + assert np.allclose(predictions, -all_gradients) @pytest.mark.parametrize( @@ -209,10 +207,8 @@ def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, X = mapper.fit_transform(X) all_gradients = y.astype(G_H_DTYPE) - if constant_hessian: - all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) - else: - all_hessians = np.ones_like(all_gradients) + shape_hessian = 1 if constant_hessian else all_gradients.shape + all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) grower = TreeGrower(X, all_gradients, all_hessians, max_bins=n_bins, shrinkage=1., min_samples_leaf=min_samples_leaf, @@ -283,13 +279,13 @@ def test_max_depth(max_depth): assert depth == max_depth -def test_init_parameters_validation(): +def test_input_validation(): X_binned, all_gradients, all_hessians = _make_training_data() X_binned_float = X_binned.astype(np.float32) with pytest.raises(NotImplementedError, - match="Explicit feature binning required for now"): + match="X_binned must be of type uint8"): TreeGrower(X_binned_float, all_gradients, all_hessians) X_binned_C_array = np.ascontiguousarray(X_binned) @@ -298,6 +294,9 @@ def test_init_parameters_validation(): match="X_binned should be passed as Fortran contiguous array"): TreeGrower(X_binned_C_array, all_gradients, all_hessians) + +def test_init_parameters_validation(): + X_binned, all_gradients, all_hessians = _make_training_data() with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"): From dcce26b04286fb8713dec0df8c322cd20a056dbc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 16 Apr 2019 12:42:41 -0400 Subject: [PATCH 205/247] Addressed comments --- .../tests/test_histogram.py | 2 ++ .../tests/test_loss.py | 20 +++++++++---------- .../tests/test_splitting.py | 18 +++++++---------- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index 20a04c46d4d99..c425a0389a789 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -50,6 +50,8 @@ def test_build_histogram(build_func): def test_histogram_sample_order_independence(): + # Make sure the order of the samples has no impact on the histogram + # computations rng = np.random.RandomState(42) n_sub_samples = 100 n_samples = 1000 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 8430e084775bf..575095beb4883 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -1,8 +1,8 @@ import numpy as np from numpy.testing import assert_almost_equal -import scipy from scipy.optimize import newton from sklearn.utils import assert_all_finite +from sklearn.utils.fixes import sp_version import pytest from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES @@ -29,7 +29,7 @@ def get_hessians(y_true, raw_predictions): loss.update_gradients_and_hessians(gradients, hessians, y_true, raw_predictions) - if loss.__class__ is _LOSSES['least_squares']: + if loss.__class__.__name__ == 'LeastSquares': # hessians aren't updated because they're constant: # the value is 1 because the loss is actually an half # least squares loss. @@ -49,7 +49,7 @@ def get_hessians(y_true, raw_predictions): ('binary_crossentropy', -12, 1), ('binary_crossentropy', 30, 1), ]) -@pytest.mark.skipif(scipy.__version__.split('.')[:3] == ['1', '2', '0'], +@pytest.mark.skipif(sp_version == (1, 2, 0), reason='bug in scipy 1.2.0, see scipy issue #9608') @pytest.mark.skipif(Y_DTYPE != np.float64, reason='Newton internally uses float64 != Y_DTYPE') @@ -117,7 +117,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): offset[0, :] = eps f_plus_eps = loss(y_true, raw_predictions + offset / 2, average=False) f_minus_eps = loss(y_true, raw_predictions - offset / 2, average=False) - numerical_gradient = (f_plus_eps - f_minus_eps) / eps + numerical_gradients = (f_plus_eps - f_minus_eps) / eps # Approximate hessians eps = 1e-4 # need big enough eps as we divide by its square @@ -130,8 +130,8 @@ def test_numerical_gradients(loss, n_classes, prediction_dim): def relative_error(a, b): return np.abs(a - b) / np.maximum(np.abs(a), np.abs(b)) - assert np.all(relative_error(numerical_gradient, gradients) < 1e-5) - assert np.all(relative_error(numerical_hessians, hessians) < 1e-5) + assert np.allclose(numerical_gradients, gradients, rtol=1e-5) + assert np.allclose(numerical_hessians, hessians, rtol=1e-5) def test_baseline_least_squares(): @@ -154,8 +154,8 @@ def test_baseline_binary_crossentropy(): y_train = y_train.astype(np.float64) baseline_prediction = loss.get_baseline_prediction(y_train, 1) assert_all_finite(baseline_prediction) - assert_almost_equal(loss.inverse_link_function(baseline_prediction), - y_train[0], decimal=6) + assert np.allclose(loss.inverse_link_function(baseline_prediction), + y_train[0]) # Make sure baseline prediction is equal to link_function(p), where p # is the proba of the positive class. We want predict_proba() to return p, @@ -167,7 +167,7 @@ def test_baseline_binary_crossentropy(): assert baseline_prediction.shape == tuple() # scalar assert baseline_prediction.dtype == y_train.dtype p = y_train.mean() - assert_almost_equal(baseline_prediction, np.log(p / (1 - p))) + assert np.allclose(baseline_prediction, np.log(p / (1 - p))) def test_baseline_categorical_crossentropy(): @@ -189,4 +189,4 @@ def test_baseline_categorical_crossentropy(): assert baseline_prediction.shape == (prediction_dim, 1) for k in range(prediction_dim): p = (y_train == k).mean() - assert_almost_equal(baseline_prediction[k, :], np.log(p)) + assert np.allclose(baseline_prediction[k, :], np.log(p)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 92b1ea7262853..d34f5ef064137 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -1,6 +1,4 @@ import numpy as np -from numpy.testing import assert_almost_equal -from numpy.testing import assert_array_almost_equal import pytest from sklearn.ensemble._hist_gradient_boosting.types import HISTOGRAM_DTYPE @@ -134,8 +132,8 @@ def test_gradient_and_hessian_sanity(constant_hessian): else: expected_hessian = all_hessians[indices].sum() - assert_almost_equal(gradient, expected_gradient, decimal=3) - assert_almost_equal(hessian, expected_hessian, decimal=3) + assert np.isclose(gradient, expected_gradient) + assert np.isclose(hessian, expected_hessian) # make sure sum of gradients in histograms are the same for all features, # and make sure they're equal to their expected value @@ -158,8 +156,8 @@ def test_gradient_and_hessian_sanity(constant_hessian): else: expected_hessian = all_hessians[indices].sum() - assert_almost_equal(gradients, expected_gradient, decimal=4) - assert_almost_equal(hessians, expected_hessian, decimal=4) + assert np.allclose(gradients, expected_gradient) + assert np.allclose(hessians, expected_hessian) def test_split_indices(): @@ -203,7 +201,7 @@ def test_split_indices(): min_samples_leaf, min_gain_to_split, hessians_are_constant) - assert_array_almost_equal(sample_indices, splitter.partition) + assert np.all(sample_indices == splitter.partition) histograms = builder.compute_histograms_brute(sample_indices) si_root = splitter.find_node_split(sample_indices, histograms, @@ -218,10 +216,8 @@ def test_split_indices(): assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8]) assert set(samples_right) == set([2, 7, 9]) - assert_array_almost_equal(samples_left, - splitter.partition[:position_right]) - assert_array_almost_equal(samples_right, - splitter.partition[position_right:]) + assert list(samples_left) == list(splitter.partition[:position_right]) + assert list(samples_right) == list(splitter.partition[position_right:]) # Check that the resulting split indices sizes are consistent with the # count statistics anticipated when looking for the best split. From f4ac9292bbd28d391d9fd8f191f561e60ad63c12 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Apr 2019 12:38:42 -0400 Subject: [PATCH 206/247] use from sklearn.experimental import enable_hist_gradient_boosting --- benchmarks/bench_hist_gradient_boosting.py | 6 ++- ...bench_hist_gradient_boosting_higgsboson.py | 4 +- doc/conf.py | 5 ++ doc/modules/classes.rst | 36 ++++++++------- doc/modules/ensemble.rst | 23 +++++++--- doc/whats_new/v0.21.rst | 23 ++++++++++ .../gradient_boosting.py | 37 ++++++++++++++- .../tests/test_compare_lightgbm.py | 6 ++- .../tests/test_gradient_boosting.py | 6 ++- sklearn/ensemble/gradient_boosting.py | 4 +- sklearn/experimental/__init__.py | 14 ++---- .../enable_hist_gradient_boosting.py | 32 +++++++++++++ .../test_enable_hist_gradient_boosting.py | 46 +++++++++++++++++++ 13 files changed, 198 insertions(+), 44 deletions(-) create mode 100644 sklearn/experimental/enable_hist_gradient_boosting.py create mode 100644 sklearn/experimental/tests/test_enable_hist_gradient_boosting.py diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 570ee1b6adef7..8d055b22c2252 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -3,8 +3,10 @@ import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split -from sklearn.experimental import HistGradientBoostingClassifier -from sklearn.experimental import HistGradientBoostingRegressor +# To use this experimental feature, we need to explicitly ask for it: +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.datasets import make_classification from sklearn.datasets import make_regression from sklearn.ensemble._hist_gradient_boosting.utils import ( diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index 8832d0c7c786c..23d0e16194cc0 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -9,7 +9,9 @@ from joblib import Memory from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score -from sklearn.experimental import HistGradientBoostingClassifier +# To use this experimental feature, we need to explicitly ask for it: +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import ( get_equivalent_estimator) diff --git a/doc/conf.py b/doc/conf.py index 7b8a7d19414fc..e2e4f50d9f41d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -263,6 +263,11 @@ 'sphx_glr_plot_compare_methods_001.png': 349} +# enable experimental module so that the new GBDTs estimators can be +# discovered properly by sphinx +from sklearn.experimental import enable_hist_gradient_boosting + + def make_carousel_thumbs(app, exception): """produces the final resized carousel images""" if exception is not None: diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 9a4ed491c72dd..1740730c46fcb 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -422,6 +422,9 @@ Samples generator ensemble.RandomTreesEmbedding ensemble.VotingClassifier ensemble.VotingRegressor + ensemble.HistGradientBoostingRegressor + ensemble.HistGradientBoostingClassifier + .. autosummary:: :toctree: generated/ @@ -470,6 +473,22 @@ partial dependence exceptions.NonBLASDotWarning exceptions.UndefinedMetricWarning + +:mod:`sklearn.experimental`: Experimental +========================================= + +.. automodule:: sklearn.experimental + :no-members: + :no-inherited-members: + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + + experimental.enable_hist_gradient_boosting + + .. _feature_extraction_ref: :mod:`sklearn.feature_extraction`: Feature Extraction @@ -1486,23 +1505,6 @@ Utilities from joblib: utils.parallel_backend utils.register_parallel_backend -.. _experimental_ref: - -Experimental -============ - -.. automodule:: sklearn.experimental - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - experimental.HistGradientBoostingRegressor - experimental.HistGradientBoostingClassifier Recently deprecated =================== diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 9348fe43705a2..dcf629a0ca50d 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -460,12 +460,12 @@ trees. .. note:: Scikit-learn 0.21 introduces two new experimental implementation of - gradient boosting trees, namely - :class:`sklearn.experimental.HistGradientBoostingClassifier` and - :class:`sklearn.experimental.HistGradientBoostingRegressor`. These fast - estimators first bin the input samples X into integer-valued bins - (typically 256 bins) which tremendously reduces the number of splitting - points to consider, and allow the algorithm to leverage integer-based data + gradient boosting trees, namely :class:`HistGradientBoostingClassifier` + and :class:`HistGradientBoostingRegressor`, inspired by + `LightGBM `_. These fast estimators + first bin the input samples ``X`` into integer-valued bins (typically 256 + bins) which tremendously reduces the number of splitting points to + consider, and allow the algorithm to leverage integer-based data structures (histograms) instead of relying on sorted continuous values. The new histogram-based estimators can be orders of magnitude faster than @@ -474,7 +474,16 @@ trees. different, and some of the features from :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` are not yet supported. - The following doc focuses on :class:`GradientBoostingClassifier` and + These new estimators are still **experimental** for now: their predictions + and their API might change without any deprecation cycle. To use them, you + need to explicitly import ``enable_hist_gradient_boosting``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> # now you can import normally from ensemble + >>> from sklearn.ensemble import HistGradientBoostingClassifier + + The following guide focuses on :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` only, which might be preferred for small sample sizes since binning may lead to split points that are too approximate in this setting. diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 3175fca4747f6..e485c08608808 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -240,6 +240,29 @@ Support for Python 3.4 and below has been officially dropped. :issue:`12513` by :user:`Ramil Nugmanov ` and :user:`Mohamed Ali Jamaoui `. +- |MajorFeature| Add two new implementations of + gradient boosting trees: :class:`ensemble.HistGradientBoostingClassifier` + and :class:`ensemble.HistGradientBoostingRegressor`. The implementation of + these estimators is inspired by + `LightGBM `_ and can be orders of + magnitude faster than :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` when the number of samples is + larger than tens of thousands of samples. The API of these new estimators + is slightly different, and some of the features from + :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` are not yet supported. + + These new estimators are experimental, which means that their results or + their API might change without any deprecation cycle. To use them, you + need to explicitly import ``enable_hist_gradient_boosting``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> # now you can import normally from ensemble + >>> from sklearn.ensemble import HistGradientBoostingClassifier + + :issue:`12807` by :user:`Nicolas Hug`. + :mod:`sklearn.externals` ........................ diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index fa5fc0c992a48..760738417ad1c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -489,6 +489,21 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): might be preferred since binning may lead to split points that are too approximate in this setting. + This implementation is inspired by + `LightGBM `_. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_hist_gradient_boosting``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> # now you can import normally from ensemble + >>> from sklearn.ensemble import HistGradientBoostingClassifier + + Parameters ---------- loss : {'least_squares'}, optional (default='least_squares') @@ -571,8 +586,10 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): Examples -------- + >>> # To use this experimental feature, we need to explicitly ask for it: + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> from sklearn.ensemble import HistGradientBoostingRegressor >>> from sklearn.datasets import load_boston - >>> from sklearn.experimental import HistGradientBoostingRegressor >>> X, y = load_boston(return_X_y=True) >>> est = HistGradientBoostingRegressor().fit(X, y) >>> est.score(X, y) @@ -636,6 +653,20 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, might be preferred since binning may lead to split points that are too approximate in this setting. + This implementation is inspired by + `LightGBM `_. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_hist_gradient_boosting``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> # now you can import normally from ensemble + >>> from sklearn.ensemble import HistGradientBoostingClassifier + Parameters ---------- loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \ @@ -722,8 +753,10 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, Examples -------- + >>> # To use this experimental feature, we need to explicitly ask for it: + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> from sklearn.ensemble import HistGradientBoostingRegressor >>> from sklearn.datasets import load_iris - >>> from sklearn.experimental import HistGradientBoostingClassifier >>> X, y = load_iris(return_X_y=True) >>> clf = HistGradientBoostingClassifier().fit(X, y) >>> clf.score(X, y) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 3380511afd418..95672a60e5c40 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -4,8 +4,10 @@ import numpy as np import pytest -from sklearn.experimental import HistGradientBoostingRegressor -from sklearn.experimental import HistGradientBoostingClassifier +# To use this experimental feature, we need to explicitly ask for it: +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.ensemble._hist_gradient_boosting.utils import ( get_equivalent_estimator) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 12ef2ea7a4cae..790597b07fa15 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -2,8 +2,10 @@ import pytest from sklearn.datasets import make_classification, make_regression -from sklearn.experimental import HistGradientBoostingClassifier -from sklearn.experimental import HistGradientBoostingRegressor +# To use this experimental feature, we need to explicitly ask for it: +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingClassifier X_classification, y_classification = make_classification(random_state=0) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 77c00e4055d15..49d187083d8a3 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -2003,7 +2003,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): See also -------- - sklearn.experimental.HistGradientBoostingClassifier, + sklearn.ensemble.HistGradientBoostingClassifier, sklearn.tree.DecisionTreeClassifier, RandomForestClassifier AdaBoostClassifier @@ -2464,7 +2464,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): See also -------- - sklearn.experimental.HistGradientBoostingRegressor, + sklearn.ensemble.HistGradientBoostingRegressor, sklearn.tree.DecisionTreeRegressor, RandomForestRegressor References diff --git a/sklearn/experimental/__init__.py b/sklearn/experimental/__init__.py index 269a850dd5321..0effaf5b05fa0 100644 --- a/sklearn/experimental/__init__.py +++ b/sklearn/experimental/__init__.py @@ -1,11 +1,7 @@ """ -The :mod:`sklearn.experimental` module includes estimators and tools whose API -and behaviour might change without a deprecation cycle. -""" - -from ..ensemble._hist_gradient_boosting.gradient_boosting import ( - HistGradientBoostingClassifier, - HistGradientBoostingRegressor -) +The :mod:`sklearn.experimental` module provides importable modules that enable +the use of experimental features or estimators. -__all__ = ['HistGradientBoostingRegressor', 'HistGradientBoostingClassifier'] +The features and estimators that are experimental aren't subject to +deprecation cycles. Use them at your own risks! +""" diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py new file mode 100644 index 0000000000000..2e008489ae17d --- /dev/null +++ b/sklearn/experimental/enable_hist_gradient_boosting.py @@ -0,0 +1,32 @@ +"""Enables histogram-based gradient boosting estimators. + +The API and results of these estimators might change without any deprecation +cycle. + +Importing this file dynamically sets the +:class:`sklearn.ensemble.HistGradientBoostingClassifier` and +:class:`sklearn.ensemble.HistGradientBoostingRegressor` as attributes of the +ensemble module:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa + >>> # now you can import normally from ensemble + >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> from sklearn.ensemble import HistGradientBoostingRegressor + + +The ``# noqa`` comment comment can be removed: it just tells linters like +flake8 to ignore the import, which appears as unused. +""" + +from ..ensemble._hist_gradient_boosting.gradient_boosting import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor +) + +from .. import ensemble + +ensemble.HistGradientBoostingClassifier = HistGradientBoostingClassifier +ensemble.HistGradientBoostingRegressor = HistGradientBoostingRegressor +ensemble.__all__ += ['HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor'] \ No newline at end of file diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py new file mode 100644 index 0000000000000..6c51b34b44aa0 --- /dev/null +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -0,0 +1,46 @@ +import pytest +import sys + + +@pytest.fixture +def clean_imports(): + # Removes the relevant scikit-learn related imports (also removes from the + # cache). This is needed to keep the individual tests functions + # independent. + modules_to_delete = ( + 'experimental', + 'enable_hist_gradient_boosting', + 'ensemble', + ) + modules = list(sys.modules.keys()) + for module in modules: + if any(mod_to_delete in module for mod_to_delete in modules_to_delete): + del sys.modules[module] + + +def test_valid_import(clean_imports): + # recommended way + from sklearn.experimental import enable_hist_gradient_boosting # noqa + from sklearn.ensemble import HistGradientBoostingClassifier + + +def test_valid_import_2(clean_imports): + # recommended way, making sure ensemble can be imported before + import sklearn.ensemble + from sklearn.experimental import enable_hist_gradient_boosting # noqa + from sklearn.ensemble import HistGradientBoostingClassifier + + +def test_import_failure(clean_imports): + # missing enable_hist_gradient_boosting + + with pytest.raises(ImportError): + from sklearn.ensemble import HistGradientBoostingClassifier + + with pytest.raises(ImportError): + from sklearn.ensemble._hist_gradient_boosting import ( + HistGradientBoostingClassifier) + + import sklearn.experimental + with pytest.raises(ImportError): + from sklearn.ensemble import HistGradientBoostingClassifier From 062ec7505f3cb6b3977d936243de6a706de0aa61 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Apr 2019 12:45:33 -0400 Subject: [PATCH 207/247] noqa for whole file test_enable_hist_gradient_boosting.py --- .../tests/test_enable_hist_gradient_boosting.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index 6c51b34b44aa0..8492c1bd908f3 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -2,6 +2,10 @@ import sys +# Ignore flake8 (import not at top of file, etc.) +# flake8: noqa + + @pytest.fixture def clean_imports(): # Removes the relevant scikit-learn related imports (also removes from the @@ -20,14 +24,14 @@ def clean_imports(): def test_valid_import(clean_imports): # recommended way - from sklearn.experimental import enable_hist_gradient_boosting # noqa + from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier def test_valid_import_2(clean_imports): # recommended way, making sure ensemble can be imported before import sklearn.ensemble - from sklearn.experimental import enable_hist_gradient_boosting # noqa + from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier From 72d48b9f4c2f7d18e7c9f11c3592f3ad71a5c6bd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Apr 2019 12:47:55 -0400 Subject: [PATCH 208/247] flake8 --- doc/conf.py | 2 +- sklearn/experimental/enable_hist_gradient_boosting.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index e2e4f50d9f41d..0616f1ef832a2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -265,7 +265,7 @@ # enable experimental module so that the new GBDTs estimators can be # discovered properly by sphinx -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.experimental import enable_hist_gradient_boosting # noqa def make_carousel_thumbs(app, exception): diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py index 2e008489ae17d..6b0a6ad8a28bb 100644 --- a/sklearn/experimental/enable_hist_gradient_boosting.py +++ b/sklearn/experimental/enable_hist_gradient_boosting.py @@ -29,4 +29,4 @@ ensemble.HistGradientBoostingClassifier = HistGradientBoostingClassifier ensemble.HistGradientBoostingRegressor = HistGradientBoostingRegressor ensemble.__all__ += ['HistGradientBoostingClassifier', - 'HistGradientBoostingRegressor'] \ No newline at end of file + 'HistGradientBoostingRegressor'] From 505d409ff186997b3e6a0fe72b5201dd4f02efc2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Apr 2019 13:52:06 -0400 Subject: [PATCH 209/247] protected omp_get_max_threads() --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 2c78ed9750e0b..2f7c7d3453326 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -15,7 +15,8 @@ cimport cython from cython.parallel import prange import numpy as np cimport numpy as np -from openmp cimport omp_get_max_threads +IF SKLEARN_OPENMP_SUPPORTED: + from openmp cimport omp_get_max_threads from libc.stdlib cimport malloc, free from libc.string cimport memcpy @@ -239,7 +240,12 @@ cdef class Splitter: self.X_binned[:, feature_idx] unsigned int [::1] left_indices_buffer = self.left_indices_buffer unsigned int [::1] right_indices_buffer = self.right_indices_buffer - int n_threads = omp_get_max_threads() + + IF SKLEARN_OPENMP_SUPPORTED: + int n_threads = omp_get_max_threads() + ELSE: + int n_threads = 1 + int [:] sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32) int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) From b8b73e67e3bc64554bf1e7823a075d3455ccc2a7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Apr 2019 14:06:56 -0400 Subject: [PATCH 210/247] trying without module deletion hack --- .../test_enable_hist_gradient_boosting.py | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index 8492c1bd908f3..26f90ea39ab9f 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -6,45 +6,45 @@ # flake8: noqa -@pytest.fixture -def clean_imports(): - # Removes the relevant scikit-learn related imports (also removes from the - # cache). This is needed to keep the individual tests functions - # independent. - modules_to_delete = ( - 'experimental', - 'enable_hist_gradient_boosting', - 'ensemble', - ) - modules = list(sys.modules.keys()) - for module in modules: - if any(mod_to_delete in module for mod_to_delete in modules_to_delete): - del sys.modules[module] - - -def test_valid_import(clean_imports): - # recommended way - from sklearn.experimental import enable_hist_gradient_boosting - from sklearn.ensemble import HistGradientBoostingClassifier - - -def test_valid_import_2(clean_imports): - # recommended way, making sure ensemble can be imported before - import sklearn.ensemble - from sklearn.experimental import enable_hist_gradient_boosting - from sklearn.ensemble import HistGradientBoostingClassifier - - -def test_import_failure(clean_imports): - # missing enable_hist_gradient_boosting - - with pytest.raises(ImportError): - from sklearn.ensemble import HistGradientBoostingClassifier - - with pytest.raises(ImportError): - from sklearn.ensemble._hist_gradient_boosting import ( - HistGradientBoostingClassifier) - - import sklearn.experimental - with pytest.raises(ImportError): - from sklearn.ensemble import HistGradientBoostingClassifier +# @pytest.fixture +# def clean_imports(): +# # Removes the relevant scikit-learn related imports (also removes from the +# # cache). This is needed to keep the individual tests functions +# # independent. +# modules_to_delete = ( +# 'experimental', +# 'enable_hist_gradient_boosting', +# 'ensemble', +# ) +# modules = list(sys.modules.keys()) +# for module in modules: +# if any(mod_to_delete in module for mod_to_delete in modules_to_delete): +# del sys.modules[module] + + +# def test_valid_import(clean_imports): +# # recommended way +# from sklearn.experimental import enable_hist_gradient_boosting +# from sklearn.ensemble import HistGradientBoostingClassifier + + +# def test_valid_import_2(clean_imports): +# # recommended way, making sure ensemble can be imported before +# import sklearn.ensemble +# from sklearn.experimental import enable_hist_gradient_boosting +# from sklearn.ensemble import HistGradientBoostingClassifier + + +# def test_import_failure(clean_imports): +# # missing enable_hist_gradient_boosting + +# with pytest.raises(ImportError): +# from sklearn.ensemble import HistGradientBoostingClassifier + +# with pytest.raises(ImportError): +# from sklearn.ensemble._hist_gradient_boosting import ( +# HistGradientBoostingClassifier) + +# import sklearn.experimental +# with pytest.raises(ImportError): +# from sklearn.ensemble import HistGradientBoostingClassifier From acfcce5db2fe92f05e4bf5ee6636dae753c015a4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 17 Apr 2019 14:13:58 -0400 Subject: [PATCH 211/247] deleted test_enable file: impossible to do properly --- .../test_enable_hist_gradient_boosting.py | 50 ------------------- 1 file changed, 50 deletions(-) delete mode 100644 sklearn/experimental/tests/test_enable_hist_gradient_boosting.py diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py deleted file mode 100644 index 26f90ea39ab9f..0000000000000 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest -import sys - - -# Ignore flake8 (import not at top of file, etc.) -# flake8: noqa - - -# @pytest.fixture -# def clean_imports(): -# # Removes the relevant scikit-learn related imports (also removes from the -# # cache). This is needed to keep the individual tests functions -# # independent. -# modules_to_delete = ( -# 'experimental', -# 'enable_hist_gradient_boosting', -# 'ensemble', -# ) -# modules = list(sys.modules.keys()) -# for module in modules: -# if any(mod_to_delete in module for mod_to_delete in modules_to_delete): -# del sys.modules[module] - - -# def test_valid_import(clean_imports): -# # recommended way -# from sklearn.experimental import enable_hist_gradient_boosting -# from sklearn.ensemble import HistGradientBoostingClassifier - - -# def test_valid_import_2(clean_imports): -# # recommended way, making sure ensemble can be imported before -# import sklearn.ensemble -# from sklearn.experimental import enable_hist_gradient_boosting -# from sklearn.ensemble import HistGradientBoostingClassifier - - -# def test_import_failure(clean_imports): -# # missing enable_hist_gradient_boosting - -# with pytest.raises(ImportError): -# from sklearn.ensemble import HistGradientBoostingClassifier - -# with pytest.raises(ImportError): -# from sklearn.ensemble._hist_gradient_boosting import ( -# HistGradientBoostingClassifier) - -# import sklearn.experimental -# with pytest.raises(ImportError): -# from sklearn.ensemble import HistGradientBoostingClassifier From 8b1f603aa17ab3d0e918f8826dcc3f325c35ef9d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 18 Apr 2019 11:58:20 -0400 Subject: [PATCH 212/247] test enable_experimental with assert_run_python_script from cloud_pickle --- .../test_enable_hist_gradient_boosting.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 sklearn/experimental/tests/test_enable_hist_gradient_boosting.py diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py new file mode 100644 index 0000000000000..7a8ff6a349e6b --- /dev/null +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -0,0 +1,103 @@ +import tempfile +from subprocess import check_output, STDOUT, CalledProcessError +import os +import os.path as op +import sys +import textwrap +from subprocess import TimeoutExpired + + +TIMEOUT = 60 + + +def _make_cwd_env(): + """Helper to prepare environment for the child processes""" + # This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle + cloudpickle_repo_folder = op.normpath( + op.join(op.dirname(__file__), '..')) + env = os.environ.copy() + pythonpath = "{src}{sep}tests{pathsep}{src}".format( + src=cloudpickle_repo_folder, sep=os.sep, pathsep=os.pathsep) + env['PYTHONPATH'] = pythonpath + return cloudpickle_repo_folder, env + + +def assert_run_python_script(source_code, timeout=TIMEOUT): + """Utility to help check pickleability of objects defined in __main__ + + The script provided in the source code should return 0 and not print + anything on stderr or stdout. + + This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle + """ + fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') + os.close(fd) + try: + with open(source_file, 'wb') as f: + f.write(source_code.encode('utf-8')) + cmd = [sys.executable, source_file] + cwd, env = _make_cwd_env() + kwargs = { + 'cwd': cwd, + 'stderr': STDOUT, + 'env': env, + } + # If coverage is running, pass the config file to the subprocess + coverage_rc = os.environ.get("COVERAGE_PROCESS_START") + if coverage_rc: + kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc + + kwargs['timeout'] = timeout + try: + try: + out = check_output(cmd, **kwargs) + except CalledProcessError as e: + raise RuntimeError(u"script errored with output:\n%s" + % e.output.decode('utf-8')) + if out != b"": + raise AssertionError(out.decode('utf-8')) + except TimeoutExpired as e: + raise RuntimeError(u"script timeout, output so far:\n%s" + % e.output.decode('utf-8')) + finally: + os.unlink(source_file) + + +def test_imports_strategies(): + # Make sure different import strategies work or fail as expected. + + # Since Python caches the imported modules, we need to run a child process + # for every test case. Else, the tests would not be independent + # (manually removing the imports from the cache (sys.modules) is not + # recommended and can lead to many complications). + + good_import = """ + from sklearn.experimental import enable_hist_gradient_boosting + from sklearn.ensemble import GradientBoostingClassifier + from sklearn.ensemble import GradientBoostingRegressor + """ + assert_run_python_script(textwrap.dedent(good_import)) + + good_import_with_ensemble_first = """ + import sklearn.ensemble + from sklearn.experimental import enable_hist_gradient_boosting + from sklearn.ensemble import GradientBoostingClassifier + from sklearn.ensemble import GradientBoostingRegressor + """ + assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first)) + + bad_imports = """ + import pytest + + with pytest.raises(ImportError): + from sklearn.ensemble import HistGradientBoostingClassifier + + with pytest.raises(ImportError): + from sklearn.ensemble._hist_gradient_boosting import ( + HistGradientBoostingClassifier) + + import sklearn.experimental + with pytest.raises(ImportError): + from sklearn.ensemble import HistGradientBoostingClassifier + """ + assert_run_python_script(textwrap.dedent(bad_imports)) From ea14a84e77d39a4f06932d8db7ac3c08f6059db2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 18 Apr 2019 14:22:23 -0400 Subject: [PATCH 213/247] Addressed comments --- .../tests/test_enable_hist_gradient_boosting.py | 8 ++++---- sklearn/utils/estimator_checks.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index 7a8ff6a349e6b..05106b2f6c20d 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -13,13 +13,13 @@ def _make_cwd_env(): """Helper to prepare environment for the child processes""" # This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle - cloudpickle_repo_folder = op.normpath( - op.join(op.dirname(__file__), '..')) + sklearn_repo_folder = op.normpath( + op.join(op.dirname(__file__), '../..')) env = os.environ.copy() pythonpath = "{src}{sep}tests{pathsep}{src}".format( - src=cloudpickle_repo_folder, sep=os.sep, pathsep=os.pathsep) + src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) env['PYTHONPATH'] = pythonpath - return cloudpickle_repo_folder, env + return sklearn_repo_folder, env def assert_run_python_script(source_code, timeout=TIMEOUT): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 32eea7bd61841..a3353628e5caf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -396,8 +396,9 @@ def set_checking_parameters(estimator): # which is more feature than we have in most case. estimator.set_params(k=1) - if name in ('HistGradientBoostingClassifier', - 'HistGradientBoostingRegressor'): + if name == 'HistGradientBoostingClassifier': + # The default min_samples_leaf (20) isn't appropriate for small + # datasets (only very shallow trees are built) that the checks use. estimator.set_params(min_samples_leaf=5) From 69f127c7bccde25515fca74025b5f26fe29c9c1d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 18 Apr 2019 14:49:35 -0400 Subject: [PATCH 214/247] put back min_samples_leaf=5 for checks of HistGradientBoostingRegressor --- sklearn/utils/estimator_checks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index a3353628e5caf..d5d59a041fdf4 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -396,7 +396,8 @@ def set_checking_parameters(estimator): # which is more feature than we have in most case. estimator.set_params(k=1) - if name == 'HistGradientBoostingClassifier': + if name in ('HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor'): # The default min_samples_leaf (20) isn't appropriate for small # datasets (only very shallow trees are built) that the checks use. estimator.set_params(min_samples_leaf=5) From 83bc17a760b911db02c7af292fbebf6adf96cc59 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 18 Apr 2019 15:09:25 -0400 Subject: [PATCH 215/247] removed one line so that the PR is 5555 lines --- .../bench_hist_gradient_boosting_higgsboson.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index 23d0e16194cc0..ec75760cd39f7 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -70,15 +70,14 @@ def load_data(): print("Fitting a sklearn model...") tic = time() -est = HistGradientBoostingClassifier( - loss='binary_crossentropy', - learning_rate=lr, - max_iter=n_trees, - max_bins=max_bins, - max_leaf_nodes=n_leaf_nodes, - n_iter_no_change=None, - random_state=0, - verbose=1) +est = HistGradientBoostingClassifier(loss='binary_crossentropy', + learning_rate=lr, + max_iter=n_trees, + max_bins=max_bins, + max_leaf_nodes=n_leaf_nodes, + n_iter_no_change=None, + random_state=0, + verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) From c4b22bf99629e7301392d34eac03963494f092f6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Apr 2019 11:05:49 -0400 Subject: [PATCH 216/247] Moved utility into utils.testing and updated docstring --- .../test_enable_hist_gradient_boosting.py | 64 +------------------ sklearn/utils/testing.py | 58 +++++++++++++++++ 2 files changed, 61 insertions(+), 61 deletions(-) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index 05106b2f6c20d..eff4f53d810a9 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -1,66 +1,8 @@ -import tempfile -from subprocess import check_output, STDOUT, CalledProcessError -import os -import os.path as op -import sys -import textwrap -from subprocess import TimeoutExpired - - -TIMEOUT = 60 - - -def _make_cwd_env(): - """Helper to prepare environment for the child processes""" - # This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle - sklearn_repo_folder = op.normpath( - op.join(op.dirname(__file__), '../..')) - env = os.environ.copy() - pythonpath = "{src}{sep}tests{pathsep}{src}".format( - src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) - env['PYTHONPATH'] = pythonpath - return sklearn_repo_folder, env +"""Tests for making sure experimental imports work as expected.""" +import textwrap -def assert_run_python_script(source_code, timeout=TIMEOUT): - """Utility to help check pickleability of objects defined in __main__ - - The script provided in the source code should return 0 and not print - anything on stderr or stdout. - - This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle - """ - fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') - os.close(fd) - try: - with open(source_file, 'wb') as f: - f.write(source_code.encode('utf-8')) - cmd = [sys.executable, source_file] - cwd, env = _make_cwd_env() - kwargs = { - 'cwd': cwd, - 'stderr': STDOUT, - 'env': env, - } - # If coverage is running, pass the config file to the subprocess - coverage_rc = os.environ.get("COVERAGE_PROCESS_START") - if coverage_rc: - kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc - - kwargs['timeout'] = timeout - try: - try: - out = check_output(cmd, **kwargs) - except CalledProcessError as e: - raise RuntimeError(u"script errored with output:\n%s" - % e.output.decode('utf-8')) - if out != b"": - raise AssertionError(out.decode('utf-8')) - except TimeoutExpired as e: - raise RuntimeError(u"script timeout, output so far:\n%s" - % e.output.decode('utf-8')) - finally: - os.unlink(source_file) +from sklearn.utils.testing import assert_run_python_script def test_imports_strategies(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 1662294189690..2ca91cc23a712 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -11,11 +11,17 @@ # Thierry Guillemot # License: BSD 3 clause import os +import os.path as op import inspect import pkgutil import warnings import sys import functools +import tempfile +from subprocess import check_output, STDOUT, CalledProcessError +import textwrap +from subprocess import TimeoutExpired + import scipy as sp import scipy.io @@ -970,3 +976,55 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None): if n1 != n2: incorrect += [func_name + ' ' + n1 + ' != ' + n2] return incorrect + + +def assert_run_python_script(source_code, timeout=60): + """Utility to check assertions in an independent Python subprocess. + + The script provided in the source code should return 0 and not print + anything on stderr or stdout. + + This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle + """ + def _make_cwd_env(): + """Helper to prepare environment for the child processes""" + sklearn_repo_folder = op.normpath( + op.join(op.dirname(__file__), '../..')) + env = os.environ.copy() + pythonpath = "{src}{sep}tests{pathsep}{src}".format( + src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) + env['PYTHONPATH'] = pythonpath + return sklearn_repo_folder, env + + + fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') + os.close(fd) + try: + with open(source_file, 'wb') as f: + f.write(source_code.encode('utf-8')) + cmd = [sys.executable, source_file] + cwd, env = _make_cwd_env() + kwargs = { + 'cwd': cwd, + 'stderr': STDOUT, + 'env': env, + } + # If coverage is running, pass the config file to the subprocess + coverage_rc = os.environ.get("COVERAGE_PROCESS_START") + if coverage_rc: + kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc + + kwargs['timeout'] = timeout + try: + try: + out = check_output(cmd, **kwargs) + except CalledProcessError as e: + raise RuntimeError(u"script errored with output:\n%s" + % e.output.decode('utf-8')) + if out != b"": + raise AssertionError(out.decode('utf-8')) + except TimeoutExpired as e: + raise RuntimeError(u"script timeout, output so far:\n%s" + % e.output.decode('utf-8')) + finally: + os.unlink(source_file) From 6553f72d4bc095f1b801d2e43ac98f6c1032dd83 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Apr 2019 11:21:50 -0400 Subject: [PATCH 217/247] pep8 --- sklearn/utils/testing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 2ca91cc23a712..d11193b44c3cf 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -19,7 +19,6 @@ import functools import tempfile from subprocess import check_output, STDOUT, CalledProcessError -import textwrap from subprocess import TimeoutExpired @@ -996,7 +995,6 @@ def _make_cwd_env(): env['PYTHONPATH'] = pythonpath return sklearn_repo_folder, env - fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') os.close(fd) try: From 4cb5da4d0a9a9a96cee88e90d2f425c62e606fe7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Apr 2019 11:32:58 -0400 Subject: [PATCH 218/247] added comment for min_samples_leaf --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 760738417ad1c..b73af18dba3b1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -525,7 +525,9 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): nodes to go from the root to the deepest leaf. Must be strictly greater than 1. Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) - The minimum number of samples per leaf. + The minimum number of samples per leaf. For small datasets with less + than a few hundred samples, it is recommended to lower this value since + only very shallow trees would be built. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use ``0`` for no regularization (default). @@ -692,7 +694,9 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, nodes to go from the root to the deepest leaf. Must be strictly greater than 1. Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) - The minimum number of samples per leaf. + The minimum number of samples per leaf. For small datasets with less + than a few hundred samples, it is recommended to lower this value since + only very shallow trees would be built. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. max_bins : int, optional (default=256) From a8a4ce0656712d740999923105d9a17d6dbfe7e1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 19 Apr 2019 15:06:23 -0400 Subject: [PATCH 219/247] doc --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index b73af18dba3b1..719756061f896 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -526,8 +526,8 @@ class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin): than 1. Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. For small datasets with less - than a few hundred samples, it is recommended to lower this value since - only very shallow trees would be built. + than a few hundred samples, it is recommended to lower this value + since only very shallow trees would be built. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use ``0`` for no regularization (default). @@ -695,8 +695,8 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, than 1. Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. For small datasets with less - than a few hundred samples, it is recommended to lower this value since - only very shallow trees would be built. + than a few hundred samples, it is recommended to lower this value + since only very shallow trees would be built. l2_regularization : float, optional (default=0) The L2 regularization parameter. Use 0 for no regularization. max_bins : int, optional (default=256) From 6109620259d9235ec4fc09143c40ed55b3bee3c1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 20 Apr 2019 08:04:35 -0400 Subject: [PATCH 220/247] docstring params --- sklearn/utils/testing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index d11193b44c3cf..4987567d6a161 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -984,6 +984,13 @@ def assert_run_python_script(source_code, timeout=60): anything on stderr or stdout. This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle + + Parameters + ---------- + source_code : str + The Python source code to execute. + timeout : int + Time in seconds before timeout. """ def _make_cwd_env(): """Helper to prepare environment for the child processes""" From 3ef02123c55fa30e190bc7ceb57fe59294b97c01 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 20 Apr 2019 08:20:24 -0400 Subject: [PATCH 221/247] no idea whats going on? --- sklearn/utils/testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 4987567d6a161..564499602053b 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -1026,8 +1026,8 @@ def _make_cwd_env(): except CalledProcessError as e: raise RuntimeError(u"script errored with output:\n%s" % e.output.decode('utf-8')) - if out != b"": - raise AssertionError(out.decode('utf-8')) + # if out != b"": + # raise AssertionError(out.decode('utf-8')) except TimeoutExpired as e: raise RuntimeError(u"script timeout, output so far:\n%s" % e.output.decode('utf-8')) From d493fe4c8c63f21daeae6eb52b56e9a9e241db72 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 20 Apr 2019 08:52:40 -0400 Subject: [PATCH 222/247] remove coverage? --- sklearn/utils/testing.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 564499602053b..e194e79f7db54 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -1014,11 +1014,6 @@ def _make_cwd_env(): 'stderr': STDOUT, 'env': env, } - # If coverage is running, pass the config file to the subprocess - coverage_rc = os.environ.get("COVERAGE_PROCESS_START") - if coverage_rc: - kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc - kwargs['timeout'] = timeout try: try: @@ -1026,8 +1021,8 @@ def _make_cwd_env(): except CalledProcessError as e: raise RuntimeError(u"script errored with output:\n%s" % e.output.decode('utf-8')) - # if out != b"": - # raise AssertionError(out.decode('utf-8')) + if out != b"": + raise AssertionError(out.decode('utf-8')) except TimeoutExpired as e: raise RuntimeError(u"script timeout, output so far:\n%s" % e.output.decode('utf-8')) From 1da9941364b9c6844e5ec2fcba6c0d172f68f5a1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 20 Apr 2019 09:27:52 -0400 Subject: [PATCH 223/247] put back helper in experimental/test_ :/ --- .../test_enable_hist_gradient_boosting.py | 64 ++++++++++++++++++- sklearn/utils/testing.py | 57 ----------------- 2 files changed, 63 insertions(+), 58 deletions(-) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index eff4f53d810a9..d09a9a9695b2d 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -1,8 +1,70 @@ """Tests for making sure experimental imports work as expected.""" +import sys +import os +import os.path as op import textwrap +import tempfile +from subprocess import check_output, STDOUT, CalledProcessError +from subprocess import TimeoutExpired -from sklearn.utils.testing import assert_run_python_script + +def assert_run_python_script(source_code, timeout=60): + """Utility to check assertions in an independent Python subprocess. + + The script provided in the source code should return 0 and not print + anything on stderr or stdout. + + This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle + + Parameters + ---------- + source_code : str + The Python source code to execute. + timeout : int + Time in seconds before timeout. + """ + def _make_cwd_env(): + """Helper to prepare environment for the child processes""" + sklearn_repo_folder = op.normpath( + op.join(op.dirname(__file__), '../..')) + env = os.environ.copy() + pythonpath = "{src}{sep}tests{pathsep}{src}".format( + src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) + env['PYTHONPATH'] = pythonpath + return sklearn_repo_folder, env + + fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') + os.close(fd) + try: + with open(source_file, 'wb') as f: + f.write(source_code.encode('utf-8')) + cmd = [sys.executable, source_file] + cwd, env = _make_cwd_env() + kwargs = { + 'cwd': cwd, + 'stderr': STDOUT, + 'env': env, + } + # If coverage is running, pass the config file to the subprocess + coverage_rc = os.environ.get("COVERAGE_PROCESS_START") + if coverage_rc: + kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc + + kwargs['timeout'] = timeout + try: + try: + out = check_output(cmd, **kwargs) + except CalledProcessError as e: + raise RuntimeError(u"script errored with output:\n%s" + % e.output.decode('utf-8')) + if out != b"": + raise AssertionError(out.decode('utf-8')) + except TimeoutExpired as e: + raise RuntimeError(u"script timeout, output so far:\n%s" + % e.output.decode('utf-8')) + finally: + os.unlink(source_file) def test_imports_strategies(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index e194e79f7db54..2d1439b4dc443 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -17,9 +17,6 @@ import warnings import sys import functools -import tempfile -from subprocess import check_output, STDOUT, CalledProcessError -from subprocess import TimeoutExpired import scipy as sp @@ -32,7 +29,6 @@ import tempfile import shutil -import os.path as op import atexit import unittest @@ -975,56 +971,3 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None): if n1 != n2: incorrect += [func_name + ' ' + n1 + ' != ' + n2] return incorrect - - -def assert_run_python_script(source_code, timeout=60): - """Utility to check assertions in an independent Python subprocess. - - The script provided in the source code should return 0 and not print - anything on stderr or stdout. - - This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle - - Parameters - ---------- - source_code : str - The Python source code to execute. - timeout : int - Time in seconds before timeout. - """ - def _make_cwd_env(): - """Helper to prepare environment for the child processes""" - sklearn_repo_folder = op.normpath( - op.join(op.dirname(__file__), '../..')) - env = os.environ.copy() - pythonpath = "{src}{sep}tests{pathsep}{src}".format( - src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) - env['PYTHONPATH'] = pythonpath - return sklearn_repo_folder, env - - fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') - os.close(fd) - try: - with open(source_file, 'wb') as f: - f.write(source_code.encode('utf-8')) - cmd = [sys.executable, source_file] - cwd, env = _make_cwd_env() - kwargs = { - 'cwd': cwd, - 'stderr': STDOUT, - 'env': env, - } - kwargs['timeout'] = timeout - try: - try: - out = check_output(cmd, **kwargs) - except CalledProcessError as e: - raise RuntimeError(u"script errored with output:\n%s" - % e.output.decode('utf-8')) - if out != b"": - raise AssertionError(out.decode('utf-8')) - except TimeoutExpired as e: - raise RuntimeError(u"script timeout, output so far:\n%s" - % e.output.decode('utf-8')) - finally: - os.unlink(source_file) From 5623288ac70d5fcd049d27f1840616e1f4c86926 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 20 Apr 2019 10:14:04 -0400 Subject: [PATCH 224/247] hmm --- .../test_enable_hist_gradient_boosting.py | 64 +------------------ sklearn/utils/testing.py | 64 ++++++++++++++++++- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index d09a9a9695b2d..eff4f53d810a9 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -1,70 +1,8 @@ """Tests for making sure experimental imports work as expected.""" -import sys -import os -import os.path as op import textwrap -import tempfile -from subprocess import check_output, STDOUT, CalledProcessError -from subprocess import TimeoutExpired - -def assert_run_python_script(source_code, timeout=60): - """Utility to check assertions in an independent Python subprocess. - - The script provided in the source code should return 0 and not print - anything on stderr or stdout. - - This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle - - Parameters - ---------- - source_code : str - The Python source code to execute. - timeout : int - Time in seconds before timeout. - """ - def _make_cwd_env(): - """Helper to prepare environment for the child processes""" - sklearn_repo_folder = op.normpath( - op.join(op.dirname(__file__), '../..')) - env = os.environ.copy() - pythonpath = "{src}{sep}tests{pathsep}{src}".format( - src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) - env['PYTHONPATH'] = pythonpath - return sklearn_repo_folder, env - - fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') - os.close(fd) - try: - with open(source_file, 'wb') as f: - f.write(source_code.encode('utf-8')) - cmd = [sys.executable, source_file] - cwd, env = _make_cwd_env() - kwargs = { - 'cwd': cwd, - 'stderr': STDOUT, - 'env': env, - } - # If coverage is running, pass the config file to the subprocess - coverage_rc = os.environ.get("COVERAGE_PROCESS_START") - if coverage_rc: - kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc - - kwargs['timeout'] = timeout - try: - try: - out = check_output(cmd, **kwargs) - except CalledProcessError as e: - raise RuntimeError(u"script errored with output:\n%s" - % e.output.decode('utf-8')) - if out != b"": - raise AssertionError(out.decode('utf-8')) - except TimeoutExpired as e: - raise RuntimeError(u"script timeout, output so far:\n%s" - % e.output.decode('utf-8')) - finally: - os.unlink(source_file) +from sklearn.utils.testing import assert_run_python_script def test_imports_strategies(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 2d1439b4dc443..ed11eacb663b4 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -17,6 +17,9 @@ import warnings import sys import functools +import tempfile +from subprocess import check_output, STDOUT, CalledProcessError +from subprocess import TimeoutExpired import scipy as sp @@ -83,7 +86,8 @@ "assert_array_almost_equal", "assert_array_less", "assert_less", "assert_less_equal", "assert_greater", "assert_greater_equal", - "assert_approx_equal", "assert_allclose", "SkipTest"] + "assert_approx_equal", "assert_allclose", + "assert_run_python_script", "SkipTest"] __all__.extend(additional_names_in_all) _dummy = TestCase('__init__') @@ -971,3 +975,61 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None): if n1 != n2: incorrect += [func_name + ' ' + n1 + ' != ' + n2] return incorrect + + +def assert_run_python_script(source_code, timeout=60): + """Utility to check assertions in an independent Python subprocess. + + The script provided in the source code should return 0 and not print + anything on stderr or stdout. + + This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle + + Parameters + ---------- + source_code : str + The Python source code to execute. + timeout : int + Time in seconds before timeout. + """ + def _make_cwd_env(): + """Helper to prepare environment for the child processes""" + sklearn_repo_folder = op.normpath( + op.join(op.dirname(__file__), '../..')) + env = os.environ.copy() + pythonpath = "{src}{sep}tests{pathsep}{src}".format( + src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) + env['PYTHONPATH'] = pythonpath + return sklearn_repo_folder, env + + fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') + os.close(fd) + try: + with open(source_file, 'wb') as f: + f.write(source_code.encode('utf-8')) + cmd = [sys.executable, source_file] + cwd, env = _make_cwd_env() + kwargs = { + 'cwd': cwd, + 'stderr': STDOUT, + 'env': env, + } + # If coverage is running, pass the config file to the subprocess + coverage_rc = os.environ.get("COVERAGE_PROCESS_START") + if coverage_rc: + kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc + + kwargs['timeout'] = timeout + try: + try: + out = check_output(cmd, **kwargs) + except CalledProcessError as e: + raise RuntimeError(u"script errored with output:\n%s" + % e.output.decode('utf-8')) + if out != b"": + raise AssertionError(out.decode('utf-8')) + except TimeoutExpired as e: + raise RuntimeError(u"script timeout, output so far:\n%s" + % e.output.decode('utf-8')) + finally: + os.unlink(source_file) From 442593a627e3a77a44e5369449fe1e38aebfb44d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 20 Apr 2019 11:11:59 -0400 Subject: [PATCH 225/247] changed cwd and env --- sklearn/utils/testing.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index ed11eacb663b4..9c23cb3f02fa3 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -992,23 +992,14 @@ def assert_run_python_script(source_code, timeout=60): timeout : int Time in seconds before timeout. """ - def _make_cwd_env(): - """Helper to prepare environment for the child processes""" - sklearn_repo_folder = op.normpath( - op.join(op.dirname(__file__), '../..')) - env = os.environ.copy() - pythonpath = "{src}{sep}tests{pathsep}{src}".format( - src=sklearn_repo_folder, sep=os.sep, pathsep=os.pathsep) - env['PYTHONPATH'] = pythonpath - return sklearn_repo_folder, env - fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py') os.close(fd) try: with open(source_file, 'wb') as f: f.write(source_code.encode('utf-8')) cmd = [sys.executable, source_file] - cwd, env = _make_cwd_env() + cwd = op.normpath(op.join(op.dirname(sklearn.__file__), '..')) + env = os.environ.copy() kwargs = { 'cwd': cwd, 'stderr': STDOUT, From 4755ba7346fc69a83271be68ef406cd9ca7141d8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Apr 2019 08:17:04 -0400 Subject: [PATCH 226/247] specify --cov-file --- build_tools/azure/test_script.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index c720f6e387c87..bc05b059dbd9f 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -24,7 +24,8 @@ pip list TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML --pyargs" if [[ "$COVERAGE" == "true" ]]; then - TEST_CMD="$TEST_CMD --cov sklearn" + COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" + TEST_CMD="$TEST_CMD --cov sklearn --cov-file=$BUILD_SOURCESDIRECTORY/.coveragerc" fi if [[ -n "$CHECK_WARNINGS" ]]; then From 058ae9436daa5f709fc66b1ebd6b481b6facb2bc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Apr 2019 08:31:17 -0400 Subject: [PATCH 227/247] rcfile instead of -cov-file --- build_tools/azure/test_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index bc05b059dbd9f..6b241bbf55a99 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -25,7 +25,7 @@ TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML --py if [[ "$COVERAGE" == "true" ]]; then COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" - TEST_CMD="$TEST_CMD --cov sklearn --cov-file=$BUILD_SOURCESDIRECTORY/.coveragerc" + TEST_CMD="$TEST_CMD --cov sklearn --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" fi if [[ -n "$CHECK_WARNINGS" ]]; then From 5cbabf82ef24ff83fbaa4c56d97560262f2187d0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Apr 2019 08:40:20 -0400 Subject: [PATCH 228/247] noideawatimdoing --- build_tools/azure/test_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 6b241bbf55a99..ff72dc03f5529 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -25,7 +25,7 @@ TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML --py if [[ "$COVERAGE" == "true" ]]; then COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" - TEST_CMD="$TEST_CMD --cov sklearn --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" + TEST_CMD="$TEST_CMD --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" fi if [[ -n "$CHECK_WARNINGS" ]]; then From 42dda67786a3fa25b7e9172c2221175355b12486 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 22 Apr 2019 08:47:49 -0400 Subject: [PATCH 229/247] revert --- build_tools/azure/test_script.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index ff72dc03f5529..c720f6e387c87 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -24,8 +24,7 @@ pip list TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML --pyargs" if [[ "$COVERAGE" == "true" ]]; then - COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" - TEST_CMD="$TEST_CMD --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" + TEST_CMD="$TEST_CMD --cov sklearn" fi if [[ -n "$CHECK_WARNINGS" ]]; then From 6c9f03eed2344c7647f351e686b92b76d7e88f52 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 07:42:59 -0400 Subject: [PATCH 230/247] Trying with parallel = True in coveragerc --- .coveragerc | 1 + sklearn/utils/testing.py | 1 + 2 files changed, 2 insertions(+) diff --git a/.coveragerc b/.coveragerc index 6d76a5bca8235..5e9b307cca251 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,6 +2,7 @@ branch = True source = sklearn include = */sklearn/* +parallel = True omit = */sklearn/externals/* */benchmarks/* diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 9c23cb3f02fa3..695f38aaaa7c3 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -999,6 +999,7 @@ def assert_run_python_script(source_code, timeout=60): f.write(source_code.encode('utf-8')) cmd = [sys.executable, source_file] cwd = op.normpath(op.join(op.dirname(sklearn.__file__), '..')) + print(cwd) env = os.environ.copy() kwargs = { 'cwd': cwd, From cc980a7667d1577a2980e46d3282a3b4e69cdcab Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 08:04:55 -0400 Subject: [PATCH 231/247] using --cov-config?? --- build_tools/azure/test_script.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index c720f6e387c87..0ab037993499c 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -24,7 +24,8 @@ pip list TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML --pyargs" if [[ "$COVERAGE" == "true" ]]; then - TEST_CMD="$TEST_CMD --cov sklearn" + export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" + TEST_CMD="$TEST_CMD --cov-config=$BUILD_SOURCESDIRECTORY/.coveragerc --cov sklearn" fi if [[ -n "$CHECK_WARNINGS" ]]; then From 66852448ee97f42999044e5be084a67cf9c2906f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 24 Apr 2019 14:27:31 +0200 Subject: [PATCH 232/247] Small improvements to coverage config --- .coveragerc | 2 -- build_tools/azure/test_script.sh | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.coveragerc b/.coveragerc index 5e9b307cca251..1ce5846a34299 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,8 +1,6 @@ [run] branch = True source = sklearn -include = */sklearn/* -parallel = True omit = */sklearn/externals/* */benchmarks/* diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 0ab037993499c..4fd3e70da7362 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -21,11 +21,11 @@ except ImportError: python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pip list -TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML --pyargs" +TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" - TEST_CMD="$TEST_CMD --cov-config=$BUILD_SOURCESDIRECTORY/.coveragerc --cov sklearn" + TEST_CMD="$TEST_CMD --cov-config=$COVERAGE_PROCESS_START --cov sklearn" fi if [[ -n "$CHECK_WARNINGS" ]]; then @@ -37,5 +37,5 @@ cp setup.cfg $TEST_DIR cd $TEST_DIR set -x -$TEST_CMD sklearn +$TEST_CMD --pyargs sklearn set +x From 49ca47104f992507c8e32381081e1e59443fe63c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 08:28:46 -0400 Subject: [PATCH 233/247] removed include to avoid warning --- .coveragerc | 1 - sklearn/utils/testing.py | 1 - 2 files changed, 2 deletions(-) diff --git a/.coveragerc b/.coveragerc index 5e9b307cca251..1133065a5b248 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,7 +1,6 @@ [run] branch = True source = sklearn -include = */sklearn/* parallel = True omit = */sklearn/externals/* diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 695f38aaaa7c3..9c23cb3f02fa3 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -999,7 +999,6 @@ def assert_run_python_script(source_code, timeout=60): f.write(source_code.encode('utf-8')) cmd = [sys.executable, source_file] cwd = op.normpath(op.join(op.dirname(sklearn.__file__), '..')) - print(cwd) env = os.environ.copy() kwargs = { 'cwd': cwd, From 8bffe2c38ff071688161c47a7a830c476b062ace Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 08:59:50 -0400 Subject: [PATCH 234/247] put back parallel = True --- .coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/.coveragerc b/.coveragerc index 1ce5846a34299..1133065a5b248 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,7 @@ [run] branch = True source = sklearn +parallel = True omit = */sklearn/externals/* */benchmarks/* From e1deb05b337ad11216fedd83089d41ace077f5fa Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 09:24:22 -0400 Subject: [PATCH 235/247] trying to pass --rcfile to coverage --- build_tools/azure/test_pytest_soft_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index 7fd522cf4b1c5..3ae3dac149a14 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -9,7 +9,7 @@ conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # Need to append the coverage to the existing .coverage generated by # running the tests - CMD="coverage run --append" + CMD="coverage run --append --rcfile=.coveragerc" else CMD="python" fi From dfbea1d41c6fbfbe4cb1dd697538829bae1f7a27 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 10:11:43 -0400 Subject: [PATCH 236/247] magic --- build_tools/azure/test_pytest_soft_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index 3ae3dac149a14..d478dc53a97c5 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -9,7 +9,7 @@ conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # Need to append the coverage to the existing .coverage generated by # running the tests - CMD="coverage run --append --rcfile=.coveragerc" + CMD="coverage run --append --rcfile=$TEST_DIR/.coveragerc" else CMD="python" fi From 94b814cf102b0a6e9bae1c25a5318ae64f8b6571 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 10:35:30 -0400 Subject: [PATCH 237/247] revert magic --- build_tools/azure/test_pytest_soft_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index d478dc53a97c5..3ae3dac149a14 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -9,7 +9,7 @@ conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # Need to append the coverage to the existing .coverage generated by # running the tests - CMD="coverage run --append --rcfile=$TEST_DIR/.coveragerc" + CMD="coverage run --append --rcfile=.coveragerc" else CMD="python" fi From 66d137681b30f534eee2d9714bc7697e1a088bee Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 25 Apr 2019 09:02:22 -0400 Subject: [PATCH 238/247] magic again --- build_tools/azure/test_pytest_soft_dependency.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index 3ae3dac149a14..88fa2c71cbcdc 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -9,6 +9,7 @@ conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # Need to append the coverage to the existing .coverage generated by # running the tests + echo -e "[run]\nbranch = True" > .coveragerc CMD="coverage run --append --rcfile=.coveragerc" else CMD="python" From 7bc7f6e852732926d65cb6b3f0bb773465a8442e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 Apr 2019 16:00:29 +0200 Subject: [PATCH 239/247] Update test_pytest_soft_dependency.sh --- build_tools/azure/test_pytest_soft_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index 88fa2c71cbcdc..99a3e93778960 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -9,7 +9,6 @@ conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # Need to append the coverage to the existing .coverage generated by # running the tests - echo -e "[run]\nbranch = True" > .coveragerc CMD="coverage run --append --rcfile=.coveragerc" else CMD="python" @@ -17,5 +16,6 @@ fi # .coverage from running the tests is in TEST_DIR pushd $TEST_DIR +echo -e "[run]\nbranch = True" > .coveragerc $CMD -m sklearn.utils.tests.test_estimator_checks popd From 6f6fa519978f5ba8aafc522582e9512231545395 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 25 Apr 2019 10:04:03 -0400 Subject: [PATCH 240/247] Trigger CI?? From 962c5e4842463e185b3f8b7450f61ab9b95342c8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 18:03:43 +0200 Subject: [PATCH 241/247] MAINT coverage config for test_pytest_soft_dependency.sh --- build_tools/azure/test_pytest_soft_dependency.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index 99a3e93778960..3dbb431d4d425 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -8,14 +8,15 @@ conda remove -y py pytest || pip uninstall -y py pytest if [[ "$COVERAGE" == "true" ]]; then # Need to append the coverage to the existing .coverage generated by - # running the tests - CMD="coverage run --append --rcfile=.coveragerc" + # running the tests. Make sure to reuse the same coverage + # configuration as the one used by the main pytest run to be + # able to combine the results. + CMD="coverage run --append --rcfile=../.coveragerc" else CMD="python" fi # .coverage from running the tests is in TEST_DIR pushd $TEST_DIR -echo -e "[run]\nbranch = True" > .coveragerc $CMD -m sklearn.utils.tests.test_estimator_checks popd From 10cb5be8c6d1a08d11c0f28dd8e3fb2fe7a48c4c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 18:04:03 +0200 Subject: [PATCH 242/247] Try to omit any setup.py file from the coverage report --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index 1133065a5b248..7f1b3b706cace 100644 --- a/.coveragerc +++ b/.coveragerc @@ -5,4 +5,4 @@ parallel = True omit = */sklearn/externals/* */benchmarks/* - */setup.py + **/setup.py From 8adb9f013604f5a7e3ca3a5f1983f710d87e3c1c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 18:34:56 +0200 Subject: [PATCH 243/247] TEST_DIR is not a subfolder of BUILD_SOURCESDIRECTORY --- build_tools/azure/test_pytest_soft_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index 3dbb431d4d425..ce9906436413e 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -11,7 +11,7 @@ if [[ "$COVERAGE" == "true" ]]; then # running the tests. Make sure to reuse the same coverage # configuration as the one used by the main pytest run to be # able to combine the results. - CMD="coverage run --append --rcfile=../.coveragerc" + CMD="coverage run --append --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" else CMD="python" fi From 406cec1e6cc028684f6080a6504a25579d4e2590 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 19:24:18 +0200 Subject: [PATCH 244/247] One more try --- build_tools/azure/test_pytest_soft_dependency.sh | 2 +- build_tools/azure/upload_codecov.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/build_tools/azure/test_pytest_soft_dependency.sh b/build_tools/azure/test_pytest_soft_dependency.sh index ce9906436413e..28eacacc27d42 100755 --- a/build_tools/azure/test_pytest_soft_dependency.sh +++ b/build_tools/azure/test_pytest_soft_dependency.sh @@ -11,7 +11,7 @@ if [[ "$COVERAGE" == "true" ]]; then # running the tests. Make sure to reuse the same coverage # configuration as the one used by the main pytest run to be # able to combine the results. - CMD="coverage run --append --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" + CMD="coverage run --rcfile=$BUILD_SOURCESDIRECTORY/.coveragerc" else CMD="python" fi diff --git a/build_tools/azure/upload_codecov.sh b/build_tools/azure/upload_codecov.sh index e9f801b3be5f5..1099efd4b1b86 100755 --- a/build_tools/azure/upload_codecov.sh +++ b/build_tools/azure/upload_codecov.sh @@ -8,6 +8,7 @@ source activate $VIRTUALENV # Need to run codecov from a git checkout, so we copy .coverage # from TEST_DIR where pytest has been run +coverage combine cp $TEST_DIR/.coverage $BUILD_REPOSITORY_LOCALPATH codecov --root $BUILD_REPOSITORY_LOCALPATH -t $CODECOV_TOKEN || echo "codecov upload failed" From 9d8269aeba72705adcaaf237951f8fa5aa6191eb Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 19:48:45 +0200 Subject: [PATCH 245/247] coverage combine in TEST_DIR --- build_tools/azure/upload_codecov.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build_tools/azure/upload_codecov.sh b/build_tools/azure/upload_codecov.sh index 1099efd4b1b86..ab6c14082ea7a 100755 --- a/build_tools/azure/upload_codecov.sh +++ b/build_tools/azure/upload_codecov.sh @@ -8,7 +8,9 @@ source activate $VIRTUALENV # Need to run codecov from a git checkout, so we copy .coverage # from TEST_DIR where pytest has been run +pushd $TEST_DIR coverage combine +popd cp $TEST_DIR/.coverage $BUILD_REPOSITORY_LOCALPATH codecov --root $BUILD_REPOSITORY_LOCALPATH -t $CODECOV_TOKEN || echo "codecov upload failed" From d63d9db34a8dac5cb682050f486ee137fa3c1d88 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 20:22:17 +0200 Subject: [PATCH 246/247] remove useless pass --- sklearn/ensemble/_hist_gradient_boosting/loss.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 88f4f1f7a08a4..5d7c68ea0b38f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -79,7 +79,6 @@ def get_baseline_prediction(self, y_train, prediction_dim): baseline_prediction : float or ndarray, shape (1, prediction_dim) The baseline prediction. """ - pass @abstractmethod def update_gradients_and_hessians(self, gradients, hessians, y_true, @@ -103,7 +102,6 @@ def update_gradients_and_hessians(self, gradients, hessians, y_true, The raw_predictions (i.e. values from the trees) of the tree ensemble at iteration ``i - 1``. """ - pass class LeastSquares(BaseLoss): From 280c487a2a50fca313c60d53d3b7bd2eac95631d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 26 Apr 2019 20:22:36 +0200 Subject: [PATCH 247/247] omit */setup.py --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index 7f1b3b706cace..1133065a5b248 100644 --- a/.coveragerc +++ b/.coveragerc @@ -5,4 +5,4 @@ parallel = True omit = */sklearn/externals/* */benchmarks/* - **/setup.py + */setup.py