Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 02c0029

Browse files
committed
ENH gbt sparse support
1 parent c82ad6d commit 02c0029

File tree

16 files changed

+3745
-4859
lines changed

16 files changed

+3745
-4859
lines changed

doc/modules/tree.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -312,10 +312,13 @@ total cost over the entire trees (by summing the cost at each node) of
312312
Scikit-learn offers a more efficient implementation for the construction of
313313
decision trees. A naive implementation (as above) would recompute the class
314314
label histograms (for classification) or the means (for regression) at for each
315-
new split point along a given feature. By presorting the feature over all
316-
relevant samples, and retaining a running label count, we reduce the complexity
315+
new split point along a given feature. Presorting the feature over all
316+
relevant samples, and retaining a running label count, will reduce the complexity
317317
at each node to :math:`O(n_{features}\log(n_{samples}))`, which results in a
318-
total cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))`.
318+
total cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))`. This is an option
319+
for all tree based algorithms. By default it is turned on for gradient boosting,
320+
where in general it makes training faster, but turned off for all other algorithms as
321+
it tends to slow down training when training deep trees.
319322

320323

321324
Tips on practical use

doc/whats_new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ Enhancements
202202
- Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for
203203
the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_.
204204

205+
- Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor`
206+
and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior
207+
the same. This allows gradient boosters to turn off presorting when building
208+
deep trees or using sparse data. By `Jacob Schreiber`_.
209+
205210
Bug fixes
206211
.........
207212

sklearn/ensemble/_gradient_boosting.c

Lines changed: 18 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sklearn/ensemble/gradient_boosting.py

Lines changed: 96 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -22,36 +22,48 @@
2222

2323
from __future__ import print_function
2424
from __future__ import division
25-
from abc import ABCMeta, abstractmethod
26-
from time import time
27-
28-
import numbers
29-
import numpy as np
3025

31-
from scipy import stats
26+
from abc import ABCMeta
27+
from abc import abstractmethod
3228

3329
from .base import BaseEnsemble
3430
from ..base import BaseEstimator
3531
from ..base import ClassifierMixin
3632
from ..base import RegressorMixin
37-
from ..utils import check_random_state, check_array, check_X_y, column_or_1d
38-
from ..utils import check_consistent_length, deprecated
39-
from ..utils.extmath import logsumexp
40-
from ..utils.fixes import expit, bincount
41-
from ..utils.stats import _weighted_percentile
42-
from ..utils.validation import check_is_fitted, NotFittedError
33+
4334
from ..externals import six
4435
from ..feature_selection.from_model import _LearntSelectorMixin
4536

46-
from ..tree.tree import DecisionTreeRegressor
47-
from ..tree._tree import DTYPE, TREE_LEAF
48-
from ..tree._splitter import PresortBestSplitter
49-
from ..tree._criterion import FriedmanMSE
50-
5137
from ._gradient_boosting import predict_stages
5238
from ._gradient_boosting import predict_stage
5339
from ._gradient_boosting import _random_sample_mask
5440

41+
import numbers
42+
import numpy as np
43+
44+
from scipy import stats
45+
from scipy.sparse import csc_matrix
46+
from scipy.sparse import csr_matrix
47+
from scipy.sparse import issparse
48+
49+
from time import time
50+
from ..tree.tree import DecisionTreeRegressor
51+
from ..tree._tree import DTYPE
52+
from ..tree._tree import TREE_LEAF
53+
54+
from ..utils import check_random_state
55+
from ..utils import check_array
56+
from ..utils import check_X_y
57+
from ..utils import column_or_1d
58+
from ..utils import check_consistent_length
59+
from ..utils import deprecated
60+
from ..utils.extmath import logsumexp
61+
from ..utils.fixes import expit
62+
from ..utils.fixes import bincount
63+
from ..utils.stats import _weighted_percentile
64+
from ..utils.validation import check_is_fitted
65+
from ..utils.validation import NotFittedError
66+
5567

5668
class QuantileEstimator(BaseEstimator):
5769
"""An estimator predicting the alpha-quantile of the training targets."""
@@ -711,7 +723,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
711723
min_samples_leaf, min_weight_fraction_leaf,
712724
max_depth, init, subsample, max_features,
713725
random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
714-
warm_start=False):
726+
warm_start=False, presort='auto'):
715727

716728
self.n_estimators = n_estimators
717729
self.learning_rate = learning_rate
@@ -728,11 +740,12 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
728740
self.verbose = verbose
729741
self.max_leaf_nodes = max_leaf_nodes
730742
self.warm_start = warm_start
743+
self.presort = presort
731744

732745
self.estimators_ = np.empty((0, 0), dtype=np.object)
733746

734747
def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
735-
criterion, splitter, random_state):
748+
random_state, X_idx_sorted, X_csc=None, X_csr=None):
736749
"""Fit another stage of ``n_classes_`` trees to the boosting model. """
737750

738751
assert sample_mask.dtype == np.bool
@@ -748,27 +761,37 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
748761

749762
# induce regression tree on residuals
750763
tree = DecisionTreeRegressor(
751-
criterion=criterion,
752-
splitter=splitter,
764+
criterion='friedman_mse',
765+
splitter='best',
753766
max_depth=self.max_depth,
754767
min_samples_split=self.min_samples_split,
755768
min_samples_leaf=self.min_samples_leaf,
756769
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
757770
max_features=self.max_features,
758771
max_leaf_nodes=self.max_leaf_nodes,
759-
random_state=random_state)
772+
random_state=random_state,
773+
presort=self.presort)
760774

761775
if self.subsample < 1.0:
762776
# no inplace multiplication!
763777
sample_weight = sample_weight * sample_mask.astype(np.float64)
764778

765-
tree.fit(X, residual, sample_weight=sample_weight,
766-
check_input=False)
779+
if X_csc is not None:
780+
tree.fit(X_csc, residual, sample_weight=sample_weight,
781+
check_input=False, X_idx_sorted=X_idx_sorted)
782+
else:
783+
tree.fit(X, residual, sample_weight=sample_weight,
784+
check_input=False, X_idx_sorted=X_idx_sorted)
767785

768786
# update tree leaves
769-
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
770-
sample_weight, sample_mask,
771-
self.learning_rate, k=k)
787+
if X_csr is not None:
788+
loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
789+
sample_weight, sample_mask,
790+
self.learning_rate, k=k)
791+
else:
792+
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
793+
sample_weight, sample_mask,
794+
self.learning_rate, k=k)
772795

773796
# add tree to ensemble
774797
self.estimators_[i, k] = tree
@@ -944,7 +967,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
944967
self._clear_state()
945968

946969
# Check input
947-
X, y = check_X_y(X, y, dtype=DTYPE)
970+
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE)
948971
n_samples, self.n_features = X.shape
949972
if sample_weight is None:
950973
sample_weight = np.ones(n_samples, dtype=np.float32)
@@ -981,9 +1004,25 @@ def fit(self, X, y, sample_weight=None, monitor=None):
9811004
y_pred = self._decision_function(X)
9821005
self._resize_state()
9831006

1007+
X_idx_sorted = None
1008+
presort = self.presort
1009+
# Allow presort to be 'auto', which means True if the dataset is dense,
1010+
# otherwise it will be False.
1011+
if presort == 'auto' and issparse(X):
1012+
presort = False
1013+
elif presort == 'auto':
1014+
presort = True
1015+
1016+
if self.presort == True:
1017+
if issparse(X):
1018+
raise ValueError("Presorting is not supported for sparse matrices.")
1019+
else:
1020+
X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
1021+
dtype=np.int32)
1022+
9841023
# fit the boosting stages
9851024
n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
986-
begin_at_stage, monitor)
1025+
begin_at_stage, monitor, X_idx_sorted)
9871026
# change shape of arrays after fit (early-stopping or additional ests)
9881027
if n_stages != self.estimators_.shape[0]:
9891028
self.estimators_ = self.estimators_[:n_stages]
@@ -994,7 +1033,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
9941033
return self
9951034

9961035
def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
997-
begin_at_stage=0, monitor=None):
1036+
begin_at_stage=0, monitor=None, X_idx_sorted=None):
9981037
"""Iteratively fits the stages.
9991038
10001039
For each stage it computes the progress (OOB, train score)
@@ -1015,18 +1054,13 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
10151054
else:
10161055
min_weight_leaf = 0.
10171056

1018-
# init criterion and splitter
1019-
criterion = FriedmanMSE(1)
1020-
splitter = PresortBestSplitter(criterion,
1021-
self.max_features_,
1022-
self.min_samples_leaf,
1023-
min_weight_leaf,
1024-
random_state)
1025-
10261057
if self.verbose:
10271058
verbose_reporter = VerboseReporter(self.verbose)
10281059
verbose_reporter.init(self, begin_at_stage)
10291060

1061+
X_csc = csc_matrix(X) if issparse(X) else None
1062+
X_csr = csr_matrix(X) if issparse(X) else None
1063+
10301064
# perform boosting iterations
10311065
i = begin_at_stage
10321066
for i in range(begin_at_stage, self.n_estimators):
@@ -1042,8 +1076,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
10421076

10431077
# fit next stage of trees
10441078
y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
1045-
sample_mask, criterion, splitter,
1046-
random_state)
1079+
sample_mask, random_state, X_idx_sorted,
1080+
X_csc, X_csr)
10471081

10481082
# track deviance (= loss)
10491083
if do_oob:
@@ -1074,6 +1108,7 @@ def _make_estimator(self, append=True):
10741108
def _init_decision_function(self, X):
10751109
"""Check input and compute prediction of ``init``. """
10761110
self._check_initialized()
1111+
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
10771112
if X.shape[1] != self.n_features:
10781113
raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(
10791114
self.n_features, X.shape[1]))
@@ -1104,7 +1139,9 @@ def decision_function(self, X):
11041139
Regression and binary classification produce an array of shape
11051140
[n_samples].
11061141
"""
1107-
X = check_array(X, dtype=DTYPE, order="C")
1142+
1143+
self._check_initialized()
1144+
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
11081145
score = self._decision_function(X)
11091146
if score.shape[1] == 1:
11101147
return score.ravel()
@@ -1318,6 +1355,12 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
13181355
If None, the random number generator is the RandomState instance used
13191356
by `np.random`.
13201357
1358+
presort : bool or 'auto', optional (default='auto')
1359+
Whether to presort the data to speed up the finding of best splits in
1360+
fitting. Auto mode by default will use presorting on dense data and
1361+
default to normal sorting on sparse data. Setting presort to true on
1362+
sparse data will raise an error.
1363+
13211364
Attributes
13221365
----------
13231366
feature_importances_ : array, shape = [n_features]
@@ -1369,7 +1412,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
13691412
min_samples_leaf=1, min_weight_fraction_leaf=0.,
13701413
max_depth=3, init=None, random_state=None,
13711414
max_features=None, verbose=0,
1372-
max_leaf_nodes=None, warm_start=False):
1415+
max_leaf_nodes=None, warm_start=False,
1416+
presort='auto'):
13731417

13741418
super(GradientBoostingClassifier, self).__init__(
13751419
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1379,7 +1423,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
13791423
max_depth=max_depth, init=init, subsample=subsample,
13801424
max_features=max_features,
13811425
random_state=random_state, verbose=verbose,
1382-
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
1426+
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
1427+
presort=presort)
13831428

13841429
def _validate_y(self, y):
13851430
self.classes_, y = np.unique(y, return_inverse=True)
@@ -1644,6 +1689,11 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
16441689
If None, the random number generator is the RandomState instance used
16451690
by `np.random`.
16461691
1692+
presort : bool or 'auto', optional (default='auto')
1693+
Whether to presort the data to speed up the finding of best splits in
1694+
fitting. Auto mode by default will use presorting on dense data and
1695+
default to normal sorting on sparse data. Setting presort to true on
1696+
sparse data will raise an error.
16471697
16481698
Attributes
16491699
----------
@@ -1693,7 +1743,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
16931743
min_samples_leaf=1, min_weight_fraction_leaf=0.,
16941744
max_depth=3, init=None, random_state=None,
16951745
max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
1696-
warm_start=False):
1746+
warm_start=False, presort='auto'):
16971747

16981748
super(GradientBoostingRegressor, self).__init__(
16991749
loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
@@ -1703,7 +1753,8 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
17031753
max_depth=max_depth, init=init, subsample=subsample,
17041754
max_features=max_features,
17051755
random_state=random_state, alpha=alpha, verbose=verbose,
1706-
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start)
1756+
max_leaf_nodes=max_leaf_nodes, warm_start=warm_start,
1757+
presort='auto')
17071758

17081759
def predict(self, X):
17091760
"""Predict regression target for X.

0 commit comments

Comments
 (0)