Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit af9c1cb

Browse files
committed
WIP refactoring tree criterion evaluation & other optimizations
1 parent e2e5fbd commit af9c1cb

File tree

11 files changed

+31156
-25079
lines changed

11 files changed

+31156
-25079
lines changed

sklearn/ensemble/_gradient_boosting.c

Lines changed: 591 additions & 388 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sklearn/ensemble/forest.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
4040
# License: BSD 3 clause
4141

4242
from __future__ import division
43+
import sys
4344

4445
import warnings
4546
from warnings import warn
@@ -89,7 +90,7 @@ def _generate_unsampled_indices(random_state, n_samples):
8990
return unsampled_indices
9091

9192
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
92-
verbose=0, class_weight=None):
93+
presort, X_idx_sorted, verbose=0, class_weight=None):
9394
"""Private function used to fit a single tree in parallel."""
9495
if verbose > 1:
9596
print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -111,10 +112,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
111112
curr_sample_weight *= compute_sample_weight('auto', y, indices)
112113
elif class_weight == 'balanced_subsample':
113114
curr_sample_weight *= compute_sample_weight('balanced', y, indices)
114-
115-
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
115+
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False,
116+
presort=presort, X_idx_sorted=X_idx_sorted)
116117
else:
117-
tree.fit(X, y, sample_weight=sample_weight, check_input=False)
118+
tree.fit(X, y, sample_weight=sample_weight, check_input=False,
119+
presort=presort, X_idx_sorted=X_idx_sorted)
118120

119121
return tree
120122

@@ -181,7 +183,7 @@ def apply(self, X):
181183

182184
return np.array(results).T
183185

184-
def fit(self, X, y, sample_weight=None):
186+
def fit(self, X, y, sample_weight=None, presort=False):
185187
"""Build a forest of trees from the training set (X, y).
186188
187189
Parameters
@@ -202,6 +204,11 @@ def fit(self, X, y, sample_weight=None):
202204
classification, splits are also ignored if they would result in any
203205
single class carrying a negative weight in either child node.
204206
207+
presort : boolean (default=False)
208+
Presort the dataset. Presorting works well with small trees and
209+
small datasets, but can take significantly longer with bigger
210+
datasets or deep trees.
211+
205212
Returns
206213
-------
207214
self : object
@@ -257,6 +264,12 @@ def fit(self, X, y, sample_weight=None):
257264

258265
n_more_estimators = self.n_estimators - len(self.estimators_)
259266

267+
if presort:
268+
X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
269+
dtype=np.int32)
270+
else:
271+
X_idx_sorted = None
272+
260273
if n_more_estimators < 0:
261274
raise ValueError('n_estimators=%d must be larger or equal to '
262275
'len(estimators_)=%d when warm_start==True'
@@ -284,8 +297,8 @@ def fit(self, X, y, sample_weight=None):
284297
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
285298
backend="threading")(
286299
delayed(_parallel_build_trees)(
287-
t, self, X, y, sample_weight, i, len(trees),
288-
verbose=self.verbose, class_weight=self.class_weight)
300+
t, self, X, y, sample_weight, i, len(trees), presort,
301+
X_idx_sorted, self.verbose, self.class_weight)
289302
for i, t in enumerate(trees))
290303

291304
# Collect newly grown trees
@@ -491,20 +504,17 @@ def predict(self, X):
491504
y : array of shape = [n_samples] or [n_samples, n_outputs]
492505
The predicted classes.
493506
"""
494-
proba = self.predict_proba(X)
495507

508+
proba = self.predict_proba(X)
496509
if self.n_outputs_ == 1:
497510
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
498-
499511
else:
500512
n_samples = proba[0].shape[0]
501513
predictions = np.zeros((n_samples, self.n_outputs_))
502-
503514
for k in range(self.n_outputs_):
504515
predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
505516
axis=1),
506517
axis=0)
507-
508518
return predictions
509519

510520
def predict_proba(self, X):

sklearn/ensemble/gradient_boosting.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@
4545

4646
from ..tree.tree import DecisionTreeRegressor
4747
from ..tree._tree import DTYPE, TREE_LEAF
48-
from ..tree._tree import PresortBestSplitter
49-
from ..tree._tree import FriedmanMSE
5048

5149
from ._gradient_boosting import predict_stages
5250
from ._gradient_boosting import predict_stage
@@ -731,8 +729,8 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
731729

732730
self.estimators_ = np.empty((0, 0), dtype=np.object)
733731

734-
def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
735-
criterion, splitter, random_state):
732+
def _fit_stage(self, i, X, X_idx_sorted, y, y_pred, sample_weight, sample_mask,
733+
random_state):
736734
"""Fit another stage of ``n_classes_`` trees to the boosting model. """
737735

738736
assert sample_mask.dtype == np.bool
@@ -748,8 +746,8 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
748746

749747
# induce regression tree on residuals
750748
tree = DecisionTreeRegressor(
751-
criterion=criterion,
752-
splitter=splitter,
749+
criterion='mse',
750+
splitter='best',
753751
max_depth=self.max_depth,
754752
min_samples_split=self.min_samples_split,
755753
min_samples_leaf=self.min_samples_leaf,
@@ -763,7 +761,7 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
763761
sample_weight = sample_weight * sample_mask.astype(np.float64)
764762

765763
tree.fit(X, residual, sample_weight=sample_weight,
766-
check_input=False)
764+
check_input=False, presort=True, X_idx_sorted=X_idx_sorted)
767765

768766
# update tree leaves
769767
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
@@ -975,9 +973,12 @@ def fit(self, X, y, sample_weight=None, monitor=None):
975973
y_pred = self._decision_function(X)
976974
self._resize_state()
977975

976+
X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
977+
dtype=np.int32)
978+
978979
# fit the boosting stages
979-
n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
980-
begin_at_stage, monitor)
980+
n_stages = self._fit_stages(X, X_idx_sorted, y, y_pred, sample_weight,
981+
random_state, begin_at_stage, monitor)
981982
# change shape of arrays after fit (early-stopping or additional ests)
982983
if n_stages != self.estimators_.shape[0]:
983984
self.estimators_ = self.estimators_[:n_stages]
@@ -987,7 +988,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
987988

988989
return self
989990

990-
def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
991+
def _fit_stages(self, X, X_idx_sorted, y, y_pred, sample_weight, random_state,
991992
begin_at_stage=0, monitor=None):
992993
"""Iteratively fits the stages.
993994
@@ -1009,14 +1010,6 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
10091010
else:
10101011
min_weight_leaf = 0.
10111012

1012-
# init criterion and splitter
1013-
criterion = FriedmanMSE(1)
1014-
splitter = PresortBestSplitter(criterion,
1015-
self.max_features_,
1016-
self.min_samples_leaf,
1017-
min_weight_leaf,
1018-
random_state)
1019-
10201013
if self.verbose:
10211014
verbose_reporter = VerboseReporter(self.verbose)
10221015
verbose_reporter.init(self, begin_at_stage)
@@ -1035,9 +1028,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
10351028
sample_weight[~sample_mask])
10361029

10371030
# fit next stage of trees
1038-
y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
1039-
sample_mask, criterion, splitter,
1040-
random_state)
1031+
y_pred = self._fit_stage(i, X, X_idx_sorted, y, y_pred, sample_weight,
1032+
sample_mask, random_state)
10411033

10421034
# track deviance (= loss)
10431035
if do_oob:

0 commit comments

Comments
 (0)