22
22
23
23
from __future__ import print_function
24
24
from __future__ import division
25
- from abc import ABCMeta , abstractmethod
26
- from time import time
27
-
28
- import numbers
29
- import numpy as np
30
25
31
- from scipy import stats
26
+ from abc import ABCMeta
27
+ from abc import abstractmethod
32
28
33
29
from .base import BaseEnsemble
34
30
from ..base import BaseEstimator
35
31
from ..base import ClassifierMixin
36
32
from ..base import RegressorMixin
37
- from ..utils import check_random_state , check_array , check_X_y , column_or_1d
38
- from ..utils import check_consistent_length , deprecated
39
- from ..utils .extmath import logsumexp
40
- from ..utils .fixes import expit , bincount
41
- from ..utils .stats import _weighted_percentile
42
- from ..utils .validation import check_is_fitted , NotFittedError
33
+
43
34
from ..externals import six
44
35
from ..feature_selection .from_model import _LearntSelectorMixin
45
36
46
- from ..tree .tree import DecisionTreeRegressor
47
- from ..tree ._tree import DTYPE , TREE_LEAF
48
- from ..tree ._splitter import PresortBestSplitter
49
- from ..tree ._criterion import FriedmanMSE
50
-
51
37
from ._gradient_boosting import predict_stages
52
38
from ._gradient_boosting import predict_stage
53
39
from ._gradient_boosting import _random_sample_mask
54
40
41
+ import numbers
42
+ import numpy as np
43
+
44
+ from scipy import stats
45
+ from scipy .sparse import csc_matrix
46
+ from scipy .sparse import csr_matrix
47
+ from scipy .sparse import issparse
48
+
49
+ from time import time
50
+ from ..tree .tree import DecisionTreeRegressor
51
+ from ..tree ._tree import DTYPE
52
+ from ..tree ._tree import TREE_LEAF
53
+
54
+ from ..utils import check_random_state
55
+ from ..utils import check_array
56
+ from ..utils import check_X_y
57
+ from ..utils import column_or_1d
58
+ from ..utils import check_consistent_length
59
+ from ..utils import deprecated
60
+ from ..utils .extmath import logsumexp
61
+ from ..utils .fixes import expit
62
+ from ..utils .fixes import bincount
63
+ from ..utils .stats import _weighted_percentile
64
+ from ..utils .validation import check_is_fitted
65
+ from ..utils .validation import NotFittedError
66
+
55
67
56
68
class QuantileEstimator (BaseEstimator ):
57
69
"""An estimator predicting the alpha-quantile of the training targets."""
@@ -711,7 +723,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
711
723
min_samples_leaf , min_weight_fraction_leaf ,
712
724
max_depth , init , subsample , max_features ,
713
725
random_state , alpha = 0.9 , verbose = 0 , max_leaf_nodes = None ,
714
- warm_start = False ):
726
+ warm_start = False , presort = 'auto' ):
715
727
716
728
self .n_estimators = n_estimators
717
729
self .learning_rate = learning_rate
@@ -728,11 +740,12 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
728
740
self .verbose = verbose
729
741
self .max_leaf_nodes = max_leaf_nodes
730
742
self .warm_start = warm_start
743
+ self .presort = presort
731
744
732
745
self .estimators_ = np .empty ((0 , 0 ), dtype = np .object )
733
746
734
747
def _fit_stage (self , i , X , y , y_pred , sample_weight , sample_mask ,
735
- criterion , splitter , random_state ):
748
+ random_state , X_idx_sorted , X_csc = None , X_csr = None ):
736
749
"""Fit another stage of ``n_classes_`` trees to the boosting model. """
737
750
738
751
assert sample_mask .dtype == np .bool
@@ -748,27 +761,37 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
748
761
749
762
# induce regression tree on residuals
750
763
tree = DecisionTreeRegressor (
751
- criterion = criterion ,
752
- splitter = splitter ,
764
+ criterion = 'friedman_mse' ,
765
+ splitter = 'best' ,
753
766
max_depth = self .max_depth ,
754
767
min_samples_split = self .min_samples_split ,
755
768
min_samples_leaf = self .min_samples_leaf ,
756
769
min_weight_fraction_leaf = self .min_weight_fraction_leaf ,
757
770
max_features = self .max_features ,
758
771
max_leaf_nodes = self .max_leaf_nodes ,
759
- random_state = random_state )
772
+ random_state = random_state ,
773
+ presort = self .presort )
760
774
761
775
if self .subsample < 1.0 :
762
776
# no inplace multiplication!
763
777
sample_weight = sample_weight * sample_mask .astype (np .float64 )
764
778
765
- tree .fit (X , residual , sample_weight = sample_weight ,
766
- check_input = False )
779
+ if X_csc is not None :
780
+ tree .fit (X_csc , residual , sample_weight = sample_weight ,
781
+ check_input = False , X_idx_sorted = X_idx_sorted )
782
+ else :
783
+ tree .fit (X , residual , sample_weight = sample_weight ,
784
+ check_input = False , X_idx_sorted = X_idx_sorted )
767
785
768
786
# update tree leaves
769
- loss .update_terminal_regions (tree .tree_ , X , y , residual , y_pred ,
770
- sample_weight , sample_mask ,
771
- self .learning_rate , k = k )
787
+ if X_csr is not None :
788
+ loss .update_terminal_regions (tree .tree_ , X_csr , y , residual , y_pred ,
789
+ sample_weight , sample_mask ,
790
+ self .learning_rate , k = k )
791
+ else :
792
+ loss .update_terminal_regions (tree .tree_ , X , y , residual , y_pred ,
793
+ sample_weight , sample_mask ,
794
+ self .learning_rate , k = k )
772
795
773
796
# add tree to ensemble
774
797
self .estimators_ [i , k ] = tree
@@ -944,7 +967,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
944
967
self ._clear_state ()
945
968
946
969
# Check input
947
- X , y = check_X_y (X , y , dtype = DTYPE )
970
+ X , y = check_X_y (X , y , accept_sparse = [ 'csr' , 'csc' , 'coo' ], dtype = DTYPE )
948
971
n_samples , self .n_features = X .shape
949
972
if sample_weight is None :
950
973
sample_weight = np .ones (n_samples , dtype = np .float32 )
@@ -981,9 +1004,25 @@ def fit(self, X, y, sample_weight=None, monitor=None):
981
1004
y_pred = self ._decision_function (X )
982
1005
self ._resize_state ()
983
1006
1007
+ X_idx_sorted = None
1008
+ presort = self .presort
1009
+ # Allow presort to be 'auto', which means True if the dataset is dense,
1010
+ # otherwise it will be False.
1011
+ if presort == 'auto' and issparse (X ):
1012
+ presort = False
1013
+ elif presort == 'auto' :
1014
+ presort = True
1015
+
1016
+ if self .presort == True :
1017
+ if issparse (X ):
1018
+ raise ValueError ("Presorting is not supported for sparse matrices." )
1019
+ else :
1020
+ X_idx_sorted = np .asfortranarray (np .argsort (X , axis = 0 ),
1021
+ dtype = np .int32 )
1022
+
984
1023
# fit the boosting stages
985
1024
n_stages = self ._fit_stages (X , y , y_pred , sample_weight , random_state ,
986
- begin_at_stage , monitor )
1025
+ begin_at_stage , monitor , X_idx_sorted )
987
1026
# change shape of arrays after fit (early-stopping or additional ests)
988
1027
if n_stages != self .estimators_ .shape [0 ]:
989
1028
self .estimators_ = self .estimators_ [:n_stages ]
@@ -994,7 +1033,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
994
1033
return self
995
1034
996
1035
def _fit_stages (self , X , y , y_pred , sample_weight , random_state ,
997
- begin_at_stage = 0 , monitor = None ):
1036
+ begin_at_stage = 0 , monitor = None , X_idx_sorted = None ):
998
1037
"""Iteratively fits the stages.
999
1038
1000
1039
For each stage it computes the progress (OOB, train score)
@@ -1015,18 +1054,13 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
1015
1054
else :
1016
1055
min_weight_leaf = 0.
1017
1056
1018
- # init criterion and splitter
1019
- criterion = FriedmanMSE (1 )
1020
- splitter = PresortBestSplitter (criterion ,
1021
- self .max_features_ ,
1022
- self .min_samples_leaf ,
1023
- min_weight_leaf ,
1024
- random_state )
1025
-
1026
1057
if self .verbose :
1027
1058
verbose_reporter = VerboseReporter (self .verbose )
1028
1059
verbose_reporter .init (self , begin_at_stage )
1029
1060
1061
+ X_csc = csc_matrix (X ) if issparse (X ) else None
1062
+ X_csr = csr_matrix (X ) if issparse (X ) else None
1063
+
1030
1064
# perform boosting iterations
1031
1065
i = begin_at_stage
1032
1066
for i in range (begin_at_stage , self .n_estimators ):
@@ -1042,8 +1076,8 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
1042
1076
1043
1077
# fit next stage of trees
1044
1078
y_pred = self ._fit_stage (i , X , y , y_pred , sample_weight ,
1045
- sample_mask , criterion , splitter ,
1046
- random_state )
1079
+ sample_mask , random_state , X_idx_sorted ,
1080
+ X_csc , X_csr )
1047
1081
1048
1082
# track deviance (= loss)
1049
1083
if do_oob :
@@ -1074,6 +1108,7 @@ def _make_estimator(self, append=True):
1074
1108
def _init_decision_function (self , X ):
1075
1109
"""Check input and compute prediction of ``init``. """
1076
1110
self ._check_initialized ()
1111
+ X = self .estimators_ [0 , 0 ]._validate_X_predict (X , check_input = True )
1077
1112
if X .shape [1 ] != self .n_features :
1078
1113
raise ValueError ("X.shape[1] should be {0:d}, not {1:d}." .format (
1079
1114
self .n_features , X .shape [1 ]))
@@ -1104,7 +1139,9 @@ def decision_function(self, X):
1104
1139
Regression and binary classification produce an array of shape
1105
1140
[n_samples].
1106
1141
"""
1107
- X = check_array (X , dtype = DTYPE , order = "C" )
1142
+
1143
+ self ._check_initialized ()
1144
+ X = self .estimators_ [0 , 0 ]._validate_X_predict (X , check_input = True )
1108
1145
score = self ._decision_function (X )
1109
1146
if score .shape [1 ] == 1 :
1110
1147
return score .ravel ()
@@ -1318,6 +1355,12 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
1318
1355
If None, the random number generator is the RandomState instance used
1319
1356
by `np.random`.
1320
1357
1358
+ presort : bool or 'auto', optional (default='auto')
1359
+ Whether to presort the data to speed up the finding of best splits in
1360
+ fitting. Auto mode by default will use presorting on dense data and
1361
+ default to normal sorting on sparse data. Setting presort to true on
1362
+ sparse data will raise an error.
1363
+
1321
1364
Attributes
1322
1365
----------
1323
1366
feature_importances_ : array, shape = [n_features]
@@ -1369,7 +1412,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
1369
1412
min_samples_leaf = 1 , min_weight_fraction_leaf = 0. ,
1370
1413
max_depth = 3 , init = None , random_state = None ,
1371
1414
max_features = None , verbose = 0 ,
1372
- max_leaf_nodes = None , warm_start = False ):
1415
+ max_leaf_nodes = None , warm_start = False ,
1416
+ presort = 'auto' ):
1373
1417
1374
1418
super (GradientBoostingClassifier , self ).__init__ (
1375
1419
loss = loss , learning_rate = learning_rate , n_estimators = n_estimators ,
@@ -1379,7 +1423,8 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
1379
1423
max_depth = max_depth , init = init , subsample = subsample ,
1380
1424
max_features = max_features ,
1381
1425
random_state = random_state , verbose = verbose ,
1382
- max_leaf_nodes = max_leaf_nodes , warm_start = warm_start )
1426
+ max_leaf_nodes = max_leaf_nodes , warm_start = warm_start ,
1427
+ presort = presort )
1383
1428
1384
1429
def _validate_y (self , y ):
1385
1430
self .classes_ , y = np .unique (y , return_inverse = True )
@@ -1644,6 +1689,11 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
1644
1689
If None, the random number generator is the RandomState instance used
1645
1690
by `np.random`.
1646
1691
1692
+ presort : bool or 'auto', optional (default='auto')
1693
+ Whether to presort the data to speed up the finding of best splits in
1694
+ fitting. Auto mode by default will use presorting on dense data and
1695
+ default to normal sorting on sparse data. Setting presort to true on
1696
+ sparse data will raise an error.
1647
1697
1648
1698
Attributes
1649
1699
----------
@@ -1693,7 +1743,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
1693
1743
min_samples_leaf = 1 , min_weight_fraction_leaf = 0. ,
1694
1744
max_depth = 3 , init = None , random_state = None ,
1695
1745
max_features = None , alpha = 0.9 , verbose = 0 , max_leaf_nodes = None ,
1696
- warm_start = False ):
1746
+ warm_start = False , presort = 'auto' ):
1697
1747
1698
1748
super (GradientBoostingRegressor , self ).__init__ (
1699
1749
loss = loss , learning_rate = learning_rate , n_estimators = n_estimators ,
@@ -1703,7 +1753,8 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
1703
1753
max_depth = max_depth , init = init , subsample = subsample ,
1704
1754
max_features = max_features ,
1705
1755
random_state = random_state , alpha = alpha , verbose = verbose ,
1706
- max_leaf_nodes = max_leaf_nodes , warm_start = warm_start )
1756
+ max_leaf_nodes = max_leaf_nodes , warm_start = warm_start ,
1757
+ presort = 'auto' )
1707
1758
1708
1759
def predict (self , X ):
1709
1760
"""Predict regression target for X.
0 commit comments