Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9412f50

Browse files
authored
ENH Fast PDP for DecisionTreeRegressor and RandomForestRegressor (#15864)
1 parent 244d118 commit 9412f50

File tree

7 files changed

+168
-14
lines changed

7 files changed

+168
-14
lines changed

doc/whats_new/v0.23.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,16 @@ Changelog
205205
``max_value`` and ``min_value``. Array-like inputs allow a different max and min to be specified
206206
for each feature. :pr:`16403` by :user:`Narendra Mukherjee <narendramukherjee>`.
207207

208+
:mod:`sklearn.inspection`
209+
.........................
210+
211+
- |Feature| :func:`inspection.partial_dependence` and
212+
:func:`inspection.plot_partial_dependence` now support the fast 'recursion'
213+
method for :class:`ensemble.RandomForestRegressor` and
214+
:class:`tree.DecisionTreeRegressor`. :pr:`15864` by
215+
`Nicolas Hug`_.
216+
217+
208218
:mod:`sklearn.linear_model`
209219
...........................
210220

sklearn/ensemble/_forest.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,36 @@ def _set_oob_score(self, X, y):
846846

847847
self.oob_score_ /= self.n_outputs_
848848

849+
def _compute_partial_dependence_recursion(self, grid, target_features):
850+
"""Fast partial dependence computation.
851+
852+
Parameters
853+
----------
854+
grid : ndarray of shape (n_samples, n_target_features)
855+
The grid points on which the partial dependence should be
856+
evaluated.
857+
target_features : ndarray of shape (n_target_features)
858+
The set of target features for which the partial dependence
859+
should be evaluated.
860+
861+
Returns
862+
-------
863+
averaged_predictions : ndarray of shape (n_samples,)
864+
The value of the partial dependence function on each grid point.
865+
"""
866+
grid = np.asarray(grid, dtype=DTYPE, order='C')
867+
averaged_predictions = np.zeros(shape=grid.shape[0],
868+
dtype=np.float64, order='C')
869+
870+
for tree in self.estimators_:
871+
# Note: we don't sum in parallel because the GIL isn't released in
872+
# the fast method.
873+
tree.tree_.compute_partial_dependence(
874+
grid, target_features, averaged_predictions)
875+
# Average over the forest
876+
averaged_predictions /= len(self.estimators_)
877+
878+
return averaged_predictions
849879

850880
class RandomForestClassifier(ForestClassifier):
851881
"""

sklearn/ensemble/_gb.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,8 +708,6 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
708708
(n_trees_per_iteration, n_samples)
709709
The value of the partial dependence function on each grid point.
710710
"""
711-
check_is_fitted(self,
712-
msg="'estimator' parameter must be a fitted estimator")
713711
if self.init is not None:
714712
warnings.warn(
715713
'Using recursion method with a non-constant init predictor '

sklearn/inspection/_partial_dependence.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
from ..utils import _determine_key_type
2121
from ..utils import _get_column_indices
2222
from ..utils.validation import check_is_fitted
23+
from ..tree import DecisionTreeRegressor
24+
from ..ensemble import RandomForestRegressor
2325
from ..exceptions import NotFittedError
2426
from ..ensemble._gb import BaseGradientBoosting
2527
from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
@@ -100,7 +102,14 @@ def _grid_from_X(X, percentiles, grid_resolution):
100102

101103

102104
def _partial_dependence_recursion(est, grid, features):
103-
return est._compute_partial_dependence_recursion(grid, features)
105+
averaged_predictions = est._compute_partial_dependence_recursion(grid,
106+
features)
107+
if averaged_predictions.ndim == 1:
108+
# reshape to (1, n_points) for consistency with
109+
# _partial_dependence_brute
110+
averaged_predictions = averaged_predictions.reshape(1, -1)
111+
112+
return averaged_predictions
104113

105114

106115
def _partial_dependence_brute(est, grid, features, X, response_method):
@@ -242,7 +251,10 @@ def partial_dependence(estimator, X, features, response_method='auto',
242251
:class:`~sklearn.ensemble.GradientBoostingClassifier`,
243252
:class:`~sklearn.ensemble.GradientBoostingRegressor`,
244253
:class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
245-
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`)
254+
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
255+
:class:`~sklearn.tree.DecisionTreeRegressor`,
256+
:class:`~sklearn.ensemble.RandomForestRegressor`,
257+
)
246258
but is more efficient in terms of speed.
247259
With this method, the target response of a
248260
classifier is always the decision function, not the predicted
@@ -339,19 +351,25 @@ def partial_dependence(estimator, X, features, response_method='auto',
339351
if (isinstance(estimator, BaseGradientBoosting) and
340352
estimator.init is None):
341353
method = 'recursion'
342-
elif isinstance(estimator, BaseHistGradientBoosting):
354+
elif isinstance(estimator, (BaseHistGradientBoosting,
355+
DecisionTreeRegressor,
356+
RandomForestRegressor)):
343357
method = 'recursion'
344358
else:
345359
method = 'brute'
346360

347361
if method == 'recursion':
348362
if not isinstance(estimator,
349-
(BaseGradientBoosting, BaseHistGradientBoosting)):
363+
(BaseGradientBoosting, BaseHistGradientBoosting,
364+
DecisionTreeRegressor, RandomForestRegressor)):
350365
supported_classes_recursion = (
351366
'GradientBoostingClassifier',
352367
'GradientBoostingRegressor',
353368
'HistGradientBoostingClassifier',
354369
'HistGradientBoostingRegressor',
370+
'HistGradientBoostingRegressor',
371+
'DecisionTreeRegressor',
372+
'RandomForestRegressor',
355373
)
356374
raise ValueError(
357375
"Only the following estimators support the 'recursion' "
@@ -399,5 +417,3 @@ def partial_dependence(estimator, X, features, response_method='auto',
399417
-1, *[val.shape[0] for val in values])
400418

401419
return averaged_predictions, values
402-
403-

sklearn/inspection/_plot/partial_dependence.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
126126
:class:`~sklearn.ensemble.GradientBoostingClassifier`,
127127
:class:`~sklearn.ensemble.GradientBoostingRegressor`,
128128
:class:`~sklearn.ensemble.HistGradientBoostingClassifier`,
129-
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`)
129+
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`,
130+
:class:`~sklearn.tree.DecisionTreeRegressor`,
131+
:class:`~sklearn.ensemble.RandomForestRegressor`
130132
but is more efficient in terms of speed.
131133
With this method, the target response of a
132134
classifier is always the decision function, not the predicted
@@ -201,9 +203,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
201203
from matplotlib.ticker import ScalarFormatter # noqa
202204

203205
# set target_idx for multi-class estimators
204-
if (is_classifier(estimator) and
205-
hasattr(estimator, 'classes_') and
206-
np.size(estimator.classes_) > 2):
206+
if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2:
207207
if target is None:
208208
raise ValueError('target must be specified for multi-class')
209209
target_idx = np.searchsorted(estimator.classes_, target)

sklearn/inspection/tests/test_partial_dependence.py

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from sklearn.ensemble import GradientBoostingClassifier
1616
from sklearn.ensemble import GradientBoostingRegressor
17+
from sklearn.ensemble import RandomForestRegressor
1718
from sklearn.experimental import enable_hist_gradient_boosting # noqa
1819
from sklearn.ensemble import HistGradientBoostingClassifier
1920
from sklearn.ensemble import HistGradientBoostingRegressor
@@ -36,6 +37,9 @@
3637
from sklearn.utils._testing import assert_allclose
3738
from sklearn.utils._testing import assert_array_equal
3839
from sklearn.utils._testing import ignore_warnings
40+
from sklearn.utils import _IS_32BIT
41+
from sklearn.utils.validation import check_random_state
42+
from sklearn.tree.tests.test_tree import assert_is_subtree
3943

4044

4145
# toy sample
@@ -174,6 +178,11 @@ def test_partial_dependence_helpers(est, method, target_feature):
174178
# samples.
175179
# This also checks that the brute and recursion methods give the same
176180
# output.
181+
# Note that even on the trainset, the brute and the recursion methods
182+
# aren't always strictly equivalent, in particular when the slow method
183+
# generates unrealistic samples that have low mass in the joint
184+
# distribution of the input features, and when some of the features are
185+
# dependent. Hence the high tolerance on the checks.
177186

178187
X, y = make_regression(random_state=0, n_features=5, n_informative=5)
179188
# The 'init' estimator for GBDT (here the average prediction) isn't taken
@@ -206,6 +215,71 @@ def test_partial_dependence_helpers(est, method, target_feature):
206215
assert np.allclose(pdp, mean_predictions, rtol=rtol)
207216

208217

218+
@pytest.mark.parametrize('seed', range(1))
219+
def test_recursion_decision_tree_vs_forest_and_gbdt(seed):
220+
# Make sure that the recursion method gives the same results on a
221+
# DecisionTreeRegressor and a GradientBoostingRegressor or a
222+
# RandomForestRegressor with 1 tree and equivalent parameters.
223+
224+
rng = np.random.RandomState(seed)
225+
226+
# Purely random dataset to avoid correlated features
227+
n_samples = 1000
228+
n_features = 5
229+
X = rng.randn(n_samples, n_features)
230+
y = rng.randn(n_samples) * 10
231+
232+
# The 'init' estimator for GBDT (here the average prediction) isn't taken
233+
# into account with the recursion method, for technical reasons. We set
234+
# the mean to 0 to that this 'bug' doesn't have any effect.
235+
y = y - y.mean()
236+
237+
# set max_depth not too high to avoid splits with same gain but different
238+
# features
239+
max_depth = 5
240+
241+
tree_seed = 0
242+
forest = RandomForestRegressor(n_estimators=1, max_features=None,
243+
bootstrap=False, max_depth=max_depth,
244+
random_state=tree_seed)
245+
# The forest will use ensemble.base._set_random_states to set the
246+
# random_state of the tree sub-estimator. We simulate this here to have
247+
# equivalent estimators.
248+
equiv_random_state = check_random_state(tree_seed).randint(
249+
np.iinfo(np.int32).max)
250+
gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1,
251+
criterion='mse', max_depth=max_depth,
252+
random_state=equiv_random_state)
253+
tree = DecisionTreeRegressor(max_depth=max_depth,
254+
random_state=equiv_random_state)
255+
256+
forest.fit(X, y)
257+
gbdt.fit(X, y)
258+
tree.fit(X, y)
259+
260+
# sanity check: if the trees aren't the same, the PD values won't be equal
261+
try:
262+
assert_is_subtree(tree.tree_, gbdt[0, 0].tree_)
263+
assert_is_subtree(tree.tree_, forest[0].tree_)
264+
except AssertionError:
265+
# For some reason the trees aren't exactly equal on 32bits, so the PDs
266+
# cannot be equal either. See
267+
# https://github.com/scikit-learn/scikit-learn/issues/8853
268+
assert _IS_32BIT, "this should only fail on 32 bit platforms"
269+
return
270+
271+
grid = rng.randn(50).reshape(-1, 1)
272+
for f in range(n_features):
273+
features = np.array([f], dtype=np.int32)
274+
275+
pdp_forest = _partial_dependence_recursion(forest, grid, features)
276+
pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features)
277+
pdp_tree = _partial_dependence_recursion(tree, grid, features)
278+
279+
np.testing.assert_allclose(pdp_gbdt, pdp_tree)
280+
np.testing.assert_allclose(pdp_forest, pdp_tree)
281+
282+
209283
@pytest.mark.parametrize('est', (
210284
GradientBoostingClassifier(random_state=0),
211285
HistGradientBoostingClassifier(random_state=0),
@@ -236,8 +310,9 @@ def test_recursion_decision_function(est, target_feature):
236310
LinearRegression(),
237311
GradientBoostingRegressor(random_state=0),
238312
HistGradientBoostingRegressor(random_state=0, min_samples_leaf=1,
239-
max_leaf_nodes=None, max_iter=1))
240-
)
313+
max_leaf_nodes=None, max_iter=1),
314+
DecisionTreeRegressor(random_state=0),
315+
))
241316
@pytest.mark.parametrize('power', (1, 2))
242317
def test_partial_dependence_easy_target(est, power):
243318
# If the target y only depends on one feature in an obvious way (linear or

sklearn/tree/_classes.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,6 +1252,31 @@ def n_classes_(self):
12521252
warnings.warn(msg, FutureWarning)
12531253
return np.array([1] * self.n_outputs_, dtype=np.intp)
12541254

1255+
def _compute_partial_dependence_recursion(self, grid, target_features):
1256+
"""Fast partial dependence computation.
1257+
1258+
Parameters
1259+
----------
1260+
grid : ndarray of shape (n_samples, n_target_features)
1261+
The grid points on which the partial dependence should be
1262+
evaluated.
1263+
target_features : ndarray of shape (n_target_features)
1264+
The set of target features for which the partial dependence
1265+
should be evaluated.
1266+
1267+
Returns
1268+
-------
1269+
averaged_predictions : ndarray of shape (n_samples,)
1270+
The value of the partial dependence function on each grid point.
1271+
"""
1272+
grid = np.asarray(grid, dtype=DTYPE, order='C')
1273+
averaged_predictions = np.zeros(shape=grid.shape[0],
1274+
dtype=np.float64, order='C')
1275+
1276+
self.tree_.compute_partial_dependence(
1277+
grid, target_features, averaged_predictions)
1278+
return averaged_predictions
1279+
12551280

12561281
class ExtraTreeClassifier(DecisionTreeClassifier):
12571282
"""An extremely randomized tree classifier.

0 commit comments

Comments
 (0)