Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b099a59

Browse files
committed
Merge pull request #5261 from glouppe/check-importances
[MRG+1] Stronger tests for variable importances
2 parents 8424c48 + 3575db6 commit b099a59

File tree

2 files changed

+143
-35
lines changed

2 files changed

+143
-35
lines changed

sklearn/ensemble/tests/test_forest.py

Lines changed: 142 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@
1010

1111
import pickle
1212
from collections import defaultdict
13+
from itertools import combinations
1314
from itertools import product
1415

1516
import numpy as np
16-
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
17+
from scipy.misc import comb
18+
from scipy.sparse import csr_matrix
19+
from scipy.sparse import csc_matrix
20+
from scipy.sparse import coo_matrix
1721

1822
from sklearn.utils.testing import assert_almost_equal
1923
from sklearn.utils.testing import assert_array_almost_equal
@@ -35,6 +39,7 @@
3539
from sklearn.ensemble import RandomTreesEmbedding
3640
from sklearn.grid_search import GridSearchCV
3741
from sklearn.svm import LinearSVC
42+
from sklearn.utils.fixes import bincount
3843
from sklearn.utils.validation import check_random_state
3944

4045
from sklearn.tree.tree import SPARSE_SPLITTERS
@@ -186,44 +191,146 @@ def test_probability():
186191
yield check_probability, name
187192

188193

189-
def check_importances(name, X, y):
190-
# Check variable importances.
191-
192-
ForestClassifier = FOREST_CLASSIFIERS[name]
193-
for n_jobs in [1, 2]:
194-
clf = ForestClassifier(n_estimators=10, n_jobs=n_jobs)
195-
clf.fit(X, y)
196-
importances = clf.feature_importances_
197-
n_important = np.sum(importances > 0.1)
198-
assert_equal(importances.shape[0], 10)
199-
assert_equal(n_important, 3)
200-
201-
X_new = clf.transform(X, threshold="mean")
202-
assert_less(0 < X_new.shape[1], X.shape[1])
203-
204-
# Check with sample weights
205-
sample_weight = np.ones(y.shape)
206-
sample_weight[y == 1] *= 100
207-
208-
clf = ForestClassifier(n_estimators=50, n_jobs=n_jobs, random_state=0)
209-
clf.fit(X, y, sample_weight=sample_weight)
210-
importances = clf.feature_importances_
211-
assert_true(np.all(importances >= 0.0))
194+
def check_importances(X, y, name, criterion):
195+
ForestEstimator = FOREST_ESTIMATORS[name]
212196

213-
clf = ForestClassifier(n_estimators=50, n_jobs=n_jobs, random_state=0)
214-
clf.fit(X, y, sample_weight=3 * sample_weight)
215-
importances_bis = clf.feature_importances_
216-
assert_almost_equal(importances, importances_bis)
197+
est = ForestEstimator(n_estimators=20, criterion=criterion,
198+
random_state=0)
199+
est.fit(X, y)
200+
importances = est.feature_importances_
201+
n_important = np.sum(importances > 0.1)
202+
assert_equal(importances.shape[0], 10)
203+
assert_equal(n_important, 3)
204+
205+
X_new = est.transform(X, threshold="mean")
206+
assert_less(X_new.shape[1], X.shape[1])
207+
208+
# Check with parallel
209+
importances = est.feature_importances_
210+
est.set_params(n_jobs=2)
211+
importances_parrallel = est.feature_importances_
212+
assert_array_almost_equal(importances, importances_parrallel)
213+
214+
# Check with sample weights
215+
sample_weight = check_random_state(0).randint(1, 10, len(X))
216+
est = ForestEstimator(n_estimators=20, random_state=0,
217+
criterion=criterion)
218+
est.fit(X, y, sample_weight=sample_weight)
219+
importances = est.feature_importances_
220+
assert_true(np.all(importances >= 0.0))
221+
222+
for scale in [0.5, 10, 100]:
223+
est = ForestEstimator(n_estimators=20, random_state=0,
224+
criterion=criterion)
225+
est.fit(X, y, sample_weight=scale * sample_weight)
226+
importances_bis = est.feature_importances_
227+
assert_less(np.abs(importances - importances_bis).mean(), 0.001)
217228

218229

219230
def test_importances():
220-
X, y = datasets.make_classification(n_samples=1000, n_features=10,
231+
X, y = datasets.make_classification(n_samples=500, n_features=10,
221232
n_informative=3, n_redundant=0,
222233
n_repeated=0, shuffle=False,
223234
random_state=0)
224235

225-
for name in FOREST_CLASSIFIERS:
226-
yield check_importances, name, X, y
236+
for name, criterion in product(FOREST_CLASSIFIERS, ["gini", "entropy"]):
237+
yield check_importances, X, y, name, criterion
238+
239+
for name, criterion in product(FOREST_REGRESSORS, ["mse", "friedman_mse"]):
240+
yield check_importances, X, y, name, criterion
241+
242+
243+
def test_importances_asymptotic():
244+
# Check whether variable importances of totally randomized trees
245+
# converge towards their theoretical values (See Louppe et al,
246+
# Understanding variable importances in forests of randomized trees, 2013).
247+
248+
def binomial(k, n):
249+
return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True)
250+
251+
def entropy(samples):
252+
n_samples = len(samples)
253+
entropy = 0.
254+
255+
for count in bincount(samples):
256+
p = 1. * count / n_samples
257+
if p > 0:
258+
entropy -= p * np.log2(p)
259+
260+
return entropy
261+
262+
def mdi_importance(X_m, X, y):
263+
n_samples, n_features = X.shape
264+
265+
features = list(range(n_features))
266+
features.pop(X_m)
267+
values = [np.unique(X[:, i]) for i in range(n_features)]
268+
269+
imp = 0.
270+
271+
for k in range(n_features):
272+
# Weight of each B of size k
273+
coef = 1. / (binomial(k, n_features) * (n_features - k))
274+
275+
# For all B of size k
276+
for B in combinations(features, k):
277+
# For all values B=b
278+
for b in product(*[values[B[j]] for j in range(k)]):
279+
mask_b = np.ones(n_samples, dtype=np.bool)
280+
281+
for j in range(k):
282+
mask_b &= X[:, B[j]] == b[j]
283+
284+
X_, y_ = X[mask_b, :], y[mask_b]
285+
n_samples_b = len(X_)
286+
287+
if n_samples_b > 0:
288+
children = []
289+
290+
for xi in values[X_m]:
291+
mask_xi = X_[:, X_m] == xi
292+
children.append(y_[mask_xi])
293+
294+
imp += (coef
295+
* (1. * n_samples_b / n_samples) # P(B=b)
296+
* (entropy(y_) -
297+
sum([entropy(c) * len(c) / n_samples_b
298+
for c in children])))
299+
300+
return imp
301+
302+
data = np.array([[0, 0, 1, 0, 0, 1, 0, 1],
303+
[1, 0, 1, 1, 1, 0, 1, 2],
304+
[1, 0, 1, 1, 0, 1, 1, 3],
305+
[0, 1, 1, 1, 0, 1, 0, 4],
306+
[1, 1, 0, 1, 0, 1, 1, 5],
307+
[1, 1, 0, 1, 1, 1, 1, 6],
308+
[1, 0, 1, 0, 0, 1, 0, 7],
309+
[1, 1, 1, 1, 1, 1, 1, 8],
310+
[1, 1, 1, 1, 0, 1, 1, 9],
311+
[1, 1, 1, 0, 1, 1, 1, 0]])
312+
313+
X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7]
314+
n_features = X.shape[1]
315+
316+
# Compute true importances
317+
true_importances = np.zeros(n_features)
318+
319+
for i in range(n_features):
320+
true_importances[i] = mdi_importance(i, X, y)
321+
322+
# Estimate importances with totally randomized trees
323+
clf = ExtraTreesClassifier(n_estimators=500,
324+
max_features=1,
325+
criterion="entropy",
326+
random_state=0).fit(X, y)
327+
328+
importances = sum(tree.tree_.compute_feature_importances(normalize=False)
329+
for tree in clf.estimators_) / clf.n_estimators
330+
331+
# Check correctness
332+
assert_almost_equal(entropy(y), sum(importances))
333+
assert_less(np.abs(true_importances - importances).mean(), 0.01)
227334

228335

229336
def check_unfitted_feature_importances(name):
@@ -239,6 +346,7 @@ def test_unfitted_feature_importances():
239346
def check_oob_score(name, X, y, n_estimators=20):
240347
# Check that oob prediction is a good estimation of the generalization
241348
# error.
349+
242350
# Proper behavior
243351
est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0,
244352
n_estimators=n_estimators, bootstrap=True)
@@ -583,7 +691,7 @@ def check_min_samples_leaf(name, X, y):
583691
random_state=0)
584692
est.fit(X, y)
585693
out = est.estimators_[0].tree_.apply(X)
586-
node_counts = np.bincount(out)
694+
node_counts = bincount(out)
587695
# drop inner nodes
588696
leaf_count = node_counts[node_counts != 0]
589697
assert_greater(np.min(leaf_count), 4,
@@ -617,7 +725,7 @@ def check_min_weight_fraction_leaf(name, X, y):
617725
est.bootstrap = False
618726
est.fit(X, y, sample_weight=weights)
619727
out = est.estimators_[0].tree_.apply(X)
620-
node_weights = np.bincount(out, weights=weights)
728+
node_weights = bincount(out, weights=weights)
621729
# drop inner nodes
622730
leaf_weights = node_weights[node_weights != 0]
623731
assert_greater_equal(
@@ -663,7 +771,7 @@ def check_sparse_input(name, X, X_sparse, y):
663771

664772
def test_sparse_input():
665773
X, y = datasets.make_multilabel_classification(random_state=0,
666-
n_samples=40)
774+
n_samples=50)
667775

668776
for name, sparse_matrix in product(FOREST_ESTIMATORS,
669777
(csr_matrix, csc_matrix, coo_matrix)):

sklearn/tree/tests/test_tree.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ def test_sample_weight():
820820
X = iris.data
821821
y = iris.target
822822

823-
duplicates = rng.randint(0, X.shape[0], 200)
823+
duplicates = rng.randint(0, X.shape[0], 100)
824824

825825
clf = DecisionTreeClassifier(random_state=1)
826826
clf.fit(X[duplicates], y[duplicates])

0 commit comments

Comments
 (0)