10
10
11
11
import pickle
12
12
from collections import defaultdict
13
+ from itertools import combinations
13
14
from itertools import product
14
15
15
16
import numpy as np
16
- from scipy .sparse import csr_matrix , csc_matrix , coo_matrix
17
+ from scipy .misc import comb
18
+ from scipy .sparse import csr_matrix
19
+ from scipy .sparse import csc_matrix
20
+ from scipy .sparse import coo_matrix
17
21
18
22
from sklearn .utils .testing import assert_almost_equal
19
23
from sklearn .utils .testing import assert_array_almost_equal
35
39
from sklearn .ensemble import RandomTreesEmbedding
36
40
from sklearn .grid_search import GridSearchCV
37
41
from sklearn .svm import LinearSVC
42
+ from sklearn .utils .fixes import bincount
38
43
from sklearn .utils .validation import check_random_state
39
44
40
45
from sklearn .tree .tree import SPARSE_SPLITTERS
@@ -186,44 +191,146 @@ def test_probability():
186
191
yield check_probability , name
187
192
188
193
189
- def check_importances (name , X , y ):
190
- # Check variable importances.
191
-
192
- ForestClassifier = FOREST_CLASSIFIERS [name ]
193
- for n_jobs in [1 , 2 ]:
194
- clf = ForestClassifier (n_estimators = 10 , n_jobs = n_jobs )
195
- clf .fit (X , y )
196
- importances = clf .feature_importances_
197
- n_important = np .sum (importances > 0.1 )
198
- assert_equal (importances .shape [0 ], 10 )
199
- assert_equal (n_important , 3 )
200
-
201
- X_new = clf .transform (X , threshold = "mean" )
202
- assert_less (0 < X_new .shape [1 ], X .shape [1 ])
203
-
204
- # Check with sample weights
205
- sample_weight = np .ones (y .shape )
206
- sample_weight [y == 1 ] *= 100
207
-
208
- clf = ForestClassifier (n_estimators = 50 , n_jobs = n_jobs , random_state = 0 )
209
- clf .fit (X , y , sample_weight = sample_weight )
210
- importances = clf .feature_importances_
211
- assert_true (np .all (importances >= 0.0 ))
194
+ def check_importances (X , y , name , criterion ):
195
+ ForestEstimator = FOREST_ESTIMATORS [name ]
212
196
213
- clf = ForestClassifier (n_estimators = 50 , n_jobs = n_jobs , random_state = 0 )
214
- clf .fit (X , y , sample_weight = 3 * sample_weight )
215
- importances_bis = clf .feature_importances_
216
- assert_almost_equal (importances , importances_bis )
197
+ est = ForestEstimator (n_estimators = 20 , criterion = criterion ,
198
+ random_state = 0 )
199
+ est .fit (X , y )
200
+ importances = est .feature_importances_
201
+ n_important = np .sum (importances > 0.1 )
202
+ assert_equal (importances .shape [0 ], 10 )
203
+ assert_equal (n_important , 3 )
204
+
205
+ X_new = est .transform (X , threshold = "mean" )
206
+ assert_less (X_new .shape [1 ], X .shape [1 ])
207
+
208
+ # Check with parallel
209
+ importances = est .feature_importances_
210
+ est .set_params (n_jobs = 2 )
211
+ importances_parrallel = est .feature_importances_
212
+ assert_array_almost_equal (importances , importances_parrallel )
213
+
214
+ # Check with sample weights
215
+ sample_weight = check_random_state (0 ).randint (1 , 10 , len (X ))
216
+ est = ForestEstimator (n_estimators = 20 , random_state = 0 ,
217
+ criterion = criterion )
218
+ est .fit (X , y , sample_weight = sample_weight )
219
+ importances = est .feature_importances_
220
+ assert_true (np .all (importances >= 0.0 ))
221
+
222
+ for scale in [0.5 , 10 , 100 ]:
223
+ est = ForestEstimator (n_estimators = 20 , random_state = 0 ,
224
+ criterion = criterion )
225
+ est .fit (X , y , sample_weight = scale * sample_weight )
226
+ importances_bis = est .feature_importances_
227
+ assert_less (np .abs (importances - importances_bis ).mean (), 0.001 )
217
228
218
229
219
230
def test_importances ():
220
- X , y = datasets .make_classification (n_samples = 1000 , n_features = 10 ,
231
+ X , y = datasets .make_classification (n_samples = 500 , n_features = 10 ,
221
232
n_informative = 3 , n_redundant = 0 ,
222
233
n_repeated = 0 , shuffle = False ,
223
234
random_state = 0 )
224
235
225
- for name in FOREST_CLASSIFIERS :
226
- yield check_importances , name , X , y
236
+ for name , criterion in product (FOREST_CLASSIFIERS , ["gini" , "entropy" ]):
237
+ yield check_importances , X , y , name , criterion
238
+
239
+ for name , criterion in product (FOREST_REGRESSORS , ["mse" , "friedman_mse" ]):
240
+ yield check_importances , X , y , name , criterion
241
+
242
+
243
+ def test_importances_asymptotic ():
244
+ # Check whether variable importances of totally randomized trees
245
+ # converge towards their theoretical values (See Louppe et al,
246
+ # Understanding variable importances in forests of randomized trees, 2013).
247
+
248
+ def binomial (k , n ):
249
+ return 0 if k < 0 or k > n else comb (int (n ), int (k ), exact = True )
250
+
251
+ def entropy (samples ):
252
+ n_samples = len (samples )
253
+ entropy = 0.
254
+
255
+ for count in bincount (samples ):
256
+ p = 1. * count / n_samples
257
+ if p > 0 :
258
+ entropy -= p * np .log2 (p )
259
+
260
+ return entropy
261
+
262
+ def mdi_importance (X_m , X , y ):
263
+ n_samples , n_features = X .shape
264
+
265
+ features = list (range (n_features ))
266
+ features .pop (X_m )
267
+ values = [np .unique (X [:, i ]) for i in range (n_features )]
268
+
269
+ imp = 0.
270
+
271
+ for k in range (n_features ):
272
+ # Weight of each B of size k
273
+ coef = 1. / (binomial (k , n_features ) * (n_features - k ))
274
+
275
+ # For all B of size k
276
+ for B in combinations (features , k ):
277
+ # For all values B=b
278
+ for b in product (* [values [B [j ]] for j in range (k )]):
279
+ mask_b = np .ones (n_samples , dtype = np .bool )
280
+
281
+ for j in range (k ):
282
+ mask_b &= X [:, B [j ]] == b [j ]
283
+
284
+ X_ , y_ = X [mask_b , :], y [mask_b ]
285
+ n_samples_b = len (X_ )
286
+
287
+ if n_samples_b > 0 :
288
+ children = []
289
+
290
+ for xi in values [X_m ]:
291
+ mask_xi = X_ [:, X_m ] == xi
292
+ children .append (y_ [mask_xi ])
293
+
294
+ imp += (coef
295
+ * (1. * n_samples_b / n_samples ) # P(B=b)
296
+ * (entropy (y_ ) -
297
+ sum ([entropy (c ) * len (c ) / n_samples_b
298
+ for c in children ])))
299
+
300
+ return imp
301
+
302
+ data = np .array ([[0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 ],
303
+ [1 , 0 , 1 , 1 , 1 , 0 , 1 , 2 ],
304
+ [1 , 0 , 1 , 1 , 0 , 1 , 1 , 3 ],
305
+ [0 , 1 , 1 , 1 , 0 , 1 , 0 , 4 ],
306
+ [1 , 1 , 0 , 1 , 0 , 1 , 1 , 5 ],
307
+ [1 , 1 , 0 , 1 , 1 , 1 , 1 , 6 ],
308
+ [1 , 0 , 1 , 0 , 0 , 1 , 0 , 7 ],
309
+ [1 , 1 , 1 , 1 , 1 , 1 , 1 , 8 ],
310
+ [1 , 1 , 1 , 1 , 0 , 1 , 1 , 9 ],
311
+ [1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 ]])
312
+
313
+ X , y = np .array (data [:, :7 ], dtype = np .bool ), data [:, 7 ]
314
+ n_features = X .shape [1 ]
315
+
316
+ # Compute true importances
317
+ true_importances = np .zeros (n_features )
318
+
319
+ for i in range (n_features ):
320
+ true_importances [i ] = mdi_importance (i , X , y )
321
+
322
+ # Estimate importances with totally randomized trees
323
+ clf = ExtraTreesClassifier (n_estimators = 500 ,
324
+ max_features = 1 ,
325
+ criterion = "entropy" ,
326
+ random_state = 0 ).fit (X , y )
327
+
328
+ importances = sum (tree .tree_ .compute_feature_importances (normalize = False )
329
+ for tree in clf .estimators_ ) / clf .n_estimators
330
+
331
+ # Check correctness
332
+ assert_almost_equal (entropy (y ), sum (importances ))
333
+ assert_less (np .abs (true_importances - importances ).mean (), 0.01 )
227
334
228
335
229
336
def check_unfitted_feature_importances (name ):
@@ -239,6 +346,7 @@ def test_unfitted_feature_importances():
239
346
def check_oob_score (name , X , y , n_estimators = 20 ):
240
347
# Check that oob prediction is a good estimation of the generalization
241
348
# error.
349
+
242
350
# Proper behavior
243
351
est = FOREST_ESTIMATORS [name ](oob_score = True , random_state = 0 ,
244
352
n_estimators = n_estimators , bootstrap = True )
@@ -583,7 +691,7 @@ def check_min_samples_leaf(name, X, y):
583
691
random_state = 0 )
584
692
est .fit (X , y )
585
693
out = est .estimators_ [0 ].tree_ .apply (X )
586
- node_counts = np . bincount (out )
694
+ node_counts = bincount (out )
587
695
# drop inner nodes
588
696
leaf_count = node_counts [node_counts != 0 ]
589
697
assert_greater (np .min (leaf_count ), 4 ,
@@ -617,7 +725,7 @@ def check_min_weight_fraction_leaf(name, X, y):
617
725
est .bootstrap = False
618
726
est .fit (X , y , sample_weight = weights )
619
727
out = est .estimators_ [0 ].tree_ .apply (X )
620
- node_weights = np . bincount (out , weights = weights )
728
+ node_weights = bincount (out , weights = weights )
621
729
# drop inner nodes
622
730
leaf_weights = node_weights [node_weights != 0 ]
623
731
assert_greater_equal (
@@ -663,7 +771,7 @@ def check_sparse_input(name, X, X_sparse, y):
663
771
664
772
def test_sparse_input ():
665
773
X , y = datasets .make_multilabel_classification (random_state = 0 ,
666
- n_samples = 40 )
774
+ n_samples = 50 )
667
775
668
776
for name , sparse_matrix in product (FOREST_ESTIMATORS ,
669
777
(csr_matrix , csc_matrix , coo_matrix )):
0 commit comments