Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 710fe97

Browse files
ENH Add number of features used at each step to RFECV.cv_results_ (#28670)
Co-authored-by: Jérémie du Boisberranger <[email protected]>
1 parent 52138e2 commit 710fe97

File tree

4 files changed

+82
-39
lines changed

4 files changed

+82
-39
lines changed

doc/whats_new/v1.5.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ Changelog
188188
:pr:`28085` by :user:`Neto Menoci <netomenoci>` and
189189
:user:`Florin Andrei <FlorinAndrei>`.
190190

191+
- |Enhancement| The `cv_results_` attribute of :class:`feature_selection.RFECV` has
192+
a new key, `n_features`, containing an array with the number of features selected
193+
at each step.
194+
:pr:`28670` by :user:`Miguel Silva <miguelcsilva>`.
195+
191196
:mod:`sklearn.impute`
192197
.....................
193198

@@ -298,7 +303,7 @@ Changelog
298303
:func:`preprocessing.quantile_transform` now supports disabling
299304
subsampling explicitly.
300305
:pr:`27636` by :user:`Ralph Urlus <rurlus>`.
301-
306+
302307
:mod:`sklearn.tree`
303308
...................
304309

examples/feature_selection/plot_rfe_with_cross_validation.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,16 @@
6666
# ---------------------------------------------------
6767

6868
import matplotlib.pyplot as plt
69+
import pandas as pd
6970

70-
n_scores = len(rfecv.cv_results_["mean_test_score"])
71+
cv_results = pd.DataFrame(rfecv.cv_results_)
7172
plt.figure()
7273
plt.xlabel("Number of features selected")
7374
plt.ylabel("Mean test accuracy")
7475
plt.errorbar(
75-
range(min_features_to_select, n_scores + min_features_to_select),
76-
rfecv.cv_results_["mean_test_score"],
77-
yerr=rfecv.cv_results_["std_test_score"],
76+
x=cv_results["n_features"],
77+
y=cv_results["mean_test_score"],
78+
yerr=cv_results["std_test_score"],
7879
)
7980
plt.title("Recursive Feature Elimination \nwith correlated features")
8081
plt.show()

sklearn/feature_selection/_rfe.py

+31-30
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@
2828

2929
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
3030
"""
31-
Return the score for a fit across one fold.
31+
Return the score and n_features per step for a fit across one fold.
3232
"""
3333
X_train, y_train = _safe_split(estimator, X, y, train)
3434
X_test, y_test = _safe_split(estimator, X, y, test, train)
35-
return rfe._fit(
35+
36+
rfe._fit(
3637
X_train,
3738
y_train,
3839
lambda estimator, features: _score(
@@ -43,7 +44,9 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
4344
scorer,
4445
score_params=None,
4546
),
46-
).scores_
47+
)
48+
49+
return rfe.step_scores_, rfe.step_n_features_
4750

4851

4952
def _estimator_has(attr):
@@ -264,10 +267,9 @@ def fit(self, X, y, **fit_params):
264267
return self._fit(X, y, **fit_params)
265268

266269
def _fit(self, X, y, step_score=None, **fit_params):
267-
# Parameter step_score controls the calculation of self.scores_
268-
# step_score is not exposed to users
269-
# and is used when implementing RFECV
270-
# self.scores_ will not be calculated when calling _fit through fit
270+
# Parameter step_score controls the calculation of self.step_scores_
271+
# step_score is not exposed to users and is used when implementing RFECV
272+
# self.step_scores_ will not be calculated when calling _fit through fit
271273

272274
X, y = self._validate_data(
273275
X,
@@ -296,7 +298,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
296298
ranking_ = np.ones(n_features, dtype=int)
297299

298300
if step_score:
299-
self.scores_ = []
301+
self.step_n_features_ = []
302+
self.step_scores_ = []
300303

301304
# Elimination
302305
while np.sum(support_) > n_features_to_select:
@@ -328,7 +331,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
328331
# because 'estimator' must use features
329332
# that have not been eliminated yet
330333
if step_score:
331-
self.scores_.append(step_score(estimator, features))
334+
self.step_n_features_.append(len(features))
335+
self.step_scores_.append(step_score(estimator, features))
332336
support_[features[ranks][:threshold]] = False
333337
ranking_[np.logical_not(support_)] += 1
334338

@@ -339,7 +343,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
339343

340344
# Compute step score when only n_features_to_select features left
341345
if step_score:
342-
self.scores_.append(step_score(self.estimator_, features))
346+
self.step_n_features_.append(len(features))
347+
self.step_scores_.append(step_score(self.estimator_, features))
343348
self.n_features_ = support_.sum()
344349
self.support_ = support_
345350
self.ranking_ = ranking_
@@ -581,6 +586,9 @@ class RFECV(RFE):
581586
std_test_score : ndarray of shape (n_subsets_of_features,)
582587
Standard deviation of scores over the folds.
583588
589+
n_features : ndarray of shape (n_subsets_of_features,)
590+
Number of features used at each step.
591+
584592
.. versionadded:: 1.0
585593
586594
n_features_ : int
@@ -718,12 +726,6 @@ def fit(self, X, y, groups=None):
718726
# Initialization
719727
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
720728
scorer = check_scoring(self.estimator, scoring=self.scoring)
721-
n_features = X.shape[1]
722-
723-
if 0.0 < self.step < 1.0:
724-
step = int(max(1, self.step * n_features))
725-
else:
726-
step = int(self.step)
727729

728730
# Build an RFE object, which will evaluate and score each possible
729731
# feature count, down to self.min_features_to_select
@@ -753,18 +755,18 @@ def fit(self, X, y, groups=None):
753755
parallel = Parallel(n_jobs=self.n_jobs)
754756
func = delayed(_rfe_single_fit)
755757

756-
scores = parallel(
758+
scores_features = parallel(
757759
func(rfe, self.estimator, X, y, train, test, scorer)
758760
for train, test in cv.split(X, y, groups)
759761
)
762+
scores, step_n_features = zip(*scores_features)
760763

764+
step_n_features_rev = np.array(step_n_features[0])[::-1]
761765
scores = np.array(scores)
762-
scores_sum = np.sum(scores, axis=0)
763-
scores_sum_rev = scores_sum[::-1]
764-
argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1
765-
n_features_to_select = max(
766-
n_features - (argmax_idx * step), self.min_features_to_select
767-
)
766+
767+
# Reverse order such that lowest number of features is selected in case of tie.
768+
scores_sum_rev = np.sum(scores, axis=0)[::-1]
769+
n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)]
768770

769771
# Re-execute an elimination with best_k over the whole set
770772
rfe = RFE(
@@ -786,11 +788,10 @@ def fit(self, X, y, groups=None):
786788

787789
# reverse to stay consistent with before
788790
scores_rev = scores[:, ::-1]
789-
self.cv_results_ = {}
790-
self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0)
791-
self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0)
792-
793-
for i in range(scores.shape[0]):
794-
self.cv_results_[f"split{i}_test_score"] = scores_rev[i]
795-
791+
self.cv_results_ = {
792+
"mean_test_score": np.mean(scores_rev, axis=0),
793+
"std_test_score": np.std(scores_rev, axis=0),
794+
**{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])},
795+
"n_features": step_n_features_rev,
796+
}
796797
return self

sklearn/feature_selection/tests/test_rfe.py

+40-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from sklearn.base import BaseEstimator, ClassifierMixin
1212
from sklearn.compose import TransformedTargetRegressor
1313
from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
14-
from sklearn.datasets import load_iris, make_friedman1
14+
from sklearn.datasets import load_iris, make_classification, make_friedman1
1515
from sklearn.ensemble import RandomForestClassifier
1616
from sklearn.feature_selection import RFE, RFECV
1717
from sklearn.impute import SimpleImputer
@@ -537,9 +537,7 @@ def test_rfecv_std_and_mean(global_random_seed):
537537

538538
rfecv = RFECV(estimator=SVC(kernel="linear"))
539539
rfecv.fit(X, y)
540-
n_split_keys = len(rfecv.cv_results_) - 2
541-
split_keys = [f"split{i}_test_score" for i in range(n_split_keys)]
542-
540+
split_keys = [key for key in rfecv.cv_results_.keys() if "split" in key]
543541
cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys])
544542
expected_mean = np.mean(cv_scores, axis=0)
545543
expected_std = np.std(cv_scores, axis=0)
@@ -548,6 +546,44 @@ def test_rfecv_std_and_mean(global_random_seed):
548546
assert_allclose(rfecv.cv_results_["std_test_score"], expected_std)
549547

550548

549+
@pytest.mark.parametrize(
550+
["min_features_to_select", "n_features", "step", "cv_results_n_features"],
551+
[
552+
[1, 4, 1, np.array([1, 2, 3, 4])],
553+
[1, 5, 1, np.array([1, 2, 3, 4, 5])],
554+
[1, 4, 2, np.array([1, 2, 4])],
555+
[1, 5, 2, np.array([1, 3, 5])],
556+
[1, 4, 3, np.array([1, 4])],
557+
[1, 5, 3, np.array([1, 2, 5])],
558+
[1, 4, 4, np.array([1, 4])],
559+
[1, 5, 4, np.array([1, 5])],
560+
[4, 4, 2, np.array([4])],
561+
[4, 5, 1, np.array([4, 5])],
562+
[4, 5, 2, np.array([4, 5])],
563+
],
564+
)
565+
def test_rfecv_cv_results_n_features(
566+
min_features_to_select,
567+
n_features,
568+
step,
569+
cv_results_n_features,
570+
):
571+
X, y = make_classification(
572+
n_samples=20, n_features=n_features, n_informative=n_features, n_redundant=0
573+
)
574+
rfecv = RFECV(
575+
estimator=SVC(kernel="linear"),
576+
step=step,
577+
min_features_to_select=min_features_to_select,
578+
)
579+
rfecv.fit(X, y)
580+
assert_array_equal(rfecv.cv_results_["n_features"], cv_results_n_features)
581+
assert all(
582+
len(value) == len(rfecv.cv_results_["n_features"])
583+
for value in rfecv.cv_results_.values()
584+
)
585+
586+
551587
@pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
552588
def test_multioutput(ClsRFE):
553589
X = np.random.normal(size=(10, 3))

0 commit comments

Comments
 (0)