Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Add number of features used at each step to RFECV.cv_results_ #28670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 27, 2024
7 changes: 6 additions & 1 deletion doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ Changelog
:pr:`28085` by :user:`Neto Menoci <netomenoci>` and
:user:`Florin Andrei <FlorinAndrei>`.

- |Enhancement| The `cv_results_` attribute of :class:`feature_selection.RFECV` has
a new key, `n_features`, containing an array with the number of features selected
at each step.
:pr:`28670` by :user:`Miguel Silva <miguelcsilva>`.

:mod:`sklearn.impute`
.....................

Expand Down Expand Up @@ -298,7 +303,7 @@ Changelog
:func:`preprocessing.quantile_transform` now supports disabling
subsampling explicitly.
:pr:`27636` by :user:`Ralph Urlus <rurlus>`.

:mod:`sklearn.tree`
...................

Expand Down
9 changes: 5 additions & 4 deletions examples/feature_selection/plot_rfe_with_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,16 @@
# ---------------------------------------------------

import matplotlib.pyplot as plt
import pandas as pd

n_scores = len(rfecv.cv_results_["mean_test_score"])
cv_results = pd.DataFrame(rfecv.cv_results_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
range(min_features_to_select, n_scores + min_features_to_select),
rfecv.cv_results_["mean_test_score"],
yerr=rfecv.cv_results_["std_test_score"],
x=cv_results["n_features"],
y=cv_results["mean_test_score"],
yerr=cv_results["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()
Expand Down
61 changes: 31 additions & 30 deletions sklearn/feature_selection/_rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@

def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
"""
Return the score for a fit across one fold.
Return the score and n_features per step for a fit across one fold.
"""
X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)
return rfe._fit(

rfe._fit(
X_train,
y_train,
lambda estimator, features: _score(
Expand All @@ -43,7 +44,9 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
scorer,
score_params=None,
),
).scores_
)

return rfe.step_scores_, rfe.step_n_features_


def _estimator_has(attr):
Expand Down Expand Up @@ -264,10 +267,9 @@ def fit(self, X, y, **fit_params):
return self._fit(X, y, **fit_params)

def _fit(self, X, y, step_score=None, **fit_params):
# Parameter step_score controls the calculation of self.scores_
# step_score is not exposed to users
# and is used when implementing RFECV
# self.scores_ will not be calculated when calling _fit through fit
# Parameter step_score controls the calculation of self.step_scores_
# step_score is not exposed to users and is used when implementing RFECV
# self.step_scores_ will not be calculated when calling _fit through fit

X, y = self._validate_data(
X,
Expand Down Expand Up @@ -296,7 +298,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
ranking_ = np.ones(n_features, dtype=int)

if step_score:
self.scores_ = []
self.step_n_features_ = []
self.step_scores_ = []

# Elimination
while np.sum(support_) > n_features_to_select:
Expand Down Expand Up @@ -328,7 +331,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
# because 'estimator' must use features
# that have not been eliminated yet
if step_score:
self.scores_.append(step_score(estimator, features))
self.step_n_features_.append(len(features))
self.step_scores_.append(step_score(estimator, features))
support_[features[ranks][:threshold]] = False
ranking_[np.logical_not(support_)] += 1

Expand All @@ -339,7 +343,8 @@ def _fit(self, X, y, step_score=None, **fit_params):

# Compute step score when only n_features_to_select features left
if step_score:
self.scores_.append(step_score(self.estimator_, features))
self.step_n_features_.append(len(features))
self.step_scores_.append(step_score(self.estimator_, features))
self.n_features_ = support_.sum()
self.support_ = support_
self.ranking_ = ranking_
Expand Down Expand Up @@ -581,6 +586,9 @@ class RFECV(RFE):
std_test_score : ndarray of shape (n_subsets_of_features,)
Standard deviation of scores over the folds.

n_features : ndarray of shape (n_subsets_of_features,)
Number of features used at each step.

.. versionadded:: 1.0

n_features_ : int
Expand Down Expand Up @@ -718,12 +726,6 @@ def fit(self, X, y, groups=None):
# Initialization
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
scorer = check_scoring(self.estimator, scoring=self.scoring)
n_features = X.shape[1]

if 0.0 < self.step < 1.0:
step = int(max(1, self.step * n_features))
else:
step = int(self.step)

# Build an RFE object, which will evaluate and score each possible
# feature count, down to self.min_features_to_select
Expand Down Expand Up @@ -753,18 +755,18 @@ def fit(self, X, y, groups=None):
parallel = Parallel(n_jobs=self.n_jobs)
func = delayed(_rfe_single_fit)

scores = parallel(
scores_features = parallel(
func(rfe, self.estimator, X, y, train, test, scorer)
for train, test in cv.split(X, y, groups)
)
scores, step_n_features = zip(*scores_features)

step_n_features_rev = np.array(step_n_features[0])[::-1]
scores = np.array(scores)
scores_sum = np.sum(scores, axis=0)
scores_sum_rev = scores_sum[::-1]
argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1
n_features_to_select = max(
n_features - (argmax_idx * step), self.min_features_to_select
)

# Reverse order such that lowest number of features is selected in case of tie.
scores_sum_rev = np.sum(scores, axis=0)[::-1]
n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)]

# Re-execute an elimination with best_k over the whole set
rfe = RFE(
Expand All @@ -786,11 +788,10 @@ def fit(self, X, y, groups=None):

# reverse to stay consistent with before
scores_rev = scores[:, ::-1]
self.cv_results_ = {}
self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0)
self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0)

for i in range(scores.shape[0]):
self.cv_results_[f"split{i}_test_score"] = scores_rev[i]

self.cv_results_ = {
"mean_test_score": np.mean(scores_rev, axis=0),
"std_test_score": np.std(scores_rev, axis=0),
**{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])},
"n_features": step_n_features_rev,
}
return self
44 changes: 40 additions & 4 deletions sklearn/feature_selection/tests/test_rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.compose import TransformedTargetRegressor
from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
from sklearn.datasets import load_iris, make_friedman1
from sklearn.datasets import load_iris, make_classification, make_friedman1
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
Expand Down Expand Up @@ -537,9 +537,7 @@ def test_rfecv_std_and_mean(global_random_seed):

rfecv = RFECV(estimator=SVC(kernel="linear"))
rfecv.fit(X, y)
n_split_keys = len(rfecv.cv_results_) - 2
split_keys = [f"split{i}_test_score" for i in range(n_split_keys)]

split_keys = [key for key in rfecv.cv_results_.keys() if "split" in key]
cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys])
expected_mean = np.mean(cv_scores, axis=0)
expected_std = np.std(cv_scores, axis=0)
Expand All @@ -548,6 +546,44 @@ def test_rfecv_std_and_mean(global_random_seed):
assert_allclose(rfecv.cv_results_["std_test_score"], expected_std)


@pytest.mark.parametrize(
["min_features_to_select", "n_features", "step", "cv_results_n_features"],
[
[1, 4, 1, np.array([1, 2, 3, 4])],
[1, 5, 1, np.array([1, 2, 3, 4, 5])],
[1, 4, 2, np.array([1, 2, 4])],
[1, 5, 2, np.array([1, 3, 5])],
[1, 4, 3, np.array([1, 4])],
[1, 5, 3, np.array([1, 2, 5])],
[1, 4, 4, np.array([1, 4])],
[1, 5, 4, np.array([1, 5])],
[4, 4, 2, np.array([4])],
[4, 5, 1, np.array([4, 5])],
[4, 5, 2, np.array([4, 5])],
],
)
def test_rfecv_cv_results_n_features(
min_features_to_select,
n_features,
step,
cv_results_n_features,
):
X, y = make_classification(
n_samples=20, n_features=n_features, n_informative=n_features, n_redundant=0
)
rfecv = RFECV(
estimator=SVC(kernel="linear"),
step=step,
min_features_to_select=min_features_to_select,
)
rfecv.fit(X, y)
assert_array_equal(rfecv.cv_results_["n_features"], cv_results_n_features)
assert all(
len(value) == len(rfecv.cv_results_["n_features"])
for value in rfecv.cv_results_.values()
)


@pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
def test_multioutput(ClsRFE):
X = np.random.normal(size=(10, 3))
Expand Down
Loading