diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index bd03cc743f76e..32a6fc280d531 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -188,6 +188,11 @@ Changelog :pr:`28085` by :user:`Neto Menoci ` and :user:`Florin Andrei `. +- |Enhancement| The `cv_results_` attribute of :class:`feature_selection.RFECV` has + a new key, `n_features`, containing an array with the number of features selected + at each step. + :pr:`28670` by :user:`Miguel Silva `. + :mod:`sklearn.impute` ..................... @@ -298,7 +303,7 @@ Changelog :func:`preprocessing.quantile_transform` now supports disabling subsampling explicitly. :pr:`27636` by :user:`Ralph Urlus `. - + :mod:`sklearn.tree` ................... diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py index 693e21fe21787..6e4a8ae0ee8c5 100644 --- a/examples/feature_selection/plot_rfe_with_cross_validation.py +++ b/examples/feature_selection/plot_rfe_with_cross_validation.py @@ -66,15 +66,16 @@ # --------------------------------------------------- import matplotlib.pyplot as plt +import pandas as pd -n_scores = len(rfecv.cv_results_["mean_test_score"]) +cv_results = pd.DataFrame(rfecv.cv_results_) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Mean test accuracy") plt.errorbar( - range(min_features_to_select, n_scores + min_features_to_select), - rfecv.cv_results_["mean_test_score"], - yerr=rfecv.cv_results_["std_test_score"], + x=cv_results["n_features"], + y=cv_results["mean_test_score"], + yerr=cv_results["std_test_score"], ) plt.title("Recursive Feature Elimination \nwith correlated features") plt.show() diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index e85dc8f623596..44764655e988d 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -28,11 +28,12 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): """ - Return the score for a fit across one fold. + Return the score and n_features per step for a fit across one fold. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) - return rfe._fit( + + rfe._fit( X_train, y_train, lambda estimator, features: _score( @@ -43,7 +44,9 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): scorer, score_params=None, ), - ).scores_ + ) + + return rfe.step_scores_, rfe.step_n_features_ def _estimator_has(attr): @@ -264,10 +267,9 @@ def fit(self, X, y, **fit_params): return self._fit(X, y, **fit_params) def _fit(self, X, y, step_score=None, **fit_params): - # Parameter step_score controls the calculation of self.scores_ - # step_score is not exposed to users - # and is used when implementing RFECV - # self.scores_ will not be calculated when calling _fit through fit + # Parameter step_score controls the calculation of self.step_scores_ + # step_score is not exposed to users and is used when implementing RFECV + # self.step_scores_ will not be calculated when calling _fit through fit X, y = self._validate_data( X, @@ -296,7 +298,8 @@ def _fit(self, X, y, step_score=None, **fit_params): ranking_ = np.ones(n_features, dtype=int) if step_score: - self.scores_ = [] + self.step_n_features_ = [] + self.step_scores_ = [] # Elimination while np.sum(support_) > n_features_to_select: @@ -328,7 +331,8 @@ def _fit(self, X, y, step_score=None, **fit_params): # because 'estimator' must use features # that have not been eliminated yet if step_score: - self.scores_.append(step_score(estimator, features)) + self.step_n_features_.append(len(features)) + self.step_scores_.append(step_score(estimator, features)) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 @@ -339,7 +343,8 @@ def _fit(self, X, y, step_score=None, **fit_params): # Compute step score when only n_features_to_select features left if step_score: - self.scores_.append(step_score(self.estimator_, features)) + self.step_n_features_.append(len(features)) + self.step_scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ @@ -581,6 +586,9 @@ class RFECV(RFE): std_test_score : ndarray of shape (n_subsets_of_features,) Standard deviation of scores over the folds. + n_features : ndarray of shape (n_subsets_of_features,) + Number of features used at each step. + .. versionadded:: 1.0 n_features_ : int @@ -718,12 +726,6 @@ def fit(self, X, y, groups=None): # Initialization cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) - n_features = X.shape[1] - - if 0.0 < self.step < 1.0: - step = int(max(1, self.step * n_features)) - else: - step = int(self.step) # Build an RFE object, which will evaluate and score each possible # feature count, down to self.min_features_to_select @@ -753,18 +755,18 @@ def fit(self, X, y, groups=None): parallel = Parallel(n_jobs=self.n_jobs) func = delayed(_rfe_single_fit) - scores = parallel( + scores_features = parallel( func(rfe, self.estimator, X, y, train, test, scorer) for train, test in cv.split(X, y, groups) ) + scores, step_n_features = zip(*scores_features) + step_n_features_rev = np.array(step_n_features[0])[::-1] scores = np.array(scores) - scores_sum = np.sum(scores, axis=0) - scores_sum_rev = scores_sum[::-1] - argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1 - n_features_to_select = max( - n_features - (argmax_idx * step), self.min_features_to_select - ) + + # Reverse order such that lowest number of features is selected in case of tie. + scores_sum_rev = np.sum(scores, axis=0)[::-1] + n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)] # Re-execute an elimination with best_k over the whole set rfe = RFE( @@ -786,11 +788,10 @@ def fit(self, X, y, groups=None): # reverse to stay consistent with before scores_rev = scores[:, ::-1] - self.cv_results_ = {} - self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0) - self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0) - - for i in range(scores.shape[0]): - self.cv_results_[f"split{i}_test_score"] = scores_rev[i] - + self.cv_results_ = { + "mean_test_score": np.mean(scores_rev, axis=0), + "std_test_score": np.std(scores_rev, axis=0), + **{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])}, + "n_features": step_n_features_rev, + } return self diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index e3edb0e7b5d21..01c6194493ab6 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -11,7 +11,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.compose import TransformedTargetRegressor from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression -from sklearn.datasets import load_iris, make_friedman1 +from sklearn.datasets import load_iris, make_classification, make_friedman1 from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE, RFECV from sklearn.impute import SimpleImputer @@ -537,9 +537,7 @@ def test_rfecv_std_and_mean(global_random_seed): rfecv = RFECV(estimator=SVC(kernel="linear")) rfecv.fit(X, y) - n_split_keys = len(rfecv.cv_results_) - 2 - split_keys = [f"split{i}_test_score" for i in range(n_split_keys)] - + split_keys = [key for key in rfecv.cv_results_.keys() if "split" in key] cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys]) expected_mean = np.mean(cv_scores, axis=0) expected_std = np.std(cv_scores, axis=0) @@ -548,6 +546,44 @@ def test_rfecv_std_and_mean(global_random_seed): assert_allclose(rfecv.cv_results_["std_test_score"], expected_std) +@pytest.mark.parametrize( + ["min_features_to_select", "n_features", "step", "cv_results_n_features"], + [ + [1, 4, 1, np.array([1, 2, 3, 4])], + [1, 5, 1, np.array([1, 2, 3, 4, 5])], + [1, 4, 2, np.array([1, 2, 4])], + [1, 5, 2, np.array([1, 3, 5])], + [1, 4, 3, np.array([1, 4])], + [1, 5, 3, np.array([1, 2, 5])], + [1, 4, 4, np.array([1, 4])], + [1, 5, 4, np.array([1, 5])], + [4, 4, 2, np.array([4])], + [4, 5, 1, np.array([4, 5])], + [4, 5, 2, np.array([4, 5])], + ], +) +def test_rfecv_cv_results_n_features( + min_features_to_select, + n_features, + step, + cv_results_n_features, +): + X, y = make_classification( + n_samples=20, n_features=n_features, n_informative=n_features, n_redundant=0 + ) + rfecv = RFECV( + estimator=SVC(kernel="linear"), + step=step, + min_features_to_select=min_features_to_select, + ) + rfecv.fit(X, y) + assert_array_equal(rfecv.cv_results_["n_features"], cv_results_n_features) + assert all( + len(value) == len(rfecv.cv_results_["n_features"]) + for value in rfecv.cv_results_.values() + ) + + @pytest.mark.parametrize("ClsRFE", [RFE, RFECV]) def test_multioutput(ClsRFE): X = np.random.normal(size=(10, 3))