ENH Add number of features used at each step to RFECV.cv_results_ (#28670)

miguelcsilva · jeremiedbb · web-flow · commit 710fe973174b · 2024-03-27T15:04:37.000Z
Co-authored-by: Jérémie du Boisberranger &lt;jeremie@probabl.ai&gt;
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -188,6 +188,11 @@ Changelog
   :pr:`28085` by :user:`Neto Menoci <netomenoci>` and
   :user:`Florin Andrei <FlorinAndrei>`.
 
+- |Enhancement| The `cv_results_` attribute of :class:`feature_selection.RFECV` has
+  a new key, `n_features`, containing an array with the number of features selected
+  at each step.
+  :pr:`28670` by :user:`Miguel Silva <miguelcsilva>`.
+
 :mod:`sklearn.impute`
 .....................
 
@@ -298,7 +303,7 @@ Changelog
   :func:`preprocessing.quantile_transform` now supports disabling
   subsampling explicitly.
   :pr:`27636` by :user:`Ralph Urlus <rurlus>`.
-  
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py
@@ -66,15 +66,16 @@
 # ---------------------------------------------------
 
 import matplotlib.pyplot as plt
+import pandas as pd
 
-n_scores = len(rfecv.cv_results_["mean_test_score"])
+cv_results = pd.DataFrame(rfecv.cv_results_)
 plt.figure()
 plt.xlabel("Number of features selected")
 plt.ylabel("Mean test accuracy")
 plt.errorbar(
-    range(min_features_to_select, n_scores + min_features_to_select),
-    rfecv.cv_results_["mean_test_score"],
-    yerr=rfecv.cv_results_["std_test_score"],
+    x=cv_results["n_features"],
+    y=cv_results["mean_test_score"],
+    yerr=cv_results["std_test_score"],
 )
 plt.title("Recursive Feature Elimination \nwith correlated features")
 plt.show()
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
@@ -28,11 +28,12 @@
 
 def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
     """
-    Return the score for a fit across one fold.
+    Return the score and n_features per step for a fit across one fold.
     """
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, y_test = _safe_split(estimator, X, y, test, train)
-    return rfe._fit(
+
+    rfe._fit(
         X_train,
         y_train,
         lambda estimator, features: _score(
@@ -43,7 +44,9 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
             scorer,
             score_params=None,
         ),
-    ).scores_
+    )
+
+    return rfe.step_scores_, rfe.step_n_features_
 
 
 def _estimator_has(attr):
@@ -264,10 +267,9 @@ def fit(self, X, y, **fit_params):
         return self._fit(X, y, **fit_params)
 
     def _fit(self, X, y, step_score=None, **fit_params):
-        # Parameter step_score controls the calculation of self.scores_
-        # step_score is not exposed to users
-        # and is used when implementing RFECV
-        # self.scores_ will not be calculated when calling _fit through fit
+        # Parameter step_score controls the calculation of self.step_scores_
+        # step_score is not exposed to users and is used when implementing RFECV
+        # self.step_scores_ will not be calculated when calling _fit through fit
 
         X, y = self._validate_data(
             X,
@@ -296,7 +298,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
         ranking_ = np.ones(n_features, dtype=int)
 
         if step_score:
-            self.scores_ = []
+            self.step_n_features_ = []
+            self.step_scores_ = []
 
         # Elimination
         while np.sum(support_) > n_features_to_select:
@@ -328,7 +331,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
             # because 'estimator' must use features
             # that have not been eliminated yet
             if step_score:
-                self.scores_.append(step_score(estimator, features))
+                self.step_n_features_.append(len(features))
+                self.step_scores_.append(step_score(estimator, features))
             support_[features[ranks][:threshold]] = False
             ranking_[np.logical_not(support_)] += 1
 
@@ -339,7 +343,8 @@ def _fit(self, X, y, step_score=None, **fit_params):
 
         # Compute step score when only n_features_to_select features left
         if step_score:
-            self.scores_.append(step_score(self.estimator_, features))
+            self.step_n_features_.append(len(features))
+            self.step_scores_.append(step_score(self.estimator_, features))
         self.n_features_ = support_.sum()
         self.support_ = support_
         self.ranking_ = ranking_
@@ -581,6 +586,9 @@ class RFECV(RFE):
         std_test_score : ndarray of shape (n_subsets_of_features,)
             Standard deviation of scores over the folds.
 
+        n_features : ndarray of shape (n_subsets_of_features,)
+            Number of features used at each step.
+
         .. versionadded:: 1.0
 
     n_features_ : int
@@ -718,12 +726,6 @@ def fit(self, X, y, groups=None):
         # Initialization
         cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
         scorer = check_scoring(self.estimator, scoring=self.scoring)
-        n_features = X.shape[1]
-
-        if 0.0 < self.step < 1.0:
-            step = int(max(1, self.step * n_features))
-        else:
-            step = int(self.step)
 
         # Build an RFE object, which will evaluate and score each possible
         # feature count, down to self.min_features_to_select
@@ -753,18 +755,18 @@ def fit(self, X, y, groups=None):
             parallel = Parallel(n_jobs=self.n_jobs)
             func = delayed(_rfe_single_fit)
 
-        scores = parallel(
+        scores_features = parallel(
             func(rfe, self.estimator, X, y, train, test, scorer)
             for train, test in cv.split(X, y, groups)
         )
+        scores, step_n_features = zip(*scores_features)
 
+        step_n_features_rev = np.array(step_n_features[0])[::-1]
         scores = np.array(scores)
-        scores_sum = np.sum(scores, axis=0)
-        scores_sum_rev = scores_sum[::-1]
-        argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1
-        n_features_to_select = max(
-            n_features - (argmax_idx * step), self.min_features_to_select
-        )
+
+        # Reverse order such that lowest number of features is selected in case of tie.
+        scores_sum_rev = np.sum(scores, axis=0)[::-1]
+        n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)]
 
         # Re-execute an elimination with best_k over the whole set
         rfe = RFE(
@@ -786,11 +788,10 @@ def fit(self, X, y, groups=None):
 
         # reverse to stay consistent with before
         scores_rev = scores[:, ::-1]
-        self.cv_results_ = {}
-        self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0)
-        self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0)
-
-        for i in range(scores.shape[0]):
-            self.cv_results_[f"split{i}_test_score"] = scores_rev[i]
-
+        self.cv_results_ = {
+            "mean_test_score": np.mean(scores_rev, axis=0),
+            "std_test_score": np.std(scores_rev, axis=0),
+            **{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])},
+            "n_features": step_n_features_rev,
+        }
         return self
diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py
@@ -11,7 +11,7 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.compose import TransformedTargetRegressor
 from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
-from sklearn.datasets import load_iris, make_friedman1
+from sklearn.datasets import load_iris, make_classification, make_friedman1
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.feature_selection import RFE, RFECV
 from sklearn.impute import SimpleImputer
@@ -537,9 +537,7 @@ def test_rfecv_std_and_mean(global_random_seed):
 
     rfecv = RFECV(estimator=SVC(kernel="linear"))
     rfecv.fit(X, y)
-    n_split_keys = len(rfecv.cv_results_) - 2
-    split_keys = [f"split{i}_test_score" for i in range(n_split_keys)]
-
+    split_keys = [key for key in rfecv.cv_results_.keys() if "split" in key]
     cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys])
     expected_mean = np.mean(cv_scores, axis=0)
     expected_std = np.std(cv_scores, axis=0)
@@ -548,6 +546,44 @@ def test_rfecv_std_and_mean(global_random_seed):
     assert_allclose(rfecv.cv_results_["std_test_score"], expected_std)
 
 
+@pytest.mark.parametrize(
+    ["min_features_to_select", "n_features", "step", "cv_results_n_features"],
+    [
+        [1, 4, 1, np.array([1, 2, 3, 4])],
+        [1, 5, 1, np.array([1, 2, 3, 4, 5])],
+        [1, 4, 2, np.array([1, 2, 4])],
+        [1, 5, 2, np.array([1, 3, 5])],
+        [1, 4, 3, np.array([1, 4])],
+        [1, 5, 3, np.array([1, 2, 5])],
+        [1, 4, 4, np.array([1, 4])],
+        [1, 5, 4, np.array([1, 5])],
+        [4, 4, 2, np.array([4])],
+        [4, 5, 1, np.array([4, 5])],
+        [4, 5, 2, np.array([4, 5])],
+    ],
+)
+def test_rfecv_cv_results_n_features(
+    min_features_to_select,
+    n_features,
+    step,
+    cv_results_n_features,
+):
+    X, y = make_classification(
+        n_samples=20, n_features=n_features, n_informative=n_features, n_redundant=0
+    )
+    rfecv = RFECV(
+        estimator=SVC(kernel="linear"),
+        step=step,
+        min_features_to_select=min_features_to_select,
+    )
+    rfecv.fit(X, y)
+    assert_array_equal(rfecv.cv_results_["n_features"], cv_results_n_features)
+    assert all(
+        len(value) == len(rfecv.cv_results_["n_features"])
+        for value in rfecv.cv_results_.values()
+    )
+
+
 @pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
 def test_multioutput(ClsRFE):
     X = np.random.normal(size=(10, 3))