diff --git a/doc/whats_new/upcoming_changes/sklearn.feature_selection/32251.feature.rst b/doc/whats_new/upcoming_changes/sklearn.feature_selection/32251.feature.rst new file mode 100644 index 0000000000000..7fb6e8806ac51 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.feature_selection/32251.feature.rst @@ -0,0 +1,9 @@ +- :class:`feature_selection.RFE` and :class:`feature_selection.RFECV` + now support the use of :func:`permutation_importance` as an :attr:`importance_getter`. + When a callable, and when possible, the :attr:`importance_getter` is passed + :attr:`feature_indices` along with the fitted estimator. + The attribute :attr:`feature_indices` stores the index of the features from the full dataset + that have not been eliminated yet. + This allows methods that need a test set, like :func:`permutation_importance`, to know which + features to use in their predictions. + By :user:`GaƩtan de Castellane `. diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py index 307707c5aa069..13e92598a0116 100644 --- a/examples/feature_selection/plot_rfe_with_cross_validation.py +++ b/examples/feature_selection/plot_rfe_with_cross_validation.py @@ -21,12 +21,13 @@ # features are non-informative as they are drawn at random. from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split n_features = 15 feat_names = [f"feature_{i}" for i in range(15)] X, y = make_classification( - n_samples=500, + n_samples=1_000, n_features=n_features, n_informative=3, n_redundant=2, @@ -36,6 +37,9 @@ class_sep=0.8, random_state=0, ) +X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=0.5, shuffle=False, random_state=0 +) # %% # Model training and selection @@ -60,7 +64,7 @@ min_features_to_select=min_features_to_select, n_jobs=2, ) -rfecv.fit(X, y) +rfecv.fit(X_train, y_train) print(f"Optimal number of features: {rfecv.n_features_}") @@ -113,3 +117,69 @@ # In the five folds, the selected features are consistent. This is good news, # it means that the selection is stable across folds, and it confirms that # these features are the most informative ones. + +# %% +# Using `permutation_importance` to select features +# ------------------------------------------------- +# The `importance_getter` parameter in RFE and RFECV uses by default the `coef_` (e.g. +# in linear models) or the `feature_importances_` attributes of an estimator to derive +# feature importance. These importance measures are used to choose which features to +# eliminate first. +# +# We show here how to use a callable to compute the `permutation_importance` instead. +# This callable accepts a fitted model and an array containing the indices of the +# features that remain after elimination. + +# %% +from sklearn.inspection import permutation_importance + + +# Use `feature_indices` to extract from the test set the features that have not been +# eliminated yet. +def permutation_importance_getter(model, feature_indices, X_test, y_test, random_state): + return permutation_importance( + model, + X_test[:, feature_indices], + y_test, + n_repeats=10, + n_jobs=2, + random_state=random_state, + ).importances_mean + + +rfecv = RFECV( + estimator=clf, + step=1, + cv=cv, + scoring="accuracy", + min_features_to_select=min_features_to_select, + n_jobs=2, + importance_getter=lambda model, feature_indices: permutation_importance_getter( + model, feature_indices, X_test, y_test, random_state=0 + ), +) +rfecv.fit(X, y) + +print(f"Optimal number of features: {rfecv.n_features_}") + +# %% +data = { + key: value + for key, value in rfecv.cv_results_.items() + if key in ["n_features", "mean_test_score", "std_test_score"] +} +cv_results = pd.DataFrame(data) +plt.figure() +plt.xlabel("Number of features selected") +plt.ylabel("Mean test accuracy") +plt.errorbar( + x=cv_results["n_features"], + y=cv_results["mean_test_score"], + yerr=cv_results["std_test_score"], +) +plt.title("Recursive Feature Elimination \nwith correlated features") +plt.show() + +# %% +# We see that we obtain very similar results with this model agnostic feature importance +# method. diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 3c12cd035d5c8..63eeaf532af4d 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -3,6 +3,7 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause +import inspect import warnings from abc import ABCMeta, abstractmethod from operator import attrgetter @@ -196,7 +197,9 @@ def get_feature_names_out(self, input_features=None): return input_features[self.get_support()] -def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): +def _get_feature_importances( + estimator, getter, feature_indices=None, transform_func=None, norm_order=1 +): """ Retrieve and aggregate (ndim > 1) the feature importances from an estimator. Also optionally applies transformation. @@ -215,6 +218,10 @@ def _get_feature_importances(estimator, getter, transform_func=None, norm_order= The transform to apply to the feature importances. By default (`None`) no transformation is applied. + feature_indices : ndarray of shape (n_features,), default=None + The indices of features from the full dataset whose importance are currently + evaluated. These are passed to `getter` when it can accept them. + norm_order : int, default=1 The norm order to apply when `transform_func="norm"`. Only applied when `importances.ndim > 1`. @@ -243,7 +250,14 @@ def _get_feature_importances(estimator, getter, transform_func=None, norm_order= elif not callable(getter): raise ValueError("`importance_getter` has to be a string or `callable`") - importances = getter(estimator) + if isinstance(getter, attrgetter): + importances = getter(estimator) + else: + param_names = list(inspect.signature(getter).parameters.keys()) + if "feature_indices" in param_names: + importances = getter(estimator, feature_indices) + else: + importances = getter(estimator) if transform_func is None: return importances diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 056bb0203b187..053ff76e67840 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -120,14 +120,24 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator): For example, give `regressor_.coef_` in case of :class:`~sklearn.compose.TransformedTargetRegressor` or `named_steps.clf.feature_importances_` in case of - class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. + :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. If `callable`, overrides the default feature importance getter. The callable is passed with the fitted estimator and it should - return importance for each feature. + return importance for each feature. When the callable also accepts + `feature_indices` in its signature, it will be passed the index of the features + of the full dataset that remain after elimination in previous iterations. + + `feature_indices` allows `RFE` to be used with permutation importance, as + shown on `RFECV` at the end of + :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`. .. versionadded:: 0.24 + .. versionchanged:: 1.8 + Add support for passing `feature_indices` to the callable when part of its + signature. + Attributes ---------- classes_ : ndarray of shape (n_classes,) @@ -346,6 +356,7 @@ def _fit(self, X, y, step_score=None, **fit_params): importances = _get_feature_importances( estimator, self.importance_getter, + features, transform_func="square", ) ranks = np.argsort(importances) @@ -646,10 +657,20 @@ class RFECV(RFE): If `callable`, overrides the default feature importance getter. The callable is passed with the fitted estimator and it should - return importance for each feature. + return importance for each feature. When the callable also accepts + `feature_indices` in its signature, it will be passed the index of the features + of the full dataset that remain after elimination in previous iterations. + + `feature_indices` allows `RFE` to be used with permutation importance, as + shown on `RFECV` at the end of + :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`. .. versionadded:: 0.24 + .. versionchanged:: 1.8 + Add support for passing `feature_indices` to the callable when part of its + signature. + Attributes ---------- classes_ : ndarray of shape (n_classes,) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index 1f5672545874c..d5a5dd3cf9c99 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -14,12 +14,13 @@ from sklearn.compose import TransformedTargetRegressor from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression from sklearn.datasets import load_iris, make_classification, make_friedman1 -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.feature_selection import RFE, RFECV from sklearn.impute import SimpleImputer +from sklearn.inspection import permutation_importance from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import get_scorer, make_scorer, zero_one_loss -from sklearn.model_selection import GroupKFold, cross_val_score +from sklearn.model_selection import GroupKFold, cross_val_score, train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC, SVR, LinearSVR @@ -753,3 +754,75 @@ def test_results_per_cv_in_rfecv(global_random_seed): assert len(rfecv.cv_results_["split1_ranking"]) == len( rfecv.cv_results_["split2_ranking"] ) + + +def test_rfe_with_permutation_importance(global_random_seed): + """ + Ensure that using permutation_importance as a importance_getter returns the amount + features set with `n_features_to_select`. + """ + X, y = make_friedman1(n_samples=100, n_features=7, random_state=global_random_seed) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=global_random_seed + ) + + reg = RandomForestRegressor(random_state=global_random_seed, n_estimators=2) + + def permutation_importance_getter( + model, feature_indices, X_test, y_test, random_state + ): + return permutation_importance( + model, + X_test[:, feature_indices], + y_test, + n_repeats=2, + random_state=random_state, + ).importances_mean + + rfe = RFE( + estimator=reg, + importance_getter=lambda model, feature_indices: permutation_importance_getter( + model, feature_indices, X_test, y_test, global_random_seed + ), + n_features_to_select=5, + ).fit(X_train, y_train) + + assert rfe.n_features_ == 5 + + +def test_rfecv_with_permutation_importance(global_random_seed): + """ + Ensure that using permutation_importance as a importance_getter in `RFECV` returns + exactly the five informative features of `make_friedman1`. + """ + X, y = make_friedman1( + n_samples=1_000, n_features=7, random_state=global_random_seed + ) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=global_random_seed + ) + + reg = RandomForestRegressor(random_state=global_random_seed, n_estimators=15) + + def permutation_importance_getter( + model, feature_indices, X_test, y_test, random_state + ): + return permutation_importance( + model, + X_test[:, feature_indices], + y_test, + n_repeats=2, + random_state=random_state, + ).importances_mean + + rfecv = RFECV( + estimator=reg, + importance_getter=lambda model, feature_indices: permutation_importance_getter( + model, feature_indices, X_test, y_test, global_random_seed + ), + ).fit(X_train, y_train) + + assert rfecv.n_features_ == 5 + assert_array_equal(rfecv.support_, np.array(([True] * 5) + ([False] * 2)))