Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- :class:`feature_selection.RFE` and :class:`feature_selection.RFECV`
now support the use of :func:`permutation_importance` as an :attr:`importance_getter`.
When a callable, and when possible, the :attr:`importance_getter` is passed
:attr:`feature_indices` along with the fitted estimator.
The attribute :attr:`feature_indices` stores the index of the features from the full dataset
that have not been eliminated yet.
This allows methods that need a test set, like :func:`permutation_importance`, to know which
features to use in their predictions.
By :user:`Gaétan de Castellane <GaetandeCast>`.
74 changes: 72 additions & 2 deletions examples/feature_selection/plot_rfe_with_cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@
# features are non-informative as they are drawn at random.

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

n_features = 15
feat_names = [f"feature_{i}" for i in range(15)]

X, y = make_classification(
n_samples=500,
n_samples=1_000,
n_features=n_features,
n_informative=3,
n_redundant=2,
Expand All @@ -36,6 +37,9 @@
class_sep=0.8,
random_state=0,
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.5, shuffle=False, random_state=0
)

# %%
# Model training and selection
Expand All @@ -60,7 +64,7 @@
min_features_to_select=min_features_to_select,
n_jobs=2,
)
rfecv.fit(X, y)
rfecv.fit(X_train, y_train)

print(f"Optimal number of features: {rfecv.n_features_}")

Expand Down Expand Up @@ -113,3 +117,69 @@
# In the five folds, the selected features are consistent. This is good news,
# it means that the selection is stable across folds, and it confirms that
# these features are the most informative ones.

# %%
# Using `permutation_importance` to select features
# -------------------------------------------------
# The `importance_getter` parameter in RFE and RFECV uses by default the `coef_` (e.g.
# in linear models) or the `feature_importances_` attributes of an estimator to derive
# feature importance. These importance measures are used to choose which features to
# eliminate first.
#
# We show here how to use a callable to compute the `permutation_importance` instead.
# This callable accepts a fitted model and an array containing the indices of the
# features that remain after elimination.

# %%
from sklearn.inspection import permutation_importance


# Use `feature_indices` to extract from the test set the features that have not been
# eliminated yet.
def permutation_importance_getter(model, feature_indices, X_test, y_test, random_state):
return permutation_importance(
model,
X_test[:, feature_indices],
y_test,
n_repeats=10,
n_jobs=2,
random_state=random_state,
).importances_mean


rfecv = RFECV(
estimator=clf,
step=1,
cv=cv,
scoring="accuracy",
min_features_to_select=min_features_to_select,
n_jobs=2,
importance_getter=lambda model, feature_indices: permutation_importance_getter(
model, feature_indices, X_test, y_test, random_state=0
),
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

# %%
data = {
key: value
for key, value in rfecv.cv_results_.items()
if key in ["n_features", "mean_test_score", "std_test_score"]
}
cv_results = pd.DataFrame(data)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
x=cv_results["n_features"],
y=cv_results["mean_test_score"],
yerr=cv_results["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()

# %%
# We see that we obtain very similar results with this model agnostic feature importance
# method.
18 changes: 16 additions & 2 deletions sklearn/feature_selection/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import inspect
import warnings
from abc import ABCMeta, abstractmethod
from operator import attrgetter
Expand Down Expand Up @@ -196,7 +197,9 @@ def get_feature_names_out(self, input_features=None):
return input_features[self.get_support()]


def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):
def _get_feature_importances(
estimator, getter, feature_indices=None, transform_func=None, norm_order=1
):
"""
Retrieve and aggregate (ndim > 1) the feature importances
from an estimator. Also optionally applies transformation.
Expand All @@ -215,6 +218,10 @@ def _get_feature_importances(estimator, getter, transform_func=None, norm_order=
The transform to apply to the feature importances. By default (`None`)
no transformation is applied.

feature_indices : ndarray of shape (n_features,), default=None
The indices of features from the full dataset whose importance are currently
evaluated. These are passed to `getter` when it can accept them.

norm_order : int, default=1
The norm order to apply when `transform_func="norm"`. Only applied
when `importances.ndim > 1`.
Expand Down Expand Up @@ -243,7 +250,14 @@ def _get_feature_importances(estimator, getter, transform_func=None, norm_order=
elif not callable(getter):
raise ValueError("`importance_getter` has to be a string or `callable`")

importances = getter(estimator)
if isinstance(getter, attrgetter):
importances = getter(estimator)
else:
param_names = list(inspect.signature(getter).parameters.keys())
if "feature_indices" in param_names:
importances = getter(estimator, feature_indices)
else:
importances = getter(estimator)

if transform_func is None:
return importances
Expand Down
27 changes: 24 additions & 3 deletions sklearn/feature_selection/_rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,24 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
For example, give `regressor_.coef_` in case of
:class:`~sklearn.compose.TransformedTargetRegressor` or
`named_steps.clf.feature_importances_` in case of
class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
:class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.

If `callable`, overrides the default feature importance getter.
The callable is passed with the fitted estimator and it should
return importance for each feature.
return importance for each feature. When the callable also accepts
`feature_indices` in its signature, it will be passed the index of the features
of the full dataset that remain after elimination in previous iterations.

`feature_indices` allows `RFE` to be used with permutation importance, as
shown on `RFECV` at the end of
:ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`.

.. versionadded:: 0.24

.. versionchanged:: 1.8
Add support for passing `feature_indices` to the callable when part of its
signature.

Attributes
----------
classes_ : ndarray of shape (n_classes,)
Expand Down Expand Up @@ -346,6 +356,7 @@ def _fit(self, X, y, step_score=None, **fit_params):
importances = _get_feature_importances(
estimator,
self.importance_getter,
features,
transform_func="square",
)
ranks = np.argsort(importances)
Expand Down Expand Up @@ -646,10 +657,20 @@ class RFECV(RFE):

If `callable`, overrides the default feature importance getter.
The callable is passed with the fitted estimator and it should
return importance for each feature.
return importance for each feature. When the callable also accepts
`feature_indices` in its signature, it will be passed the index of the features
of the full dataset that remain after elimination in previous iterations.

`feature_indices` allows `RFE` to be used with permutation importance, as
shown on `RFECV` at the end of
:ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`.

.. versionadded:: 0.24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add a .. versionchanged:: 1.8 and mention that support for passing feature_indices to the callable when part of its signature.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And similarly for the docstring of the other class.


.. versionchanged:: 1.8
Add support for passing `feature_indices` to the callable when part of its
signature.

Attributes
----------
classes_ : ndarray of shape (n_classes,)
Expand Down
77 changes: 75 additions & 2 deletions sklearn/feature_selection/tests/test_rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
from sklearn.compose import TransformedTargetRegressor
from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
from sklearn.datasets import load_iris, make_classification, make_friedman1
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import get_scorer, make_scorer, zero_one_loss
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.model_selection import GroupKFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR, LinearSVR
Expand Down Expand Up @@ -753,3 +754,75 @@ def test_results_per_cv_in_rfecv(global_random_seed):
assert len(rfecv.cv_results_["split1_ranking"]) == len(
rfecv.cv_results_["split2_ranking"]
)


def test_rfe_with_permutation_importance(global_random_seed):
"""
Ensure that using permutation_importance as a importance_getter returns the amount
features set with `n_features_to_select`.
"""
X, y = make_friedman1(n_samples=100, n_features=7, random_state=global_random_seed)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=global_random_seed
)

reg = RandomForestRegressor(random_state=global_random_seed, n_estimators=2)

def permutation_importance_getter(
model, feature_indices, X_test, y_test, random_state
):
return permutation_importance(
model,
X_test[:, feature_indices],
y_test,
n_repeats=2,
random_state=random_state,
).importances_mean

rfe = RFE(
estimator=reg,
importance_getter=lambda model, feature_indices: permutation_importance_getter(
model, feature_indices, X_test, y_test, global_random_seed
),
n_features_to_select=5,
).fit(X_train, y_train)

assert rfe.n_features_ == 5


def test_rfecv_with_permutation_importance(global_random_seed):
"""
Ensure that using permutation_importance as a importance_getter in `RFECV` returns
exactly the five informative features of `make_friedman1`.
"""
X, y = make_friedman1(
n_samples=1_000, n_features=7, random_state=global_random_seed
)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=global_random_seed
)

reg = RandomForestRegressor(random_state=global_random_seed, n_estimators=15)

def permutation_importance_getter(
model, feature_indices, X_test, y_test, random_state
):
return permutation_importance(
model,
X_test[:, feature_indices],
y_test,
n_repeats=2,
random_state=random_state,
).importances_mean

rfecv = RFECV(
estimator=reg,
importance_getter=lambda model, feature_indices: permutation_importance_getter(
model, feature_indices, X_test, y_test, global_random_seed
),
).fit(X_train, y_train)

assert rfecv.n_features_ == 5
assert_array_equal(rfecv.support_, np.array(([True] * 5) + ([False] * 2)))
Loading