diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 39235625093bc..8e0686ca77b31 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -69,7 +69,6 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. - :mod:`sklearn.base` ................... @@ -311,6 +310,15 @@ Changelog :mod:`sklearn.feature_selection` ................................ +- |Enhancement| Updated the following :mod:`feature_selection` estimators to allow + NaN/Inf values in ``transform`` and ``fit``: + :class:`feature_selection.RFE`, :class:`feature_selection.RFECV`, + :class:`feature_selection.SelectFromModel`, + and :class:`feature_selection.VarianceThreshold`. Note that if the underlying + estimator of the feature selector does not allow NaN/Inf then it will still + error, but the feature selectors themselves no longer enforce this + restriction unnecessarily. :issue:`11635` by :user:`Alec Peters `. + - |Fix| Fixed a bug where :class:`feature_selection.VarianceThreshold` with `threshold=0` did not remove constant features due to numerical instability, by using range rather than variance in this case. diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index bcd9834189f60..20a54c41a358b 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -71,7 +71,9 @@ def transform(self, X): X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ - X = check_array(X, dtype=None, accept_sparse='csr') + tags = self._get_tags() + X = check_array(X, dtype=None, accept_sparse='csr', + force_all_finite=not tags.get('allow_nan', True)) mask = self.get_support() if not mask.any(): warn("No features were selected: either the data is" diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 3e324fbec5535..674127f06acd7 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -131,6 +131,10 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator): threshold_ : float The threshold value used for feature selection. + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. + Examples -------- >>> from sklearn.feature_selection import SelectFromModel @@ -249,3 +253,7 @@ def partial_fit(self, X, y=None, **fit_params): self.estimator_ = clone(self.estimator) self.estimator_.partial_fit(X, y, **fit_params) return self + + def _more_tags(self): + estimator_tags = self.estimator._get_tags() + return {'allow_nan': estimator_tags.get('allow_nan', True)} diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 86362f27ef181..12e99175c9d61 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -103,6 +103,10 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator): >>> selector.ranking_ array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5]) + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. + See also -------- RFECV : Recursive feature elimination with built-in cross-validated @@ -150,7 +154,9 @@ def _fit(self, X, y, step_score=None): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit - X, y = check_X_y(X, y, "csc", ensure_min_features=2) + tags = self._get_tags() + X, y = check_X_y(X, y, "csc", ensure_min_features=2, + force_all_finite=not tags.get('allow_nan', True)) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -326,7 +332,9 @@ def predict_log_proba(self, X): return self.estimator_.predict_log_proba(self.transform(X)) def _more_tags(self): - return {'poor_score': True} + estimator_tags = self.estimator._get_tags() + return {'poor_score': True, + 'allow_nan': estimator_tags.get('allow_nan', True)} class RFECV(RFE): @@ -421,6 +429,8 @@ class RFECV(RFE): ``ceil((n_features - min_features_to_select) / step) + 1``, where step is the number of features removed at each iteration. + Allows NaN/Inf in the input if the underlying estimator does as well. + Examples -------- The following example shows how to retrieve the a-priori not known 5 @@ -479,7 +489,8 @@ def fit(self, X, y, groups=None): train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). """ - X, y = check_X_y(X, y, "csr", ensure_min_features=2) + X, y = check_X_y(X, y, "csr", ensure_min_features=2, + force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 15576fe31025c..4f9d720b762b9 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -29,6 +29,10 @@ class VarianceThreshold(SelectorMixin, BaseEstimator): variances_ : array, shape (n_features,) Variances of individual features. + Notes + ----- + Allows NaN in the input. + Examples -------- The following dataset has integer features, two of which are the same @@ -61,7 +65,8 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ('csr', 'csc'), dtype=np.float64) + X = check_array(X, ('csr', 'csc'), dtype=np.float64, + force_all_finite='allow-nan') if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) @@ -69,16 +74,18 @@ def fit(self, X, y=None): mins, maxes = min_max_axis(X, axis=0) peak_to_peaks = maxes - mins else: - self.variances_ = np.var(X, axis=0) + self.variances_ = np.nanvar(X, axis=0) if self.threshold == 0: peak_to_peaks = np.ptp(X, axis=0) if self.threshold == 0: # Use peak-to-peak to avoid numeric precision issues # for constant features - self.variances_ = np.minimum(self.variances_, peak_to_peaks) + compare_arr = np.array([self.variances_, peak_to_peaks]) + self.variances_ = np.nanmin(compare_arr, axis=0) - if np.all(self.variances_ <= self.threshold): + if np.all(~np.isfinite(self.variances_) | + (self.variances_ <= self.threshold)): msg = "No feature in X meets the variance threshold {0:.5f}" if X.shape[0] == 1: msg += " (X contains only one sample)" @@ -90,3 +97,6 @@ def _get_support_mask(self): check_is_fitted(self) return self.variances_ > self.threshold + + def _more_tags(self): + return {'allow_nan': True} diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index a1f6a9d970117..57bd88a30eb0e 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -10,10 +10,28 @@ from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel -from sklearn.ensemble import RandomForestClassifier +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import (RandomForestClassifier, + HistGradientBoostingClassifier) from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.base import BaseEstimator + +class NaNTag(BaseEstimator): + def _more_tags(self): + return {'allow_nan': True} + + +class NoNaNTag(BaseEstimator): + def _more_tags(self): + return {'allow_nan': False} + + +class NaNTagRandomForest(RandomForestClassifier): + def _more_tags(self): + return {'allow_nan': True} + + iris = datasets.load_iris() data, y = iris.data, iris.target rng = np.random.RandomState(0) @@ -320,3 +338,40 @@ def test_threshold_without_refitting(): # Set a higher threshold to filter out more features. model.threshold = "1.0 * mean" assert X_transform.shape[1] > model.transform(data).shape[1] + + +def test_fit_accepts_nan_inf(): + # Test that fit doesn't check for np.inf and np.nan values. + clf = HistGradientBoostingClassifier(random_state=0) + + model = SelectFromModel(estimator=clf) + + nan_data = data.copy() + nan_data[0] = np.NaN + nan_data[1] = np.Inf + + model.fit(data, y) + + +def test_transform_accepts_nan_inf(): + # Test that transform doesn't check for np.inf and np.nan values. + clf = NaNTagRandomForest(n_estimators=100, random_state=0) + nan_data = data.copy() + + model = SelectFromModel(estimator=clf) + model.fit(nan_data, y) + + nan_data[0] = np.NaN + nan_data[1] = np.Inf + + model.transform(nan_data) + + +def test_allow_nan_tag_comes_from_estimator(): + allow_nan_est = NaNTag() + model = SelectFromModel(estimator=allow_nan_est) + assert model._get_tags()['allow_nan'] is True + + no_nan_est = NoNaNTag() + model = SelectFromModel(estimator=no_nan_est) + assert model._get_tags()['allow_nan'] is False diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index 724c749ee636b..ccd3c0a1b0e83 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -2,6 +2,7 @@ Testing Recursive feature elimination """ +import pytest import numpy as np from numpy.testing import assert_array_almost_equal, assert_array_equal from scipy import sparse @@ -54,6 +55,9 @@ def get_params(self, deep=True): def set_params(self, **params): return self + def _get_tags(self): + return {} + def test_rfe_features_importance(): generator = check_random_state(0) @@ -369,3 +373,25 @@ def test_rfe_cv_groups(): ) est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0 + + +@pytest.mark.parametrize("cv", [ + None, + 5 +]) +def test_rfe_allow_nan_inf_in_x(cv): + iris = load_iris() + X = iris.data + y = iris.target + + # add nan and inf value to X + X[0][0] = np.NaN + X[0][1] = np.Inf + + clf = MockClassifier() + if cv is not None: + rfe = RFECV(estimator=clf, cv=cv) + else: + rfe = RFE(estimator=clf) + rfe.fit(X, y) + rfe.transform(X) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 9dc7effd3d1a5..77d9c9445bc71 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -46,3 +46,15 @@ def test_zero_variance_floating_point_error(): msg = "No feature in X meets the variance threshold 0.00000" with pytest.raises(ValueError, match=msg): VarianceThreshold().fit(X) + + +def test_variance_nan(): + arr = np.array(data, dtype=np.float64) + # add single NaN and feature should still be included + arr[0, 0] = np.NaN + # make all values in feature NaN and feature should be rejected + arr[:, 1] = np.NaN + + for X in [arr, csr_matrix(arr), csc_matrix(arr), bsr_matrix(arr)]: + sel = VarianceThreshold().fit(X) + assert_array_equal([0, 3, 4], sel.get_support(indices=True))