From 03eb7696dcac68c1fb25df289f59a3fb436f2fd0 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 19 Jul 2018 09:41:07 -0700 Subject: [PATCH 01/41] removed check for nan/inf in SelectorMixin.tranform, RFE.fit, and RFECV.fit --- sklearn/feature_selection/base.py | 2 +- sklearn/feature_selection/rfe.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 5bb0b3ea890c3..30902eca34e22 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -72,7 +72,7 @@ def transform(self, X): X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ - X = check_array(X, dtype=None, accept_sparse='csr') + X = check_array(X, dtype=None, accept_sparse='csr', force_all_finite=False) mask = self.get_support() if not mask.any(): warn("No features were selected: either the data is" diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 31c0133b2c694..f7d8a1476de9a 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -7,7 +7,7 @@ """Recursive feature elimination for feature ranking""" import numpy as np -from ..utils import check_X_y, safe_sqr +from ..utils import check_array, safe_sqr from ..utils.metaestimators import if_delegate_has_method from ..utils.metaestimators import _safe_split from ..utils.validation import check_is_fitted @@ -145,7 +145,8 @@ def _fit(self, X, y, step_score=None): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit - X, y = check_X_y(X, y, "csc") + X = check_array(X, "csc", force_all_finite=False) + # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -409,7 +410,7 @@ def fit(self, X, y, groups=None): Group labels for the samples used while splitting the dataset into train/test set. """ - X, y = check_X_y(X, y, "csr") + X = check_array(X, "csr", force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) From 03042a2055380848a9a311760b95bf094b1dae7f Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 19 Jul 2018 09:45:31 -0700 Subject: [PATCH 02/41] removed extra blank line --- sklearn/feature_selection/rfe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index f7d8a1476de9a..ed9ea2a574ec3 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -146,7 +146,6 @@ def _fit(self, X, y, step_score=None): # self.scores_ will not be calculated when calling _fit through fit X = check_array(X, "csc", force_all_finite=False) - # Initialization n_features = X.shape[1] if self.n_features_to_select is None: From d6ffba72f5d71a658336b9a2c559fc3dbe8a37f0 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 19 Jul 2018 10:39:18 -0700 Subject: [PATCH 03/41] added tests for changes to RFE/RFECV and SelectorMixin --- sklearn/feature_selection/rfe.py | 2 +- sklearn/feature_selection/tests/test_rfe.py | 31 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index ed9ea2a574ec3..e1c3c069217f0 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -409,7 +409,7 @@ def fit(self, X, y, groups=None): Group labels for the samples used while splitting the dataset into train/test set. """ - X = check_array(X, "csr", force_all_finite=False) + X = check_array(X, "csr", force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index e8533d808dafd..a5110e1256746 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -354,3 +354,34 @@ def test_rfe_cv_groups(): ) est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0 + +def test_rfe_allow_nan_inf_in_x(): + generator = check_random_state(0) + iris = load_iris() + X = iris.data + y = iris.target + + # add nan and inf value to X + X[0][0] = np.NaN + X[0][1] = np.Inf + + clf = MockClassifier() + rfe = RFE(estimator=clf) + rfe.fit(X, y) + rfe.transform(X) + +def test_rfecv_allow_nan_inf_in_x(): + generator = check_random_state(0) + iris = load_iris() + X = iris.data + y = iris.target + + # add nan and inf value to X + X[0][0] = np.NaN + X[0][1] = np.Inf + + clf = MockClassifier() + rfecv = RFECV(estimator=clf) + rfecv.fit(X, y) + rfecv.transform(X) + From da4c5f49472d2e9113b1bce344fd377714b8aa64 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 11:40:33 -0800 Subject: [PATCH 04/41] pep8 changes and pytest.mark.parametrize for nan_inf tests --- sklearn/feature_selection/tests/test_rfe.py | 26 ++++++--------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index a5110e1256746..bb27c8945ce4e 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -355,8 +355,12 @@ def test_rfe_cv_groups(): est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0 -def test_rfe_allow_nan_inf_in_x(): - generator = check_random_state(0) + +@pytest.mark.parametrize("rfe_cls", [ + RFE, + RFECV +]) +def test_rfe_allow_nan_inf_in_x(rfe_cls): iris = load_iris() X = iris.data y = iris.target @@ -366,22 +370,6 @@ def test_rfe_allow_nan_inf_in_x(): X[0][1] = np.Inf clf = MockClassifier() - rfe = RFE(estimator=clf) + rfe = rfe_cls(estimator=clf) rfe.fit(X, y) rfe.transform(X) - -def test_rfecv_allow_nan_inf_in_x(): - generator = check_random_state(0) - iris = load_iris() - X = iris.data - y = iris.target - - # add nan and inf value to X - X[0][0] = np.NaN - X[0][1] = np.Inf - - clf = MockClassifier() - rfecv = RFECV(estimator=clf) - rfecv.fit(X, y) - rfecv.transform(X) - From d3601f468c76b5a47c9786e0d11a51a5fd16c94f Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 12:27:11 -0800 Subject: [PATCH 05/41] added force_all_finite=False for univariate feature selection fit --- sklearn/feature_selection/univariate_selection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 612f61028e2a4..831d308bbd6f6 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -337,7 +337,8 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True) + X, y = check_X_y(X, y, ['csr', 'csc'], force_all_finite=False, + multi_output=True) if not callable(self.score_func): raise TypeError("The score function should be a callable, %s (%s) " From 27e1399b1ac4d44ee938bb34c2259c4b9599e717 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 13:54:55 -0800 Subject: [PATCH 06/41] added tests for nan and inf being allowed in univariate select --- .../tests/test_feature_select.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 14e621473090a..c1a0b756e96a8 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -669,3 +669,27 @@ def test_mutual_info_regression(): gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) + + +def test_univariate_nan_inf_allowed_in_transform(): + X, y = make_regression(n_samples=100, n_features=10, n_informative=2, + shuffle=False, random_state=0, noise=10) + + univariate_filter = GenericUnivariateSelect(f_regression, mode='percentile') + univariate_filter.fit(X, y) + X[0] = np.NaN + X[1] = np.Inf + univariate_filter.transform(X) + + +def test_univariate_nan_inf_allowed_in_fit(): + X, y = make_regression(n_samples=100, n_features=10, n_informative=2, + shuffle=False, random_state=0, noise=10) + + # fake scorer to ensure GenericUnivariateSelect.fit allows nan if the scorer does + fake_scorer = lambda x, y: x + + univariate_filter = GenericUnivariateSelect(fake_scorer, mode='percentile') + X[0] = np.NaN + X[1] = np.Inf + univariate_filter.fit(X, y) From 2b06f2a3633f569ec596e8cf0254dfd4bb6ea225 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 13:56:19 -0800 Subject: [PATCH 07/41] added test for nan and inf being allowed in SelectorMixin transform --- sklearn/feature_selection/tests/test_from_model.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index e6bb76c5e19a9..843350d8c26b0 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -319,3 +319,14 @@ def test_threshold_without_refitting(): # Set a higher threshold to filter out more features. model.threshold = "1.0 * mean" assert_greater(X_transform.shape[1], model.transform(data).shape[1]) + +def test_transform_accepts_nan_inf(): + # Test that transform doesn't check for np.inf and np.nan values. + clf = RandomForestClassifier(n_estimators=100, random_state=0) + + model = SelectFromModel(estimator=clf) + model.fit(data, y) + + data[0] = np.NaN + data[1] = np.Inf + model.transform(data) From fb7d7d62ccfc9d95d1318498bf43a5448cdaa45a Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 13:56:55 -0800 Subject: [PATCH 08/41] updated checks to not run check_estimators_nan_inf on feature selectors --- sklearn/utils/estimator_checks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3db9865aad2c0..502210dbb1858 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -48,6 +48,7 @@ from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection.base import SelectorMixin from sklearn.svm.base import BaseLibSVM from sklearn.linear_model.stochastic_gradient import BaseSGD from sklearn.pipeline import make_pipeline @@ -103,7 +104,7 @@ def _yield_non_meta_checks(name, estimator): # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency - if name not in ALLOW_NAN: + if name not in ALLOW_NAN and not isinstance(estimator, SelectorMixin): # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf From 4d4662cfc1d439c83996d838574a555ad1c31b9d Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 14:07:01 -0800 Subject: [PATCH 09/41] pep8 corrections --- sklearn/feature_selection/base.py | 3 ++- sklearn/feature_selection/tests/test_from_model.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 30902eca34e22..b28335082e335 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -72,7 +72,8 @@ def transform(self, X): X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ - X = check_array(X, dtype=None, accept_sparse='csr', force_all_finite=False) + X = check_array(X, dtype=None, accept_sparse='csr', + force_all_finite=False) mask = self.get_support() if not mask.any(): warn("No features were selected: either the data is" diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 843350d8c26b0..8d40cbe1a654b 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -320,6 +320,7 @@ def test_threshold_without_refitting(): model.threshold = "1.0 * mean" assert_greater(X_transform.shape[1], model.transform(data).shape[1]) + def test_transform_accepts_nan_inf(): # Test that transform doesn't check for np.inf and np.nan values. clf = RandomForestClassifier(n_estimators=100, random_state=0) From eadb8049a68442cf64f5e8fd0c774bff536ded5c Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 14:07:27 -0800 Subject: [PATCH 10/41] created dummy_score function for use by all tests needing a fake scorer --- .../tests/test_feature_select.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index c1a0b756e96a8..8a53a9e3f7c70 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -31,6 +31,12 @@ ############################################################################## + +# dummy scorer to test other functionality +def dummy_score(X, y): + return X[0], X[0] + + # Test the score functions def test_f_oneway_vs_scipy_stats(): @@ -476,7 +482,6 @@ def test_selectkbest_tiebreaking(): # Prior to 0.11, SelectKBest would return more features than requested. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] - dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectKBest(dummy_score, k=1) X1 = ignore_warnings(sel.fit_transform)([X], y) @@ -493,7 +498,6 @@ def test_selectpercentile_tiebreaking(): # Test if SelectPercentile selects the right n_features in case of ties. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] - dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectPercentile(dummy_score, percentile=34) X1 = ignore_warnings(sel.fit_transform)([X], y) @@ -675,7 +679,8 @@ def test_univariate_nan_inf_allowed_in_transform(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) - univariate_filter = GenericUnivariateSelect(f_regression, mode='percentile') + univariate_filter = GenericUnivariateSelect(f_regression, + mode='percentile') univariate_filter.fit(X, y) X[0] = np.NaN X[1] = np.Inf @@ -686,10 +691,8 @@ def test_univariate_nan_inf_allowed_in_fit(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) - # fake scorer to ensure GenericUnivariateSelect.fit allows nan if the scorer does - fake_scorer = lambda x, y: x - - univariate_filter = GenericUnivariateSelect(fake_scorer, mode='percentile') + univariate_filter = GenericUnivariateSelect(dummy_score, + mode='percentile') X[0] = np.NaN X[1] = np.Inf univariate_filter.fit(X, y) From da866eb7f9fd684e40e1b1f8b287c14d5a440d9d Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 8 Jan 2019 14:51:52 -0800 Subject: [PATCH 11/41] fixed travis test error with RFECV not having cv specified --- sklearn/feature_selection/tests/test_rfe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index bb27c8945ce4e..c6907c75a98c4 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -356,11 +356,11 @@ def test_rfe_cv_groups(): assert est_groups.n_features_ > 0 -@pytest.mark.parametrize("rfe_cls", [ - RFE, - RFECV +@pytest.mark.parametrize("cv", [ + None, + 5 ]) -def test_rfe_allow_nan_inf_in_x(rfe_cls): +def test_rfe_allow_nan_inf_in_x(cv): iris = load_iris() X = iris.data y = iris.target @@ -370,6 +370,9 @@ def test_rfe_allow_nan_inf_in_x(rfe_cls): X[0][1] = np.Inf clf = MockClassifier() - rfe = rfe_cls(estimator=clf) + if cv is not None: + rfe = RFECV(estimator=clf, cv=cv) + else: + rfe = RFE(estimator=clf) rfe.fit(X, y) rfe.transform(X) From f7c2fb87180dbf4e5527bb34f34daf097a06656c Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Wed, 9 Jan 2019 12:28:27 -0800 Subject: [PATCH 12/41] allowed no NaN in pickle check for estimators in ALLOW_NAN --- sklearn/utils/estimator_checks.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fa17b4de71ccb..1f6067d3a4aef 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -46,7 +46,6 @@ from sklearn.random_projection import BaseRandomProjection from sklearn.feature_selection import SelectKBest -from sklearn.feature_selection.base import SelectorMixin from sklearn.svm.base import BaseLibSVM from sklearn.linear_model.stochastic_gradient import BaseSGD from sklearn.pipeline import make_pipeline @@ -78,7 +77,10 @@ ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator', 'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler', - 'PowerTransformer', 'QuantileTransformer'] + 'PowerTransformer', 'QuantileTransformer', + 'GenericUnivariateSelect', 'RFE', 'RFECV', 'SelectFdr', + 'SelectFpr', 'SelectFwe', 'SelectKBest', 'SelectFromModel', + 'SelectPercentile', 'VarianceThreshold'] def _yield_non_meta_checks(name, estimator): @@ -102,7 +104,7 @@ def _yield_non_meta_checks(name, estimator): # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency - if name not in ALLOW_NAN and not isinstance(estimator, SelectorMixin): + if name not in ALLOW_NAN: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf @@ -1219,7 +1221,18 @@ def check_estimators_pickle(name, estimator_orig): y = multioutput_estimator_convert_y_2d(estimator, y) set_random_state(estimator) - estimator.fit(X, y) + + try: + estimator.fit(X, y) + except ValueError as e: + if 'inf' not in repr(e) and 'NaN' not in repr(e): + raise e + else: + # Some feature selection estimators don't allow nan/inf with + # their default parameters, even though they are allowed in + # general. Remove the nan in these cases. + X = np.nan_to_num(X) + estimator.fit(X, y) result = dict() for method in check_methods: From 288cecc31c37e79f7ef5cf0ba802648e0e3368c2 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Fri, 11 Jan 2019 11:42:02 -0800 Subject: [PATCH 13/41] added support for nan/inf in VarianceThreshold.fit, added tests for it --- .../tests/test_variance_threshold.py | 11 +++++++++++ sklearn/feature_selection/variance_threshold.py | 8 +++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index a40491302f350..7bb49934ce49a 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -2,6 +2,7 @@ assert_raises) from scipy.sparse import bsr_matrix, csc_matrix, csr_matrix +import numpy as np from sklearn.feature_selection import VarianceThreshold @@ -26,3 +27,13 @@ def test_variance_threshold(): for X in [data, csr_matrix(data)]: X = VarianceThreshold(threshold=.4).fit_transform(X) assert_equal((len(data), 1), X.shape) + + +def test_variance_nan_inf(): + arr = np.array(data, dtype=np.float64) + arr[0, 0] = np.NaN + arr[0, 1] = np.Inf + + for X in [arr, csr_matrix(arr), csc_matrix(arr), bsr_matrix(arr)]: + sel = VarianceThreshold().fit(X) + assert_array_equal([0, 3, 4], sel.get_support(indices=True)) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index c9e018d94a84e..aff84ebd7b058 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -61,14 +61,16 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ('csr', 'csc'), dtype=np.float64) + X = check_array(X, ('csr', 'csc'), dtype=np.float64, + force_all_finite=False) if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) else: - self.variances_ = np.var(X, axis=0) + self.variances_ = np.nanvar(X, axis=0) - if np.all(self.variances_ <= self.threshold): + if np.all(~np.isfinite(self.variances_) | + (self.variances_ <= self.threshold)): msg = "No feature in X meets the variance threshold {0:.5f}" if X.shape[0] == 1: msg += " (X contains only one sample)" From 67fa9e7e57a38a04730b206c0be573eea3ce8856 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 17 Jan 2019 11:10:00 -0800 Subject: [PATCH 14/41] added whats new documentation for feature selection allowing NaN/Inf --- doc/whats_new/v0.21.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 506904f0808ee..0037d663e6bb0 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -94,6 +94,21 @@ Support for Python 3.4 and below has been officially dropped. with the document and the caller functions. :issue:`6463` by :user:`movelikeriver `. +:mod:`sklearn.feature_selection` +........................... + +- |Enhancement| Updated all :mod:`feature_selection` estimators to allow + NaN/Inf values in ``transform`` and ``fit``. This includes + :class:`feature_selection.GenericUnivariateSelect`, :class:`feature_selection.RFE`, + :class:`feature_selection.RFECV`, :class:`feature_selection.SelectFdr`, + :class:`feature_selection.SelectFpr`, :class:`feature_selection.SelectFromModel`, + :class:`feature_selection.SelectFwe`, :class:`feature_selection.SelectKBest`, + :class:`feature_selection.SelectPercentile`, + and :class:`feature_selection.VarianceThreshold`. Note that if the underlying + estimator of the feature selector does not allow NaN/Inf then it will still + error, but the feature selectors themselves no longer enforce this restriction + unnecessarily. :issue:`11635` by :user:`Alec Peters `. + :mod:`sklearn.linear_model` ........................... From 9be38702e1ae8180f6008c773b9994fdaa91f02c Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 17 Jan 2019 11:39:53 -0800 Subject: [PATCH 15/41] added Notes for modified feature selectors about allowing NaN/Inf --- sklearn/feature_selection/from_model.py | 4 ++++ sklearn/feature_selection/rfe.py | 6 ++++++ .../feature_selection/univariate_selection.py | 20 +++++++++++++++++++ .../feature_selection/variance_threshold.py | 4 ++++ 4 files changed, 34 insertions(+) diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index fb26f9d685688..25bfe71fc172c 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -130,6 +130,10 @@ class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin): threshold_ : float The threshold value used for feature selection. + + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. """ def __init__(self, estimator, threshold=None, prefit=False, norm_order=1, max_features=None): diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index c0c25d3e63869..eff0bfac7024f 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -102,6 +102,10 @@ class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin): >>> selector.ranking_ array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5]) + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. + See also -------- RFECV : Recursive feature elimination with built-in cross-validated @@ -414,6 +418,8 @@ class RFECV(RFE, MetaEstimatorMixin): ``ceil((n_features - min_features_to_select) / step) + 1``, where step is the number of features removed at each iteration. + Allows NaN/Inf in the input if the underlying estimator does as well. + Examples -------- The following example shows how to retrieve the a-priori not known 5 diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index fbf7261bcf4c9..b7a059117fd4b 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -406,6 +406,8 @@ class SelectPercentile(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. + Allows NaN/Inf in the input if the underlying score_func does as well. + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -490,6 +492,8 @@ class SelectKBest(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. + Allows NaN/Inf in the input if the underlying score_func does as well. + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -569,6 +573,10 @@ class SelectFpr(_BaseFilter): >>> X_new.shape (569, 16) + Notes + ----- + Allows NaN/Inf in the input if the underlying score_func does as well. + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -635,6 +643,10 @@ class SelectFdr(_BaseFilter): ---------- https://en.wikipedia.org/wiki/False_discovery_rate + Notes + ----- + Allows NaN/Inf in the input if the underlying score_func does as well. + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -700,6 +712,10 @@ class SelectFwe(_BaseFilter): pvalues_ : array-like, shape=(n_features,) p-values of feature scores. + Notes + ----- + Allows NaN/Inf in the input if the underlying score_func does as well. + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -766,6 +782,10 @@ class GenericUnivariateSelect(_BaseFilter): >>> X_new.shape (569, 20) + Notes + ----- + Allows NaN/Inf in the input if the underlying score_func does as well. + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index aff84ebd7b058..d0a53b006dd86 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -29,6 +29,10 @@ class VarianceThreshold(BaseEstimator, SelectorMixin): variances_ : array, shape (n_features,) Variances of individual features. + Notes + ----- + Allows NaN/Inf in the input. + Examples -------- The following dataset has integer features, two of which are the same From c9e8fcc75d5fee6d5060bf524969ab97e5530217 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 17 Jan 2019 11:46:20 -0800 Subject: [PATCH 16/41] fixed pep8 error --- sklearn/feature_selection/univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index b7a059117fd4b..4e9663013b64b 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -785,7 +785,7 @@ class GenericUnivariateSelect(_BaseFilter): Notes ----- Allows NaN/Inf in the input if the underlying score_func does as well. - + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. From 34e271070c3c8dfdbe6a04e6de77d33af920ca47 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 17 Jan 2019 12:00:19 -0800 Subject: [PATCH 17/41] shortened line length for whats new documentation --- doc/whats_new/v0.21.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 0037d663e6bb0..5d04e043cfe85 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -98,16 +98,17 @@ Support for Python 3.4 and below has been officially dropped. ........................... - |Enhancement| Updated all :mod:`feature_selection` estimators to allow - NaN/Inf values in ``transform`` and ``fit``. This includes - :class:`feature_selection.GenericUnivariateSelect`, :class:`feature_selection.RFE`, - :class:`feature_selection.RFECV`, :class:`feature_selection.SelectFdr`, - :class:`feature_selection.SelectFpr`, :class:`feature_selection.SelectFromModel`, + NaN/Inf values in ``transform`` and ``fit``. This includes + :class:`feature_selection.GenericUnivariateSelect`, + :class:`feature_selection.RFE`, :class:`feature_selection.RFECV`, + :class:`feature_selection.SelectFdr`, :class:`feature_selection.SelectFpr`, + :class:`feature_selection.SelectFromModel`, :class:`feature_selection.SelectFwe`, :class:`feature_selection.SelectKBest`, - :class:`feature_selection.SelectPercentile`, + :class:`feature_selection.SelectPercentile`, and :class:`feature_selection.VarianceThreshold`. Note that if the underlying estimator of the feature selector does not allow NaN/Inf then it will still - error, but the feature selectors themselves no longer enforce this restriction - unnecessarily. :issue:`11635` by :user:`Alec Peters `. + error, but the feature selectors themselves no longer enforce this + restriction unnecessarily. :issue:`11635` by :user:`Alec Peters `. :mod:`sklearn.linear_model` ........................... From 816b0a33ace13750f93e6126ba3632dab67a4bba Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Wed, 23 Jan 2019 11:24:13 -0800 Subject: [PATCH 18/41] changed VarianceThreshold to only allow nan, not inf. Updated tests for it --- sklearn/feature_selection/tests/test_variance_threshold.py | 5 ++--- sklearn/feature_selection/variance_threshold.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 7bb49934ce49a..fa6635003e66a 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -29,11 +29,10 @@ def test_variance_threshold(): assert_equal((len(data), 1), X.shape) -def test_variance_nan_inf(): +def test_variance_nan(): arr = np.array(data, dtype=np.float64) arr[0, 0] = np.NaN - arr[0, 1] = np.Inf for X in [arr, csr_matrix(arr), csc_matrix(arr), bsr_matrix(arr)]: sel = VarianceThreshold().fit(X) - assert_array_equal([0, 3, 4], sel.get_support(indices=True)) + assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index d0a53b006dd86..978af67b93f2c 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -31,7 +31,7 @@ class VarianceThreshold(BaseEstimator, SelectorMixin): Notes ----- - Allows NaN/Inf in the input. + Allows NaN in the input. Examples -------- @@ -66,7 +66,7 @@ def fit(self, X, y=None): self """ X = check_array(X, ('csr', 'csc'), dtype=np.float64, - force_all_finite=False) + force_all_finite='allow-nan') if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) From c4619eb31ef48c8b2cb0df9af83932aaa161901f Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Wed, 23 Jan 2019 11:25:40 -0800 Subject: [PATCH 19/41] fixed phrasing in whats_new for #11635 --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 5d04e043cfe85..65ae5387f9c73 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -106,7 +106,7 @@ Support for Python 3.4 and below has been officially dropped. :class:`feature_selection.SelectFwe`, :class:`feature_selection.SelectKBest`, :class:`feature_selection.SelectPercentile`, and :class:`feature_selection.VarianceThreshold`. Note that if the underlying - estimator of the feature selector does not allow NaN/Inf then it will still + estimator or score function of the feature selector does not allow NaN/Inf then it will still error, but the feature selectors themselves no longer enforce this restriction unnecessarily. :issue:`11635` by :user:`Alec Peters `. From 195ed139b3a5ccad7b70eb5cd4fb89acdb2968f1 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 24 Jan 2019 09:26:09 -0800 Subject: [PATCH 20/41] added test for all-NaN feature in VarianceThreshold --- sklearn/feature_selection/tests/test_variance_threshold.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index fa6635003e66a..44af99ff2a9e8 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -31,8 +31,11 @@ def test_variance_threshold(): def test_variance_nan(): arr = np.array(data, dtype=np.float64) + # add single NaN and feature should still be included arr[0, 0] = np.NaN + # make all values in feature NaN and feature should be rejected + arr[:, 1] = np.NaN for X in [arr, csr_matrix(arr), csc_matrix(arr), bsr_matrix(arr)]: sel = VarianceThreshold().fit(X) - assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) + assert_array_equal([0, 3, 4], sel.get_support(indices=True)) From cca7acc7ec898809eb48a95e505a7568a89f8f15 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 24 Apr 2019 21:18:06 +1000 Subject: [PATCH 21/41] Fix use of check_array --- sklearn/feature_selection/rfe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index a871b4e7e6f12..4e2aeebe33966 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -153,8 +153,8 @@ def _fit(self, X, y, step_score=None): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit - X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=False) + X = check_array(X, "csc", ensure_min_features=2, + force_all_finite=False) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -485,8 +485,8 @@ def fit(self, X, y, groups=None): Group labels for the samples used while splitting the dataset into train/test set. """ - X, y = check_X_y(X, y, "csr", ensure_min_features=2, - force_all_finite=False) + X = check_array(X, "csr", ensure_min_features=2, + force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) From 017a0db2699ae07ce7562e27146720d97585633b Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 24 Apr 2019 22:01:11 +1000 Subject: [PATCH 22/41] Use check_X_y --- sklearn/feature_selection/rfe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 4e2aeebe33966..4bdd60aaebb05 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -7,7 +7,7 @@ """Recursive feature elimination for feature ranking""" import numpy as np -from ..utils import check_array, safe_sqr +from ..utils import check_X_y, safe_sqr from ..utils.metaestimators import if_delegate_has_method from ..utils.metaestimators import _safe_split from ..utils.validation import check_is_fitted @@ -153,8 +153,8 @@ def _fit(self, X, y, step_score=None): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit - X = check_array(X, "csc", ensure_min_features=2, - force_all_finite=False) + X, y = check_X_y(X, y, "csc", ensure_min_features=2, + force_all_finite=False) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -485,8 +485,8 @@ def fit(self, X, y, groups=None): Group labels for the samples used while splitting the dataset into train/test set. """ - X = check_array(X, "csr", ensure_min_features=2, - force_all_finite=False) + X, y = check_X_y(X, y, "csr", ensure_min_features=2, + force_all_finite=False) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) From 0efc7f00cf68c650d89e69ed1257b0aed65292d2 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 13:44:45 -0700 Subject: [PATCH 23/41] updated SelectFromModel allow_nan tag to inherit from its estimator --- sklearn/feature_selection/base.py | 2 +- sklearn/feature_selection/from_model.py | 3 ++- .../tests/test_from_model.py | 24 +++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 38cca8c11e9a5..54e53590e793c 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -72,7 +72,7 @@ def transform(self, X): The input samples with only the selected features. """ X = check_array(X, dtype=None, accept_sparse='csr', - force_all_finite=False) + force_all_finite=not self._get_tags().get('allow_nan', True)) mask = self.get_support() if not mask.any(): warn("No features were selected: either the data is" diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index af3ec23a12425..2ffb2efcabbb1 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -233,4 +233,5 @@ def partial_fit(self, X, y=None, **fit_params): return self def _more_tags(self): - return {'allow_nan': True} + estimator_tags = self.estimator._get_tags() + return {'allow_nan': estimator_tags.get('allow_nan', True)} diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index f33ada57334e1..d44dee4604adb 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -18,6 +18,17 @@ from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.base import BaseEstimator + +class NaNTag(BaseEstimator): + def _more_tags(self): + return {'allow_nan': True} + + +class NoNaNTag(BaseEstimator): + def _more_tags(self): + return {'allow_nan': False} + + iris = datasets.load_iris() data, y = iris.data, iris.target rng = np.random.RandomState(0) @@ -340,3 +351,16 @@ def test_transform_accepts_nan_inf(): data[0] = np.NaN data[1] = np.Inf model.transform(data) + +def test_allow_nan_tag_comes_from_estimator(): + allow_nan_est = NaNTag() + model = SelectFromModel(estimator=allow_nan_est) + assert_equal(model._get_tags()['allow_nan'], True) + + no_nan_est = NoNaNTag() + model = SelectFromModel(estimator=no_nan_est) + assert_equal(model._get_tags()['allow_nan'], False) + + + + From 4119fcd12f0331fa942bc23ee97ad264707b40c9 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 13:49:40 -0700 Subject: [PATCH 24/41] moved documentation for #11635 to v0.22 --- doc/whats_new/v0.21.rst | 16 ---------------- doc/whats_new/v0.22.rst | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 4a59912dcdf4a..d2b6f1f9a500c 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -429,22 +429,6 @@ Support for Python 3.4 and below has been officially dropped. passing the file name(s) or the file object(s) to the analyzer. :pr:`13641` by `Adrin Jalali`_. -:mod:`sklearn.feature_selection` -........................... - -- |Enhancement| Updated all :mod:`feature_selection` estimators to allow - NaN/Inf values in ``transform`` and ``fit``. This includes - :class:`feature_selection.GenericUnivariateSelect`, - :class:`feature_selection.RFE`, :class:`feature_selection.RFECV`, - :class:`feature_selection.SelectFdr`, :class:`feature_selection.SelectFpr`, - :class:`feature_selection.SelectFromModel`, - :class:`feature_selection.SelectFwe`, :class:`feature_selection.SelectKBest`, - :class:`feature_selection.SelectPercentile`, - and :class:`feature_selection.VarianceThreshold`. Note that if the underlying - estimator or score function of the feature selector does not allow NaN/Inf then it will still - error, but the feature selectors themselves no longer enforce this - restriction unnecessarily. :issue:`11635` by :user:`Alec Peters `. - :mod:`sklearn.impute` ..................... diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 0518d6c9e0de4..b1c55020ac9d6 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -39,6 +39,22 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.feature_selection` +........................... + +- |Enhancement| Updated all :mod:`feature_selection` estimators to allow + NaN/Inf values in ``transform`` and ``fit``. This includes + :class:`feature_selection.GenericUnivariateSelect`, + :class:`feature_selection.RFE`, :class:`feature_selection.RFECV`, + :class:`feature_selection.SelectFdr`, :class:`feature_selection.SelectFpr`, + :class:`feature_selection.SelectFromModel`, + :class:`feature_selection.SelectFwe`, :class:`feature_selection.SelectKBest`, + :class:`feature_selection.SelectPercentile`, + and :class:`feature_selection.VarianceThreshold`. Note that if the underlying + estimator or score function of the feature selector does not allow NaN/Inf then it will still + error, but the feature selectors themselves no longer enforce this + restriction unnecessarily. :issue:`11635` by :user:`Alec Peters `. + :mod:`sklearn.linear_model` .................. From 9f209855ee05b08c77cefe569eb29397c9c46b30 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 13:59:09 -0700 Subject: [PATCH 25/41] fixed formatting --- sklearn/feature_selection/base.py | 3 ++- sklearn/feature_selection/tests/test_from_model.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/base.py b/sklearn/feature_selection/base.py index 54e53590e793c..1c1f9053bec7c 100644 --- a/sklearn/feature_selection/base.py +++ b/sklearn/feature_selection/base.py @@ -71,8 +71,9 @@ def transform(self, X): X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ + tags = self._get_tags() X = check_array(X, dtype=None, accept_sparse='csr', - force_all_finite=not self._get_tags().get('allow_nan', True)) + force_all_finite=not tags.get('allow_nan', True)) mask = self.get_support() if not mask.any(): warn("No features were selected: either the data is" diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index ed6c676dc6b38..5a82ec153da1d 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -346,6 +346,7 @@ def test_transform_accepts_nan_inf(): data[1] = np.Inf model.transform(data) + def test_allow_nan_tag_comes_from_estimator(): allow_nan_est = NaNTag() model = SelectFromModel(estimator=allow_nan_est) @@ -354,7 +355,3 @@ def test_allow_nan_tag_comes_from_estimator(): no_nan_est = NoNaNTag() model = SelectFromModel(estimator=no_nan_est) assert_equal(model._get_tags()['allow_nan'], False) - - - - From fc1a975ea92fd5b014a895edf15cd33909e98e9f Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 14:34:26 -0700 Subject: [PATCH 26/41] removed test_transform_accepts_nan_inf from test_from_model.py since its not true anymore --- sklearn/feature_selection/tests/test_from_model.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 5a82ec153da1d..144aa09abcd8c 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -335,18 +335,6 @@ def test_threshold_without_refitting(): assert_greater(X_transform.shape[1], model.transform(data).shape[1]) -def test_transform_accepts_nan_inf(): - # Test that transform doesn't check for np.inf and np.nan values. - clf = RandomForestClassifier(n_estimators=100, random_state=0) - - model = SelectFromModel(estimator=clf) - model.fit(data, y) - - data[0] = np.NaN - data[1] = np.Inf - model.transform(data) - - def test_allow_nan_tag_comes_from_estimator(): allow_nan_est = NaNTag() model = SelectFromModel(estimator=allow_nan_est) From f71040889798a59e32873124457fc6f5fc0f8323 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 14:42:09 -0700 Subject: [PATCH 27/41] updated test_transform_accepts_nan_inf to handle NaN tag --- .../feature_selection/tests/test_from_model.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 144aa09abcd8c..1c5f77091d41f 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -29,6 +29,11 @@ def _more_tags(self): return {'allow_nan': False} +class NaNTagRandomForest(RandomForestClassifier): + def _more_tags(self): + return {'allow_nan': True} + + iris = datasets.load_iris() data, y = iris.data, iris.target rng = np.random.RandomState(0) @@ -335,6 +340,18 @@ def test_threshold_without_refitting(): assert_greater(X_transform.shape[1], model.transform(data).shape[1]) +def test_transform_accepts_nan_inf(): + # Test that transform doesn't check for np.inf and np.nan values. + clf = NaNTagRandomForest(n_estimators=100, random_state=0) + + model = SelectFromModel(estimator=clf) + model.fit(data, y) + + data[0] = np.NaN + data[1] = np.Inf + model.transform(data) + + def test_allow_nan_tag_comes_from_estimator(): allow_nan_est = NaNTag() model = SelectFromModel(estimator=allow_nan_est) From 2b55934ff5c055108f37371f36495abce96a7faa Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 15:02:32 -0700 Subject: [PATCH 28/41] fixed formatting --- sklearn/feature_selection/tests/test_from_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 1c5f77091d41f..6bd8f3e130469 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -341,15 +341,15 @@ def test_threshold_without_refitting(): def test_transform_accepts_nan_inf(): - # Test that transform doesn't check for np.inf and np.nan values. - clf = NaNTagRandomForest(n_estimators=100, random_state=0) + # Test that transform doesn't check for np.inf and np.nan values. + clf = NaNTagRandomForest(n_estimators=100, random_state=0) - model = SelectFromModel(estimator=clf) - model.fit(data, y) + model = SelectFromModel(estimator=clf) + model.fit(data, y) - data[0] = np.NaN - data[1] = np.Inf - model.transform(data) + data[0] = np.NaN + data[1] = np.Inf + model.transform(data) def test_allow_nan_tag_comes_from_estimator(): From 1adc65c7d179cf8ff26c633e56a165652b1908ec Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Thu, 23 May 2019 15:10:47 -0700 Subject: [PATCH 29/41] fixed formatting --- sklearn/feature_selection/tests/test_from_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 6bd8f3e130469..857c1fe1fd0e2 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -341,15 +341,15 @@ def test_threshold_without_refitting(): def test_transform_accepts_nan_inf(): - # Test that transform doesn't check for np.inf and np.nan values. - clf = NaNTagRandomForest(n_estimators=100, random_state=0) + # Test that transform doesn't check for np.inf and np.nan values. + clf = NaNTagRandomForest(n_estimators=100, random_state=0) - model = SelectFromModel(estimator=clf) - model.fit(data, y) + model = SelectFromModel(estimator=clf) + model.fit(data, y) - data[0] = np.NaN - data[1] = np.Inf - model.transform(data) + data[0] = np.NaN + data[1] = np.Inf + model.transform(data) def test_allow_nan_tag_comes_from_estimator(): From 055d4ea79e3f86e4ad2741e94b0bb3ff3cc79882 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 29 Oct 2019 16:08:18 -0700 Subject: [PATCH 30/41] set allow_nan tag to false for univariate feature selectors since the default score_funcs do not currently allow nans --- sklearn/feature_selection/univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index a93eec34fc0e4..3a70f0690270a 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -363,7 +363,7 @@ def _check_params(self, X, y): pass def _more_tags(self): - return {'allow_nan': True} + return {'allow_nan': False} ###################################################################### From 4499960fe032f59da973d2999e312baa4aee61e5 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 29 Oct 2019 17:35:31 -0700 Subject: [PATCH 31/41] updated VarianceThreshold.fit() to handle nan when threshold is 0 --- sklearn/feature_selection/_variance_threshold.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 5f7ce3e3549af..4f9d720b762b9 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -81,7 +81,8 @@ def fit(self, X, y=None): if self.threshold == 0: # Use peak-to-peak to avoid numeric precision issues # for constant features - self.variances_ = np.minimum(self.variances_, peak_to_peaks) + compare_arr = np.array([self.variances_, peak_to_peaks]) + self.variances_ = np.nanmin(compare_arr, axis=0) if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)): From 92bc14fc2bc2532921821ddd8f3b358004be3c8e Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 29 Oct 2019 17:36:08 -0700 Subject: [PATCH 32/41] removed test that no longer applies now that allow_nan tag is False for univariate selectors --- .../feature_selection/tests/test_feature_select.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index c77a14b0459f2..ff24467b389fd 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -673,18 +673,6 @@ def test_mutual_info_regression(): assert_array_equal(support, gtruth) -def test_univariate_nan_inf_allowed_in_transform(): - X, y = make_regression(n_samples=100, n_features=10, n_informative=2, - shuffle=False, random_state=0, noise=10) - - univariate_filter = GenericUnivariateSelect(f_regression, - mode='percentile') - univariate_filter.fit(X, y) - X[0] = np.NaN - X[1] = np.Inf - univariate_filter.transform(X) - - def test_univariate_nan_inf_allowed_in_fit(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) From e1add7f6b1c25720d5ff15052bbff1609f5e0fa8 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 29 Oct 2019 17:36:36 -0700 Subject: [PATCH 33/41] added pytest import since it was accidentally removed --- sklearn/feature_selection/tests/test_rfe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index e4c1a91f0bc48..a86999d2b1e44 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -2,6 +2,7 @@ Testing Recursive feature elimination """ +import pytest import numpy as np from numpy.testing import assert_array_almost_equal, assert_array_equal from scipy import sparse From 4921954fb8b362040f27646ff0eaddd4ddde844d Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 29 Oct 2019 17:37:42 -0700 Subject: [PATCH 34/41] removed duplicate numpy import --- sklearn/feature_selection/tests/test_variance_threshold.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 7dc3be4a9f69a..77d9c9445bc71 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -4,7 +4,6 @@ from sklearn.utils._testing import assert_array_equal from scipy.sparse import bsr_matrix, csc_matrix, csr_matrix -import numpy as np from sklearn.feature_selection import VarianceThreshold From 6783182725555037631d13a5e8ecf370ccb6379c Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Tue, 29 Oct 2019 17:37:59 -0700 Subject: [PATCH 35/41] updated assert_equals to assert --- sklearn/feature_selection/tests/test_from_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 313cc72d5cec4..40fafe4429896 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -353,8 +353,8 @@ def test_transform_accepts_nan_inf(): def test_allow_nan_tag_comes_from_estimator(): allow_nan_est = NaNTag() model = SelectFromModel(estimator=allow_nan_est) - assert_equal(model._get_tags()['allow_nan'], True) + assert model._get_tags()['allow_nan'] is True no_nan_est = NoNaNTag() model = SelectFromModel(estimator=no_nan_est) - assert_equal(model._get_tags()['allow_nan'], False) + assert model._get_tags()['allow_nan'] is False From 8a9164f49794873d6b088ff8c81da7fd89a91c7f Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Fri, 1 Nov 2019 11:27:54 -0700 Subject: [PATCH 36/41] reverted changes to univariate feature selectors to break those out to separate PR --- .../_univariate_selection.py | 26 +------------------ .../tests/test_feature_select.py | 19 ++------------ sklearn/utils/estimator_checks.py | 13 +--------- 3 files changed, 4 insertions(+), 54 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 6bf414a938a93..21990bb3a8167 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -338,8 +338,7 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc'], force_all_finite=False, - multi_output=True) + X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True) if not callable(self.score_func): raise TypeError("The score function should be a callable, %s (%s) " @@ -362,9 +361,6 @@ def fit(self, X, y): def _check_params(self, X, y): pass - def _more_tags(self): - return {'allow_nan': False} - ###################################################################### # Specific filters @@ -409,8 +405,6 @@ class SelectPercentile(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. - Allows NaN/Inf in the input if the underlying score_func does as well. - See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -495,8 +489,6 @@ class SelectKBest(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. - Allows NaN/Inf in the input if the underlying score_func does as well. - See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -576,10 +568,6 @@ class SelectFpr(_BaseFilter): >>> X_new.shape (569, 16) - Notes - ----- - Allows NaN/Inf in the input if the underlying score_func does as well. - See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -646,10 +634,6 @@ class SelectFdr(_BaseFilter): ---------- https://en.wikipedia.org/wiki/False_discovery_rate - Notes - ----- - Allows NaN/Inf in the input if the underlying score_func does as well. - See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -715,10 +699,6 @@ class SelectFwe(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores. - Notes - ----- - Allows NaN/Inf in the input if the underlying score_func does as well. - See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. @@ -785,10 +765,6 @@ class GenericUnivariateSelect(_BaseFilter): >>> X_new.shape (569, 20) - Notes - ----- - Allows NaN/Inf in the input if the underlying score_func does as well. - See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index ff24467b389fd..abb11fdc7b8da 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -24,12 +24,6 @@ ############################################################################## - -# dummy scorer to test other functionality -def dummy_score(X, y): - return X[0], X[0] - - # Test the score functions def test_f_oneway_vs_scipy_stats(): @@ -477,6 +471,7 @@ def test_selectkbest_tiebreaking(): # Prior to 0.11, SelectKBest would return more features than requested. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] + dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectKBest(dummy_score, k=1) X1 = ignore_warnings(sel.fit_transform)([X], y) @@ -493,6 +488,7 @@ def test_selectpercentile_tiebreaking(): # Test if SelectPercentile selects the right n_features in case of ties. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] + dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectPercentile(dummy_score, percentile=34) X1 = ignore_warnings(sel.fit_transform)([X], y) @@ -671,14 +667,3 @@ def test_mutual_info_regression(): gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) - - -def test_univariate_nan_inf_allowed_in_fit(): - X, y = make_regression(n_samples=100, n_features=10, n_informative=2, - shuffle=False, random_state=0, noise=10) - - univariate_filter = GenericUnivariateSelect(dummy_score, - mode='percentile') - X[0] = np.NaN - X[1] = np.Inf - univariate_filter.fit(X, y) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f3e3b997bf678..30c668237b371 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1475,18 +1475,7 @@ def check_estimators_pickle(name, estimator_orig): y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) - - try: - estimator.fit(X, y) - except ValueError as e: - if 'inf' not in repr(e) and 'NaN' not in repr(e): - raise e - else: - # Some feature selection estimators don't allow nan/inf with - # their default parameters, even though they are allowed in - # general. Remove the nan in these cases. - X = np.nan_to_num(X) - estimator.fit(X, y) + estimator.fit(X, y) result = dict() for method in check_methods: From 8167671b8b1cb7f39707a1760104ce833ad867f5 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Fri, 1 Nov 2019 11:39:05 -0700 Subject: [PATCH 37/41] updated documentation to reflect removal of univariate changes --- doc/whats_new/v0.22.rst | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 1678a52b00c89..8e0686ca77b31 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -310,16 +310,12 @@ Changelog :mod:`sklearn.feature_selection` ................................ -- |Enhancement| Updated all :mod:`feature_selection` estimators to allow - NaN/Inf values in ``transform`` and ``fit``. This includes - :class:`feature_selection.GenericUnivariateSelect`, +- |Enhancement| Updated the following :mod:`feature_selection` estimators to allow + NaN/Inf values in ``transform`` and ``fit``: :class:`feature_selection.RFE`, :class:`feature_selection.RFECV`, - :class:`feature_selection.SelectFdr`, :class:`feature_selection.SelectFpr`, :class:`feature_selection.SelectFromModel`, - :class:`feature_selection.SelectFwe`, :class:`feature_selection.SelectKBest`, - :class:`feature_selection.SelectPercentile`, and :class:`feature_selection.VarianceThreshold`. Note that if the underlying - estimator or score function of the feature selector does not allow NaN/Inf then it will still + estimator of the feature selector does not allow NaN/Inf then it will still error, but the feature selectors themselves no longer enforce this restriction unnecessarily. :issue:`11635` by :user:`Alec Peters `. From 6176538cc9fe2763622820f6683462f93eac18d0 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Fri, 1 Nov 2019 12:40:49 -0700 Subject: [PATCH 38/41] fixed rfe handling of tags so they are properly inherited from the estimator --- sklearn/feature_selection/_rfe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index a204ac3742ca5..12e99175c9d61 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -154,8 +154,9 @@ def _fit(self, X, y, step_score=None): # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit + tags = self._get_tags() X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=False) + force_all_finite=not tags.get('allow_nan', True)) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -331,7 +332,9 @@ def predict_log_proba(self, X): return self.estimator_.predict_log_proba(self.transform(X)) def _more_tags(self): - return {'poor_score': True, 'allow_nan': True} + estimator_tags = self.estimator._get_tags() + return {'poor_score': True, + 'allow_nan': estimator_tags.get('allow_nan', True)} class RFECV(RFE): From e43d8f868be144a8abcc277b1df19b6120ee53ec Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Fri, 1 Nov 2019 13:26:58 -0700 Subject: [PATCH 39/41] added get_tags to mockclassifier for rfe tests --- sklearn/feature_selection/tests/test_rfe.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index a86999d2b1e44..ccd3c0a1b0e83 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -55,6 +55,9 @@ def get_params(self, deep=True): def set_params(self, **params): return self + def _get_tags(self): + return {} + def test_rfe_features_importance(): generator = check_random_state(0) From 6950892db621991e514f51028510d56f2fb3a4e2 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Mon, 4 Nov 2019 09:15:29 -0800 Subject: [PATCH 40/41] added test for nan/inf in fit of SelectFromModel --- .../tests/test_from_model.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 40fafe4429896..d590f42024ac7 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -10,7 +10,9 @@ from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel -from sklearn.ensemble import RandomForestClassifier +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import (RandomForestClassifier, + HistGradientBoostingClassifier) from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.base import BaseEstimator @@ -338,16 +340,31 @@ def test_threshold_without_refitting(): assert X_transform.shape[1] > model.transform(data).shape[1] +def test_fit_accepts_nan_inf(): + # Test that fit doesn't check for np.inf and np.nan values. + clf = HistGradientBoostingClassifier(random_state=0) + + model = SelectFromModel(estimator=clf) + + nan_data = data.copy() + nan_data[0] = np.NaN + nan_data[1] = np.Inf + + model.fit(data, y) + + def test_transform_accepts_nan_inf(): # Test that transform doesn't check for np.inf and np.nan values. clf = NaNTagRandomForest(n_estimators=100, random_state=0) + nan_data = data.copy() model = SelectFromModel(estimator=clf) - model.fit(data, y) + model.fit(nan_data, y) + + nan_data[0] = np.NaN + nan_data[1] = np.Inf - data[0] = np.NaN - data[1] = np.Inf - model.transform(data) + model.transform(nan_data) def test_allow_nan_tag_comes_from_estimator(): From 20f2426b7f441f13316a7df1b01802ca29f4d532 Mon Sep 17 00:00:00 2001 From: Alec Peters Date: Mon, 4 Nov 2019 09:19:49 -0800 Subject: [PATCH 41/41] added noqa tag to import to avoid flake errors --- sklearn/feature_selection/tests/test_from_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index d590f42024ac7..57bd88a30eb0e 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -10,7 +10,7 @@ from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import (RandomForestClassifier, HistGradientBoostingClassifier) from sklearn.linear_model import PassiveAggressiveClassifier