diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 712c078e28dfb..11ee7e700df8c 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -50,6 +50,12 @@ Enhancements Bug fixes ......... + - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not + exactly implement Benjamini-Hochberg procedure. It formerly may have + selected fewer features than it should. + (`#7490 `_) by + `Peng Meng`_. + - :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles integer inputs (`#6282 `_) by @@ -4873,3 +4879,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Eugene Chen: https://github.com/eyc88 .. _Narine Kokhlikyan: https://github.com/NarineK + +.. _Peng Meng: https://github.com/mpjlu diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index d2a73334299d5..842147c3dd870 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -371,6 +371,40 @@ def test_select_heuristics_regression(): assert_less(np.sum(support[5:] == 1), 3) +def test_boundary_case_ch2(): + # Test boundary case, and always aim to select 1 feature. + X = np.array([[10, 20], [20, 20], [20, 30]]) + y = np.array([[1], [0], [0]]) + scores, pvalues = chi2(X, y) + assert_array_almost_equal(scores, np.array([4., 0.71428571])) + assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) + + filter_fdr = SelectFdr(chi2, alpha=0.1) + filter_fdr.fit(X, y) + support_fdr = filter_fdr.get_support() + assert_array_equal(support_fdr, np.array([True, False])) + + filter_kbest = SelectKBest(chi2, k=1) + filter_kbest.fit(X, y) + support_kbest = filter_kbest.get_support() + assert_array_equal(support_kbest, np.array([True, False])) + + filter_percentile = SelectPercentile(chi2, percentile=50) + filter_percentile.fit(X, y) + support_percentile = filter_percentile.get_support() + assert_array_equal(support_percentile, np.array([True, False])) + + filter_fpr = SelectFpr(chi2, alpha=0.1) + filter_fpr.fit(X, y) + support_fpr = filter_fpr.get_support() + assert_array_equal(support_fpr, np.array([True, False])) + + filter_fwe = SelectFwe(chi2, alpha=0.1) + filter_fwe.fit(X, y) + support_fwe = filter_fwe.get_support() + assert_array_equal(support_fwe, np.array([True, False])) + + def test_select_fdr_regression(): # Test that fdr heuristic actually has low FDR. def single_fdr(alpha, n_informative, random_state): @@ -404,7 +438,7 @@ def single_fdr(alpha, n_informative, random_state): # FDR = E(FP / (TP + FP)) <= alpha false_discovery_rate = np.mean([single_fdr(alpha, n_informative, random_state) for - random_state in range(30)]) + random_state in range(100)]) assert_greater_equal(alpha, false_discovery_rate) # Make sure that the empirical false discovery rate increases diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 16002101a37f8..2ab7756df7eeb 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -596,8 +596,8 @@ def _get_support_mask(self): n_features = len(self.pvalues_) sv = np.sort(self.pvalues_) - selected = sv[sv <= float(self.alpha) / n_features - * np.arange(n_features)] + selected = sv[sv <= float(self.alpha) / n_features * + np.arange(1, n_features + 1)] if selected.size == 0: return np.zeros_like(self.pvalues_, dtype=bool) return self.pvalues_ <= selected.max()