From a71eb91734ec694c9b4146982f840c9a30e9fb56 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Mon, 13 Feb 2017 11:21:39 -0800 Subject: [PATCH 01/43] r_regression and abs_r_regression added --- .../feature_selection/univariate_selection.py | 77 ++++++++++++++++--- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index f1d6047f0b55e..bf07970ea55c6 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -227,18 +227,15 @@ def chi2(X, y): return _chisquare(observed, expected) -def f_regression(X, y, center=True): - """Univariate linear regression tests. +def r_regression(X, y, center=True): + """Univariate linear regression tests returning Pearson R. Quick linear model for testing the effect of a single regressor, sequentially for many regressors. - This is done in 2 steps: - - 1. The cross correlation between each regressor and the target is computed, + The cross correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)). - 2. It is converted to an F score then to a p-value. Read more in the :ref:`User Guide `. @@ -255,14 +252,12 @@ def f_regression(X, y, center=True): Returns ------- - F : array, shape=(n_features,) - F values of features. - - pval : array, shape=(n_features,) - p-values of F-scores. + corr : array, shape=(n_features,) + Pearson R correlation coefficients of features. See also -------- + f_regression: Univariate linear regression tests returning f-statistic and p-values f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. """ @@ -288,14 +283,70 @@ def f_regression(X, y, center=True): corr = safe_sparse_dot(y, X) corr /= X_norms corr /= norm(y) + return corr - # convert to p-value +def f_regression(X, y, center=True): + """Univariate linear regression tests returning F-statistic and p-values. + + Quick linear model for testing the effect of a single regressor, + sequentially for many regressors. + + This is done in 2 steps: + + 1. The cross correlation between each regressor and the target is computed, + that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * + std(y)) using r_regression function. + 2. It is converted to an F score and then to a p-value. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} shape = (n_samples, n_features) + The set of regressors that will be tested sequentially. + + y : array of shape(n_samples). + The data matrix + + center : True, bool, + If true, X and y will be centered. + + Returns + ------- + F : array, shape=(n_features,) + F values of features. + + pval : array, shape=(n_features,) + p-values of F-scores. + + See also + -------- + r_regression: Univariate linear regression tests returning Pearson R. + f_classif: ANOVA F-value between label/feature for classification tasks. + chi2: Chi-squared stats of non-negative features for classification tasks. + """ + + # compute the correlation + corr = r_regression(X, y, center=center) degrees_of_freedom = y.size - (2 if center else 1) + # convert to p-value F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom pv = stats.f.sf(F, 1, degrees_of_freedom) return F, pv +def abs_r_regression(X, y, center=True): + """Univariate linear regression tests returning absolute value of Pearson R. + + This convenience wrapper is to be used with SelectKBest and other models + that require a statistic which is increases with significance of association. + + see r_regression for details. + """ + # compute the correlation + corr = r_regression(X, y, center=center) + return abs(corr) + ###################################################################### # Base classes @@ -464,6 +515,8 @@ class SelectKBest(_BaseFilter): f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. + abs_r_regression: absolute value of Pearson R between label/feature for + regression tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continious target. SelectPercentile: Select features based on percentile of the highest scores. From 88ade5322bb2f5752fd44de3e470776d6693e6ec Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Mon, 13 Feb 2017 11:21:39 -0800 Subject: [PATCH 02/43] r_regression and abs_r_regression added --- .../feature_selection/univariate_selection.py | 77 ++++++++++++++++--- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index f1d6047f0b55e..9430fbee7c687 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -227,18 +227,15 @@ def chi2(X, y): return _chisquare(observed, expected) -def f_regression(X, y, center=True): - """Univariate linear regression tests. +def r_regression(X, y, center=True): + """Univariate linear regression tests returning Pearson R. Quick linear model for testing the effect of a single regressor, sequentially for many regressors. - This is done in 2 steps: - - 1. The cross correlation between each regressor and the target is computed, + The cross correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)). - 2. It is converted to an F score then to a p-value. Read more in the :ref:`User Guide `. @@ -255,14 +252,12 @@ def f_regression(X, y, center=True): Returns ------- - F : array, shape=(n_features,) - F values of features. - - pval : array, shape=(n_features,) - p-values of F-scores. + corr : array, shape=(n_features,) + Pearson R correlation coefficients of features. See also -------- + f_regression: Univariate linear regression tests returning f-statistic and p-values f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. """ @@ -288,14 +283,70 @@ def f_regression(X, y, center=True): corr = safe_sparse_dot(y, X) corr /= X_norms corr /= norm(y) + return corr - # convert to p-value +def f_regression(X, y, center=True): + """Univariate linear regression tests returning F-statistic and p-values. + + Quick linear model for testing the effect of a single regressor, + sequentially for many regressors. + + This is done in 2 steps: + + 1. The cross correlation between each regressor and the target is computed, + that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * + std(y)) using r_regression function. + 2. It is converted to an F score and then to a p-value. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} shape = (n_samples, n_features) + The set of regressors that will be tested sequentially. + + y : array of shape(n_samples). + The data matrix + + center : True, bool, + If true, X and y will be centered. + + Returns + ------- + F : array, shape=(n_features,) + F values of features. + + pval : array, shape=(n_features,) + p-values of F-scores. + + See also + -------- + r_regression: Univariate linear regression tests returning Pearson R. + f_classif: ANOVA F-value between label/feature for classification tasks. + chi2: Chi-squared stats of non-negative features for classification tasks. + """ + + # compute the correlation + corr = r_regression(X, y, center=center) degrees_of_freedom = y.size - (2 if center else 1) + # convert to p-value F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom pv = stats.f.sf(F, 1, degrees_of_freedom) return F, pv +def abs_r_regression(X, y, center=True): + """Univariate linear regression tests returning absolute value of Pearson R. + + This convenience wrapper is to be used with SelectKBest and other models + that require a statistic which is increases with significance of association. + + see r_regression for details. + """ + # compute the correlation + corr = r_regression(X, y, center=center) + return abs(corr) + ###################################################################### # Base classes @@ -464,6 +515,8 @@ class SelectKBest(_BaseFilter): f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. + abs_r_regression: absolute value of Pearson R between label/feature for + regression tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continious target. SelectPercentile: Select features based on percentile of the highest scores. From fbe3f97ee48c57631d267c1c9318033246149bb7 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Wed, 15 Feb 2017 10:23:35 -0800 Subject: [PATCH 03/43] whitespace fix --- sklearn/feature_selection/univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 9430fbee7c687..28a270e0dc2ef 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -515,7 +515,7 @@ class SelectKBest(_BaseFilter): f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. - abs_r_regression: absolute value of Pearson R between label/feature for + abs_r_regression: absolute value of Pearson R between label/feature for regression tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continious target. From f4f6a1abd1888910c0c917a4a44c657e1218bc18 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Wed, 15 Feb 2017 10:46:16 -0800 Subject: [PATCH 04/43] import and tests --- sklearn/feature_selection/__init__.py | 2 + .../tests/test_feature_select.py | 48 +++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index ffa392b5b26db..038883ce5ed8e 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -8,6 +8,8 @@ from .univariate_selection import f_classif from .univariate_selection import f_oneway from .univariate_selection import f_regression +from .univariate_selection import r_regression +from .univariate_selection import abs_r_regression from .univariate_selection import SelectPercentile from .univariate_selection import SelectKBest from .univariate_selection import SelectFpr diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 6567cc3d16493..1d0e7b8d0d6dd 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -26,9 +26,10 @@ from sklearn.datasets.samples_generator import (make_classification, make_regression) from sklearn.feature_selection import ( - chi2, f_classif, f_oneway, f_regression, mutual_info_classif, - mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr, - SelectFdr, SelectFwe, GenericUnivariateSelect) + chi2, f_classif, f_oneway, f_regression, abs_r_regression, + mutual_info_classif, mutual_info_regression, SelectPercentile, + SelectKBest, SelectFpr, SelectFdr, SelectFwe, + GenericUnivariateSelect) ############################################################################## @@ -79,6 +80,28 @@ def test_f_classif(): assert_array_almost_equal(pv_sparse, pv) +def test_abs_r_regression(): + # Test whether the F test yields meaningful results + # on a simple simulated regression problem + X, y = make_regression(n_samples=200, n_features=20, n_informative=5, + shuffle=False, random_state=0) + + abs_pearson_r = abs_r_regression(X, y) + assert_true((abs_pearson_r < 1).all()) + assert_true((abs_pearson_r[:5] > 0.1).all()) + assert_true((abs_pearson_r[5:] < 0.2 ).all()) + + # with centering, compare with sparse + abs_pearson_r = f_regression(X, y, center=True) + abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=True) + assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r) + + # again without centering, compare with sparse + abs_pearson_r = f_regression(X, y, center=False) + abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=False) + assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r) + + def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem @@ -357,6 +380,25 @@ def test_select_kbest_regression(): assert_array_equal(support, gtruth) +def test_select_kbest_abs_r_regression(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple regression problem + # with the k best heuristic + X, y = make_regression(n_samples=200, n_features=20, n_informative=5, + shuffle=False, random_state=0, noise=10) + + univariate_filter = SelectKBest(abs_r_regression, k=5) + X_r = univariate_filter.fit(X, y).transform(X) + assert_best_scores_kept(univariate_filter) + X_r2 = GenericUnivariateSelect( + f_regression, mode='k_best', param=5).fit(X, y).transform(X) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support, gtruth) + + def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem From 0fdfa134917037a1714308b653f18dece9a011f3 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Wed, 15 Feb 2017 11:00:41 -0800 Subject: [PATCH 05/43] indentation --- .../feature_selection/univariate_selection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 28a270e0dc2ef..03377e071858d 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -336,16 +336,16 @@ def f_regression(X, y, center=True): def abs_r_regression(X, y, center=True): - """Univariate linear regression tests returning absolute value of Pearson R. + """Univariate linear regression tests returning absolute value of Pearson R. - This convenience wrapper is to be used with SelectKBest and other models - that require a statistic which is increases with significance of association. + This convenience wrapper is to be used with SelectKBest and other models + that require a statistic which is increases with significance of association. - see r_regression for details. - """ - # compute the correlation - corr = r_regression(X, y, center=center) - return abs(corr) + see r_regression for details. + """ + # compute the correlation + corr = r_regression(X, y, center=center) + return abs(corr) ###################################################################### # Base classes From 3b5498fd9e79ac4701240d35db48c8a9415634b0 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Wed, 15 Feb 2017 11:50:00 -0800 Subject: [PATCH 06/43] code style --- sklearn/feature_selection/tests/test_feature_select.py | 2 +- sklearn/feature_selection/univariate_selection.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 1d0e7b8d0d6dd..dd12aac480654 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -89,7 +89,7 @@ def test_abs_r_regression(): abs_pearson_r = abs_r_regression(X, y) assert_true((abs_pearson_r < 1).all()) assert_true((abs_pearson_r[:5] > 0.1).all()) - assert_true((abs_pearson_r[5:] < 0.2 ).all()) + assert_true((abs_pearson_r[5:] < 0.2).all()) # with centering, compare with sparse abs_pearson_r = f_regression(X, y, center=True) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 03377e071858d..fe4782899e6d9 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -257,7 +257,8 @@ def r_regression(X, y, center=True): See also -------- - f_regression: Univariate linear regression tests returning f-statistic and p-values + f_regression: Univariate linear regression tests returning f-statistic + and p-values f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. """ @@ -285,6 +286,7 @@ def r_regression(X, y, center=True): corr /= norm(y) return corr + def f_regression(X, y, center=True): """Univariate linear regression tests returning F-statistic and p-values. @@ -336,10 +338,11 @@ def f_regression(X, y, center=True): def abs_r_regression(X, y, center=True): - """Univariate linear regression tests returning absolute value of Pearson R. + """Absolute value of Pearson R from univariate linear regressions. This convenience wrapper is to be used with SelectKBest and other models - that require a statistic which is increases with significance of association. + that require a statistic which is increases with significance of + association. see r_regression for details. """ From fa5dfe3161a2e365efed0237ee8d7a82d7444191 Mon Sep 17 00:00:00 2001 From: Dmytro S Lituiev Date: Wed, 15 Feb 2017 11:52:24 -0800 Subject: [PATCH 07/43] init fix --- sklearn/feature_selection/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index 038883ce5ed8e..5584eafc831c2 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -41,5 +41,7 @@ 'f_classif', 'f_oneway', 'f_regression', + 'r_regression', + 'abs_r_regression', 'mutual_info_classif', 'mutual_info_regression'] From f6dcf523fca8dbade01d97f3e871adbe3ad08336 Mon Sep 17 00:00:00 2001 From: "Julien Jerphanion (@jjerphan)" Date: Sat, 9 May 2020 16:15:21 +0200 Subject: [PATCH 08/43] Change assert_true for assert --- sklearn/feature_selection/tests/test_feature_select.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 923ff14321770..70094e8748af3 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -79,9 +79,9 @@ def test_abs_r_regression(): shuffle=False, random_state=0) abs_pearson_r = abs_r_regression(X, y) - assert_true((abs_pearson_r < 1).all()) - assert_true((abs_pearson_r[:5] > 0.1).all()) - assert_true((abs_pearson_r[5:] < 0.2).all()) + assert ((abs_pearson_r < 1).all()) + assert ((abs_pearson_r[:5] > 0.1).all()) + assert ((abs_pearson_r[5:] < 0.2).all()) # with centering, compare with sparse abs_pearson_r = f_regression(X, y, center=True) From 63d6fd759bcf3acadbc5cb1d6a144523a7d60e09 Mon Sep 17 00:00:00 2001 From: "Julien Jerphanion (@jjerphan)" Date: Sat, 9 May 2020 16:19:05 +0200 Subject: [PATCH 09/43] Change module for import to match current one I made this change to have the test ran as I previously got: ``` sklearn/feature_selection/tests/test_feature_select.py:None (sklearn/feature_selection/tests/test_feature_select.py) /home/jsquared/.virtualenvs/sk/lib64/python3.8/site-packages/py/_path/local.py:701: in pyimport __import__(modname) ../__init__.py:27: in from ._mutual_info import mutual_info_regression, mutual_info_classif ../_mutual_info.py:9: in from ..neighbors import NearestNeighbors ../../neighbors/__init__.py:17: in from ._nca import NeighborhoodComponentsAnalysis ../../neighbors/_nca.py:22: in from ..decomposition import PCA ../../decomposition/__init__.py:17: in from .dict_learning import dict_learning E ModuleNotFoundError: No module named 'sklearn.decomposition.dict_learning' ``` It seems that there are reason for this special handling to exist according to the comment above. This might need to be reverted. --- sklearn/decomposition/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index 42f661171eafe..cd4ec5b3c8668 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -14,7 +14,7 @@ import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) - from .dict_learning import dict_learning + from ._dict_learning import dict_learning from ._nmf import NMF, non_negative_factorization # noqa From 1c9d5125be1d9eff482f12c02c363798fda933b8 Mon Sep 17 00:00:00 2001 From: "Julien Jerphanion (@jjerphan)" Date: Sat, 9 May 2020 16:35:02 +0200 Subject: [PATCH 10/43] Add reference in doc comment --- sklearn/feature_selection/_univariate_selection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index c2290880f712a..762c0f9c948f5 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -356,7 +356,9 @@ def abs_r_regression(X, y, center=True): that require a statistic which is increases with significance of association. - see r_regression for details. + See also + -------- + r_regression: Univariate linear regression tests returning Pearson R. """ # compute the correlation corr = r_regression(X, y, center=center) From 45514b2a79a53ec78d32047e09caa56477b22378 Mon Sep 17 00:00:00 2001 From: "Julien Jerphanion (@jjerphan)" Date: Sat, 9 May 2020 17:07:15 +0200 Subject: [PATCH 11/43] Revert "Change module for import to match current one" This reverts commit 63d6fd759bcf3acadbc5cb1d6a144523a7d60e09. --- sklearn/decomposition/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index cd4ec5b3c8668..42f661171eafe 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -14,7 +14,7 @@ import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) - from ._dict_learning import dict_learning + from .dict_learning import dict_learning from ._nmf import NMF, non_negative_factorization # noqa From 6c120e3cec437e3962088008a374b56388393da7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 9 Feb 2021 18:37:54 +0100 Subject: [PATCH 12/43] Merge branch 'main' into r_regression --- .../_univariate_selection.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 4259c24c14777..f4339efd1fc81 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -141,7 +141,7 @@ def f_classif(X, y): pval : array, shape = [n_features,] The set of p-values. - See Also + See also -------- chi2 : Chi-squared stats of non-negative features for classification tasks. f_regression : F-value between label/feature for regression tasks. @@ -204,7 +204,7 @@ def chi2(X, y): ----- Complexity of this algorithm is O(n_classes * n_features). - See Also + See also -------- f_classif : ANOVA F-value between label/feature for classification tasks. f_regression : F-value between label/feature for regression tasks. @@ -259,7 +259,7 @@ def r_regression(X, y, *, center=True): corr : array, shape=(n_features,) Pearson R correlation coefficients of features. - See Also + See also -------- f_regression: Univariate linear regression tests returning f-statistic and p-values @@ -438,7 +438,7 @@ class SelectPercentile(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. - Default is f_classif (see below "See Also"). The default function only + Default is f_classif (see below "See also"). The default function only works with classification tasks. .. versionadded:: 0.18 @@ -470,7 +470,7 @@ class SelectPercentile(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. - See Also + See also -------- f_classif : ANOVA F-value between label/feature for classification tasks. mutual_info_classif : Mutual information for a discrete target. @@ -524,7 +524,7 @@ class SelectKBest(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. - Default is f_classif (see below "See Also"). The default function only + Default is f_classif (see below "See also"). The default function only works with classification tasks. .. versionadded:: 0.18 @@ -557,7 +557,7 @@ class SelectKBest(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. - See Also + See also -------- f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. @@ -615,7 +615,7 @@ class SelectFpr(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). - Default is f_classif (see below "See Also"). The default function only + Default is f_classif (see below "See also"). The default function only works with classification tasks. alpha : float, default=5e-2 @@ -640,7 +640,7 @@ class SelectFpr(_BaseFilter): >>> X_new.shape (569, 16) - See Also + See also -------- f_classif : ANOVA F-value between label/feature for classification tasks. chi2 : Chi-squared stats of non-negative features for classification tasks. @@ -679,7 +679,7 @@ class SelectFdr(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). - Default is f_classif (see below "See Also"). The default function only + Default is f_classif (see below "See also"). The default function only works with classification tasks. alpha : float, default=5e-2 @@ -708,7 +708,7 @@ class SelectFdr(_BaseFilter): ---------- https://en.wikipedia.org/wiki/False_discovery_rate - See Also + See also -------- f_classif : ANOVA F-value between label/feature for classification tasks. mutual_info_classif : Mutual information for a discrete target. @@ -750,7 +750,7 @@ class SelectFwe(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). - Default is f_classif (see below "See Also"). The default function only + Default is f_classif (see below "See also"). The default function only works with classification tasks. alpha : float, default=5e-2 @@ -775,7 +775,7 @@ class SelectFwe(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores. - See Also + See also -------- f_classif : ANOVA F-value between label/feature for classification tasks. chi2 : Chi-squared stats of non-negative features for classification tasks. @@ -843,7 +843,7 @@ class GenericUnivariateSelect(_BaseFilter): >>> X_new.shape (569, 20) - See Also + See also -------- f_classif : ANOVA F-value between label/feature for classification tasks. mutual_info_classif : Mutual information for a discrete target. From 98383aa22a413c3e1a8d168149aa38af54223123 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 15 Feb 2021 16:39:53 +0100 Subject: [PATCH 13/43] Add documentation reference for abs_r_regression and r_regression --- doc/modules/classes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 65d555f978df0..33126a7ed259a 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -557,9 +557,11 @@ From text :toctree: generated/ :template: function.rst + feature_selection.abs_r_regression feature_selection.chi2 feature_selection.f_classif feature_selection.f_regression + feature_selection.r_regression feature_selection.mutual_info_classif feature_selection.mutual_info_regression From 17838def33d17084da41b6484946ff4d8f7fbe0a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 15 Feb 2021 17:01:08 +0100 Subject: [PATCH 14/43] Complete test to include r_regression --- .../tests/test_feature_select.py | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 2ec73cff20b7a..120e8aec0bc1e 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -18,7 +18,7 @@ from sklearn.datasets import make_classification, make_regression from sklearn.feature_selection import ( - chi2, f_classif, f_oneway, f_regression, abs_r_regression, + chi2, f_classif, f_oneway, f_regression, abs_r_regression, r_regression, mutual_info_classif, mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr, SelectFdr, SelectFwe, GenericUnivariateSelect) @@ -72,26 +72,23 @@ def test_f_classif(): assert_array_almost_equal(pv_sparse, pv) -def test_abs_r_regression(): - # Test whether the F test yields meaningful results - # on a simple simulated regression problem +@pytest.mark.parametrize("coeff", [abs_r_regression, r_regression]) +def test_r_regression(coeff): X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) - abs_pearson_r = abs_r_regression(X, y) - assert ((abs_pearson_r < 1).all()) - assert ((abs_pearson_r[:5] > 0.1).all()) - assert ((abs_pearson_r[5:] < 0.2).all()) + pearson_r = coeff(X, y) + assert ((pearson_r < 1).all()) # with centering, compare with sparse - abs_pearson_r = f_regression(X, y, center=True) - abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=True) - assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r) + pearson_r = coeff(X, y, center=True) + pearson_r_sparse = coeff(sparse.csr_matrix(X), y, center=True) + assert_array_almost_equal(pearson_r_sparse, pearson_r) # again without centering, compare with sparse - abs_pearson_r = f_regression(X, y, center=False) - abs_pearson_r_sparse = f_regression(sparse.csr_matrix(X), y, center=False) - assert_array_almost_equal(abs_pearson_r_sparse, abs_pearson_r) + pearson_r = coeff(X, y, center=False) + pearson_r_sparse = coeff(sparse.csr_matrix(X), y, center=False) + assert_array_almost_equal(pearson_r_sparse, pearson_r) def test_f_regression(): From e1a43e8468a45125b202d72b5e8033aa2da302aa Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 16 Feb 2021 08:38:21 +0100 Subject: [PATCH 15/43] Remove Sphinx warning on indentation Co-authored-by: Chiara Marmo --- sklearn/feature_selection/_univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index f4339efd1fc81..66b0c368d7437 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -239,7 +239,7 @@ def r_regression(X, y, *, center=True): The cross correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * - std(y)). + std(y)). For more on usage see the :ref:`User Guide `. From e5aa83144af3fd5014c545bd50a5c2d6f7ac634f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 16 Feb 2021 15:10:36 +0100 Subject: [PATCH 16/43] Use 'See Also' over 'See Also' See Guidelines for writing documentation: https://scikit-learn.org/stable/developers/contributing.html#guidelines-for-writing-documentation Co-authored-by: Olivier Grisel --- .../_univariate_selection.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 66b0c368d7437..1e5ac53868303 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -141,7 +141,7 @@ def f_classif(X, y): pval : array, shape = [n_features,] The set of p-values. - See also + See Also -------- chi2 : Chi-squared stats of non-negative features for classification tasks. f_regression : F-value between label/feature for regression tasks. @@ -204,7 +204,7 @@ def chi2(X, y): ----- Complexity of this algorithm is O(n_classes * n_features). - See also + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. f_regression : F-value between label/feature for regression tasks. @@ -259,7 +259,7 @@ def r_regression(X, y, *, center=True): corr : array, shape=(n_features,) Pearson R correlation coefficients of features. - See also + See Also -------- f_regression: Univariate linear regression tests returning f-statistic and p-values @@ -334,7 +334,7 @@ def f_regression(X, y, *, center=True): pval : array, shape=(n_features,) p-values of F-scores. - See also + See Also -------- r_regression: Univariate linear regression tests returning Pearson R. f_classif: ANOVA F-value between label/feature for classification tasks. @@ -357,7 +357,7 @@ def abs_r_regression(X, y, center=True): that require a statistic which is increases with significance of association. - See also + See Also -------- r_regression: Univariate linear regression tests returning Pearson R. """ @@ -438,7 +438,7 @@ class SelectPercentile(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. - Default is f_classif (see below "See also"). The default function only + Default is f_classif (see below "See Also"). The default function only works with classification tasks. .. versionadded:: 0.18 @@ -470,7 +470,7 @@ class SelectPercentile(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. - See also + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. mutual_info_classif : Mutual information for a discrete target. @@ -524,7 +524,7 @@ class SelectKBest(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. - Default is f_classif (see below "See also"). The default function only + Default is f_classif (see below "See Also"). The default function only works with classification tasks. .. versionadded:: 0.18 @@ -557,7 +557,7 @@ class SelectKBest(_BaseFilter): Ties between features with equal scores will be broken in an unspecified way. - See also + See Also -------- f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. @@ -615,7 +615,7 @@ class SelectFpr(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). - Default is f_classif (see below "See also"). The default function only + Default is f_classif (see below "See Also"). The default function only works with classification tasks. alpha : float, default=5e-2 @@ -640,7 +640,7 @@ class SelectFpr(_BaseFilter): >>> X_new.shape (569, 16) - See also + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. chi2 : Chi-squared stats of non-negative features for classification tasks. @@ -679,7 +679,7 @@ class SelectFdr(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). - Default is f_classif (see below "See also"). The default function only + Default is f_classif (see below "See Also"). The default function only works with classification tasks. alpha : float, default=5e-2 @@ -708,7 +708,7 @@ class SelectFdr(_BaseFilter): ---------- https://en.wikipedia.org/wiki/False_discovery_rate - See also + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. mutual_info_classif : Mutual information for a discrete target. @@ -750,7 +750,7 @@ class SelectFwe(_BaseFilter): score_func : callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). - Default is f_classif (see below "See also"). The default function only + Default is f_classif (see below "See Also"). The default function only works with classification tasks. alpha : float, default=5e-2 @@ -775,7 +775,7 @@ class SelectFwe(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores. - See also + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. chi2 : Chi-squared stats of non-negative features for classification tasks. @@ -843,7 +843,7 @@ class GenericUnivariateSelect(_BaseFilter): >>> X_new.shape (569, 20) - See also + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. mutual_info_classif : Mutual information for a discrete target. From bcaae35b4fa71bbd048c81db9eff2633cc7ee62e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 16 Feb 2021 15:40:09 +0100 Subject: [PATCH 17/43] Clarify docstrings --- .../_univariate_selection.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 1e5ac53868303..8bd379b15988f 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -231,7 +231,7 @@ def chi2(X, y): @_deprecate_positional_args def r_regression(X, y, *, center=True): - """Univariate linear regression tests returning Pearson R. + """Compute Pearson R correlation coefficients of features. Linear model for testing the individual effect of each of many regressors. This is a scoring function to be used in a feature selection procedure, not @@ -261,17 +261,13 @@ def r_regression(X, y, *, center=True): See Also -------- + abs_r_regression: Absolute value of Pearson R between label and features + for regression tasks. f_regression: Univariate linear regression tests returning f-statistic and p-values mutual_info_regression: Mutual information for a continuous target. f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. - SelectKBest: Select features based on the k highest scores. - SelectFpr: Select features based on a false positive rate test. - SelectFdr: Select features based on an estimated false discovery rate. - SelectFwe: Select features based on family-wise error rate. - SelectPercentile: Select features based on percentile of the highest - scores. """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) @@ -313,6 +309,14 @@ def f_regression(X, y, *, center=True): std(y)) using r_regression function. 2. It is converted to an F score and then to a p-value. + If p-values are not needed, r_regression can be used to rank features as + a slightly cheaper alternative to f_regression. Note however that + contrary to f_regression, r_regression values lie in [-1, 1] and can thus + be negative. + + Alternatively, `abs_r_regression` can be used to rank features by + correlation magnitude instead. + Read more in the :ref:`User Guide `. Parameters @@ -336,9 +340,17 @@ def f_regression(X, y, *, center=True): See Also -------- - r_regression: Univariate linear regression tests returning Pearson R. + abs_r_regression: Absolute value of Pearson R between label and features + for regression tasks. + r_regression: Pearson R between label/feature for regression tasks. f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. + SelectKBest: Select features based on the k highest scores. + SelectFpr: Select features based on a false positive rate test. + SelectFdr: Select features based on an estimated false discovery rate. + SelectFwe: Select features based on family-wise error rate. + SelectPercentile: Select features based on percentile of the highest + scores. """ # compute the correlation @@ -360,6 +372,7 @@ def abs_r_regression(X, y, center=True): See Also -------- r_regression: Univariate linear regression tests returning Pearson R. + SelectKBest: Select features based on the k highest scores. """ # compute the correlation corr = r_regression(X, y, center=center) @@ -562,8 +575,8 @@ class SelectKBest(_BaseFilter): f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. - abs_r_regression: absolute value of Pearson R between label/feature for - regression tasks. + abs_r_regression: Absolute value of Pearson R between label and features + for regression tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continuous target. SelectPercentile: Select features based on percentile of the highest From 8a803ea11a9a2cad53ce0d5e8250b97194a81f73 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 16 Feb 2021 15:48:57 +0100 Subject: [PATCH 18/43] Add consistency test --- .../tests/test_feature_select.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 120e8aec0bc1e..b0d71a3ba3f36 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -4,6 +4,7 @@ import itertools import warnings import numpy as np +from numpy.testing import assert_allclose from scipy import stats, sparse import pytest @@ -117,6 +118,20 @@ def test_f_regression(): assert_array_almost_equal(pv_sparse, pv) +def test_f_regression_r_regression_consistency(): + # Test the equivalence of f_regression and abs_r_regression for variable + # selection using the returned values ordering + X, y = make_regression(n_samples=200, n_features=1000, + shuffle=False, random_state=0) + + Fs, _ = f_regression(X, y) + + assert_array_equal(Fs.argsort(), abs_r_regression(X, y).argsort()) + + # Test consistency of definition + assert_allclose(abs_r_regression(X, y), np.abs(r_regression(X, y))) + + def test_f_regression_input_dtype(): # Test whether f_regression returns the same value # for any numeric data_type From 47b2ea904818301784bb96f6408795b943f6f860 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 16 Feb 2021 15:52:09 +0100 Subject: [PATCH 19/43] Test for Pearson R correct support Also use a better wording. --- .../tests/test_feature_select.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index b0d71a3ba3f36..cbadd80809caf 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -78,18 +78,20 @@ def test_r_regression(coeff): X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) - pearson_r = coeff(X, y) - assert ((pearson_r < 1).all()) + correlation_coeffs = coeff(X, y) + if coeff == r_regression: + assert ((correlation_coeffs > -1).all()) + assert ((correlation_coeffs < 1).all()) # with centering, compare with sparse - pearson_r = coeff(X, y, center=True) - pearson_r_sparse = coeff(sparse.csr_matrix(X), y, center=True) - assert_array_almost_equal(pearson_r_sparse, pearson_r) + correlation_coeffs = coeff(X, y, center=True) + correlation_coeffs_sparse = coeff(sparse.csr_matrix(X), y, center=True) + assert_array_almost_equal(correlation_coeffs_sparse, correlation_coeffs) # again without centering, compare with sparse - pearson_r = coeff(X, y, center=False) - pearson_r_sparse = coeff(sparse.csr_matrix(X), y, center=False) - assert_array_almost_equal(pearson_r_sparse, pearson_r) + correlation_coeffs = coeff(X, y, center=False) + correlation_coeffs_sparse = coeff(sparse.csr_matrix(X), y, center=False) + assert_array_almost_equal(correlation_coeffs_sparse, correlation_coeffs) def test_f_regression(): From da964772dff07e90e8501b1825400b4e511616c3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 16 Feb 2021 16:28:28 +0100 Subject: [PATCH 20/43] fixup! Clarify docstrings --- sklearn/feature_selection/_univariate_selection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 8bd379b15988f..32bdd085a8935 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -231,7 +231,8 @@ def chi2(X, y): @_deprecate_positional_args def r_regression(X, y, *, center=True): - """Compute Pearson R correlation coefficients of features. + """Compute Pearson R correlation coefficients between the features and + the target Linear model for testing the individual effect of each of many regressors. This is a scoring function to be used in a feature selection procedure, not @@ -366,7 +367,7 @@ def abs_r_regression(X, y, center=True): """Absolute value of Pearson R from univariate linear regressions. This convenience wrapper is to be used with SelectKBest and other models - that require a statistic which is increases with significance of + that require a statistic which is increased with significance of association. See Also From 4a548b5227c3840281f733844c6b056cb4af251a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 12 Mar 2021 21:56:39 +0100 Subject: [PATCH 21/43] Add docstring directives for additions in version 1.0 Co-authored-by: Chiara Marmo --- doc/whats_new/v1.0.rst | 11 +++++++++++ sklearn/feature_selection/_univariate_selection.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a566d03ae1bbc..3732b5a6d8412 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -103,6 +103,17 @@ Changelog input strings would result in negative indices in the transformed data. :pr:`19035` by :user:`Liu Yu `. +:mod:`sklearn.feature_selection` +................................ + +- |Feature| :func:`feature_selection.abs_r_regression` is new criterion + which can be used with :class:`feature_selection.SelectKBest` to select + variables. It is the absolution values of + :func:`feature_selection.r_regression` which computes Pearson R correlation + coefficients between the features and the target. + :pr:`17169` by `Dmytro Lituiev ` + and `Julien Jerphanion `. + :mod:`sklearn.inspection` ......................... diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 32bdd085a8935..a78fe309dce89 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -234,6 +234,8 @@ def r_regression(X, y, *, center=True): """Compute Pearson R correlation coefficients between the features and the target + .. versionadded:: 1.0 + Linear model for testing the individual effect of each of many regressors. This is a scoring function to be used in a feature selection procedure, not a free standing feature selection procedure. @@ -366,6 +368,8 @@ def f_regression(X, y, *, center=True): def abs_r_regression(X, y, center=True): """Absolute value of Pearson R from univariate linear regressions. + .. versionadded:: 1.0 + This convenience wrapper is to be used with SelectKBest and other models that require a statistic which is increased with significance of association. From 711544c24e568403f840042d41fd1a3ff47f2b00 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:05:30 +0200 Subject: [PATCH 22/43] Fix typo in the whats_new entry Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 3732b5a6d8412..d146df8dff303 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -106,7 +106,7 @@ Changelog :mod:`sklearn.feature_selection` ................................ -- |Feature| :func:`feature_selection.abs_r_regression` is new criterion +- |Feature| :func:`feature_selection.abs_r_regression` is a new criterion which can be used with :class:`feature_selection.SelectKBest` to select variables. It is the absolution values of :func:`feature_selection.r_regression` which computes Pearson R correlation From f7fa09d407877841ce9a35b9cf77c1e4f37738d7 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:08:19 +0200 Subject: [PATCH 23/43] End sentence with a full stop. Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index a78fe309dce89..6e855f755dee7 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -232,7 +232,7 @@ def chi2(X, y): @_deprecate_positional_args def r_regression(X, y, *, center=True): """Compute Pearson R correlation coefficients between the features and - the target + the target. .. versionadded:: 1.0 From cfe2299903c186dde0d305f5b043448ac5dba30b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:08:43 +0200 Subject: [PATCH 24/43] Correct typo in docstring. Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 6e855f755dee7..825f2b9b20732 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -231,7 +231,7 @@ def chi2(X, y): @_deprecate_positional_args def r_regression(X, y, *, center=True): - """Compute Pearson R correlation coefficients between the features and + """Compute Pearson's R correlation coefficients between the features and the target. .. versionadded:: 1.0 From 92c389ddb44741f764ab6bf91050534649694d38 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:09:27 +0200 Subject: [PATCH 25/43] Correct typo in the whats_new entry. Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index d146df8dff303..b0e8ca63e78f8 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -109,7 +109,7 @@ Changelog - |Feature| :func:`feature_selection.abs_r_regression` is a new criterion which can be used with :class:`feature_selection.SelectKBest` to select variables. It is the absolution values of - :func:`feature_selection.r_regression` which computes Pearson R correlation + :func:`feature_selection.r_regression` which computes Pearson's R correlation coefficients between the features and the target. :pr:`17169` by `Dmytro Lituiev ` and `Julien Jerphanion `. From 29bf1d06a34ccd6bf797b2dab13f49b7b99e1c48 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:11:10 +0200 Subject: [PATCH 26/43] Fix syntax in r_regression's docstring. Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 825f2b9b20732..0c55ea1adcb56 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -240,9 +240,8 @@ def r_regression(X, y, *, center=True): This is a scoring function to be used in a feature selection procedure, not a free standing feature selection procedure. - The cross correlation between each regressor and the target is computed, - that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * - std(y)). + The cross correlation between each regressor and the target is computed + as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)). For more on usage see the :ref:`User Guide `. From bbf4179631d34147e2263d2f949db641a945412a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:21:59 +0200 Subject: [PATCH 27/43] Sort imports alphabetically Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index a72719e7a6878..eae46cf21006d 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -42,12 +42,12 @@ 'SelectFromModel', 'SelectPercentile', 'VarianceThreshold', + 'abs_r_regression', 'chi2', 'f_classif', 'f_oneway', 'f_regression', 'r_regression', - 'abs_r_regression', 'mutual_info_classif', 'mutual_info_regression', 'SelectorMixin'] From af31e7244d137e63e316a43bce9a4adbfec69a92 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:22:58 +0200 Subject: [PATCH 28/43] Remove useless decorator Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 0c55ea1adcb56..6a976555e8d5f 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -229,7 +229,6 @@ def chi2(X, y): return _chisquare(observed, expected) -@_deprecate_positional_args def r_regression(X, y, *, center=True): """Compute Pearson's R correlation coefficients between the features and the target. From 69e638ce2757d05f33e4251e3878229818675d77 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:25:23 +0200 Subject: [PATCH 29/43] Use verbose name for the correlation coefficient Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 6a976555e8d5f..968169b024a8d 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -257,7 +257,7 @@ def r_regression(X, y, *, center=True): Returns ------- - corr : array, shape=(n_features,) + correlation_coefficient : ndarray of shape (n_features,) Pearson R correlation coefficients of features. See Also @@ -290,10 +290,10 @@ def r_regression(X, y, *, center=True): X_norms = row_norms(X.T) # compute the correlation - corr = safe_sparse_dot(y, X) - corr /= X_norms - corr /= np.linalg.norm(y) - return corr + correlation_coefficient = safe_sparse_dot(y, X) + correlation_coefficient /= X_norms + correlation_coefficient /= np.linalg.norm(y) + return correlation_coefficient @_deprecate_positional_args From f5aa51354eb8699fe42df5c001baae4ecd10319c Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:30:56 +0200 Subject: [PATCH 30/43] Improve r_regression and r_regression docstring Use accurate descriptions for the parameters. Co-authored-by: Guillaume Lemaitre --- .../_univariate_selection.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 968169b024a8d..f0d0b9b145fea 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -246,14 +246,15 @@ def r_regression(X, y, *, center=True): Parameters ---------- - X : {array-like, sparse matrix} shape = (n_samples, n_features) - The set of regressors that will be tested sequentially. + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix. - y : array of shape(n_samples). - The data matrix + y : array-like of shape (n_samples,) + The target vector. center : bool, default=True - If true, X and y will be centered. + Whether or not to center the data matrix `X` and the target vector `y`. + By default, `X` and `y` will be centered. Returns ------- @@ -322,14 +323,15 @@ def f_regression(X, y, *, center=True): Parameters ---------- - X : {array-like, sparse matrix} shape = (n_samples, n_features) - The set of regressors that will be tested sequentially. + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix. - y : array of shape(n_samples). - The data matrix + y : array-like of shape (n_samples,) + The target vector. - center : True, bool, - If true, X and y will be centered. + center : bool, default=True + Whether or not to center the data matrix `X` and the target vector `y`. + By default, `X` and `y` will be centered. Returns ------- From dcd94633e75196ed26a17fa859bb5ff7a8783d30 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:36:02 +0200 Subject: [PATCH 31/43] Improve code comments Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index f0d0b9b145fea..70c425be3080e 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -275,8 +275,8 @@ def r_regression(X, y, *, center=True): dtype=np.float64) n_samples = X.shape[0] - # compute centered values - # note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we + # Compute centered values + # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we # need not center X if center: y = y - np.mean(y) @@ -284,13 +284,12 @@ def r_regression(X, y, *, center=True): X_means = X.mean(axis=0).getA1() else: X_means = X.mean(axis=0) - # compute the scaled standard deviations via moments + # Compute the scaled standard deviations via moments X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2) else: X_norms = row_norms(X.T) - # compute the correlation correlation_coefficient = safe_sparse_dot(y, X) correlation_coefficient /= X_norms correlation_coefficient /= np.linalg.norm(y) @@ -355,11 +354,9 @@ def f_regression(X, y, *, center=True): SelectPercentile: Select features based on percentile of the highest scores. """ - - # compute the correlation corr = r_regression(X, y, center=center) degrees_of_freedom = y.size - (2 if center else 1) - # convert to p-value + # Compute the test's statistics and its p-values F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom pv = stats.f.sf(F, 1, degrees_of_freedom) return F, pv @@ -379,7 +376,6 @@ def abs_r_regression(X, y, center=True): r_regression: Univariate linear regression tests returning Pearson R. SelectKBest: Select features based on the k highest scores. """ - # compute the correlation corr = r_regression(X, y, center=center) return abs(corr) From a16df67950731147c1d0ded8e45eedfc740697df Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:39:27 +0200 Subject: [PATCH 32/43] Add Sphinx domains in f_regression's docstring Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/_univariate_selection.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 70c425be3080e..dff2f86c07a9c 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -310,12 +310,12 @@ def f_regression(X, y, *, center=True): std(y)) using r_regression function. 2. It is converted to an F score and then to a p-value. - If p-values are not needed, r_regression can be used to rank features as - a slightly cheaper alternative to f_regression. Note however that - contrary to f_regression, r_regression values lie in [-1, 1] and can thus - be negative. + If p-values are not needed, :func:`r_regression` can be used to rank + features as a slightly cheaper alternative to :func:`f_regression`. + Note however that contrary to :func:`f_regression`, :func:`r_regression` + values lie in [-1, 1] and can thus be negative. - Alternatively, `abs_r_regression` can be used to rank features by + Alternatively, :func:`abs_r_regression` can be used to rank features by correlation magnitude instead. Read more in the :ref:`User Guide `. From 62b76ad16ca31147c8a6736744a5b688c56b2567 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:47:42 +0200 Subject: [PATCH 33/43] Make f statistics' and their p-values' computations clearer Co-authored-by: Guillaume Lemaitre --- .../_univariate_selection.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index dff2f86c07a9c..6ecd488cb4b13 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -334,11 +334,11 @@ def f_regression(X, y, *, center=True): Returns ------- - F : array, shape=(n_features,) - F values of features. + f_statistic : ndarray of shape (n_features,) + F-statistic for each feature. - pval : array, shape=(n_features,) - p-values of F-scores. + p_values : ndarray of shape (n_features,) + P-values associated with the F-statistic. See Also -------- @@ -354,12 +354,13 @@ def f_regression(X, y, *, center=True): SelectPercentile: Select features based on percentile of the highest scores. """ - corr = r_regression(X, y, center=center) - degrees_of_freedom = y.size - (2 if center else 1) - # Compute the test's statistics and its p-values - F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom - pv = stats.f.sf(F, 1, degrees_of_freedom) - return F, pv + correlation_coefficient = r_regression(X, y, center=center) + deg_of_freedom = y.size - (2 if center else 1) + + corr_coef_squared = correlation_coefficient ** 2 + f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom + p_values = stats.f.sf(f_statistic, 1, deg_of_freedom) + return f_statistic, p_values def abs_r_regression(X, y, center=True): From 107534e13b57754bce52dce926ba76a3f27796bd Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 08:51:37 +0200 Subject: [PATCH 34/43] Improve wording abs_r_regression Co-authored-by: Guillaume Lemaitre --- .../_univariate_selection.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 6ecd488cb4b13..d554ff5300ab2 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -263,8 +263,8 @@ def r_regression(X, y, *, center=True): See Also -------- - abs_r_regression: Absolute value of Pearson R between label and features - for regression tasks. + abs_r_regression: Absolute value of Pearson's R correlation coefficients + between label and features for regression tasks. f_regression: Univariate linear regression tests returning f-statistic and p-values mutual_info_regression: Mutual information for a continuous target. @@ -342,8 +342,8 @@ def f_regression(X, y, *, center=True): See Also -------- - abs_r_regression: Absolute value of Pearson R between label and features - for regression tasks. + abs_r_regression: Absolute value of Pearson's R correlation coefficients + between label and features for regression tasks. r_regression: Pearson R between label/feature for regression tasks. f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. @@ -364,21 +364,20 @@ def f_regression(X, y, *, center=True): def abs_r_regression(X, y, center=True): - """Absolute value of Pearson R from univariate linear regressions. + """Absolute value of Pearson's R from univariate linear regressions. .. versionadded:: 1.0 - This convenience wrapper is to be used with SelectKBest and other models - that require a statistic which is increased with significance of - association. + This convenience wrapper is to be used with + :class:`~sklearn.feature_selection.SelectKBest`. See Also -------- - r_regression: Univariate linear regression tests returning Pearson R. + r_regression: Univariate linear regression tests returning Pearson's R + correlation coefficient. SelectKBest: Select features based on the k highest scores. """ - corr = r_regression(X, y, center=center) - return abs(corr) + return np.abs(r_regression(X, y, center=center)) ###################################################################### # Base classes @@ -577,7 +576,7 @@ class SelectKBest(_BaseFilter): f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. - abs_r_regression: Absolute value of Pearson R between label and features + abs_r_regression: Absolute value of Pearson's R between label and features for regression tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continuous target. From ce64e78669238eec2562e2c6d3e5fbfcd3d7ca30 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 09:02:16 +0200 Subject: [PATCH 35/43] Use black formatting style for imports Co-authored-by: Guillaume Lemaitre --- .../tests/test_feature_select.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index cbadd80809caf..23a5920a92b3d 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -19,10 +19,21 @@ from sklearn.datasets import make_classification, make_regression from sklearn.feature_selection import ( - chi2, f_classif, f_oneway, f_regression, abs_r_regression, r_regression, - mutual_info_classif, mutual_info_regression, SelectPercentile, - SelectKBest, SelectFpr, SelectFdr, SelectFwe, - GenericUnivariateSelect) + abs_r_regression, + chi2, + f_classif, + f_oneway, + f_regression, + GenericUnivariateSelect, + mutual_info_classif, + mutual_info_regression, + r_regression, + SelectPercentile, + SelectKBest, + SelectFpr, + SelectFdr, + SelectFwe, +) ############################################################################## From 537de9668a736fb835061a952bf8ea61b9b44eaf Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 09:03:21 +0200 Subject: [PATCH 36/43] Remove useless comments Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/tests/test_feature_select.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 23a5920a92b3d..42627376dc2f1 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -141,7 +141,6 @@ def test_f_regression_r_regression_consistency(): assert_array_equal(Fs.argsort(), abs_r_regression(X, y).argsort()) - # Test consistency of definition assert_allclose(abs_r_regression(X, y), np.abs(r_regression(X, y))) From df1ab5b589ef9393e36b3b2b1a11797a706f3611 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 09:05:02 +0200 Subject: [PATCH 37/43] Prefer assert_allclose over assert_array_almost_equal Co-authored-by: Guillaume Lemaitre --- .../tests/test_feature_select.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 42627376dc2f1..ee34009352e8d 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -11,7 +11,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import assert_warns_message @@ -60,8 +59,8 @@ def test_f_oneway_ints(): # test that is gives the same result as with float f, p = f_oneway(X.astype(float), y) - assert_array_almost_equal(f, fint, decimal=4) - assert_array_almost_equal(p, pint, decimal=4) + assert_allclose(f, fint) + assert_allclose(p, pint) def test_f_classif(): @@ -80,8 +79,8 @@ def test_f_classif(): assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.e-4).all() - assert_array_almost_equal(F_sparse, F) - assert_array_almost_equal(pv_sparse, pv) + assert_allclose(F_sparse, F) + assert_allclose(pv_sparse, pv) @pytest.mark.parametrize("coeff", [abs_r_regression, r_regression]) @@ -97,12 +96,12 @@ def test_r_regression(coeff): # with centering, compare with sparse correlation_coeffs = coeff(X, y, center=True) correlation_coeffs_sparse = coeff(sparse.csr_matrix(X), y, center=True) - assert_array_almost_equal(correlation_coeffs_sparse, correlation_coeffs) + assert_allclose(correlation_coeffs_sparse, correlation_coeffs) # again without centering, compare with sparse correlation_coeffs = coeff(X, y, center=False) correlation_coeffs_sparse = coeff(sparse.csr_matrix(X), y, center=False) - assert_array_almost_equal(correlation_coeffs_sparse, correlation_coeffs) + assert_allclose(correlation_coeffs_sparse, correlation_coeffs) def test_f_regression(): @@ -121,14 +120,14 @@ def test_f_regression(): # with centering, compare with sparse F, pv = f_regression(X, y, center=True) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True) - assert_array_almost_equal(F_sparse, F) - assert_array_almost_equal(pv_sparse, pv) + assert_allclose(F_sparse, F) + assert_allclose(pv_sparse, pv) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False) - assert_array_almost_equal(F_sparse, F) - assert_array_almost_equal(pv_sparse, pv) + assert_allclose(F_sparse, F) + assert_allclose(pv_sparse, pv) def test_f_regression_r_regression_consistency(): @@ -153,8 +152,8 @@ def test_f_regression_input_dtype(): F1, pv1 = f_regression(X, y) F2, pv2 = f_regression(X, y.astype(float)) - assert_array_almost_equal(F1, F2, 5) - assert_array_almost_equal(pv1, pv2, 5) + assert_allclose(F1, F2, 5) + assert_allclose(pv1, pv2, 5) def test_f_regression_center(): @@ -170,7 +169,7 @@ def test_f_regression_center(): F1, _ = f_regression(X, Y, center=True) F2, _ = f_regression(X, Y, center=False) - assert_array_almost_equal(F1 * (n_samples - 1.) / (n_samples - 2.), F2) + assert_allclose(F1 * (n_samples - 1.) / (n_samples - 2.), F2) assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS @@ -309,7 +308,7 @@ def test_select_heuristics_classif(): f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() - assert_array_almost_equal(support, gtruth) + assert_allclose(support, gtruth) ############################################################################## @@ -319,7 +318,7 @@ def test_select_heuristics_classif(): def assert_best_scores_kept(score_filter): scores = score_filter.scores_ support = score_filter.get_support() - assert_array_almost_equal(np.sort(scores[support]), + assert_allclose(np.sort(scores[support]), np.sort(scores)[-support.sum():]) @@ -442,8 +441,8 @@ def test_boundary_case_ch2(): X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) - assert_array_almost_equal(scores, np.array([4., 0.71428571])) - assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) + assert_allclose(scores, np.array([4., 0.71428571])) + assert_allclose(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) From 6bda200de5bc5c26d9d8ee70f80b9e68772921d8 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 09:49:29 +0200 Subject: [PATCH 38/43] fixup! Prefer assert_allclose over assert_array_almost_equal Revert changes for unrelated tests. --- .../feature_selection/tests/test_feature_select.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index ee34009352e8d..dedbf57a89417 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -11,6 +11,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import assert_warns_message @@ -59,8 +60,8 @@ def test_f_oneway_ints(): # test that is gives the same result as with float f, p = f_oneway(X.astype(float), y) - assert_allclose(f, fint) - assert_allclose(p, pint) + assert_array_almost_equal(f, fint, decimal=4) + assert_array_almost_equal(p, pint, decimal=4) def test_f_classif(): @@ -79,8 +80,8 @@ def test_f_classif(): assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.e-4).all() - assert_allclose(F_sparse, F) - assert_allclose(pv_sparse, pv) + assert_array_almost_equal(F_sparse, F) + assert_array_almost_equal(pv_sparse, pv) @pytest.mark.parametrize("coeff", [abs_r_regression, r_regression]) @@ -441,8 +442,8 @@ def test_boundary_case_ch2(): X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) - assert_allclose(scores, np.array([4., 0.71428571])) - assert_allclose(pvalues, np.array([0.04550026, 0.39802472])) + assert_array_almost_equal(scores, np.array([4., 0.71428571])) + assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) From 76830bb0f6c646546219e418b7a392f9fa86cb7b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 12 Apr 2021 16:34:37 +0200 Subject: [PATCH 39/43] Split test into several Also check against numpy's implementation of the correlation coefficient. Co-authored-by: Guillaume Lemaitre --- .../tests/test_feature_select.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index dedbf57a89417..ad51f1dc5da25 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -9,7 +9,7 @@ import pytest -from sklearn.utils._testing import assert_almost_equal +from sklearn.utils._testing import assert_almost_equal, _convert_container from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_warns @@ -84,25 +84,35 @@ def test_f_classif(): assert_array_almost_equal(pv_sparse, pv) -@pytest.mark.parametrize("coeff", [abs_r_regression, r_regression]) -def test_r_regression(coeff): - X, y = make_regression(n_samples=200, n_features=20, n_informative=5, +@pytest.mark.parametrize("center", [True, False]) +def test_r_regression(center): + X, y = make_regression(n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0) - correlation_coeffs = coeff(X, y) - if coeff == r_regression: - assert ((correlation_coeffs > -1).all()) - assert ((correlation_coeffs < 1).all()) + corr_coeffs = r_regression(X, y, center=center) + assert ((-1 < corr_coeffs).all()) + assert ((corr_coeffs < 1).all()) - # with centering, compare with sparse - correlation_coeffs = coeff(X, y, center=True) - correlation_coeffs_sparse = coeff(sparse.csr_matrix(X), y, center=True) - assert_allclose(correlation_coeffs_sparse, correlation_coeffs) + sparse_X = _convert_container(X, "sparse") - # again without centering, compare with sparse - correlation_coeffs = coeff(X, y, center=False) - correlation_coeffs_sparse = coeff(sparse.csr_matrix(X), y, center=False) - assert_allclose(correlation_coeffs_sparse, correlation_coeffs) + sparse_corr_coeffs = r_regression(sparse_X, y, center=center) + assert_allclose(sparse_corr_coeffs, corr_coeffs) + + # Testing against numpy for reference + Z = np.hstack((X, y[:, np.newaxis])) + correlation_matrix = np.corrcoef(Z, rowvar=False) + np_corr_coeffs = correlation_matrix[:-1, -1] + assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3) + + +@pytest.mark.parametrize("array_like", ["array", "sparse_csr", "sparse_csc"]) +def test_abs_r_regression(array_like): + X, y = make_regression(n_samples=200, n_features=20, n_informative=5, + shuffle=False, random_state=0) + + X = _convert_container(X, array_like) + + assert_allclose(abs_r_regression(X, y), np.abs(r_regression(X, y))) def test_f_regression(): From c83161be4f4c82e837eede2ac9cbc0f880d3946d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 14 Apr 2021 11:16:44 +0200 Subject: [PATCH 40/43] fixup! Improve wording abs_r_regression --- sklearn/feature_selection/_univariate_selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index d554ff5300ab2..ba5b9e8438fac 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -259,7 +259,7 @@ def r_regression(X, y, *, center=True): Returns ------- correlation_coefficient : ndarray of shape (n_features,) - Pearson R correlation coefficients of features. + Pearson's R correlation coefficients of features. See Also -------- @@ -344,7 +344,7 @@ def f_regression(X, y, *, center=True): -------- abs_r_regression: Absolute value of Pearson's R correlation coefficients between label and features for regression tasks. - r_regression: Pearson R between label/feature for regression tasks. + r_regression: Pearson's R between label/feature for regression tasks. f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. SelectKBest: Select features based on the k highest scores. From 3e4743c29362be51c208f8511906cbfadf2db49b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 16 Apr 2021 10:26:27 +0200 Subject: [PATCH 41/43] Drop `feature_selection.abs_r_regression` See discussions: https://github.com/scikit-learn/scikit-learn/pull/17169#issuecomment-820997349 --- doc/modules/classes.rst | 1 - doc/whats_new/v1.0.rst | 7 +-- sklearn/feature_selection/__init__.py | 2 - .../_univariate_selection.py | 25 ----------- .../tests/test_feature_select.py | 43 ------------------- 5 files changed, 2 insertions(+), 76 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 195d63efa1def..d56914f874b42 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -557,7 +557,6 @@ From text :toctree: generated/ :template: function.rst - feature_selection.abs_r_regression feature_selection.chi2 feature_selection.f_classif feature_selection.f_regression diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b0e8ca63e78f8..eaf02942cf316 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -106,11 +106,8 @@ Changelog :mod:`sklearn.feature_selection` ................................ -- |Feature| :func:`feature_selection.abs_r_regression` is a new criterion - which can be used with :class:`feature_selection.SelectKBest` to select - variables. It is the absolution values of - :func:`feature_selection.r_regression` which computes Pearson's R correlation - coefficients between the features and the target. +- |Feature| :func:`feature_selection.r_regression` computes Pearson's R + correlation coefficients between the features and the target. :pr:`17169` by `Dmytro Lituiev ` and `Julien Jerphanion `. diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index eae46cf21006d..ef894b40065de 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -9,7 +9,6 @@ from ._univariate_selection import f_oneway from ._univariate_selection import f_regression from ._univariate_selection import r_regression -from ._univariate_selection import abs_r_regression from ._univariate_selection import SelectPercentile from ._univariate_selection import SelectKBest from ._univariate_selection import SelectFpr @@ -42,7 +41,6 @@ 'SelectFromModel', 'SelectPercentile', 'VarianceThreshold', - 'abs_r_regression', 'chi2', 'f_classif', 'f_oneway', diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index ba5b9e8438fac..4cbe4500ffa16 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -263,8 +263,6 @@ def r_regression(X, y, *, center=True): See Also -------- - abs_r_regression: Absolute value of Pearson's R correlation coefficients - between label and features for regression tasks. f_regression: Univariate linear regression tests returning f-statistic and p-values mutual_info_regression: Mutual information for a continuous target. @@ -315,9 +313,6 @@ def f_regression(X, y, *, center=True): Note however that contrary to :func:`f_regression`, :func:`r_regression` values lie in [-1, 1] and can thus be negative. - Alternatively, :func:`abs_r_regression` can be used to rank features by - correlation magnitude instead. - Read more in the :ref:`User Guide `. Parameters @@ -342,8 +337,6 @@ def f_regression(X, y, *, center=True): See Also -------- - abs_r_regression: Absolute value of Pearson's R correlation coefficients - between label and features for regression tasks. r_regression: Pearson's R between label/feature for regression tasks. f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. @@ -363,22 +356,6 @@ def f_regression(X, y, *, center=True): return f_statistic, p_values -def abs_r_regression(X, y, center=True): - """Absolute value of Pearson's R from univariate linear regressions. - - .. versionadded:: 1.0 - - This convenience wrapper is to be used with - :class:`~sklearn.feature_selection.SelectKBest`. - - See Also - -------- - r_regression: Univariate linear regression tests returning Pearson's R - correlation coefficient. - SelectKBest: Select features based on the k highest scores. - """ - return np.abs(r_regression(X, y, center=center)) - ###################################################################### # Base classes @@ -576,8 +553,6 @@ class SelectKBest(_BaseFilter): f_classif: ANOVA F-value between label/feature for classification tasks. mutual_info_classif: Mutual information for a discrete target. chi2: Chi-squared stats of non-negative features for classification tasks. - abs_r_regression: Absolute value of Pearson's R between label and features - for regression tasks. f_regression: F-value between label/feature for regression tasks. mutual_info_regression: Mutual information for a continuous target. SelectPercentile: Select features based on percentile of the highest diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index ad51f1dc5da25..852c8228b2a76 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -19,7 +19,6 @@ from sklearn.datasets import make_classification, make_regression from sklearn.feature_selection import ( - abs_r_regression, chi2, f_classif, f_oneway, @@ -105,16 +104,6 @@ def test_r_regression(center): assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3) -@pytest.mark.parametrize("array_like", ["array", "sparse_csr", "sparse_csc"]) -def test_abs_r_regression(array_like): - X, y = make_regression(n_samples=200, n_features=20, n_informative=5, - shuffle=False, random_state=0) - - X = _convert_container(X, array_like) - - assert_allclose(abs_r_regression(X, y), np.abs(r_regression(X, y))) - - def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem @@ -141,19 +130,6 @@ def test_f_regression(): assert_allclose(pv_sparse, pv) -def test_f_regression_r_regression_consistency(): - # Test the equivalence of f_regression and abs_r_regression for variable - # selection using the returned values ordering - X, y = make_regression(n_samples=200, n_features=1000, - shuffle=False, random_state=0) - - Fs, _ = f_regression(X, y) - - assert_array_equal(Fs.argsort(), abs_r_regression(X, y).argsort()) - - assert_allclose(abs_r_regression(X, y), np.abs(r_regression(X, y))) - - def test_f_regression_input_dtype(): # Test whether f_regression returns the same value # for any numeric data_type @@ -408,25 +384,6 @@ def test_select_kbest_regression(): assert_array_equal(support, gtruth) -def test_select_kbest_abs_r_regression(): - # Test whether the relative univariate feature selection - # gets the correct items in a simple regression problem - # with the k best heuristic - X, y = make_regression(n_samples=200, n_features=20, n_informative=5, - shuffle=False, random_state=0, noise=10) - - univariate_filter = SelectKBest(abs_r_regression, k=5) - X_r = univariate_filter.fit(X, y).transform(X) - assert_best_scores_kept(univariate_filter) - X_r2 = GenericUnivariateSelect( - f_regression, mode='k_best', param=5).fit(X, y).transform(X) - assert_array_equal(X_r, X_r2) - support = univariate_filter.get_support() - gtruth = np.zeros(20) - gtruth[:5] = 1 - assert_array_equal(support, gtruth) - - def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem From fc57caa5eb6f7cfe9d27cfa4c3cc2be07feaa08e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 20 Apr 2021 18:54:40 +0200 Subject: [PATCH 42/43] Use multi-line docstring (PEP 257) for better integration with IDEs Co-authored-by: Olivier Grisel --- sklearn/feature_selection/_univariate_selection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 4cbe4500ffa16..fd7f7ac19cc29 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -230,8 +230,9 @@ def chi2(X, y): def r_regression(X, y, *, center=True): - """Compute Pearson's R correlation coefficients between the features and - the target. + """Compute Pearson's r for each features and the target. + + Pearson's r is also known as the Pearson correlation coefficient. .. versionadded:: 1.0 From d153dda1fab7af9017b007e68769e4642978260b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 20 Apr 2021 18:55:32 +0200 Subject: [PATCH 43/43] Rephrase docstring Co-authored-by: Olivier Grisel --- sklearn/feature_selection/_univariate_selection.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index fd7f7ac19cc29..7fc69a4b13cf2 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -309,10 +309,18 @@ def f_regression(X, y, *, center=True): std(y)) using r_regression function. 2. It is converted to an F score and then to a p-value. - If p-values are not needed, :func:`r_regression` can be used to rank - features as a slightly cheaper alternative to :func:`f_regression`. + :func:`f_regression` is derived from :func:`r_regression` and will rank + features in the same order if all the features are positively correlated + with the target. + Note however that contrary to :func:`f_regression`, :func:`r_regression` - values lie in [-1, 1] and can thus be negative. + values lie in [-1, 1] and can thus be negative. :func:`f_regression` is + therefore recommended as a feature selection criterion to identify + potentially predictive feature for a downstream classifier, irrespective of + the sign of the association with the target variable. + + Furthermore :func:`f_regression` returns p-values while + :func:`r_regression` does not. Read more in the :ref:`User Guide `.