Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
a71eb91
r_regression and abs_r_regression added
DSLituiev Feb 13, 2017
88ade53
r_regression and abs_r_regression added
DSLituiev Feb 13, 2017
3a26693
indentation fix
DSLituiev Feb 14, 2017
fbe3f97
whitespace fix
DSLituiev Feb 15, 2017
f4f6a1a
import and tests
DSLituiev Feb 15, 2017
0fdfa13
indentation
DSLituiev Feb 15, 2017
3b5498f
code style
DSLituiev Feb 15, 2017
fa5dfe3
init fix
DSLituiev Feb 15, 2017
5158d3a
Merge branch 'master' into r_regression
DSLituiev Mar 27, 2017
040e877
Merge branch 'master' into r_regression
DSLituiev Mar 30, 2018
76427d8
Merge branch 'master' into r_regression
jjerphan May 9, 2020
f6dcf52
Change assert_true for assert
jjerphan May 9, 2020
63d6fd7
Change module for import to match current one
jjerphan May 9, 2020
1c9d512
Add reference in doc comment
jjerphan May 9, 2020
45514b2
Revert "Change module for import to match current one"
jjerphan May 9, 2020
f7f0bcc
Merge branch 'master' into r_regression
jjerphan May 19, 2020
677953d
Merge branch 'master' into r_regression
jjerphan Jul 25, 2020
c7cbfa1
Merge branch 'main' into r_regression
jjerphan Feb 9, 2021
6c120e3
Merge branch 'main' into r_regression
jjerphan Feb 9, 2021
98383aa
Add documentation reference for abs_r_regression and r_regression
jjerphan Feb 15, 2021
17838de
Complete test to include r_regression
jjerphan Feb 15, 2021
e1a43e8
Remove Sphinx warning on indentation
jjerphan Feb 16, 2021
e5aa831
Use 'See Also' over 'See Also'
jjerphan Feb 16, 2021
bcaae35
Clarify docstrings
jjerphan Feb 16, 2021
8a803ea
Add consistency test
jjerphan Feb 16, 2021
47b2ea9
Test for Pearson R correct support
jjerphan Feb 16, 2021
da96477
fixup! Clarify docstrings
jjerphan Feb 16, 2021
0d5d13a
Merge branch 'main' into r_regression
jjerphan Mar 10, 2021
4a548b5
Add docstring directives for additions in version 1.0
jjerphan Mar 12, 2021
711544c
Fix typo in the whats_new entry
jjerphan Apr 12, 2021
f7fa09d
End sentence with a full stop.
jjerphan Apr 12, 2021
cfe2299
Correct typo in docstring.
jjerphan Apr 12, 2021
92c389d
Correct typo in the whats_new entry.
jjerphan Apr 12, 2021
29bf1d0
Fix syntax in r_regression's docstring.
jjerphan Apr 12, 2021
bbf4179
Sort imports alphabetically
jjerphan Apr 12, 2021
af31e72
Remove useless decorator
jjerphan Apr 12, 2021
69e638c
Use verbose name for the correlation coefficient
jjerphan Apr 12, 2021
f5aa513
Improve r_regression and r_regression docstring
jjerphan Apr 12, 2021
dcd9463
Improve code comments
jjerphan Apr 12, 2021
a16df67
Add Sphinx domains in f_regression's docstring
jjerphan Apr 12, 2021
62b76ad
Make f statistics' and their p-values' computations clearer
jjerphan Apr 12, 2021
107534e
Improve wording abs_r_regression
jjerphan Apr 12, 2021
ce64e78
Use black formatting style for imports
jjerphan Apr 12, 2021
537de96
Remove useless comments
jjerphan Apr 12, 2021
df1ab5b
Prefer assert_allclose over assert_array_almost_equal
jjerphan Apr 12, 2021
6bda200
fixup! Prefer assert_allclose over assert_array_almost_equal
jjerphan Apr 12, 2021
76830bb
Split test into several
jjerphan Apr 12, 2021
c83161b
fixup! Improve wording abs_r_regression
jjerphan Apr 14, 2021
3e4743c
Drop `feature_selection.abs_r_regression`
jjerphan Apr 16, 2021
fc57caa
Use multi-line docstring (PEP 257) for better integration with IDEs
jjerphan Apr 20, 2021
d153dda
Rephrase docstring
jjerphan Apr 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/modules/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ From text
feature_selection.chi2
feature_selection.f_classif
feature_selection.f_regression
feature_selection.r_regression
feature_selection.mutual_info_classif
feature_selection.mutual_info_regression

Expand Down
8 changes: 8 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ Changelog
input strings would result in negative indices in the transformed data.
:pr:`19035` by :user:`Liu Yu <ly648499246>`.

:mod:`sklearn.feature_selection`
................................

- |Feature| :func:`feature_selection.r_regression` computes Pearson's R
correlation coefficients between the features and the target.
:pr:`17169` by `Dmytro Lituiev <DSLituiev>`
and `Julien Jerphanion <jjerphan>`.

:mod:`sklearn.inspection`
.........................

Expand Down
2 changes: 2 additions & 0 deletions sklearn/feature_selection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._univariate_selection import f_classif
from ._univariate_selection import f_oneway
from ._univariate_selection import f_regression
from ._univariate_selection import r_regression
from ._univariate_selection import SelectPercentile
from ._univariate_selection import SelectKBest
from ._univariate_selection import SelectFpr
Expand Down Expand Up @@ -44,6 +45,7 @@
'f_classif',
'f_oneway',
'f_regression',
'r_regression',
'mutual_info_classif',
'mutual_info_regression',
'SelectorMixin']
149 changes: 103 additions & 46 deletions sklearn/feature_selection/_univariate_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,83 +229,140 @@ def chi2(X, y):
return _chisquare(observed, expected)


@_deprecate_positional_args
def f_regression(X, y, *, center=True):
"""Univariate linear regression tests.
def r_regression(X, y, *, center=True):
"""Compute Pearson's r for each features and the target.

Pearson's r is also known as the Pearson correlation coefficient.

.. versionadded:: 1.0

Linear model for testing the individual effect of each of many regressors.
This is a scoring function to be used in a feature selection procedure, not
a free standing feature selection procedure.

This is done in 2 steps:

1. The correlation between each regressor and the target is computed,
that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
std(y)).
2. It is converted to an F score then to a p-value.
The cross correlation between each regressor and the target is computed
as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).

For more on usage see the :ref:`User Guide <univariate_feature_selection>`.

Parameters
----------
X : {array-like, sparse matrix} shape = (n_samples, n_features)
The set of regressors that will be tested sequentially.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data matrix.

y : array of shape(n_samples).
The data matrix
y : array-like of shape (n_samples,)
The target vector.

center : bool, default=True
If true, X and y will be centered.
Whether or not to center the data matrix `X` and the target vector `y`.
By default, `X` and `y` will be centered.

Returns
-------
F : array, shape=(n_features,)
F values of features.

pval : array, shape=(n_features,)
p-values of F-scores.
correlation_coefficient : ndarray of shape (n_features,)
Pearson's R correlation coefficients of features.

See Also
--------
mutual_info_regression : Mutual information for a continuous target.
f_classif : ANOVA F-value between label/feature for classification tasks.
chi2 : Chi-squared stats of non-negative features for classification tasks.
SelectKBest : Select features based on the k highest scores.
SelectFpr : Select features based on a false positive rate test.
SelectFdr : Select features based on an estimated false discovery rate.
SelectFwe : Select features based on family-wise error rate.
SelectPercentile : Select features based on percentile of the highest
scores.
f_regression: Univariate linear regression tests returning f-statistic
and p-values
mutual_info_regression: Mutual information for a continuous target.
f_classif: ANOVA F-value between label/feature for classification tasks.
chi2: Chi-squared stats of non-negative features for classification tasks.
"""
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
dtype=np.float64)
n_samples = X.shape[0]

# compute centered values
# note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
# Compute centered values
# Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
# need not center X
if center:
y = y - np.mean(y)
if issparse(X):
X_means = X.mean(axis=0).getA1()
else:
X_means = X.mean(axis=0)
# compute the scaled standard deviations via moments
# Compute the scaled standard deviations via moments
X_norms = np.sqrt(row_norms(X.T, squared=True) -
n_samples * X_means ** 2)
else:
X_norms = row_norms(X.T)

# compute the correlation
corr = safe_sparse_dot(y, X)
corr /= X_norms
corr /= np.linalg.norm(y)
correlation_coefficient = safe_sparse_dot(y, X)
correlation_coefficient /= X_norms
correlation_coefficient /= np.linalg.norm(y)
return correlation_coefficient


@_deprecate_positional_args
def f_regression(X, y, *, center=True):
"""Univariate linear regression tests returning F-statistic and p-values.

Quick linear model for testing the effect of a single regressor,
sequentially for many regressors.

This is done in 2 steps:

1. The cross correlation between each regressor and the target is computed,
that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
std(y)) using r_regression function.
2. It is converted to an F score and then to a p-value.

:func:`f_regression` is derived from :func:`r_regression` and will rank
features in the same order if all the features are positively correlated
with the target.

Note however that contrary to :func:`f_regression`, :func:`r_regression`
values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
therefore recommended as a feature selection criterion to identify
potentially predictive feature for a downstream classifier, irrespective of
the sign of the association with the target variable.

Furthermore :func:`f_regression` returns p-values while
:func:`r_regression` does not.

Read more in the :ref:`User Guide <univariate_feature_selection>`.

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data matrix.

y : array-like of shape (n_samples,)
The target vector.

center : bool, default=True
Whether or not to center the data matrix `X` and the target vector `y`.
By default, `X` and `y` will be centered.

Returns
-------
f_statistic : ndarray of shape (n_features,)
F-statistic for each feature.

p_values : ndarray of shape (n_features,)
P-values associated with the F-statistic.

See Also
--------
r_regression: Pearson's R between label/feature for regression tasks.
f_classif: ANOVA F-value between label/feature for classification tasks.
chi2: Chi-squared stats of non-negative features for classification tasks.
SelectKBest: Select features based on the k highest scores.
SelectFpr: Select features based on a false positive rate test.
SelectFdr: Select features based on an estimated false discovery rate.
SelectFwe: Select features based on family-wise error rate.
SelectPercentile: Select features based on percentile of the highest
scores.
"""
correlation_coefficient = r_regression(X, y, center=center)
deg_of_freedom = y.size - (2 if center else 1)

# convert to p-value
degrees_of_freedom = y.size - (2 if center else 1)
F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
pv = stats.f.sf(F, 1, degrees_of_freedom)
return F, pv
corr_coef_squared = correlation_coefficient ** 2
f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)
return f_statistic, p_values


######################################################################
Expand Down Expand Up @@ -502,12 +559,12 @@ class SelectKBest(_BaseFilter):

See Also
--------
f_classif : ANOVA F-value between label/feature for classification tasks.
mutual_info_classif : Mutual information for a discrete target.
chi2 : Chi-squared stats of non-negative features for classification tasks.
f_regression : F-value between label/feature for regression tasks.
mutual_info_regression : Mutual information for a continuous target.
SelectPercentile : Select features based on percentile of the highest
f_classif: ANOVA F-value between label/feature for classification tasks.
mutual_info_classif: Mutual information for a discrete target.
chi2: Chi-squared stats of non-negative features for classification tasks.
f_regression: F-value between label/feature for regression tasks.
mutual_info_regression: Mutual information for a continuous target.
SelectPercentile: Select features based on percentile of the highest
scores.
SelectFpr : Select features based on a false positive rate test.
SelectFdr : Select features based on an estimated false discovery rate.
Expand Down
59 changes: 46 additions & 13 deletions sklearn/feature_selection/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import itertools
import warnings
import numpy as np
from numpy.testing import assert_allclose
from scipy import stats, sparse

import pytest

from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_almost_equal, _convert_container
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_warns
Expand All @@ -18,9 +19,20 @@

from sklearn.datasets import make_classification, make_regression
from sklearn.feature_selection import (
chi2, f_classif, f_oneway, f_regression, mutual_info_classif,
mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr,
SelectFdr, SelectFwe, GenericUnivariateSelect)
chi2,
f_classif,
f_oneway,
f_regression,
GenericUnivariateSelect,
mutual_info_classif,
mutual_info_regression,
r_regression,
SelectPercentile,
SelectKBest,
SelectFpr,
SelectFdr,
SelectFwe,
)


##############################################################################
Expand Down Expand Up @@ -71,6 +83,27 @@ def test_f_classif():
assert_array_almost_equal(pv_sparse, pv)


@pytest.mark.parametrize("center", [True, False])
def test_r_regression(center):
X, y = make_regression(n_samples=2000, n_features=20, n_informative=5,
shuffle=False, random_state=0)

corr_coeffs = r_regression(X, y, center=center)
assert ((-1 < corr_coeffs).all())
assert ((corr_coeffs < 1).all())

sparse_X = _convert_container(X, "sparse")

sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
assert_allclose(sparse_corr_coeffs, corr_coeffs)

# Testing against numpy for reference
Z = np.hstack((X, y[:, np.newaxis]))
correlation_matrix = np.corrcoef(Z, rowvar=False)
np_corr_coeffs = correlation_matrix[:-1, -1]
assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)


def test_f_regression():
# Test whether the F test yields meaningful results
# on a simple simulated regression problem
Expand All @@ -87,14 +120,14 @@ def test_f_regression():
# with centering, compare with sparse
F, pv = f_regression(X, y, center=True)
F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
assert_array_almost_equal(F_sparse, F)
assert_array_almost_equal(pv_sparse, pv)
assert_allclose(F_sparse, F)
assert_allclose(pv_sparse, pv)

# again without centering, compare with sparse
F, pv = f_regression(X, y, center=False)
F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
assert_array_almost_equal(F_sparse, F)
assert_array_almost_equal(pv_sparse, pv)
assert_allclose(F_sparse, F)
assert_allclose(pv_sparse, pv)


def test_f_regression_input_dtype():
Expand All @@ -106,8 +139,8 @@ def test_f_regression_input_dtype():

F1, pv1 = f_regression(X, y)
F2, pv2 = f_regression(X, y.astype(float))
assert_array_almost_equal(F1, F2, 5)
assert_array_almost_equal(pv1, pv2, 5)
assert_allclose(F1, F2, 5)
assert_allclose(pv1, pv2, 5)


def test_f_regression_center():
Expand All @@ -123,7 +156,7 @@ def test_f_regression_center():

F1, _ = f_regression(X, Y, center=True)
F2, _ = f_regression(X, Y, center=False)
assert_array_almost_equal(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
assert_allclose(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS


Expand Down Expand Up @@ -262,7 +295,7 @@ def test_select_heuristics_classif():
f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
assert_array_almost_equal(support, gtruth)
assert_allclose(support, gtruth)


##############################################################################
Expand All @@ -272,7 +305,7 @@ def test_select_heuristics_classif():
def assert_best_scores_kept(score_filter):
scores = score_filter.scores_
support = score_filter.get_support()
assert_array_almost_equal(np.sort(scores[support]),
assert_allclose(np.sort(scores[support]),
np.sort(scores)[-support.sum():])


Expand Down