From 2c603ae7fb19fbc85a9fe1967e3abd25ef6cc3bd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 24 Oct 2022 20:10:17 +0200 Subject: [PATCH 1/7] FIX always scale continuous features to unit variance in mutual info --- doc/whats_new/v1.2.rst | 6 +++++ sklearn/feature_selection/_mutual_info.py | 7 +++--- .../tests/test_mutual_info.py | 23 +++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 68a5d1cfbe61d..49ba276d6593e 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -329,6 +329,12 @@ Changelog :mod:`sklearn.feature_selection` ................................ +- |Fix| fix a bug in :func:`feature_selection.mutual_info_regression` and + :func:`feature_selction.mutual_info_classif`, where the continuous features + in `X` should be scaled to a unit variance independtly if the target `y` is + continuous or discrete. + :pr:`xxx` by :user:`Guillaume Lemaitre ` + :mod:`sklearn.gaussian_process` ............................... diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py index 3d036d8ee7e0b..2a03eb7dfd2fe 100644 --- a/sklearn/feature_selection/_mutual_info.py +++ b/sklearn/feature_selection/_mutual_info.py @@ -280,10 +280,9 @@ def _estimate_mi( if copy: X = X.copy() - if not discrete_target: - X[:, continuous_mask] = scale( - X[:, continuous_mask], with_mean=False, copy=False - ) + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) # Add small noise to continuous features as advised in Kraskov et. al. X = X.astype(np.float64, copy=False) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index af2b733efd62d..dfc49d6f32eee 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -207,3 +207,26 @@ def test_mutual_info_options(global_dtype): assert_allclose(mi_5, mi_6) assert not np.allclose(mi_1, mi_3) + + +def test_mutual_information_symmetry_classif_regression(): + """Check that `mutual_info_classif` and `mutual_info_regression` are + symmetric by switching the target `y` as `feature` in `X` and vice + versa. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/23720 + """ + rng = np.random.RandomState(0) + n = 100 + d, c = rng.randint(10, size=n), rng.normal(0, 1, size=n) + + mi_classif = mutual_info_classif( + c[:, None], d, discrete_features=[False], random_state=123 + ) + + mi_regression = mutual_info_regression( + d[:, None], c, discrete_features=[True], random_state=123 + ) + + assert mi_classif == pytest.approx(mi_regression) From 31d200b178efb95efbda89488d58e391edf08e17 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 24 Oct 2022 20:14:37 +0200 Subject: [PATCH 2/7] DOC update pr number --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 49ba276d6593e..8efadff2b0a7e 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -333,7 +333,7 @@ Changelog :func:`feature_selction.mutual_info_classif`, where the continuous features in `X` should be scaled to a unit variance independtly if the target `y` is continuous or discrete. - :pr:`xxx` by :user:`Guillaume Lemaitre ` + :pr:`24747` by :user:`Guillaume Lemaitre ` :mod:`sklearn.gaussian_process` ............................... From 8fc68247ab3c10f44dd69f3666b11d84f0ed4c2e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 25 Oct 2022 10:19:56 +0200 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Tim Head --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 8efadff2b0a7e..704f189a9a684 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -331,7 +331,7 @@ Changelog - |Fix| fix a bug in :func:`feature_selection.mutual_info_regression` and :func:`feature_selction.mutual_info_classif`, where the continuous features - in `X` should be scaled to a unit variance independtly if the target `y` is + in `X` should be scaled to a unit variance independently if the target `y` is continuous or discrete. :pr:`24747` by :user:`Guillaume Lemaitre ` From 8d18d3a521a200d1e5e64aa00fea19f0346f5c7f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 25 Oct 2022 10:39:59 +0200 Subject: [PATCH 4/7] address Tim comments --- .../tests/test_mutual_info.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index dfc49d6f32eee..95cb3c1bd5615 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -209,7 +209,7 @@ def test_mutual_info_options(global_dtype): assert not np.allclose(mi_1, mi_3) -def test_mutual_information_symmetry_classif_regression(): +def test_mutual_information_symmetry_classif_regression(global_random_seed): """Check that `mutual_info_classif` and `mutual_info_regression` are symmetric by switching the target `y` as `feature` in `X` and vice versa. @@ -217,7 +217,7 @@ def test_mutual_information_symmetry_classif_regression(): Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/23720 """ - rng = np.random.RandomState(0) + rng = np.random.RandomState(global_random_seed) n = 100 d, c = rng.randint(10, size=n), rng.normal(0, 1, size=n) @@ -230,3 +230,24 @@ def test_mutual_information_symmetry_classif_regression(): ) assert mi_classif == pytest.approx(mi_regression) + + +def test_mutual_info_symmetry_classif_regression_correlated(): + """Check that `mutual_info_classif` and `mutual_info_regression` are + symmetric by switching the target `y` as `feature` in `X` and vice + versa and `X` and `y` are correlated.""" + + rng = np.random.RandomState(0) + n = 100 + d = rng.randint(10, size=n) + c = d.astype(np.float64) + + mi_classif = mutual_info_classif( + c[:, None], d, discrete_features=[False], random_state=123 + ) + + mi_regression = mutual_info_regression( + d[:, None], c, discrete_features=[True], random_state=123 + ) + + assert mi_classif == pytest.approx(mi_regression) From 0965bc25dcb45e86fa914c1725bf916112df2147 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 26 Oct 2022 11:21:26 +0200 Subject: [PATCH 5/7] Update doc/whats_new/v1.2.rst Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 704f189a9a684..9fc85672c331f 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -329,7 +329,7 @@ Changelog :mod:`sklearn.feature_selection` ................................ -- |Fix| fix a bug in :func:`feature_selection.mutual_info_regression` and +- |Fix| Fix a bug in :func:`feature_selection.mutual_info_regression` and :func:`feature_selction.mutual_info_classif`, where the continuous features in `X` should be scaled to a unit variance independently if the target `y` is continuous or discrete. From 0037d2a11f7313c28bdc485c3fd685f880eea0d8 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 16 Nov 2022 10:58:59 +0100 Subject: [PATCH 6/7] address review comments --- .../tests/test_mutual_info.py | 33 +++++-------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index 95cb3c1bd5615..f39e4a5738b21 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -209,7 +209,8 @@ def test_mutual_info_options(global_dtype): assert not np.allclose(mi_1, mi_3) -def test_mutual_information_symmetry_classif_regression(global_random_seed): +@pytest.mark.parametrize("correlated", [True, False]) +def test_mutual_information_symmetry_classif_regression(correlated, global_random_seed): """Check that `mutual_info_classif` and `mutual_info_regression` are symmetric by switching the target `y` as `feature` in `X` and vice versa. @@ -219,35 +220,19 @@ def test_mutual_information_symmetry_classif_regression(global_random_seed): """ rng = np.random.RandomState(global_random_seed) n = 100 - d, c = rng.randint(10, size=n), rng.normal(0, 1, size=n) - - mi_classif = mutual_info_classif( - c[:, None], d, discrete_features=[False], random_state=123 - ) - - mi_regression = mutual_info_regression( - d[:, None], c, discrete_features=[True], random_state=123 - ) - - assert mi_classif == pytest.approx(mi_regression) - - -def test_mutual_info_symmetry_classif_regression_correlated(): - """Check that `mutual_info_classif` and `mutual_info_regression` are - symmetric by switching the target `y` as `feature` in `X` and vice - versa and `X` and `y` are correlated.""" - - rng = np.random.RandomState(0) - n = 100 d = rng.randint(10, size=n) - c = d.astype(np.float64) + + if correlated: + c = d.astype(np.float64) + else: + c = rng.normal(0, 1, size=n) mi_classif = mutual_info_classif( - c[:, None], d, discrete_features=[False], random_state=123 + c[:, None], d, discrete_features=[False], random_state=global_random_seed ) mi_regression = mutual_info_regression( - d[:, None], c, discrete_features=[True], random_state=123 + d[:, None], c, discrete_features=[True], random_state=global_random_seed ) assert mi_classif == pytest.approx(mi_regression) From 2fb8d62b5a4ae459e843af5ea909355e6d754541 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 16 Nov 2022 11:05:17 +0100 Subject: [PATCH 7/7] trigger ci ?