From 44604d9442c6fcc72e8caa131c059e378e057c5b Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Sun, 20 Dec 2020 14:05:19 +0530 Subject: [PATCH 01/15] FIX- Incorrect warning when clustering boolean data scikit-learn#18996 --- sklearn/cluster/_optics.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 8998963704562..b6785693ebee7 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -14,6 +14,8 @@ import warnings import numpy as np +from ..exceptions import DataConversionWarning +from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS from ..utils import gen_batches, get_chunk_n_rows from ..utils.validation import _deprecate_positional_args from ..neighbors import NearestNeighbors @@ -243,7 +245,13 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = self._validate_data(X, dtype=float) + + dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float + if dtype == bool and X.dtype != bool: + msg = "Data was converted to boolean for metric %s" % self.metric + warnings.warn(msg, DataConversionWarning) + + X = self._validate_data(X, dtype=dtype) if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" From 1c19aa2edfb2611c4c6313dc1b23c5a6b4e47bf3 Mon Sep 17 00:00:00 2001 From: prakashsur <46415184+prakashsur@users.noreply.github.com> Date: Sun, 20 Dec 2020 16:11:44 +0530 Subject: [PATCH 02/15] Update warning message as part of fixing #18996 Co-authored-by: Nicolas Hug --- sklearn/cluster/_optics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index b6785693ebee7..645f826ed83ab 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -248,7 +248,7 @@ def fit(self, X, y=None): dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float if dtype == bool and X.dtype != bool: - msg = "Data was converted to boolean for metric %s" % self.metric + msg = "Data will be converted to boolean for metric %s" % self.metric warnings.warn(msg, DataConversionWarning) X = self._validate_data(X, dtype=dtype) From c77b192fe3a5d7cad40ef439d152b48b47e88c6f Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Sun, 20 Dec 2020 21:25:25 +0530 Subject: [PATCH 03/15] FIX- Incorrect warning when clustering boolean data scikit-learn#18996 --- sklearn/cluster/_optics.py | 3 ++- sklearn/cluster/tests/test_optics.py | 34 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 645f826ed83ab..d819a17fa81f5 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -248,7 +248,8 @@ def fit(self, X, y=None): dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float if dtype == bool and X.dtype != bool: - msg = "Data will be converted to boolean for metric %s" % self.metric + msg = f"Data will be converted to boolean for metric {self.metric}," \ + f" to avoid this warning, you may convert the data prior to calling fit." warnings.warn(msg, DataConversionWarning) X = self._validate_data(X, dtype=dtype) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 03ca4995c0446..66130eb85f71b 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -8,6 +8,7 @@ from sklearn.datasets import make_blobs from sklearn.cluster import OPTICS from sklearn.cluster._optics import _extend_region, _extract_xi_labels +from sklearn.exceptions import DataConversionWarning from sklearn.metrics.cluster import contingency_matrix from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster import DBSCAN @@ -209,6 +210,39 @@ def test_bad_reachability(): clust.fit(X) +def test_nowarn_if_metric_bool_data_bool(): + pairwise_metric = 'rogerstanimoto' + X = np.random.randint(2, size=(5, 2), dtype=np.bool) + + with pytest.warns(None) as warn_record: + OPTICS(metric=pairwise_metric).fit(X) + assert len(warn_record) == 0 + + +def test_warn_if_metric_bool_data_no_bool(): + pairwise_metric = 'rogerstanimoto' + X = np.random.randint(2, size=(5, 2), dtype=np.int) + msg = f"Data will be converted to boolean for metric {pairwise_metric}," \ + " to avoid this warning, you may convert the data prior to calling fit." + + with pytest.warns(DataConversionWarning, match=msg) as warn_record: + OPTICS(metric=pairwise_metric).fit(X) + assert len(warn_record) == 1 + + +def test_nowarn_if_metric_no_bool(): + pairwise_metric = 'minkowski' + X_bool = np.random.randint(2, size=(5, 2), dtype=np.bool) + X_num = np.random.randint(2, size=(5, 2), dtype=np.int) + + with pytest.warns(None) as warn_record: + # fit boolean data + OPTICS(metric=pairwise_metric).fit(X_bool) + # fit numeric data + OPTICS(metric=pairwise_metric).fit(X_num) + assert len(warn_record) == 0 + + def test_close_extract(): # Test extract where extraction eps is close to scaled max_eps From 0d782c72f95640ea3d26e0d9a4e42d8d97ca2c84 Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Sun, 20 Dec 2020 21:44:12 +0530 Subject: [PATCH 04/15] FIX- (Update) Incorrect warning when clustering boolean data scikit-learn#18996 --- sklearn/cluster/_optics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index d819a17fa81f5..e75f52dfa4567 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -248,8 +248,9 @@ def fit(self, X, y=None): dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float if dtype == bool and X.dtype != bool: - msg = f"Data will be converted to boolean for metric {self.metric}," \ - f" to avoid this warning, you may convert the data prior to calling fit." + msg = f"Data will be converted to boolean for" \ + f" metric {self.metric}, to avoid this warning," \ + f" you may convert the data prior to calling fit." warnings.warn(msg, DataConversionWarning) X = self._validate_data(X, dtype=dtype) From ad65ce5ba075d508aeafff2b433e6a75eea0eafa Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Sun, 20 Dec 2020 21:50:10 +0530 Subject: [PATCH 05/15] FIX- (Update) Incorrect warning when clustering boolean data scikit-learn#18996 --- sklearn/cluster/tests/test_optics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 66130eb85f71b..892e7c0d9144b 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -222,8 +222,9 @@ def test_nowarn_if_metric_bool_data_bool(): def test_warn_if_metric_bool_data_no_bool(): pairwise_metric = 'rogerstanimoto' X = np.random.randint(2, size=(5, 2), dtype=np.int) - msg = f"Data will be converted to boolean for metric {pairwise_metric}," \ - " to avoid this warning, you may convert the data prior to calling fit." + msg = f"Data will be converted to boolean for" \ + f" metric {pairwise_metric}, to avoid this warning" \ + f", you may convert the data prior to calling fit." with pytest.warns(DataConversionWarning, match=msg) as warn_record: OPTICS(metric=pairwise_metric).fit(X) From 21f110cc37459a8c986784a2a5195ca39a2b5fe5 Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Mon, 21 Dec 2020 22:41:44 +0530 Subject: [PATCH 06/15] DOC minor edit to documentation of plot_randomized_search.py --- examples/model_selection/plot_randomized_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_randomized_search.py b/examples/model_selection/plot_randomized_search.py index ff5b51837ed2a..79422cbbc1e6e 100644 --- a/examples/model_selection/plot_randomized_search.py +++ b/examples/model_selection/plot_randomized_search.py @@ -12,7 +12,7 @@ parameters. The result in parameter settings is quite similar, while the run time for randomized search is drastically lower. -The performance is may slightly worse for the randomized search, and is likely +The performance may be slightly worse for the randomized search, and is likely due to a noise effect and would not carry over to a held-out test set. Note that in practice, one would not search over this many different parameters From d03a52db280abf984545cdb9b47f28f3c33cedac Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Mon, 21 Dec 2020 22:58:08 +0530 Subject: [PATCH 07/15] Revert "DOC minor edit to documentation of plot_randomized_search.py" This reverts commit 21f110cc37459a8c986784a2a5195ca39a2b5fe5. --- examples/model_selection/plot_randomized_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_randomized_search.py b/examples/model_selection/plot_randomized_search.py index 79422cbbc1e6e..ff5b51837ed2a 100644 --- a/examples/model_selection/plot_randomized_search.py +++ b/examples/model_selection/plot_randomized_search.py @@ -12,7 +12,7 @@ parameters. The result in parameter settings is quite similar, while the run time for randomized search is drastically lower. -The performance may be slightly worse for the randomized search, and is likely +The performance is may slightly worse for the randomized search, and is likely due to a noise effect and would not carry over to a held-out test set. Note that in practice, one would not search over this many different parameters From 18fa4472fb37299fefa4af97d7b1156ee8371f2d Mon Sep 17 00:00:00 2001 From: Surya Prakash <46415184+jdsurya@users.noreply.github.com> Date: Mon, 18 Jan 2021 11:51:09 +0530 Subject: [PATCH 08/15] Update sklearn/cluster/_optics.py Co-authored-by: Nicolas Hug --- sklearn/cluster/_optics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index e75f52dfa4567..11893dbd70520 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -248,9 +248,9 @@ def fit(self, X, y=None): dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float if dtype == bool and X.dtype != bool: - msg = f"Data will be converted to boolean for" \ - f" metric {self.metric}, to avoid this warning," \ - f" you may convert the data prior to calling fit." + msg = (f"Data will be converted to boolean for" + f" metric {self.metric}, to avoid this warning," + f" you may convert the data prior to calling fit.") warnings.warn(msg, DataConversionWarning) X = self._validate_data(X, dtype=dtype) From 6d9a6daa3da4ee0edf87bef25e1f218c2930bd74 Mon Sep 17 00:00:00 2001 From: Surya Prakash <46415184+jdsurya@users.noreply.github.com> Date: Mon, 18 Jan 2021 11:51:25 +0530 Subject: [PATCH 09/15] Update sklearn/cluster/tests/test_optics.py Co-authored-by: Nicolas Hug --- sklearn/cluster/tests/test_optics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 892e7c0d9144b..38f1eba941959 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -232,6 +232,8 @@ def test_warn_if_metric_bool_data_no_bool(): def test_nowarn_if_metric_no_bool(): + # make sure no conversion warning is raised if + # metric isn't boolean, no matter what the data type is pairwise_metric = 'minkowski' X_bool = np.random.randint(2, size=(5, 2), dtype=np.bool) X_num = np.random.randint(2, size=(5, 2), dtype=np.int) From 76f0858cf98eeacfb42bf09c2e9e4fbe74dc25bf Mon Sep 17 00:00:00 2001 From: Surya Prakash <46415184+jdsurya@users.noreply.github.com> Date: Mon, 18 Jan 2021 11:51:49 +0530 Subject: [PATCH 10/15] Update sklearn/cluster/tests/test_optics.py Co-authored-by: Nicolas Hug --- sklearn/cluster/tests/test_optics.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 38f1eba941959..f5b466506d58e 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -220,6 +220,9 @@ def test_nowarn_if_metric_bool_data_bool(): def test_warn_if_metric_bool_data_no_bool(): + # make sure a *single* conversion warning is raised if metric is boolean + # but data isn't + # non regression test for https://github.com/scikit-learn/scikit-learn/issues/18996 pairwise_metric = 'rogerstanimoto' X = np.random.randint(2, size=(5, 2), dtype=np.int) msg = f"Data will be converted to boolean for" \ From a17b40eaf86562f53660837b34b97c4990dca790 Mon Sep 17 00:00:00 2001 From: Surya Prakash <46415184+jdsurya@users.noreply.github.com> Date: Mon, 18 Jan 2021 11:52:37 +0530 Subject: [PATCH 11/15] Update sklearn/cluster/tests/test_optics.py Co-authored-by: Nicolas Hug --- sklearn/cluster/tests/test_optics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index f5b466506d58e..0965bfb43c2ab 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -211,6 +211,8 @@ def test_bad_reachability(): def test_nowarn_if_metric_bool_data_bool(): + # make sure no warning is raised if metric and data are both boolean + # non-regression test for https://github.com/scikit-learn/scikit-learn/issues/18996 pairwise_metric = 'rogerstanimoto' X = np.random.randint(2, size=(5, 2), dtype=np.bool) From ddb8436a9a5e517dea46e615be029728ed6301e9 Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Mon, 18 Jan 2021 12:52:42 +0530 Subject: [PATCH 12/15] FIX- (Update) Incorrect warning when clustering boolean data scikit-learn#18996 --- sklearn/cluster/tests/test_optics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 0965bfb43c2ab..aaf1f9e9585eb 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -222,14 +222,12 @@ def test_nowarn_if_metric_bool_data_bool(): def test_warn_if_metric_bool_data_no_bool(): - # make sure a *single* conversion warning is raised if metric is boolean - # but data isn't - # non regression test for https://github.com/scikit-learn/scikit-learn/issues/18996 + # make sure a *single* conversion warning is raised if metric is boolean + # but data isn't + # non regression test for https://github.com/scikit-learn/scikit-learn/issues/18996 pairwise_metric = 'rogerstanimoto' X = np.random.randint(2, size=(5, 2), dtype=np.int) - msg = f"Data will be converted to boolean for" \ - f" metric {pairwise_metric}, to avoid this warning" \ - f", you may convert the data prior to calling fit." + msg = f"Data will be converted to boolean for metric {pairwise_metric}" with pytest.warns(DataConversionWarning, match=msg) as warn_record: OPTICS(metric=pairwise_metric).fit(X) @@ -237,8 +235,8 @@ def test_warn_if_metric_bool_data_no_bool(): def test_nowarn_if_metric_no_bool(): - # make sure no conversion warning is raised if - # metric isn't boolean, no matter what the data type is + # make sure no conversion warning is raised if + # metric isn't boolean, no matter what the data type is pairwise_metric = 'minkowski' X_bool = np.random.randint(2, size=(5, 2), dtype=np.bool) X_num = np.random.randint(2, size=(5, 2), dtype=np.int) From 112a480b992d64dab985f43566b9452ea04f9e1d Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Mon, 18 Jan 2021 15:05:21 +0530 Subject: [PATCH 13/15] FIX- (Update) Incorrect warning when clustering boolean data #18996 --- sklearn/cluster/tests/test_optics.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index aaf1f9e9585eb..128af98d08497 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -212,7 +212,9 @@ def test_bad_reachability(): def test_nowarn_if_metric_bool_data_bool(): # make sure no warning is raised if metric and data are both boolean - # non-regression test for https://github.com/scikit-learn/scikit-learn/issues/18996 + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/18996 + pairwise_metric = 'rogerstanimoto' X = np.random.randint(2, size=(5, 2), dtype=np.bool) @@ -224,7 +226,9 @@ def test_nowarn_if_metric_bool_data_bool(): def test_warn_if_metric_bool_data_no_bool(): # make sure a *single* conversion warning is raised if metric is boolean # but data isn't - # non regression test for https://github.com/scikit-learn/scikit-learn/issues/18996 + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/18996 + pairwise_metric = 'rogerstanimoto' X = np.random.randint(2, size=(5, 2), dtype=np.int) msg = f"Data will be converted to boolean for metric {pairwise_metric}" From 2faec75500b4eac5ffe881a845b53cca0dbdc2a4 Mon Sep 17 00:00:00 2001 From: Surya Prakash Date: Thu, 21 Jan 2021 22:44:47 +0530 Subject: [PATCH 14/15] DOC add entry in whats new for fix of incorrect multiple data-conversion warnings --- doc/whats_new/v1.0.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 154c32617c4ba..d133e75c43878 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -70,6 +70,10 @@ Changelog in multicore settings. :pr:`19052` by :user:`Yusuke Nagasaka `. +- |Fix| Fixes incorrect multiple data-conversion warnings when clustering + boolean data. :pr:`19046` by :user:`Surya Prakash ` and + :user:`Nicolas Hug `. + :mod:`sklearn.linear_model` ........................... From 478306db4bf89b8c1ba10c285ca78418b9515be3 Mon Sep 17 00:00:00 2001 From: Surya Prakash <46415184+jdsurya@users.noreply.github.com> Date: Thu, 21 Jan 2021 23:00:40 +0530 Subject: [PATCH 15/15] Update doc/whats_new/v1.0.rst Co-authored-by: Nicolas Hug --- doc/whats_new/v1.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index d133e75c43878..78dc95026c45e 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -71,8 +71,7 @@ Changelog :user:`Yusuke Nagasaka `. - |Fix| Fixes incorrect multiple data-conversion warnings when clustering - boolean data. :pr:`19046` by :user:`Surya Prakash ` and - :user:`Nicolas Hug `. + boolean data. :pr:`19046` by :user:`Surya Prakash `. :mod:`sklearn.linear_model` ...........................