From 94c1a794b87cbf934068aaa990f30d777c6fc542 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 24 Jul 2023 13:20:50 -0400 Subject: [PATCH 1/5] MNT Improve robustness of sparse test in `HDBSCAN` --- sklearn/cluster/_hdbscan/hdbscan.py | 1 + sklearn/cluster/tests/test_hdbscan.py | 31 ++++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 57de8962250b1..0c92f251373d2 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -732,6 +732,7 @@ def fit(self, X, y=None): X = X[finite_index] elif issparse(X): # Handle sparse precomputed distance matrices separately + print(f"\nDEBUG *** {X.data=}") X = self._validate_data( X, accept_sparse=["csr", "lil"], diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index c0c281ce31475..47b31d8480f50 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -288,21 +288,36 @@ def test_hdbscan_sparse(): """ Tests that HDBSCAN works correctly when passing sparse feature data. """ - sparse_X = sparse.csr_matrix(X) - labels = HDBSCAN().fit(sparse_X).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) + dense_labels = HDBSCAN().fit(X).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 - sparse_X_nan = sparse_X.copy() - sparse_X_nan[0, 0] = np.nan - labels = HDBSCAN().fit(sparse_X_nan).labels_ - n_clusters = len(set(labels) - OUTLIER_SET) + X_sparse = sparse.csr_matrix(X) + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + n_clusters = len(set(sparse_labels) - OUTLIER_SET) assert n_clusters == 3 + assert_array_equal(dense_labels, sparse_labels) + + for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")): + _X = X.copy() + _X[0, 0] = outlier_val + dense_labels = HDBSCAN().fit(_X).labels_ + n_clusters = len(set(dense_labels) - OUTLIER_SET) + assert n_clusters == 3 + assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] + + _X_sparse = X_sparse.copy() + _X_sparse[0, 0] = outlier_val + sparse_labels = HDBSCAN().fit(_X_sparse).labels_ + n_clusters = len(set(sparse_labels) - OUTLIER_SET) + assert n_clusters == 3 + assert sparse_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] + assert_array_equal(dense_labels, sparse_labels) msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="balltree").fit(_X_sparse) @pytest.mark.parametrize("algorithm", ALGORITHMS) From b3be4ce7dab6f01b0e24207fa00eb712bbf3ebba Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 24 Jul 2023 13:37:57 -0400 Subject: [PATCH 2/5] Removed old debug and changed variable name --- sklearn/cluster/_hdbscan/hdbscan.py | 1 - sklearn/cluster/tests/test_hdbscan.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 0c92f251373d2..57de8962250b1 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -732,7 +732,6 @@ def fit(self, X, y=None): X = X[finite_index] elif issparse(X): # Handle sparse precomputed distance matrices separately - print(f"\nDEBUG *** {X.data=}") X = self._validate_data( X, accept_sparse=["csr", "lil"], diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 47b31d8480f50..2f6146e48c909 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -317,7 +317,7 @@ def test_hdbscan_sparse(): msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="balltree").fit(_X_sparse) + HDBSCAN(metric="euclidean", algorithm="balltree").fit(X_sparse) @pytest.mark.parametrize("algorithm", ALGORITHMS) From c690c19021c371097be0d9bc2e640b8ce11b33a4 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 25 Jul 2023 12:00:56 -0400 Subject: [PATCH 3/5] Updated docstring and added inline comment --- sklearn/cluster/tests/test_hdbscan.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 2f6146e48c909..a0c468c0edd7b 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -287,29 +287,33 @@ def test_hdbscan_precomputed_non_brute(tree): def test_hdbscan_sparse(): """ Tests that HDBSCAN works correctly when passing sparse feature data. + Evaluates correctness by comparing against the same data passed as a dense + array. """ dense_labels = HDBSCAN().fit(X).labels_ n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 - X_sparse = sparse.csr_matrix(X) + _X_sparse = sparse.csr_matrix(X) + X_sparse = _X_sparse.copy() sparse_labels = HDBSCAN().fit(X_sparse).labels_ n_clusters = len(set(sparse_labels) - OUTLIER_SET) assert n_clusters == 3 assert_array_equal(dense_labels, sparse_labels) + # Compare that the sparse and dense non-precomputed routines return the same labels for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")): - _X = X.copy() - _X[0, 0] = outlier_val - dense_labels = HDBSCAN().fit(_X).labels_ + X_dense = X.copy() + X_dense[0, 0] = outlier_val + dense_labels = HDBSCAN().fit(X_dense).labels_ n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] - _X_sparse = X_sparse.copy() - _X_sparse[0, 0] = outlier_val - sparse_labels = HDBSCAN().fit(_X_sparse).labels_ + X_sparse = _X_sparse.copy() + X_sparse[0, 0] = outlier_val + sparse_labels = HDBSCAN().fit(X_sparse).labels_ n_clusters = len(set(sparse_labels) - OUTLIER_SET) assert n_clusters == 3 assert sparse_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] From 941a6d8558001d10bc62ac55d84310a8c474fcf9 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:22:25 -0400 Subject: [PATCH 4/5] Update sklearn/cluster/tests/test_hdbscan.py Co-authored-by: Thomas J. Fan --- sklearn/cluster/tests/test_hdbscan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index a0c468c0edd7b..2098449044c26 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -303,6 +303,7 @@ def test_hdbscan_sparse(): assert_array_equal(dense_labels, sparse_labels) # Compare that the sparse and dense non-precomputed routines return the same labels + # where the 0th observation contains the outlier. for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")): X_dense = X.copy() X_dense[0, 0] = outlier_val From 06fbab597f3c19438b40e8afe8cc734b7a091b34 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 26 Jul 2023 08:21:36 -0400 Subject: [PATCH 5/5] Simplified test --- sklearn/cluster/tests/test_hdbscan.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index a0c468c0edd7b..35835ebaf268b 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -298,12 +298,10 @@ def test_hdbscan_sparse(): _X_sparse = sparse.csr_matrix(X) X_sparse = _X_sparse.copy() sparse_labels = HDBSCAN().fit(X_sparse).labels_ - n_clusters = len(set(sparse_labels) - OUTLIER_SET) - assert n_clusters == 3 assert_array_equal(dense_labels, sparse_labels) # Compare that the sparse and dense non-precomputed routines return the same labels - for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")): + for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): X_dense = X.copy() X_dense[0, 0] = outlier_val dense_labels = HDBSCAN().fit(X_dense).labels_ @@ -314,9 +312,6 @@ def test_hdbscan_sparse(): X_sparse = _X_sparse.copy() X_sparse[0, 0] = outlier_val sparse_labels = HDBSCAN().fit(X_sparse).labels_ - n_clusters = len(set(sparse_labels) - OUTLIER_SET) - assert n_clusters == 3 - assert sparse_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] assert_array_equal(dense_labels, sparse_labels) msg = "Sparse data matrices only support algorithm `brute`."