From 94c1a794b87cbf934068aaa990f30d777c6fc542 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 24 Jul 2023 13:20:50 -0400
Subject: [PATCH 1/5] MNT Improve robustness of sparse test in `HDBSCAN`

---
 sklearn/cluster/_hdbscan/hdbscan.py   |  1 +
 sklearn/cluster/tests/test_hdbscan.py | 31 ++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 57de8962250b1..0c92f251373d2 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -732,6 +732,7 @@ def fit(self, X, y=None):
                 X = X[finite_index]
         elif issparse(X):
             # Handle sparse precomputed distance matrices separately
+            print(f"\nDEBUG *** {X.data=}")
             X = self._validate_data(
                 X,
                 accept_sparse=["csr", "lil"],
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index c0c281ce31475..47b31d8480f50 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -288,21 +288,36 @@ def test_hdbscan_sparse():
     """
     Tests that HDBSCAN works correctly when passing sparse feature data.
     """
-    sparse_X = sparse.csr_matrix(X)
 
-    labels = HDBSCAN().fit(sparse_X).labels_
-    n_clusters = len(set(labels) - OUTLIER_SET)
+    dense_labels = HDBSCAN().fit(X).labels_
+    n_clusters = len(set(dense_labels) - OUTLIER_SET)
     assert n_clusters == 3
 
-    sparse_X_nan = sparse_X.copy()
-    sparse_X_nan[0, 0] = np.nan
-    labels = HDBSCAN().fit(sparse_X_nan).labels_
-    n_clusters = len(set(labels) - OUTLIER_SET)
+    X_sparse = sparse.csr_matrix(X)
+    sparse_labels = HDBSCAN().fit(X_sparse).labels_
+    n_clusters = len(set(sparse_labels) - OUTLIER_SET)
     assert n_clusters == 3
+    assert_array_equal(dense_labels, sparse_labels)
+
+    for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")):
+        _X = X.copy()
+        _X[0, 0] = outlier_val
+        dense_labels = HDBSCAN().fit(_X).labels_
+        n_clusters = len(set(dense_labels) - OUTLIER_SET)
+        assert n_clusters == 3
+        assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
+
+        _X_sparse = X_sparse.copy()
+        _X_sparse[0, 0] = outlier_val
+        sparse_labels = HDBSCAN().fit(_X_sparse).labels_
+        n_clusters = len(set(sparse_labels) - OUTLIER_SET)
+        assert n_clusters == 3
+        assert sparse_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
+        assert_array_equal(dense_labels, sparse_labels)
 
     msg = "Sparse data matrices only support algorithm `brute`."
     with pytest.raises(ValueError, match=msg):
-        HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X)
+        HDBSCAN(metric="euclidean", algorithm="balltree").fit(_X_sparse)
 
 
 @pytest.mark.parametrize("algorithm", ALGORITHMS)

From b3be4ce7dab6f01b0e24207fa00eb712bbf3ebba Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 24 Jul 2023 13:37:57 -0400
Subject: [PATCH 2/5] Removed old debug and changed variable name

---
 sklearn/cluster/_hdbscan/hdbscan.py   | 1 -
 sklearn/cluster/tests/test_hdbscan.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 0c92f251373d2..57de8962250b1 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -732,7 +732,6 @@ def fit(self, X, y=None):
                 X = X[finite_index]
         elif issparse(X):
             # Handle sparse precomputed distance matrices separately
-            print(f"\nDEBUG *** {X.data=}")
             X = self._validate_data(
                 X,
                 accept_sparse=["csr", "lil"],
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 47b31d8480f50..2f6146e48c909 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -317,7 +317,7 @@ def test_hdbscan_sparse():
 
     msg = "Sparse data matrices only support algorithm `brute`."
     with pytest.raises(ValueError, match=msg):
-        HDBSCAN(metric="euclidean", algorithm="balltree").fit(_X_sparse)
+        HDBSCAN(metric="euclidean", algorithm="balltree").fit(X_sparse)
 
 
 @pytest.mark.parametrize("algorithm", ALGORITHMS)

From c690c19021c371097be0d9bc2e640b8ce11b33a4 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 25 Jul 2023 12:00:56 -0400
Subject: [PATCH 3/5] Updated docstring and added inline comment

---
 sklearn/cluster/tests/test_hdbscan.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index 2f6146e48c909..a0c468c0edd7b 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -287,29 +287,33 @@ def test_hdbscan_precomputed_non_brute(tree):
 def test_hdbscan_sparse():
     """
     Tests that HDBSCAN works correctly when passing sparse feature data.
+    Evaluates correctness by comparing against the same data passed as a dense
+    array.
     """
 
     dense_labels = HDBSCAN().fit(X).labels_
     n_clusters = len(set(dense_labels) - OUTLIER_SET)
     assert n_clusters == 3
 
-    X_sparse = sparse.csr_matrix(X)
+    _X_sparse = sparse.csr_matrix(X)
+    X_sparse = _X_sparse.copy()
     sparse_labels = HDBSCAN().fit(X_sparse).labels_
     n_clusters = len(set(sparse_labels) - OUTLIER_SET)
     assert n_clusters == 3
     assert_array_equal(dense_labels, sparse_labels)
 
+    # Compare that the sparse and dense non-precomputed routines return the same labels
     for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")):
-        _X = X.copy()
-        _X[0, 0] = outlier_val
-        dense_labels = HDBSCAN().fit(_X).labels_
+        X_dense = X.copy()
+        X_dense[0, 0] = outlier_val
+        dense_labels = HDBSCAN().fit(X_dense).labels_
         n_clusters = len(set(dense_labels) - OUTLIER_SET)
         assert n_clusters == 3
         assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
 
-        _X_sparse = X_sparse.copy()
-        _X_sparse[0, 0] = outlier_val
-        sparse_labels = HDBSCAN().fit(_X_sparse).labels_
+        X_sparse = _X_sparse.copy()
+        X_sparse[0, 0] = outlier_val
+        sparse_labels = HDBSCAN().fit(X_sparse).labels_
         n_clusters = len(set(sparse_labels) - OUTLIER_SET)
         assert n_clusters == 3
         assert sparse_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]

From 941a6d8558001d10bc62ac55d84310a8c474fcf9 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Tue, 25 Jul 2023 12:22:25 -0400
Subject: [PATCH 4/5] Update sklearn/cluster/tests/test_hdbscan.py

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/cluster/tests/test_hdbscan.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index a0c468c0edd7b..2098449044c26 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -303,6 +303,7 @@ def test_hdbscan_sparse():
     assert_array_equal(dense_labels, sparse_labels)
 
     # Compare that the sparse and dense non-precomputed routines return the same labels
+    # where the 0th observation contains the outlier.
     for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")):
         X_dense = X.copy()
         X_dense[0, 0] = outlier_val

From 06fbab597f3c19438b40e8afe8cc734b7a091b34 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 26 Jul 2023 08:21:36 -0400
Subject: [PATCH 5/5] Simplified test

---
 sklearn/cluster/tests/test_hdbscan.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index a0c468c0edd7b..35835ebaf268b 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -298,12 +298,10 @@ def test_hdbscan_sparse():
     _X_sparse = sparse.csr_matrix(X)
     X_sparse = _X_sparse.copy()
     sparse_labels = HDBSCAN().fit(X_sparse).labels_
-    n_clusters = len(set(sparse_labels) - OUTLIER_SET)
-    assert n_clusters == 3
     assert_array_equal(dense_labels, sparse_labels)
 
     # Compare that the sparse and dense non-precomputed routines return the same labels
-    for outlier_val, outlier_type in zip((np.inf, np.nan), ("infinite", "missing")):
+    for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
         X_dense = X.copy()
         X_dense[0, 0] = outlier_val
         dense_labels = HDBSCAN().fit(X_dense).labels_
@@ -314,9 +312,6 @@ def test_hdbscan_sparse():
         X_sparse = _X_sparse.copy()
         X_sparse[0, 0] = outlier_val
         sparse_labels = HDBSCAN().fit(X_sparse).labels_
-        n_clusters = len(set(sparse_labels) - OUTLIER_SET)
-        assert n_clusters == 3
-        assert sparse_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
         assert_array_equal(dense_labels, sparse_labels)
 
     msg = "Sparse data matrices only support algorithm `brute`."