From 92f5c84ddf9d99421604fabc2a271b20b9fe4b9a Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 5 Apr 2023 10:50:28 -0400
Subject: [PATCH 1/4] Improved thresholding check

---
 sklearn/cluster/_hdbscan/_tree.pyx | 47 ++++++++++++++----------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx
index dcea00cbc8487..cb514cbb67a70 100644
--- a/sklearn/cluster/_hdbscan/_tree.pyx
+++ b/sklearn/cluster/_hdbscan/_tree.pyx
@@ -432,7 +432,7 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
 
 
 cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
-        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy,
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
         set clusters,
         dict cluster_label_map,
         cnp.intp_t allow_single_cluster,
@@ -442,20 +442,21 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
     cdef:
         cnp.intp_t root_cluster
         cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
-        cnp.intp_t[:] parent_array, child_array
-        cnp.float64_t[:] lambda_array
+        cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
+        cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
         TreeUnionFind union_find
         cnp.intp_t n, parent, child, cluster
+        cnp.float64_t threshold
 
-    child_array = hierarchy['child']
-    parent_array = hierarchy['parent']
-    lambda_array = hierarchy['value']
+    child_array = condensed_tree['child']
+    parent_array = condensed_tree['parent']
+    lambda_array = condensed_tree['value']
 
     root_cluster = np.min(parent_array)
     result = np.empty(root_cluster, dtype=np.intp)
     union_find = TreeUnionFind(np.max(parent_array) + 1)
 
-    for n in range(hierarchy.shape[0]):
+    for n in range(condensed_tree.shape[0]):
         child = child_array[n]
         parent = parent_array[n]
         if child not in clusters:
@@ -463,24 +464,20 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
 
     for n in range(root_cluster):
         cluster = union_find.find(n)
-        if cluster < root_cluster:
-            result[n] = NOISE
-        elif cluster == root_cluster:
-            if len(clusters) == 1 and allow_single_cluster:
-                if cluster_selection_epsilon != 0.0:
-                    if hierarchy['value'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon :
-                        result[n] = cluster_label_map[cluster]
-                    else:
-                        result[n] = NOISE
-                elif hierarchy['value'][hierarchy['child'] == n] >= \
-                     hierarchy['value'][hierarchy['parent'] == cluster].max():
-                    result[n] = cluster_label_map[cluster]
-                else:
-                    result[n] = NOISE
-            else:
-                result[n] = NOISE
-        else:
-            result[n] = cluster_label_map[cluster]
+        label = NOISE
+        if cluster != root_cluster:
+            label = cluster_label_map[cluster]
+        elif len(clusters) == 1 and allow_single_cluster:
+            parent_lambda = lambda_array[child_array == n]
+            max_child_lambda = lambda_array[parent_array == cluster].max()
+            threshold = (
+                1 / cluster_selection_epsilon if cluster_selection_epsilon != 0.0
+                else max_child_lambda
+            )
+            if parent_lambda >= threshold:
+                label = cluster_label_map[cluster]
+
+        result[n] = label
 
     return result
 

From f701811975f7e46326df47db54794a4ac1b73084 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 5 Apr 2023 14:01:34 -0400
Subject: [PATCH 2/4] Updated threshold calculation

---
 sklearn/cluster/_hdbscan/_tree.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx
index c273706c21e31..807e17a02d2e1 100644
--- a/sklearn/cluster/_hdbscan/_tree.pyx
+++ b/sklearn/cluster/_hdbscan/_tree.pyx
@@ -474,12 +474,12 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
         if cluster != root_cluster:
             label = cluster_label_map[cluster]
         elif len(clusters) == 1 and allow_single_cluster:
+            # There can only be one parent
             parent_lambda = lambda_array[child_array == n]
-            max_child_lambda = lambda_array[parent_array == cluster].max()
-            threshold = (
-                1 / cluster_selection_epsilon if cluster_selection_epsilon != 0.0
-                else max_child_lambda
-            )
+            if cluster_selection_epsilon != 0.0:
+                threshold = 1 / cluster_selection_epsilon
+            else:
+                threshold = lambda_array[parent_array == cluster].max()
             if parent_lambda >= threshold:
                 label = cluster_label_map[cluster]
 

From cbf2055c20199142e5826a8c9e8d12068a5c29d7 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 2 May 2023 20:39:21 -0400
Subject: [PATCH 3/4] Added tests

---
 sklearn/cluster/_hdbscan/_tree.pyx    | 38 +++++++++++--
 sklearn/cluster/tests/test_hdbscan.py | 77 +++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx
index 58b7926f32c9d..25865efd4cf79 100644
--- a/sklearn/cluster/_hdbscan/_tree.pyx
+++ b/sklearn/cluster/_hdbscan/_tree.pyx
@@ -87,7 +87,7 @@ cdef list bfs_from_hierarchy(
     return result
 
 
-cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
+cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
     const HIERARCHY_t[::1] hierarchy,
     cnp.intp_t min_cluster_size=10
 ):
@@ -402,13 +402,41 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
     return result
 
 
-cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
+cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
         cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
         set clusters,
         dict cluster_label_map,
         cnp.intp_t allow_single_cluster,
         cnp.float64_t cluster_selection_epsilon
 ):
+    """Given a condensed tree, clusters and a labeling map for the clusters,
+    return an array containing the labels of each point based on cluster
+    membership. Note that this is where points may be marked as noisy
+    outliers. The determination of some points as noise is in large, single-
+    cluster datasets is controlled by the `allow_single_cluster` and
+    `cluster_selection_epsilon` parameters.
+
+    Parameters
+    ----------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+
+    clusters : set
+        The set of nodes corresponding to identified clusters. These node
+        values should be the same as those present in `condensed_tree`.
+
+    cluster_label_map : dict
+        A mapping from the node values present in `clusters` to the labels
+        which will be returned.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The cluster labels for each point in the data set;
+        a label of -1 denotes a noise assignment.
+    """
 
     cdef:
         cnp.intp_t root_cluster
@@ -444,6 +472,8 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
             if cluster_selection_epsilon != 0.0:
                 threshold = 1 / cluster_selection_epsilon
             else:
+                # The threshold should be calculated per-sample based on the
+                # largest lambda of any simbling node.
                 threshold = lambda_array[parent_array == cluster].max()
             if parent_lambda >= threshold:
                 label = cluster_label_map[cluster]
@@ -533,7 +563,7 @@ cdef cnp.intp_t traverse_upwards(
         else:
             return leaf #return node closest to root
 
-    parent_eps = 1 / <cnp.float64_t> cluster_tree[cluster_tree['child'] == parent]['value']
+    parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
     if parent_eps > cluster_selection_epsilon:
         return parent
     else:
@@ -726,7 +756,7 @@ cdef tuple _get_clusters(
     cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
     reverse_cluster_map = {n: c for c, n in cluster_map.items()}
 
-    labels = do_labelling(
+    labels = _do_labelling(
         condensed_tree,
         clusters,
         cluster_map,
diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
index a394c9c2f8a42..c3f6a9f7b0d23 100644
--- a/sklearn/cluster/tests/test_hdbscan.py
+++ b/sklearn/cluster/tests/test_hdbscan.py
@@ -16,6 +16,11 @@
 from sklearn.utils import shuffle
 from sklearn.utils._testing import assert_allclose, assert_array_equal
 from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
+from sklearn.cluster._hdbscan._tree import (
+    _do_labelling,
+    _condense_tree,
+    CONDENSED_dtype,
+)
 
 n_clusters_true = 3
 X, y = make_blobs(n_samples=200, random_state=10)
@@ -380,3 +385,75 @@ def test_hdbscan_precomputed_dense_nan():
     hdb = HDBSCAN(metric="precomputed")
     with pytest.raises(ValueError, match=msg):
         hdb.fit(X_nan)
+
+
+@pytest.mark.parametrize("allow_single_cluster", [True, False])
+@pytest.mark.parametrize("epsilon", [0, 0.1])
+def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
+    n_samples = 48
+    X, y = make_blobs(
+        n_samples,
+        random_state=global_random_seed,
+        # Ensure the clusters are distinct with no overlap
+        centers=[
+            [0, 0],
+            [10, 0],
+            [0, 10],
+        ],
+    )
+
+    est = HDBSCAN().fit(X)
+    condensed_tree = _condense_tree(
+        est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
+    )
+    clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
+    cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters=clusters,
+        cluster_label_map=cluster_label_map,
+        allow_single_cluster=allow_single_cluster,
+        cluster_selection_epsilon=epsilon,
+    )
+
+    first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
+    y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
+    aligned_target = np.vectorize(y_to_labels.get)(y)
+    assert_array_equal(labels, aligned_target)
+
+
+def test_labelling_thresholding():
+    n_samples = 5
+    MAX_LAMBDA = 1.5
+    condensed_tree = np.array(
+        [
+            (5, 2, MAX_LAMBDA, 1),
+            (5, 1, 0.1, 1),
+            (5, 0, MAX_LAMBDA, 1),
+            (5, 3, 0.2, 1),
+            (5, 4, 0.3, 1),
+        ],
+        dtype=CONDENSED_dtype,
+    )
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters={n_samples},
+        cluster_label_map={n_samples: 0, n_samples + 1: 1},
+        allow_single_cluster=True,
+        cluster_selection_epsilon=1,
+    )
+    num_noise = condensed_tree["value"] < 1
+    assert sum(num_noise) == sum(labels == -1)
+
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters={n_samples},
+        cluster_label_map={n_samples: 0, n_samples + 1: 1},
+        allow_single_cluster=True,
+        cluster_selection_epsilon=0,
+    )
+    # The threshold should be calculated per-sample based on the largest
+    # lambda of any simbling node. In this case, all points are siblings
+    # and the largest value is exactly MAX_LAMBDA.
+    num_noise = condensed_tree["value"] < MAX_LAMBDA
+    assert sum(num_noise) == sum(labels == -1)

From 0dbb65aeb6d65fac22ddda461b8037ad645f1abf Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 2 May 2023 20:43:29 -0400
Subject: [PATCH 4/4] Clarified inline comment

---
 sklearn/cluster/_hdbscan/_tree.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx
index 25865efd4cf79..a1bc539a5db41 100644
--- a/sklearn/cluster/_hdbscan/_tree.pyx
+++ b/sklearn/cluster/_hdbscan/_tree.pyx
@@ -467,7 +467,8 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
         if cluster != root_cluster:
             label = cluster_label_map[cluster]
         elif len(clusters) == 1 and allow_single_cluster:
-            # There can only be one parent
+            # There can only be one edge with this particular child hence this
+            # expression extracts a unique, scalar lambda value.
             parent_lambda = lambda_array[child_array == n]
             if cluster_selection_epsilon != 0.0:
                 threshold = 1 / cluster_selection_epsilon