From 92f5c84ddf9d99421604fabc2a271b20b9fe4b9a Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 5 Apr 2023 10:50:28 -0400 Subject: [PATCH 1/4] Improved thresholding check --- sklearn/cluster/_hdbscan/_tree.pyx | 47 ++++++++++++++---------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index dcea00cbc8487..cb514cbb67a70 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -432,7 +432,7 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( - cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, set clusters, dict cluster_label_map, cnp.intp_t allow_single_cluster, @@ -442,20 +442,21 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( cdef: cnp.intp_t root_cluster cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result - cnp.intp_t[:] parent_array, child_array - cnp.float64_t[:] lambda_array + cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array + cnp.ndarray[cnp.float64_t, ndim=1] lambda_array TreeUnionFind union_find cnp.intp_t n, parent, child, cluster + cnp.float64_t threshold - child_array = hierarchy['child'] - parent_array = hierarchy['parent'] - lambda_array = hierarchy['value'] + child_array = condensed_tree['child'] + parent_array = condensed_tree['parent'] + lambda_array = condensed_tree['value'] root_cluster = np.min(parent_array) result = np.empty(root_cluster, dtype=np.intp) union_find = TreeUnionFind(np.max(parent_array) + 1) - for n in range(hierarchy.shape[0]): + for n in range(condensed_tree.shape[0]): child = child_array[n] parent = parent_array[n] if child not in clusters: @@ -463,24 +464,20 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( for n in range(root_cluster): cluster = union_find.find(n) - if cluster < root_cluster: - result[n] = NOISE - elif cluster == root_cluster: - if len(clusters) == 1 and allow_single_cluster: - if cluster_selection_epsilon != 0.0: - if hierarchy['value'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : - result[n] = cluster_label_map[cluster] - else: - result[n] = NOISE - elif hierarchy['value'][hierarchy['child'] == n] >= \ - hierarchy['value'][hierarchy['parent'] == cluster].max(): - result[n] = cluster_label_map[cluster] - else: - result[n] = NOISE - else: - result[n] = NOISE - else: - result[n] = cluster_label_map[cluster] + label = NOISE + if cluster != root_cluster: + label = cluster_label_map[cluster] + elif len(clusters) == 1 and allow_single_cluster: + parent_lambda = lambda_array[child_array == n] + max_child_lambda = lambda_array[parent_array == cluster].max() + threshold = ( + 1 / cluster_selection_epsilon if cluster_selection_epsilon != 0.0 + else max_child_lambda + ) + if parent_lambda >= threshold: + label = cluster_label_map[cluster] + + result[n] = label return result From f701811975f7e46326df47db54794a4ac1b73084 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 5 Apr 2023 14:01:34 -0400 Subject: [PATCH 2/4] Updated threshold calculation --- sklearn/cluster/_hdbscan/_tree.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index c273706c21e31..807e17a02d2e1 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -474,12 +474,12 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( if cluster != root_cluster: label = cluster_label_map[cluster] elif len(clusters) == 1 and allow_single_cluster: + # There can only be one parent parent_lambda = lambda_array[child_array == n] - max_child_lambda = lambda_array[parent_array == cluster].max() - threshold = ( - 1 / cluster_selection_epsilon if cluster_selection_epsilon != 0.0 - else max_child_lambda - ) + if cluster_selection_epsilon != 0.0: + threshold = 1 / cluster_selection_epsilon + else: + threshold = lambda_array[parent_array == cluster].max() if parent_lambda >= threshold: label = cluster_label_map[cluster] From cbf2055c20199142e5826a8c9e8d12068a5c29d7 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 2 May 2023 20:39:21 -0400 Subject: [PATCH 3/4] Added tests --- sklearn/cluster/_hdbscan/_tree.pyx | 38 +++++++++++-- sklearn/cluster/tests/test_hdbscan.py | 77 +++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 58b7926f32c9d..25865efd4cf79 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -87,7 +87,7 @@ cdef list bfs_from_hierarchy( return result -cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( +cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( const HIERARCHY_t[::1] hierarchy, cnp.intp_t min_cluster_size=10 ): @@ -402,13 +402,41 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( return result -cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( +cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling( cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, set clusters, dict cluster_label_map, cnp.intp_t allow_single_cluster, cnp.float64_t cluster_selection_epsilon ): + """Given a condensed tree, clusters and a labeling map for the clusters, + return an array containing the labels of each point based on cluster + membership. Note that this is where points may be marked as noisy + outliers. The determination of some points as noise is in large, single- + cluster datasets is controlled by the `allow_single_cluster` and + `cluster_selection_epsilon` parameters. + + Parameters + ---------- + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. + + clusters : set + The set of nodes corresponding to identified clusters. These node + values should be the same as those present in `condensed_tree`. + + cluster_label_map : dict + A mapping from the node values present in `clusters` to the labels + which will be returned. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The cluster labels for each point in the data set; + a label of -1 denotes a noise assignment. + """ cdef: cnp.intp_t root_cluster @@ -444,6 +472,8 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( if cluster_selection_epsilon != 0.0: threshold = 1 / cluster_selection_epsilon else: + # The threshold should be calculated per-sample based on the + # largest lambda of any simbling node. threshold = lambda_array[parent_array == cluster].max() if parent_lambda >= threshold: label = cluster_label_map[cluster] @@ -533,7 +563,7 @@ cdef cnp.intp_t traverse_upwards( else: return leaf #return node closest to root - parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] + parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] if parent_eps > cluster_selection_epsilon: return parent else: @@ -726,7 +756,7 @@ cdef tuple _get_clusters( cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))} reverse_cluster_map = {n: c for c, n in cluster_map.items()} - labels = do_labelling( + labels = _do_labelling( condensed_tree, clusters, cluster_map, diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index a394c9c2f8a42..c3f6a9f7b0d23 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -16,6 +16,11 @@ from sklearn.utils import shuffle from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING +from sklearn.cluster._hdbscan._tree import ( + _do_labelling, + _condense_tree, + CONDENSED_dtype, +) n_clusters_true = 3 X, y = make_blobs(n_samples=200, random_state=10) @@ -380,3 +385,75 @@ def test_hdbscan_precomputed_dense_nan(): hdb = HDBSCAN(metric="precomputed") with pytest.raises(ValueError, match=msg): hdb.fit(X_nan) + + +@pytest.mark.parametrize("allow_single_cluster", [True, False]) +@pytest.mark.parametrize("epsilon", [0, 0.1]) +def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon): + n_samples = 48 + X, y = make_blobs( + n_samples, + random_state=global_random_seed, + # Ensure the clusters are distinct with no overlap + centers=[ + [0, 0], + [10, 0], + [0, 10], + ], + ) + + est = HDBSCAN().fit(X) + condensed_tree = _condense_tree( + est._single_linkage_tree_, min_cluster_size=est.min_cluster_size + ) + clusters = {n_samples + 2, n_samples + 3, n_samples + 4} + cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2} + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters=clusters, + cluster_label_map=cluster_label_map, + allow_single_cluster=allow_single_cluster, + cluster_selection_epsilon=epsilon, + ) + + first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))} + y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))} + aligned_target = np.vectorize(y_to_labels.get)(y) + assert_array_equal(labels, aligned_target) + + +def test_labelling_thresholding(): + n_samples = 5 + MAX_LAMBDA = 1.5 + condensed_tree = np.array( + [ + (5, 2, MAX_LAMBDA, 1), + (5, 1, 0.1, 1), + (5, 0, MAX_LAMBDA, 1), + (5, 3, 0.2, 1), + (5, 4, 0.3, 1), + ], + dtype=CONDENSED_dtype, + ) + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters={n_samples}, + cluster_label_map={n_samples: 0, n_samples + 1: 1}, + allow_single_cluster=True, + cluster_selection_epsilon=1, + ) + num_noise = condensed_tree["value"] < 1 + assert sum(num_noise) == sum(labels == -1) + + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters={n_samples}, + cluster_label_map={n_samples: 0, n_samples + 1: 1}, + allow_single_cluster=True, + cluster_selection_epsilon=0, + ) + # The threshold should be calculated per-sample based on the largest + # lambda of any simbling node. In this case, all points are siblings + # and the largest value is exactly MAX_LAMBDA. + num_noise = condensed_tree["value"] < MAX_LAMBDA + assert sum(num_noise) == sum(labels == -1) From 0dbb65aeb6d65fac22ddda461b8037ad645f1abf Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 2 May 2023 20:43:29 -0400 Subject: [PATCH 4/4] Clarified inline comment --- sklearn/cluster/_hdbscan/_tree.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 25865efd4cf79..a1bc539a5db41 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -467,7 +467,8 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling( if cluster != root_cluster: label = cluster_label_map[cluster] elif len(clusters) == 1 and allow_single_cluster: - # There can only be one parent + # There can only be one edge with this particular child hence this + # expression extracts a unique, scalar lambda value. parent_lambda = lambda_array[child_array == n] if cluster_selection_epsilon != 0.0: threshold = 1 / cluster_selection_epsilon