diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index cd79e41677c28..a1bc539a5db41 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -87,7 +87,7 @@ cdef list bfs_from_hierarchy( return result -cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( +cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( const HIERARCHY_t[::1] hierarchy, cnp.intp_t min_cluster_size=10 ): @@ -402,31 +402,60 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( return result -cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( - cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy, +cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, set clusters, dict cluster_label_map, cnp.intp_t allow_single_cluster, cnp.float64_t cluster_selection_epsilon ): + """Given a condensed tree, clusters and a labeling map for the clusters, + return an array containing the labels of each point based on cluster + membership. Note that this is where points may be marked as noisy + outliers. The determination of some points as noise is in large, single- + cluster datasets is controlled by the `allow_single_cluster` and + `cluster_selection_epsilon` parameters. + + Parameters + ---------- + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. + + clusters : set + The set of nodes corresponding to identified clusters. These node + values should be the same as those present in `condensed_tree`. + + cluster_label_map : dict + A mapping from the node values present in `clusters` to the labels + which will be returned. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The cluster labels for each point in the data set; + a label of -1 denotes a noise assignment. + """ cdef: cnp.intp_t root_cluster cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result - cnp.intp_t[:] parent_array, child_array - cnp.float64_t[:] lambda_array + cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array + cnp.ndarray[cnp.float64_t, ndim=1] lambda_array TreeUnionFind union_find cnp.intp_t n, parent, child, cluster + cnp.float64_t threshold - child_array = hierarchy['child'] - parent_array = hierarchy['parent'] - lambda_array = hierarchy['value'] + child_array = condensed_tree['child'] + parent_array = condensed_tree['parent'] + lambda_array = condensed_tree['value'] root_cluster = np.min(parent_array) result = np.empty(root_cluster, dtype=np.intp) union_find = TreeUnionFind(np.max(parent_array) + 1) - for n in range(hierarchy.shape[0]): + for n in range(condensed_tree.shape[0]): child = child_array[n] parent = parent_array[n] if child not in clusters: @@ -434,24 +463,23 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( for n in range(root_cluster): cluster = union_find.find(n) - if cluster < root_cluster: - result[n] = NOISE - elif cluster == root_cluster: - if len(clusters) == 1 and allow_single_cluster: - if cluster_selection_epsilon != 0.0: - if hierarchy['value'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : - result[n] = cluster_label_map[cluster] - else: - result[n] = NOISE - elif hierarchy['value'][hierarchy['child'] == n] >= \ - hierarchy['value'][hierarchy['parent'] == cluster].max(): - result[n] = cluster_label_map[cluster] - else: - result[n] = NOISE + label = NOISE + if cluster != root_cluster: + label = cluster_label_map[cluster] + elif len(clusters) == 1 and allow_single_cluster: + # There can only be one edge with this particular child hence this + # expression extracts a unique, scalar lambda value. + parent_lambda = lambda_array[child_array == n] + if cluster_selection_epsilon != 0.0: + threshold = 1 / cluster_selection_epsilon else: - result[n] = NOISE - else: - result[n] = cluster_label_map[cluster] + # The threshold should be calculated per-sample based on the + # largest lambda of any simbling node. + threshold = lambda_array[parent_array == cluster].max() + if parent_lambda >= threshold: + label = cluster_label_map[cluster] + + result[n] = label return result @@ -536,7 +564,7 @@ cdef cnp.intp_t traverse_upwards( else: return leaf #return node closest to root - parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] + parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] if parent_eps > cluster_selection_epsilon: return parent else: @@ -729,7 +757,7 @@ cdef tuple _get_clusters( cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))} reverse_cluster_map = {n: c for c, n in cluster_map.items()} - labels = do_labelling( + labels = _do_labelling( condensed_tree, clusters, cluster_map, diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index a394c9c2f8a42..c3f6a9f7b0d23 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -16,6 +16,11 @@ from sklearn.utils import shuffle from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING +from sklearn.cluster._hdbscan._tree import ( + _do_labelling, + _condense_tree, + CONDENSED_dtype, +) n_clusters_true = 3 X, y = make_blobs(n_samples=200, random_state=10) @@ -380,3 +385,75 @@ def test_hdbscan_precomputed_dense_nan(): hdb = HDBSCAN(metric="precomputed") with pytest.raises(ValueError, match=msg): hdb.fit(X_nan) + + +@pytest.mark.parametrize("allow_single_cluster", [True, False]) +@pytest.mark.parametrize("epsilon", [0, 0.1]) +def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon): + n_samples = 48 + X, y = make_blobs( + n_samples, + random_state=global_random_seed, + # Ensure the clusters are distinct with no overlap + centers=[ + [0, 0], + [10, 0], + [0, 10], + ], + ) + + est = HDBSCAN().fit(X) + condensed_tree = _condense_tree( + est._single_linkage_tree_, min_cluster_size=est.min_cluster_size + ) + clusters = {n_samples + 2, n_samples + 3, n_samples + 4} + cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2} + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters=clusters, + cluster_label_map=cluster_label_map, + allow_single_cluster=allow_single_cluster, + cluster_selection_epsilon=epsilon, + ) + + first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))} + y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))} + aligned_target = np.vectorize(y_to_labels.get)(y) + assert_array_equal(labels, aligned_target) + + +def test_labelling_thresholding(): + n_samples = 5 + MAX_LAMBDA = 1.5 + condensed_tree = np.array( + [ + (5, 2, MAX_LAMBDA, 1), + (5, 1, 0.1, 1), + (5, 0, MAX_LAMBDA, 1), + (5, 3, 0.2, 1), + (5, 4, 0.3, 1), + ], + dtype=CONDENSED_dtype, + ) + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters={n_samples}, + cluster_label_map={n_samples: 0, n_samples + 1: 1}, + allow_single_cluster=True, + cluster_selection_epsilon=1, + ) + num_noise = condensed_tree["value"] < 1 + assert sum(num_noise) == sum(labels == -1) + + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters={n_samples}, + cluster_label_map={n_samples: 0, n_samples + 1: 1}, + allow_single_cluster=True, + cluster_selection_epsilon=0, + ) + # The threshold should be calculated per-sample based on the largest + # lambda of any simbling node. In this case, all points are siblings + # and the largest value is exactly MAX_LAMBDA. + num_noise = condensed_tree["value"] < MAX_LAMBDA + assert sum(num_noise) == sum(labels == -1)