diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index fd9888ac4da82..0f15f4eedbecb 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -10,6 +10,8 @@ from libc.float cimport DBL_MAX import numpy as np from ...metrics._dist_metrics cimport DistanceMetric from ...cluster._hierarchical_fast cimport UnionFind +from ...cluster._hdbscan._tree cimport HIERARCHY_t +from ...cluster._hdbscan._tree import HIERARCHY_dtype from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._typedefs import ITYPE, DTYPE @@ -188,7 +190,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( return mst -cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst): +cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst): """Construct a single-linkage tree from an MST. Parameters @@ -209,16 +211,16 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST - new cluster size """ cdef: - cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage + cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage # Note mst.shape[0] is one fewer than the number of samples cnp.int64_t n_samples = mst.shape[0] + 1 - cnp.int64_t current_node_cluster, next_node_cluster + cnp.intp_t current_node_cluster, next_node_cluster cnp.int64_t current_node, next_node, index cnp.float64_t distance UnionFind U = UnionFind(n_samples) - single_linkage = np.zeros((n_samples - 1, 4), dtype=np.float64) + single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype) for i in range(n_samples - 1): @@ -229,12 +231,10 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST current_node_cluster = U.fast_find(current_node) next_node_cluster = U.fast_find(next_node) - # TODO: Update this to an array of structs (AoS). - # Should be done simultaneously in _tree.pyx to ensure compatability. - single_linkage[i][0] = current_node_cluster - single_linkage[i][1] = next_node_cluster - single_linkage[i][2] = distance - single_linkage[i][3] = U.size[current_node_cluster] + U.size[next_node_cluster] + single_linkage[i].left_node = current_node_cluster + single_linkage[i].right_node = next_node_cluster + single_linkage[i].value = distance + single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster] U.union(current_node_cluster, next_node_cluster) diff --git a/sklearn/cluster/_hdbscan/_tree.pxd b/sklearn/cluster/_hdbscan/_tree.pxd new file mode 100644 index 0000000000000..f0ea4bdb3d899 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_tree.pxd @@ -0,0 +1,16 @@ +cimport numpy as cnp + +# This corresponds to the scipy.cluster.hierarchy format +ctypedef packed struct HIERARCHY_t: + cnp.intp_t left_node + cnp.intp_t right_node + cnp.float64_t value + cnp.intp_t cluster_size + +# Effectively an edgelist encoding a parent/child pair, along with a value and +# the corresponding cluster_size in each row providing a tree structure. +ctypedef packed struct CONDENSED_t: + cnp.intp_t parent + cnp.intp_t child + cnp.float64_t value + cnp.intp_t cluster_size diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 6e4df6cf12592..dcea00cbc8487 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -2,26 +2,64 @@ # Authors: Leland McInnes # License: 3-clause BSD -import numpy as np cimport numpy as cnp - +from libc.math cimport isinf import cython +import numpy as np cdef cnp.float64_t INFTY = np.inf cdef cnp.intp_t NOISE = -1 +HIERARCHY_dtype = np.dtype([ + ("left_node", np.intp), + ("right_node", np.intp), + ("value", np.float64), + ("cluster_size", np.intp), +]) + +CONDENSED_dtype = np.dtype([ + ("parent", np.intp), + ("child", np.intp), + ("value", np.float64), + ("cluster_size", np.intp), +]) + +cpdef tuple tree_to_labels( + const HIERARCHY_t[::1] single_linkage_tree, + cnp.intp_t min_cluster_size=10, + cluster_selection_method="eom", + bint allow_single_cluster=False, + cnp.float64_t cluster_selection_epsilon=0.0, + max_cluster_size=None, +): + cdef: + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities + + condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size) + labels, probabilities = _get_clusters( + condensed_tree, + _compute_stability(condensed_tree), + cluster_selection_method, + allow_single_cluster, + cluster_selection_epsilon, + max_cluster_size, + ) + + return (labels, probabilities) cdef list bfs_from_hierarchy( - cnp.ndarray[cnp.float64_t, ndim=2] hierarchy, + const HIERARCHY_t[::1] hierarchy, cnp.intp_t bfs_root ): """ Perform a breadth first search on a tree in scipy hclust format. """ - cdef list process_queue, next_queue + cdef list process_queue, next_queue, result cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1 cdef cnp.intp_t node process_queue = [bfs_root] @@ -29,24 +67,28 @@ cdef list bfs_from_hierarchy( while process_queue: result.extend(process_queue) + # By construction, node i is formed by the union of nodes + # hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1] process_queue = [ x - n_samples for x in process_queue if x >= n_samples ] if process_queue: - process_queue = ( - hierarchy[process_queue, :2] - .flatten() - .astype(np.intp) - .tolist() - ) - + next_queue = [] + for node in process_queue: + next_queue.extend( + [ + hierarchy[node].left_node, + hierarchy[node].right_node, + ] + ) + process_queue = next_queue return result -cpdef cnp.ndarray condense_tree( - cnp.ndarray[cnp.float64_t, ndim=2] hierarchy, +cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( + const HIERARCHY_t[::1] hierarchy, cnp.intp_t min_cluster_size=10 ): """Condense a tree according to a minimum cluster size. This is akin @@ -57,7 +99,7 @@ cpdef cnp.ndarray condense_tree( Parameters ---------- - hierarchy : ndarray (n_samples - 1, 4) + hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype A single linkage hierarchy in scipy.cluster.hierarchy format. min_cluster_size : int, optional (default 10) @@ -66,9 +108,10 @@ cpdef cnp.ndarray condense_tree( Returns ------- - condensed_tree : numpy recarray - Effectively an edgelist with a parent, child, lambda_val - and child_size in each row providing a tree structure. + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. """ cdef: @@ -83,6 +126,8 @@ cpdef cnp.ndarray condense_tree( cnp.intp_t node, sub_node, left, right cnp.float64_t lambda_value, distance cnp.intp_t left_count, right_count + HIERARCHY_t children + relabel = np.empty(root + 1, dtype=np.intp) relabel[root] = n_samples result_list = [] @@ -93,21 +138,21 @@ cpdef cnp.ndarray condense_tree( continue children = hierarchy[node - n_samples] - left = children[0] - right = children[1] - distance = children[2] + left = children.left_node + right = children.right_node + distance = children.value if distance > 0.0: lambda_value = 1.0 / distance else: lambda_value = INFTY if left >= n_samples: - left_count = hierarchy[left - n_samples][3] + left_count = hierarchy[left - n_samples].cluster_size else: left_count = 1 if right >= n_samples: - right_count = hierarchy[right - n_samples][3] + right_count = hierarchy[right - n_samples].cluster_size else: right_count = 1 @@ -157,30 +202,30 @@ cpdef cnp.ndarray condense_tree( ) ignore[sub_node] = True - return np.array(result_list, dtype=[('parent', np.intp), - ('child', np.intp), - ('lambda_val', np.float64), - ('child_size', np.intp)]) + return np.array(result_list, dtype=CONDENSED_dtype) -cpdef dict compute_stability(cnp.ndarray condensed_tree): +cdef dict _compute_stability( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree +): cdef: cnp.float64_t[::1] result, births - cnp.ndarray condensed_node cnp.intp_t[:] parents = condensed_tree['parent'] - cnp.float64_t[:] lambdas = condensed_tree['lambda_val'] - cnp.intp_t[:] sizes = condensed_tree['child_size'] + cnp.float64_t[:] lambdas = condensed_tree['value'] + cnp.intp_t[:] sizes = condensed_tree['cluster_size'] cnp.intp_t parent, cluster_size, result_index - cnp.float64_t lambda_val + cnp.float64_t lambda_val, child_size cnp.float64_t[:, :] result_pre_dict cnp.intp_t largest_child = condensed_tree['child'].max() cnp.intp_t smallest_cluster = np.min(parents) cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1 - cnp.ndarray sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], axis=0) + cnp.ndarray sorted_child_data = np.sort(condensed_tree[['child', 'value']], axis=0) cnp.intp_t[:] sorted_children = sorted_child_data['child'].copy() - cnp.float64_t[:] sorted_lambdas = sorted_child_data['lambda_val'].copy() + cnp.float64_t[:] sorted_lambdas = sorted_child_data['value'].copy() + cnp.intp_t child, current_child = -1 + cnp.float64_t min_lambda = 0 largest_child = max(largest_child, smallest_cluster) births = np.full(largest_child + 1, np.nan, dtype=np.float64) @@ -188,9 +233,7 @@ cpdef dict compute_stability(cnp.ndarray condensed_tree): if largest_child < smallest_cluster: largest_child = smallest_cluster - births = np.nan * np.ones(largest_child + 1, dtype=np.float64) - current_child = -1 - min_lambda = 0 + births = np.full(largest_child + 1, np.nan, dtype=np.float64) for idx in range(condensed_tree.shape[0]): child = sorted_children[idx] lambda_val = sorted_lambdas[idx] @@ -229,10 +272,10 @@ cpdef dict compute_stability(cnp.ndarray condensed_tree): return dict(result_pre_dict) -cdef list bfs_from_cluster_tree(cnp.ndarray hierarchy, cnp.intp_t bfs_root): +cdef list bfs_from_cluster_tree(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy, cnp.intp_t bfs_root): cdef list result - cdef cnp.ndarray[cnp.intp_t, ndim=1] to_process + cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] to_process result = [] to_process = np.array([bfs_root], dtype=np.intp) @@ -244,20 +287,21 @@ cdef list bfs_from_cluster_tree(cnp.ndarray hierarchy, cnp.intp_t bfs_root): return result -cdef max_lambdas(cnp.ndarray hierarchy): +cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy): cdef: cnp.ndarray sorted_parent_data cnp.intp_t[:] sorted_parents - cnp.float64_t[:] sorted_lambdas, deaths + cnp.float64_t[:] sorted_lambdas + cnp.float64_t[::1] deaths cnp.intp_t parent, current_parent cnp.float64_t lambda_val, max_lambda cnp.intp_t largest_parent = hierarchy['parent'].max() - sorted_parent_data = np.sort(hierarchy[['parent', 'lambda_val']], axis=0) + sorted_parent_data = np.sort(hierarchy[['parent', 'value']], axis=0) deaths = np.zeros(largest_parent + 1, dtype=np.float64) sorted_parents = sorted_parent_data['parent'] - sorted_lambdas = sorted_parent_data['lambda_val'] + sorted_lambdas = sorted_parent_data['value'] current_parent = -1 max_lambda = 0 @@ -315,8 +359,8 @@ cdef class TreeUnionFind: return self.data[x, 0] -cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( - cnp.ndarray linkage, +cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( + const HIERARCHY_t[::1] linkage, cnp.float64_t cut, cnp.intp_t min_cluster_size ): @@ -327,7 +371,7 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( Parameters ---------- - linkage : ndarray (n_samples - 1, 4) + linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype The single linkage tree in scipy.cluster.hierarchy format. cut : double @@ -345,21 +389,23 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( """ cdef: - cnp.intp_t n, cluster, cluster_id, root, n_samples - cnp.ndarray[cnp.intp_t, ndim=1] result - cnp.intp_t[:] unique_labels, cluster_size + cnp.intp_t n, cluster, cluster_id, root, n_samples, cluster_label + cnp.intp_t[::1] unique_labels, cluster_size + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result TreeUnionFind union_find + dict cluster_label_map + HIERARCHY_t node root = 2 * linkage.shape[0] n_samples = root // 2 + 1 result = np.empty(n_samples, dtype=np.intp) - union_find = TreeUnionFind( root + 1) + union_find = TreeUnionFind(root + 1) cluster = n_samples - for row in linkage: - if row[2] < cut: - union_find.union( row[0], cluster) - union_find.union( row[1], cluster) + for node in linkage: + if node.value < cut: + union_find.union(node.left_node, cluster) + union_find.union(node.right_node, cluster) cluster += 1 cluster_size = np.zeros(cluster, dtype=np.intp) @@ -385,8 +431,8 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( return result -cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( - cnp.ndarray hierarchy, +cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy, set clusters, dict cluster_label_map, cnp.intp_t allow_single_cluster, @@ -395,7 +441,7 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( cdef: cnp.intp_t root_cluster - cnp.ndarray[cnp.intp_t, ndim=1] result + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result cnp.intp_t[:] parent_array, child_array cnp.float64_t[:] lambda_array TreeUnionFind union_find @@ -403,7 +449,7 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( child_array = hierarchy['child'] parent_array = hierarchy['parent'] - lambda_array = hierarchy['lambda_val'] + lambda_array = hierarchy['value'] root_cluster = np.min(parent_array) result = np.empty(root_cluster, dtype=np.intp) @@ -422,12 +468,12 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( elif cluster == root_cluster: if len(clusters) == 1 and allow_single_cluster: if cluster_selection_epsilon != 0.0: - if hierarchy['lambda_val'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : + if hierarchy['value'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : result[n] = cluster_label_map[cluster] else: result[n] = NOISE - elif hierarchy['lambda_val'][hierarchy['child'] == n] >= \ - hierarchy['lambda_val'][hierarchy['parent'] == cluster].max(): + elif hierarchy['value'][hierarchy['child'] == n] >= \ + hierarchy['value'][hierarchy['parent'] == cluster].max(): result[n] = cluster_label_map[cluster] else: result[n] = NOISE @@ -439,25 +485,29 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( return result -cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labels): +cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, + dict cluster_map, + cnp.intp_t[::1] labels +): cdef: - cnp.ndarray[cnp.float64_t, ndim=1] result + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result cnp.float64_t[:] lambda_array cnp.float64_t[::1] deaths cnp.intp_t[:] child_array, parent_array cnp.intp_t root_cluster, n, point, cluster_num, cluster cnp.float64_t max_lambda, lambda_val - child_array = hierarchy['child'] - parent_array = hierarchy['parent'] - lambda_array = hierarchy['lambda_val'] + child_array = condensed_tree['child'] + parent_array = condensed_tree['parent'] + lambda_array = condensed_tree['value'] result = np.zeros(labels.shape[0]) - deaths = max_lambdas(hierarchy) + deaths = max_lambdas(condensed_tree) root_cluster = np.min(parent_array) - for n in range(hierarchy.shape[0]): + for n in range(condensed_tree.shape[0]): point = child_array[n] if point >= root_cluster: continue @@ -468,7 +518,7 @@ cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labe cluster = cluster_map[cluster_num] max_lambda = deaths[cluster] - if max_lambda == 0.0 or not np.isfinite(lambda_array[n]): + if max_lambda == 0.0 or isinf(lambda_array[n]): result[point] = 1.0 else: lambda_val = min(lambda_array[n], max_lambda) @@ -477,7 +527,10 @@ cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labe return result -cpdef list recurse_leaf_dfs(cnp.ndarray cluster_tree, cnp.intp_t current_node): +cpdef list recurse_leaf_dfs( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, + cnp.intp_t current_node +): cdef cnp.intp_t[:] children cdef cnp.intp_t child @@ -488,7 +541,7 @@ cpdef list recurse_leaf_dfs(cnp.ndarray cluster_tree, cnp.intp_t current_node): return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) -cpdef list get_cluster_tree_leaves(cnp.ndarray cluster_tree): +cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree): cdef cnp.intp_t root if cluster_tree.shape[0] == 0: return [] @@ -496,7 +549,7 @@ cpdef list get_cluster_tree_leaves(cnp.ndarray cluster_tree): return recurse_leaf_dfs(cluster_tree, root) cdef cnp.intp_t traverse_upwards( - cnp.ndarray cluster_tree, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, cnp.float64_t cluster_selection_epsilon, cnp.intp_t leaf, cnp.intp_t allow_single_cluster @@ -512,7 +565,7 @@ cdef cnp.intp_t traverse_upwards( else: return leaf #return node closest to root - parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val'] + parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] if parent_eps > cluster_selection_epsilon: return parent else: @@ -525,7 +578,7 @@ cdef cnp.intp_t traverse_upwards( cdef set epsilon_search( set leaves, - cnp.ndarray cluster_tree, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, cnp.float64_t cluster_selection_epsilon, cnp.intp_t allow_single_cluster ): @@ -534,9 +587,13 @@ cdef set epsilon_search( list processed = list() cnp.intp_t leaf, epsilon_child, sub_node cnp.float64_t eps + cnp.uint8_t[:] leaf_nodes + cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child'] + cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value'] for leaf in leaves: - eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0] + leaf_nodes = children == leaf + eps = 1 / distances[leaf_nodes][0] if eps < cluster_selection_epsilon: if leaf not in processed: epsilon_child = traverse_upwards( @@ -556,8 +613,8 @@ cdef set epsilon_search( return set(selected_clusters) @cython.wraparound(True) -cpdef tuple get_clusters( - cnp.ndarray hierarchy, +cdef tuple _get_clusters( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, dict stability, cluster_selection_method='eom', cnp.uint8_t allow_single_cluster=False, @@ -570,8 +627,10 @@ cpdef tuple get_clusters( Parameters ---------- - tree : numpy recarray - The condensed tree to extract flat clusters from + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. stability : dict A dictionary mapping cluster_ids to stability values @@ -606,13 +665,13 @@ cpdef tuple get_clusters( """ cdef: list node_list - cnp.ndarray cluster_tree - cnp.uint8_t[:] child_selection - cnp.ndarray[cnp.intp_t, ndim=1] labels + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree + cnp.uint8_t[::1] child_selection + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels dict is_cluster, cluster_sizes cnp.float64_t subtree_stability, max_lambda cnp.intp_t node, sub_node, cluster, n_samples - cnp.ndarray[cnp.float64_t, ndim=1] probs + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs # Assume clusters are ordered by numeric id equivalent to # a topological sort of the tree; This is valid given the @@ -624,19 +683,19 @@ cpdef tuple get_clusters( node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root) - cluster_tree = hierarchy[hierarchy['child_size'] > 1] + cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1] is_cluster = {cluster: True for cluster in node_list} - n_samples = np.max(hierarchy[hierarchy['child_size'] == 1]['child']) + 1 - max_lambda = np.max(hierarchy['lambda_val']) + n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1 + max_lambda = np.max(condensed_tree['value']) if max_cluster_size is None: max_cluster_size = n_samples + 1 # Set to a value that will never be triggered - cluster_sizes = {child: child_size for child, child_size - in zip(cluster_tree['child'], cluster_tree['child_size'])} + cluster_sizes = {child: cluster_size for child, cluster_size + in zip(cluster_tree['child'], cluster_tree['cluster_size'])} if allow_single_cluster: # Compute cluster size for the root node cluster_sizes[node_list[-1]] = np.sum( - cluster_tree[cluster_tree['parent'] == node_list[-1]]['child_size']) + cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size']) if cluster_selection_method == 'eom': for node in node_list: @@ -677,7 +736,7 @@ cpdef tuple get_clusters( if len(leaves) == 0: for c in is_cluster: is_cluster[c] = False - is_cluster[hierarchy['parent'].min()] = True + is_cluster[condensed_tree['parent'].min()] = True if cluster_selection_epsilon != 0.0: selected_clusters = epsilon_search( @@ -700,12 +759,12 @@ cpdef tuple get_clusters( reverse_cluster_map = {n: c for c, n in cluster_map.items()} labels = do_labelling( - hierarchy, + condensed_tree, clusters, cluster_map, allow_single_cluster, cluster_selection_epsilon ) - probs = get_probabilities(hierarchy, reverse_cluster_map, labels) + probs = get_probabilities(condensed_tree, reverse_cluster_map, labels) return (labels, probs) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 947ae918c93a8..c55f8913024ae 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -28,7 +28,8 @@ mst_from_data_matrix, MST_edge_dtype, ) -from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut +from ._tree import tree_to_labels, labelling_at_cut +from ._tree import HIERARCHY_dtype FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics @@ -83,31 +84,6 @@ def _brute_mst(mutual_reachability, min_samples): return mst -def _tree_to_labels( - single_linkage_tree, - min_cluster_size=10, - cluster_selection_method="eom", - allow_single_cluster=False, - cluster_selection_epsilon=0.0, - max_cluster_size=None, -): - """Converts a pretrained tree and cluster size into a - set of labels and probabilities. - """ - condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) - stability_dict = compute_stability(condensed_tree) - labels, probabilities = get_clusters( - condensed_tree, - stability_dict, - cluster_selection_method, - allow_single_cluster, - cluster_selection_epsilon, - max_cluster_size, - ) - - return (labels, probabilities, single_linkage_tree) - - def _process_mst(min_spanning_tree): # Sort edges of the min_spanning_tree by weight row_order = np.argsort(min_spanning_tree["distance"]) @@ -221,24 +197,29 @@ def remap_single_linkage_tree(tree, internal_to_raw, non_finite): finite_count = len(internal_to_raw) outlier_count = len(non_finite) - for i, (left, right, *_) in enumerate(tree): + for i, _ in enumerate(tree): + left = tree[i]["left_node"] + right = tree[i]["right_node"] + if left < finite_count: - tree[i, 0] = internal_to_raw[left] + tree[i]["left_node"] = internal_to_raw[left] else: - tree[i, 0] = left + outlier_count + tree[i]["left_node"] = left + outlier_count if right < finite_count: - tree[i, 1] = internal_to_raw[right] + tree[i]["right_node"] = internal_to_raw[right] else: - tree[i, 1] = right + outlier_count + tree[i]["right_node"] = right + outlier_count - outlier_tree = np.zeros((len(non_finite), 4)) - last_cluster_id = tree[tree.shape[0] - 1][0:2].max() - last_cluster_size = tree[tree.shape[0] - 1][3] + outlier_tree = np.zeros(len(non_finite), dtype=HIERARCHY_dtype) + last_cluster_id = max( + tree[tree.shape[0] - 1]["left_node"], tree[tree.shape[0] - 1]["right_node"] + ) + last_cluster_size = tree[tree.shape[0] - 1]["cluster_size"] for i, outlier in enumerate(non_finite): outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) last_cluster_id += 1 last_cluster_size += 1 - tree = np.vstack([tree, outlier_tree]) + tree = np.concatenate([tree, outlier_tree]) return tree @@ -658,14 +639,10 @@ def fit(self, X, y=None): kwargs["algo"] = "ball_tree" kwargs["leaf_size"] = self.leaf_size - single_linkage_tree = mst_func(**kwargs) + self._single_linkage_tree_ = mst_func(**kwargs) - ( - self.labels_, - self.probabilities_, + self.labels_, self.probabilities_ = tree_to_labels( self._single_linkage_tree_, - ) = _tree_to_labels( - single_linkage_tree, self.min_cluster_size, self.cluster_selection_method, self.allow_single_cluster,