From e8a56a28b05fce5981ac00cbb6975c3e58c7dead Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 12 Mar 2023 10:45:17 -0400 Subject: [PATCH 1/8] Updated hdbscan submodule with new HIERARCHY types --- sklearn/cluster/_hdbscan/_linkage.pyx | 18 ++- sklearn/cluster/_hdbscan/_tree.pxd | 9 ++ sklearn/cluster/_hdbscan/_tree.pyx | 222 ++++++++++++++------------ sklearn/cluster/_hdbscan/hdbscan.py | 19 ++- 4 files changed, 152 insertions(+), 116 deletions(-) create mode 100644 sklearn/cluster/_hdbscan/_tree.pxd diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index fd9888ac4da82..85a00e7dd27be 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -10,6 +10,8 @@ from libc.float cimport DBL_MAX import numpy as np from ...metrics._dist_metrics cimport DistanceMetric from ...cluster._hierarchical_fast cimport UnionFind +from ...cluster._hdbscan._tree cimport HIERARCHY_t +from ...cluster._hdbscan._tree import HIERARCHY_dtype from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._typedefs import ITYPE, DTYPE @@ -188,7 +190,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( return mst -cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst): +cpdef cnp.ndarray[HIERARCHY_t, ndim=1] make_single_linkage(const MST_edge_t[::1] mst): """Construct a single-linkage tree from an MST. Parameters @@ -209,16 +211,16 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST - new cluster size """ cdef: - cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage + cnp.ndarray[HIERARCHY_t, ndim=1] single_linkage # Note mst.shape[0] is one fewer than the number of samples cnp.int64_t n_samples = mst.shape[0] + 1 - cnp.int64_t current_node_cluster, next_node_cluster cnp.int64_t current_node, next_node, index + cnp.intp_t current_node_cluster, next_node_cluster cnp.float64_t distance UnionFind U = UnionFind(n_samples) - single_linkage = np.zeros((n_samples - 1, 4), dtype=np.float64) + single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype) for i in range(n_samples - 1): @@ -231,10 +233,10 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST # TODO: Update this to an array of structs (AoS). # Should be done simultaneously in _tree.pyx to ensure compatability. - single_linkage[i][0] = current_node_cluster - single_linkage[i][1] = next_node_cluster - single_linkage[i][2] = distance - single_linkage[i][3] = U.size[current_node_cluster] + U.size[next_node_cluster] + single_linkage[i].left_node = current_node_cluster + single_linkage[i].right_node = next_node_cluster + single_linkage[i].value = distance + single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster] U.union(current_node_cluster, next_node_cluster) diff --git a/sklearn/cluster/_hdbscan/_tree.pxd b/sklearn/cluster/_hdbscan/_tree.pxd new file mode 100644 index 0000000000000..83d5b38cb99fb --- /dev/null +++ b/sklearn/cluster/_hdbscan/_tree.pxd @@ -0,0 +1,9 @@ +cimport numpy as cnp +import numpy as np + + +ctypedef packed struct HIERARCHY_t: + cnp.intp_t left_node + cnp.intp_t right_node + cnp.float64_t value + cnp.intp_t cluster_size diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 0e493f28379eb..a956c622cbbbc 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -2,19 +2,25 @@ # Authors: Leland McInnes # License: 3-clause BSD -import numpy as np cimport numpy as cnp +from libc.math cimport isinf +from cython import wraparound -import cython - +import numpy as np cdef cnp.float64_t INFTY = np.inf cdef cnp.intp_t NOISE = -1 +HIERARCHY_dtype = np.dtype([ + ("left_node", np.intp), + ("right_node", np.intp), + ("value", np.float64), + ("cluster_size", np.intp), +]) cdef list bfs_from_hierarchy( - cnp.ndarray[cnp.float64_t, ndim=2] hierarchy, + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, cnp.intp_t bfs_root ): """ @@ -29,24 +35,28 @@ cdef list bfs_from_hierarchy( while process_queue: result.extend(process_queue) + # By construction, node i is formed by the union of nodes + # hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1] process_queue = [ x - n_samples for x in process_queue if x >= n_samples ] if process_queue: - process_queue = ( - hierarchy[process_queue, :2] - .flatten() - .astype(np.intp) - .tolist() - ) - + next_queue = [] + for node in process_queue: + next_queue.extend( + [ + hierarchy[node].left_node, + hierarchy[node].right_node, + ] + ) + process_queue = next_queue return result -cpdef cnp.ndarray condense_tree( - cnp.ndarray[cnp.float64_t, ndim=2] hierarchy, +cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condense_tree( + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, cnp.intp_t min_cluster_size=10 ): """Condense a tree according to a minimum cluster size. This is akin @@ -57,7 +67,7 @@ cpdef cnp.ndarray condense_tree( Parameters ---------- - hierarchy : ndarray (n_samples - 1, 4) + hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype A single linkage hierarchy in scipy.cluster.hierarchy format. min_cluster_size : int, optional (default 10) @@ -66,9 +76,9 @@ cpdef cnp.ndarray condense_tree( Returns ------- - condensed_tree : numpy recarray + condensed_tree : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype Effectively an edgelist with a parent, child, lambda_val - and child_size in each row providing a tree structure. + and cluster_size in each row providing a tree structure. """ cdef: @@ -83,6 +93,8 @@ cpdef cnp.ndarray condense_tree( cnp.intp_t node, sub_node, left, right cnp.float64_t lambda_value, distance cnp.intp_t left_count, right_count + HIERARCHY_t children + relabel = np.empty(root + 1, dtype=np.intp) relabel[root] = n_samples result_list = [] @@ -93,21 +105,21 @@ cpdef cnp.ndarray condense_tree( continue children = hierarchy[node - n_samples] - left = children[0] - right = children[1] - distance = children[2] + left = children.left_node + right = children.right_node + distance = children.value if distance > 0.0: lambda_value = 1.0 / distance else: lambda_value = INFTY if left >= n_samples: - left_count = hierarchy[left - n_samples][3] + left_count = hierarchy[left - n_samples].cluster_size else: left_count = 1 if right >= n_samples: - right_count = hierarchy[right - n_samples][3] + right_count = hierarchy[right - n_samples].cluster_size else: right_count = 1 @@ -157,30 +169,28 @@ cpdef cnp.ndarray condense_tree( ) ignore[sub_node] = True - return np.array(result_list, dtype=[('parent', np.intp), - ('child', np.intp), - ('lambda_val', np.float64), - ('child_size', np.intp)]) + return np.array(result_list, dtype=HIERARCHY_dtype) -cpdef dict compute_stability(cnp.ndarray condensed_tree): +cpdef dict compute_stability(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condensed_tree): cdef: cnp.float64_t[::1] result, births - cnp.ndarray condensed_node - cnp.intp_t[:] parents = condensed_tree['parent'] - cnp.float64_t[:] lambdas = condensed_tree['lambda_val'] - cnp.intp_t[:] sizes = condensed_tree['child_size'] + cnp.intp_t[:] parents = condensed_tree['left_node'] + cnp.float64_t[:] lambdas = condensed_tree['value'] + cnp.intp_t[:] sizes = condensed_tree['cluster_size'] cnp.intp_t parent, cluster_size, result_index - cnp.float64_t lambda_val + cnp.float64_t lambda_val, child_size cnp.float64_t[:, :] result_pre_dict - cnp.intp_t largest_child = condensed_tree['child'].max() + cnp.intp_t largest_child = condensed_tree['right_node'].max() cnp.intp_t smallest_cluster = np.min(parents) cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1 - cnp.ndarray sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], axis=0) - cnp.intp_t[:] sorted_children = sorted_child_data['child'].copy() - cnp.float64_t[:] sorted_lambdas = sorted_child_data['lambda_val'].copy() + cnp.ndarray sorted_child_data = np.sort(condensed_tree[['right_node', 'value']], axis=0) + cnp.intp_t[:] sorted_children = sorted_child_data['right_node'].copy() + cnp.float64_t[:] sorted_lambdas = sorted_child_data['value'].copy() + cnp.intp_t child, current_child = -1 + cnp.float64_t min_lambda = 0 largest_child = max(largest_child, smallest_cluster) births = np.full(largest_child + 1, np.nan, dtype=np.float64) @@ -188,9 +198,7 @@ cpdef dict compute_stability(cnp.ndarray condensed_tree): if largest_child < smallest_cluster: largest_child = smallest_cluster - births = np.nan * np.ones(largest_child + 1, dtype=np.float64) - current_child = -1 - min_lambda = 0 + births = np.full(largest_child + 1, np.nan, dtype=np.float64) for idx in range(condensed_tree.shape[0]): child = sorted_children[idx] lambda_val = sorted_lambdas[idx] @@ -229,35 +237,36 @@ cpdef dict compute_stability(cnp.ndarray condensed_tree): return dict(result_pre_dict) -cdef list bfs_from_cluster_tree(cnp.ndarray hierarchy, cnp.intp_t bfs_root): +cdef list bfs_from_cluster_tree(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, cnp.intp_t bfs_root): cdef list result - cdef cnp.ndarray[cnp.intp_t, ndim=1] to_process + cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] to_process result = [] to_process = np.array([bfs_root], dtype=np.intp) while to_process.shape[0] > 0: result.extend(to_process.tolist()) - to_process = hierarchy['child'][np.in1d(hierarchy['parent'], to_process)] + to_process = hierarchy['right_node'][np.in1d(hierarchy['left_node'], to_process)] return result -cdef max_lambdas(cnp.ndarray hierarchy): +cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy): cdef: cnp.ndarray sorted_parent_data cnp.intp_t[:] sorted_parents - cnp.float64_t[:] sorted_lambdas, deaths + cnp.float64_t[:] sorted_lambdas + cnp.float64_t[::1] deaths cnp.intp_t parent, current_parent cnp.float64_t lambda_val, max_lambda - cnp.intp_t largest_parent = hierarchy['parent'].max() + cnp.intp_t largest_parent = hierarchy['left_node'].max() - sorted_parent_data = np.sort(hierarchy[['parent', 'lambda_val']], axis=0) + sorted_parent_data = np.sort(hierarchy[['left_node', 'value']], axis=0) deaths = np.zeros(largest_parent + 1, dtype=np.float64) - sorted_parents = sorted_parent_data['parent'] - sorted_lambdas = sorted_parent_data['lambda_val'] + sorted_parents = sorted_parent_data['left_node'] + sorted_lambdas = sorted_parent_data['value'] current_parent = -1 max_lambda = 0 @@ -314,12 +323,12 @@ cdef class TreeUnionFind (object): self.is_component[x] = False return self._data[x, 0] - cdef cnp.ndarray[cnp.intp_t, ndim=1] components(self): + cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] components(self): return self.is_component.nonzero()[0] -cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( - cnp.ndarray linkage, +cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( + HIERARCHY_t[::1] linkage, cnp.float64_t cut, cnp.intp_t min_cluster_size ): @@ -330,7 +339,7 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( Parameters ---------- - linkage : ndarray (n_samples - 1, 4) + linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype The single linkage tree in scipy.cluster.hierarchy format. cut : double @@ -348,21 +357,23 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( """ cdef: - cnp.intp_t n, cluster, cluster_id, root, n_samples - cnp.ndarray[cnp.intp_t, ndim=1] result - cnp.intp_t[:] unique_labels, cluster_size + cnp.intp_t n, cluster, cluster_id, root, n_samples, cluster_label + cnp.intp_t[::1] unique_labels, cluster_size + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result TreeUnionFind union_find + dict cluster_label_map + HIERARCHY_t node root = 2 * linkage.shape[0] n_samples = root // 2 + 1 result = np.empty(n_samples, dtype=np.intp) - union_find = TreeUnionFind( root + 1) + union_find = TreeUnionFind(root + 1) cluster = n_samples - for row in linkage: - if row[2] < cut: - union_find.union_( row[0], cluster) - union_find.union_( row[1], cluster) + for node in linkage: + if node.value < cut: + union_find.union_(node.left_node, cluster) + union_find.union_(node.right_node, cluster) cluster += 1 cluster_size = np.zeros(cluster, dtype=np.intp) @@ -388,8 +399,8 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( return result -cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( - cnp.ndarray hierarchy, +cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, set clusters, dict cluster_label_map, cnp.intp_t allow_single_cluster, @@ -398,15 +409,15 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( cdef: cnp.intp_t root_cluster - cnp.ndarray[cnp.intp_t, ndim=1] result + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result cnp.intp_t[:] parent_array, child_array cnp.float64_t[:] lambda_array TreeUnionFind union_find cnp.intp_t n, parent, child, cluster - child_array = hierarchy['child'] - parent_array = hierarchy['parent'] - lambda_array = hierarchy['lambda_val'] + child_array = hierarchy['right_node'] + parent_array = hierarchy['left_node'] + lambda_array = hierarchy['value'] root_cluster = np.min(parent_array) result = np.empty(root_cluster, dtype=np.intp) @@ -425,12 +436,12 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( elif cluster == root_cluster: if len(clusters) == 1 and allow_single_cluster: if cluster_selection_epsilon != 0.0: - if hierarchy['lambda_val'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : + if hierarchy['value'][hierarchy['right_node'] == n] >= 1 / cluster_selection_epsilon : result[n] = cluster_label_map[cluster] else: result[n] = NOISE - elif hierarchy['lambda_val'][hierarchy['child'] == n] >= \ - hierarchy['lambda_val'][hierarchy['parent'] == cluster].max(): + elif hierarchy['value'][hierarchy['right_node'] == n] >= \ + hierarchy['value'][hierarchy['left_node'] == cluster].max(): result[n] = cluster_label_map[cluster] else: result[n] = NOISE @@ -442,19 +453,23 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( return result -cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labels): +cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities( + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, + dict cluster_map, + cnp.intp_t[::1] labels +): cdef: - cnp.ndarray[cnp.float64_t, ndim=1] result + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result cnp.float64_t[:] lambda_array cnp.float64_t[::1] deaths cnp.intp_t[:] child_array, parent_array cnp.intp_t root_cluster, n, point, cluster_num, cluster cnp.float64_t max_lambda, lambda_val - child_array = hierarchy['child'] - parent_array = hierarchy['parent'] - lambda_array = hierarchy['lambda_val'] + child_array = hierarchy['right_node'] + parent_array = hierarchy['left_node'] + lambda_array = hierarchy['value'] result = np.zeros(labels.shape[0]) deaths = max_lambdas(hierarchy) @@ -471,7 +486,7 @@ cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labe cluster = cluster_map[cluster_num] max_lambda = deaths[cluster] - if max_lambda == 0.0 or not np.isfinite(lambda_array[n]): + if max_lambda == 0.0 or isinf(lambda_array[n]): result[point] = 1.0 else: lambda_val = min(lambda_array[n], max_lambda) @@ -480,26 +495,29 @@ cdef get_probabilities(cnp.ndarray hierarchy, dict cluster_map, cnp.ndarray labe return result -cpdef list recurse_leaf_dfs(cnp.ndarray cluster_tree, cnp.intp_t current_node): +cpdef list recurse_leaf_dfs( + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree, + cnp.intp_t current_node +): cdef cnp.intp_t[:] children cdef cnp.intp_t child - children = cluster_tree[cluster_tree['parent'] == current_node]['child'] + children = cluster_tree[cluster_tree['left_node'] == current_node]['right_node'] if len(children) == 0: return [current_node,] else: return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) -cpdef list get_cluster_tree_leaves(cnp.ndarray cluster_tree): +cpdef list get_cluster_tree_leaves(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree): cdef cnp.intp_t root if cluster_tree.shape[0] == 0: return [] - root = cluster_tree['parent'].min() + root = cluster_tree['left_node'].min() return recurse_leaf_dfs(cluster_tree, root) cdef cnp.intp_t traverse_upwards( - cnp.ndarray cluster_tree, + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree, cnp.float64_t cluster_selection_epsilon, cnp.intp_t leaf, cnp.intp_t allow_single_cluster @@ -507,15 +525,15 @@ cdef cnp.intp_t traverse_upwards( cdef cnp.intp_t root, parent cdef cnp.float64_t parent_eps - root = cluster_tree['parent'].min() - parent = cluster_tree[cluster_tree['child'] == leaf]['parent'] + root = cluster_tree['left_node'].min() + parent = cluster_tree[cluster_tree['right_node'] == leaf]['left_node'] if parent == root: if allow_single_cluster: return parent else: return leaf #return node closest to root - parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val'] + parent_eps = 1 / cluster_tree[cluster_tree['right_node'] == parent]['value'] if parent_eps > cluster_selection_epsilon: return parent else: @@ -528,7 +546,7 @@ cdef cnp.intp_t traverse_upwards( cdef set epsilon_search( set leaves, - cnp.ndarray cluster_tree, + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree, cnp.float64_t cluster_selection_epsilon, cnp.intp_t allow_single_cluster ): @@ -537,9 +555,13 @@ cdef set epsilon_search( list processed = list() cnp.intp_t leaf, epsilon_child, sub_node cnp.float64_t eps + cnp.uint8_t[:] leaf_nodes + cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['right_node'] + cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value'] for leaf in leaves: - eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0] + leaf_nodes = children == leaf + eps = 1 / distances[leaf_nodes][0] if eps < cluster_selection_epsilon: if leaf not in processed: epsilon_child = traverse_upwards( @@ -558,9 +580,9 @@ cdef set epsilon_search( return set(selected_clusters) -@cython.wraparound(True) +@wraparound(True) cpdef tuple get_clusters( - cnp.ndarray hierarchy, + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, dict stability, cluster_selection_method='eom', cnp.uint8_t allow_single_cluster=False, @@ -609,13 +631,13 @@ cpdef tuple get_clusters( """ cdef: list node_list - cnp.ndarray cluster_tree - cnp.uint8_t[:] child_selection - cnp.ndarray[cnp.intp_t, ndim=1] labels + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree + cnp.uint8_t[::1] child_selection + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels dict is_cluster, cluster_sizes cnp.float64_t subtree_stability, max_lambda cnp.intp_t node, sub_node, cluster, n_samples - cnp.ndarray[cnp.float64_t, ndim=1] probs + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs # Assume clusters are ordered by numeric id equivalent to # a topological sort of the tree; This is valid given the @@ -627,26 +649,26 @@ cpdef tuple get_clusters( node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root) - cluster_tree = hierarchy[hierarchy['child_size'] > 1] + cluster_tree = hierarchy[hierarchy['cluster_size'] > 1] is_cluster = {cluster: True for cluster in node_list} - n_samples = np.max(hierarchy[hierarchy['child_size'] == 1]['child']) + 1 - max_lambda = np.max(hierarchy['lambda_val']) + n_samples = np.max(hierarchy[hierarchy['cluster_size'] == 1]['right_node']) + 1 + max_lambda = np.max(hierarchy['value']) if max_cluster_size is None: max_cluster_size = n_samples + 1 # Set to a value that will never be triggered - cluster_sizes = {child: child_size for child, child_size - in zip(cluster_tree['child'], cluster_tree['child_size'])} + cluster_sizes = {child: cluster_size for child, cluster_size + in zip(cluster_tree['right_node'], cluster_tree['cluster_size'])} if allow_single_cluster: # Compute cluster size for the root node cluster_sizes[node_list[-1]] = np.sum( - cluster_tree[cluster_tree['parent'] == node_list[-1]]['child_size']) + cluster_tree[cluster_tree['left_node'] == node_list[-1]]['cluster_size']) if cluster_selection_method == 'eom': for node in node_list: - child_selection = (cluster_tree['parent'] == node) + child_selection = (cluster_tree['left_node'] == node) subtree_stability = np.sum([ stability[child] for - child in cluster_tree['child'][child_selection]]) + child in cluster_tree['right_node'][child_selection]]) if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size: is_cluster[node] = False stability[node] = subtree_stability @@ -659,7 +681,7 @@ cpdef tuple get_clusters( eom_clusters = [c for c in is_cluster if is_cluster[c]] selected_clusters = [] # first check if eom_clusters only has root node, which skips epsilon check. - if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()): + if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['left_node'].min()): if allow_single_cluster: selected_clusters = eom_clusters else: @@ -680,7 +702,7 @@ cpdef tuple get_clusters( if len(leaves) == 0: for c in is_cluster: is_cluster[c] = False - is_cluster[hierarchy['parent'].min()] = True + is_cluster[hierarchy['left_node'].min()] = True if cluster_selection_epsilon != 0.0: selected_clusters = epsilon_search( diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 947ae918c93a8..f93f67f008cda 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -29,6 +29,7 @@ MST_edge_dtype, ) from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut +from ._tree import HIERARCHY_dtype FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics @@ -223,22 +224,24 @@ def remap_single_linkage_tree(tree, internal_to_raw, non_finite): outlier_count = len(non_finite) for i, (left, right, *_) in enumerate(tree): if left < finite_count: - tree[i, 0] = internal_to_raw[left] + tree[i]["left_node"] = internal_to_raw[left] else: - tree[i, 0] = left + outlier_count + tree[i]["left_node"] = left + outlier_count if right < finite_count: - tree[i, 1] = internal_to_raw[right] + tree[i]["right_node"] = internal_to_raw[right] else: - tree[i, 1] = right + outlier_count + tree[i]["right_node"] = right + outlier_count - outlier_tree = np.zeros((len(non_finite), 4)) - last_cluster_id = tree[tree.shape[0] - 1][0:2].max() - last_cluster_size = tree[tree.shape[0] - 1][3] + outlier_tree = np.zeros(len(non_finite), dtype=HIERARCHY_dtype) + last_cluster_id = max( + tree[tree.shape[0] - 1]["left_node"], tree[tree.shape[0] - 1]["right_node"] + ) + last_cluster_size = tree[tree.shape[0] - 1]["value"] for i, outlier in enumerate(non_finite): outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) last_cluster_id += 1 last_cluster_size += 1 - tree = np.vstack([tree, outlier_tree]) + tree = np.concatenate([tree, outlier_tree]) return tree From 8d538947946adc87f92f265af89115d9d4f46a3b Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 12 Mar 2023 10:53:52 -0400 Subject: [PATCH 2/8] Reverted change to decorator --- sklearn/cluster/_hdbscan/_tree.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index a956c622cbbbc..99e9fdbfd6161 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -5,7 +5,7 @@ cimport numpy as cnp from libc.math cimport isinf -from cython import wraparound +import cython import numpy as np @@ -580,7 +580,7 @@ cdef set epsilon_search( return set(selected_clusters) -@wraparound(True) +@cython.wraparound(True) cpdef tuple get_clusters( cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, dict stability, From 728f9ddc19353ae74ae748b3b42c5e26d674553b Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 22 Mar 2023 19:38:12 -0400 Subject: [PATCH 3/8] Refactor based on feedback --- sklearn/cluster/_hdbscan/_linkage.pyx | 2 +- sklearn/cluster/_hdbscan/_tree.pyx | 39 +++++++++++++++++++++---- sklearn/cluster/_hdbscan/hdbscan.py | 42 +++++---------------------- 3 files changed, 42 insertions(+), 41 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 85a00e7dd27be..ae1f3c992aa3a 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -190,7 +190,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( return mst -cpdef cnp.ndarray[HIERARCHY_t, ndim=1] make_single_linkage(const MST_edge_t[::1] mst): +cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst): """Construct a single-linkage tree from an MST. Parameters diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 99e9fdbfd6161..105e12b333911 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -19,15 +19,40 @@ HIERARCHY_dtype = np.dtype([ ("cluster_size", np.intp), ]) +cpdef tuple tree_to_labels( + const HIERARCHY_t[::1] single_linkage_tree, + cnp.intp_t min_cluster_size=10, + cluster_selection_method="eom", + bint allow_single_cluster=False, + cnp.float64_t cluster_selection_epsilon=0.0, + max_cluster_size=None, +): + cdef: + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condensed_tree + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities + + condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size) + labels, probabilities = _get_clusters( + condensed_tree, + _compute_stability(condensed_tree), + cluster_selection_method, + allow_single_cluster, + cluster_selection_epsilon, + max_cluster_size, + ) + + return (labels, probabilities) + cdef list bfs_from_hierarchy( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, + const HIERARCHY_t[::1] hierarchy, cnp.intp_t bfs_root ): """ Perform a breadth first search on a tree in scipy hclust format. """ - cdef list process_queue, next_queue + cdef list process_queue, next_queue, result cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1 cdef cnp.intp_t node process_queue = [bfs_root] @@ -55,8 +80,8 @@ cdef list bfs_from_hierarchy( return result -cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condense_tree( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, +cdef cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] _condense_tree( + const HIERARCHY_t[::1] hierarchy, cnp.intp_t min_cluster_size=10 ): """Condense a tree according to a minimum cluster size. This is akin @@ -172,7 +197,9 @@ cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condense_tree( return np.array(result_list, dtype=HIERARCHY_dtype) -cpdef dict compute_stability(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condensed_tree): +cdef dict _compute_stability( + cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condensed_tree +): cdef: cnp.float64_t[::1] result, births @@ -581,7 +608,7 @@ cdef set epsilon_search( return set(selected_clusters) @cython.wraparound(True) -cpdef tuple get_clusters( +cdef tuple _get_clusters( cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, dict stability, cluster_selection_method='eom', diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index f93f67f008cda..c55f8913024ae 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -28,7 +28,7 @@ mst_from_data_matrix, MST_edge_dtype, ) -from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut +from ._tree import tree_to_labels, labelling_at_cut from ._tree import HIERARCHY_dtype FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics @@ -84,31 +84,6 @@ def _brute_mst(mutual_reachability, min_samples): return mst -def _tree_to_labels( - single_linkage_tree, - min_cluster_size=10, - cluster_selection_method="eom", - allow_single_cluster=False, - cluster_selection_epsilon=0.0, - max_cluster_size=None, -): - """Converts a pretrained tree and cluster size into a - set of labels and probabilities. - """ - condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) - stability_dict = compute_stability(condensed_tree) - labels, probabilities = get_clusters( - condensed_tree, - stability_dict, - cluster_selection_method, - allow_single_cluster, - cluster_selection_epsilon, - max_cluster_size, - ) - - return (labels, probabilities, single_linkage_tree) - - def _process_mst(min_spanning_tree): # Sort edges of the min_spanning_tree by weight row_order = np.argsort(min_spanning_tree["distance"]) @@ -222,7 +197,10 @@ def remap_single_linkage_tree(tree, internal_to_raw, non_finite): finite_count = len(internal_to_raw) outlier_count = len(non_finite) - for i, (left, right, *_) in enumerate(tree): + for i, _ in enumerate(tree): + left = tree[i]["left_node"] + right = tree[i]["right_node"] + if left < finite_count: tree[i]["left_node"] = internal_to_raw[left] else: @@ -236,7 +214,7 @@ def remap_single_linkage_tree(tree, internal_to_raw, non_finite): last_cluster_id = max( tree[tree.shape[0] - 1]["left_node"], tree[tree.shape[0] - 1]["right_node"] ) - last_cluster_size = tree[tree.shape[0] - 1]["value"] + last_cluster_size = tree[tree.shape[0] - 1]["cluster_size"] for i, outlier in enumerate(non_finite): outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) last_cluster_id += 1 @@ -661,14 +639,10 @@ def fit(self, X, y=None): kwargs["algo"] = "ball_tree" kwargs["leaf_size"] = self.leaf_size - single_linkage_tree = mst_func(**kwargs) + self._single_linkage_tree_ = mst_func(**kwargs) - ( - self.labels_, - self.probabilities_, + self.labels_, self.probabilities_ = tree_to_labels( self._single_linkage_tree_, - ) = _tree_to_labels( - single_linkage_tree, self.min_cluster_size, self.cluster_selection_method, self.allow_single_cluster, From f1c0d3e6204408e99e65baa953da203021b5fbc2 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 22 Mar 2023 20:14:09 -0400 Subject: [PATCH 4/8] Created second dtype for clarification of tree semantics --- sklearn/cluster/_hdbscan/_linkage.pyx | 2 - sklearn/cluster/_hdbscan/_tree.pxd | 8 +- sklearn/cluster/_hdbscan/_tree.pyx | 109 ++++++++++++++------------ 3 files changed, 65 insertions(+), 54 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index ae1f3c992aa3a..56ac84fb47bbc 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -231,8 +231,6 @@ cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_e current_node_cluster = U.fast_find(current_node) next_node_cluster = U.fast_find(next_node) - # TODO: Update this to an array of structs (AoS). - # Should be done simultaneously in _tree.pyx to ensure compatability. single_linkage[i].left_node = current_node_cluster single_linkage[i].right_node = next_node_cluster single_linkage[i].value = distance diff --git a/sklearn/cluster/_hdbscan/_tree.pxd b/sklearn/cluster/_hdbscan/_tree.pxd index 83d5b38cb99fb..bb267a9b6223a 100644 --- a/sklearn/cluster/_hdbscan/_tree.pxd +++ b/sklearn/cluster/_hdbscan/_tree.pxd @@ -1,9 +1,15 @@ cimport numpy as cnp import numpy as np - +# This corresponds to the scipy.cluster.hierarchy format ctypedef packed struct HIERARCHY_t: cnp.intp_t left_node cnp.intp_t right_node cnp.float64_t value cnp.intp_t cluster_size + +ctypedef packed struct CONDENSED_t: + cnp.intp_t parent + cnp.intp_t child + cnp.float64_t value + cnp.intp_t cluster_size diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 105e12b333911..b859136346383 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -19,6 +19,13 @@ HIERARCHY_dtype = np.dtype([ ("cluster_size", np.intp), ]) +CONDENSED_dtype = np.dtype([ + ("parent", np.intp), + ("child", np.intp), + ("value", np.float64), + ("cluster_size", np.intp), +]) + cpdef tuple tree_to_labels( const HIERARCHY_t[::1] single_linkage_tree, cnp.intp_t min_cluster_size=10, @@ -28,7 +35,7 @@ cpdef tuple tree_to_labels( max_cluster_size=None, ): cdef: - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condensed_tree + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities @@ -80,7 +87,7 @@ cdef list bfs_from_hierarchy( return result -cdef cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] _condense_tree( +cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( const HIERARCHY_t[::1] hierarchy, cnp.intp_t min_cluster_size=10 ): @@ -194,27 +201,27 @@ cdef cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] _condense_tree( ) ignore[sub_node] = True - return np.array(result_list, dtype=HIERARCHY_dtype) + return np.array(result_list, dtype=CONDENSED_dtype) cdef dict _compute_stability( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] condensed_tree + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree ): cdef: cnp.float64_t[::1] result, births - cnp.intp_t[:] parents = condensed_tree['left_node'] + cnp.intp_t[:] parents = condensed_tree['parent'] cnp.float64_t[:] lambdas = condensed_tree['value'] cnp.intp_t[:] sizes = condensed_tree['cluster_size'] cnp.intp_t parent, cluster_size, result_index cnp.float64_t lambda_val, child_size cnp.float64_t[:, :] result_pre_dict - cnp.intp_t largest_child = condensed_tree['right_node'].max() + cnp.intp_t largest_child = condensed_tree['child'].max() cnp.intp_t smallest_cluster = np.min(parents) cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1 - cnp.ndarray sorted_child_data = np.sort(condensed_tree[['right_node', 'value']], axis=0) - cnp.intp_t[:] sorted_children = sorted_child_data['right_node'].copy() + cnp.ndarray sorted_child_data = np.sort(condensed_tree[['child', 'value']], axis=0) + cnp.intp_t[:] sorted_children = sorted_child_data['child'].copy() cnp.float64_t[:] sorted_lambdas = sorted_child_data['value'].copy() cnp.intp_t child, current_child = -1 cnp.float64_t min_lambda = 0 @@ -264,7 +271,7 @@ cdef dict _compute_stability( return dict(result_pre_dict) -cdef list bfs_from_cluster_tree(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, cnp.intp_t bfs_root): +cdef list bfs_from_cluster_tree(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy, cnp.intp_t bfs_root): cdef list result cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] to_process @@ -274,12 +281,12 @@ cdef list bfs_from_cluster_tree(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hiera while to_process.shape[0] > 0: result.extend(to_process.tolist()) - to_process = hierarchy['right_node'][np.in1d(hierarchy['left_node'], to_process)] + to_process = hierarchy['child'][np.in1d(hierarchy['parent'], to_process)] return result -cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy): +cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy): cdef: cnp.ndarray sorted_parent_data @@ -288,11 +295,11 @@ cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] h cnp.float64_t[::1] deaths cnp.intp_t parent, current_parent cnp.float64_t lambda_val, max_lambda - cnp.intp_t largest_parent = hierarchy['left_node'].max() + cnp.intp_t largest_parent = hierarchy['parent'].max() - sorted_parent_data = np.sort(hierarchy[['left_node', 'value']], axis=0) + sorted_parent_data = np.sort(hierarchy[['parent', 'value']], axis=0) deaths = np.zeros(largest_parent + 1, dtype=np.float64) - sorted_parents = sorted_parent_data['left_node'] + sorted_parents = sorted_parent_data['parent'] sorted_lambdas = sorted_parent_data['value'] current_parent = -1 @@ -355,7 +362,7 @@ cdef class TreeUnionFind (object): cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( - HIERARCHY_t[::1] linkage, + const HIERARCHY_t[::1] linkage, cnp.float64_t cut, cnp.intp_t min_cluster_size ): @@ -427,7 +434,7 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy, set clusters, dict cluster_label_map, cnp.intp_t allow_single_cluster, @@ -442,8 +449,8 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( TreeUnionFind union_find cnp.intp_t n, parent, child, cluster - child_array = hierarchy['right_node'] - parent_array = hierarchy['left_node'] + child_array = hierarchy['child'] + parent_array = hierarchy['parent'] lambda_array = hierarchy['value'] root_cluster = np.min(parent_array) @@ -463,12 +470,12 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( elif cluster == root_cluster: if len(clusters) == 1 and allow_single_cluster: if cluster_selection_epsilon != 0.0: - if hierarchy['value'][hierarchy['right_node'] == n] >= 1 / cluster_selection_epsilon : + if hierarchy['value'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon : result[n] = cluster_label_map[cluster] else: result[n] = NOISE - elif hierarchy['value'][hierarchy['right_node'] == n] >= \ - hierarchy['value'][hierarchy['left_node'] == cluster].max(): + elif hierarchy['value'][hierarchy['child'] == n] >= \ + hierarchy['value'][hierarchy['parent'] == cluster].max(): result[n] = cluster_label_map[cluster] else: result[n] = NOISE @@ -481,7 +488,7 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling( cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, dict cluster_map, cnp.intp_t[::1] labels ): @@ -494,15 +501,15 @@ cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities( cnp.intp_t root_cluster, n, point, cluster_num, cluster cnp.float64_t max_lambda, lambda_val - child_array = hierarchy['right_node'] - parent_array = hierarchy['left_node'] - lambda_array = hierarchy['value'] + child_array = condensed_tree['child'] + parent_array = condensed_tree['parent'] + lambda_array = condensed_tree['value'] result = np.zeros(labels.shape[0]) - deaths = max_lambdas(hierarchy) + deaths = max_lambdas(condensed_tree) root_cluster = np.min(parent_array) - for n in range(hierarchy.shape[0]): + for n in range(condensed_tree.shape[0]): point = child_array[n] if point >= root_cluster: continue @@ -523,28 +530,28 @@ cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities( cpdef list recurse_leaf_dfs( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, cnp.intp_t current_node ): cdef cnp.intp_t[:] children cdef cnp.intp_t child - children = cluster_tree[cluster_tree['left_node'] == current_node]['right_node'] + children = cluster_tree[cluster_tree['parent'] == current_node]['child'] if len(children) == 0: return [current_node,] else: return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) -cpdef list get_cluster_tree_leaves(cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree): +cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree): cdef cnp.intp_t root if cluster_tree.shape[0] == 0: return [] - root = cluster_tree['left_node'].min() + root = cluster_tree['parent'].min() return recurse_leaf_dfs(cluster_tree, root) cdef cnp.intp_t traverse_upwards( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, cnp.float64_t cluster_selection_epsilon, cnp.intp_t leaf, cnp.intp_t allow_single_cluster @@ -552,15 +559,15 @@ cdef cnp.intp_t traverse_upwards( cdef cnp.intp_t root, parent cdef cnp.float64_t parent_eps - root = cluster_tree['left_node'].min() - parent = cluster_tree[cluster_tree['right_node'] == leaf]['left_node'] + root = cluster_tree['parent'].min() + parent = cluster_tree[cluster_tree['child'] == leaf]['parent'] if parent == root: if allow_single_cluster: return parent else: return leaf #return node closest to root - parent_eps = 1 / cluster_tree[cluster_tree['right_node'] == parent]['value'] + parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] if parent_eps > cluster_selection_epsilon: return parent else: @@ -573,7 +580,7 @@ cdef cnp.intp_t traverse_upwards( cdef set epsilon_search( set leaves, - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, cnp.float64_t cluster_selection_epsilon, cnp.intp_t allow_single_cluster ): @@ -583,7 +590,7 @@ cdef set epsilon_search( cnp.intp_t leaf, epsilon_child, sub_node cnp.float64_t eps cnp.uint8_t[:] leaf_nodes - cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['right_node'] + cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child'] cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value'] for leaf in leaves: @@ -609,7 +616,7 @@ cdef set epsilon_search( @cython.wraparound(True) cdef tuple _get_clusters( - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] hierarchy, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, dict stability, cluster_selection_method='eom', cnp.uint8_t allow_single_cluster=False, @@ -622,7 +629,7 @@ cdef tuple _get_clusters( Parameters ---------- - tree : numpy recarray + condensed_tree : numpy recarray The condensed tree to extract flat clusters from stability : dict @@ -658,7 +665,7 @@ cdef tuple _get_clusters( """ cdef: list node_list - cnp.ndarray[HIERARCHY_t, ndim=1, mode='c'] cluster_tree + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree cnp.uint8_t[::1] child_selection cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels dict is_cluster, cluster_sizes @@ -676,26 +683,26 @@ cdef tuple _get_clusters( node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root) - cluster_tree = hierarchy[hierarchy['cluster_size'] > 1] + cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1] is_cluster = {cluster: True for cluster in node_list} - n_samples = np.max(hierarchy[hierarchy['cluster_size'] == 1]['right_node']) + 1 - max_lambda = np.max(hierarchy['value']) + n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1 + max_lambda = np.max(condensed_tree['value']) if max_cluster_size is None: max_cluster_size = n_samples + 1 # Set to a value that will never be triggered cluster_sizes = {child: cluster_size for child, cluster_size - in zip(cluster_tree['right_node'], cluster_tree['cluster_size'])} + in zip(cluster_tree['child'], cluster_tree['cluster_size'])} if allow_single_cluster: # Compute cluster size for the root node cluster_sizes[node_list[-1]] = np.sum( - cluster_tree[cluster_tree['left_node'] == node_list[-1]]['cluster_size']) + cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size']) if cluster_selection_method == 'eom': for node in node_list: - child_selection = (cluster_tree['left_node'] == node) + child_selection = (cluster_tree['parent'] == node) subtree_stability = np.sum([ stability[child] for - child in cluster_tree['right_node'][child_selection]]) + child in cluster_tree['child'][child_selection]]) if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size: is_cluster[node] = False stability[node] = subtree_stability @@ -708,7 +715,7 @@ cdef tuple _get_clusters( eom_clusters = [c for c in is_cluster if is_cluster[c]] selected_clusters = [] # first check if eom_clusters only has root node, which skips epsilon check. - if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['left_node'].min()): + if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()): if allow_single_cluster: selected_clusters = eom_clusters else: @@ -729,7 +736,7 @@ cdef tuple _get_clusters( if len(leaves) == 0: for c in is_cluster: is_cluster[c] = False - is_cluster[hierarchy['left_node'].min()] = True + is_cluster[condensed_tree['parent'].min()] = True if cluster_selection_epsilon != 0.0: selected_clusters = epsilon_search( @@ -752,12 +759,12 @@ cdef tuple _get_clusters( reverse_cluster_map = {n: c for c, n in cluster_map.items()} labels = do_labelling( - hierarchy, + condensed_tree, clusters, cluster_map, allow_single_cluster, cluster_selection_epsilon ) - probs = get_probabilities(hierarchy, reverse_cluster_map, labels) + probs = get_probabilities(condensed_tree, reverse_cluster_map, labels) return (labels, probs) From 5cac761c2114b07d9ae9d59f5543beb88d927a77 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Wed, 22 Mar 2023 20:16:28 -0400 Subject: [PATCH 5/8] Update sklearn/cluster/_hdbscan/_tree.pxd Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/_tree.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pxd b/sklearn/cluster/_hdbscan/_tree.pxd index bb267a9b6223a..14c78dc6b1a2d 100644 --- a/sklearn/cluster/_hdbscan/_tree.pxd +++ b/sklearn/cluster/_hdbscan/_tree.pxd @@ -1,5 +1,4 @@ cimport numpy as cnp -import numpy as np # This corresponds to the scipy.cluster.hierarchy format ctypedef packed struct HIERARCHY_t: From 83d9e481fe48dfac5e114514e0744d09d258a574 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 24 Mar 2023 18:40:19 -0400 Subject: [PATCH 6/8] Improved documentation for condensed dtype --- sklearn/cluster/_hdbscan/_tree.pxd | 2 ++ sklearn/cluster/_hdbscan/_tree.pyx | 13 ++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pxd b/sklearn/cluster/_hdbscan/_tree.pxd index 14c78dc6b1a2d..f0ea4bdb3d899 100644 --- a/sklearn/cluster/_hdbscan/_tree.pxd +++ b/sklearn/cluster/_hdbscan/_tree.pxd @@ -7,6 +7,8 @@ ctypedef packed struct HIERARCHY_t: cnp.float64_t value cnp.intp_t cluster_size +# Effectively an edgelist encoding a parent/child pair, along with a value and +# the corresponding cluster_size in each row providing a tree structure. ctypedef packed struct CONDENSED_t: cnp.intp_t parent cnp.intp_t child diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index e1176a0fbe61e..dcea00cbc8487 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -108,9 +108,10 @@ cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( Returns ------- - condensed_tree : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype - Effectively an edgelist with a parent, child, lambda_val - and cluster_size in each row providing a tree structure. + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. """ cdef: @@ -626,8 +627,10 @@ cdef tuple _get_clusters( Parameters ---------- - condensed_tree : numpy recarray - The condensed tree to extract flat clusters from + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. stability : dict A dictionary mapping cluster_ids to stability values From 5e056442b77339b2f3b64a39f64edc4d4469decc Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Mon, 27 Mar 2023 09:23:41 -0400 Subject: [PATCH 7/8] Update sklearn/cluster/_hdbscan/_linkage.pyx Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/_linkage.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 56ac84fb47bbc..f39b1d03cf7ce 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -215,8 +215,8 @@ cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_e # Note mst.shape[0] is one fewer than the number of samples cnp.int64_t n_samples = mst.shape[0] + 1 - cnp.int64_t current_node, next_node, index cnp.intp_t current_node_cluster, next_node_cluster + cnp.int64_t current_node, next_node, index cnp.float64_t distance UnionFind U = UnionFind(n_samples) From 4fdf2be2e604ead0f81e7928f2cc3c201eb4f073 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Mon, 27 Mar 2023 09:23:48 -0400 Subject: [PATCH 8/8] Update sklearn/cluster/_hdbscan/_linkage.pyx Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/_linkage.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index f39b1d03cf7ce..0f15f4eedbecb 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -211,7 +211,7 @@ cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_e - new cluster size """ cdef: - cnp.ndarray[HIERARCHY_t, ndim=1] single_linkage + cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage # Note mst.shape[0] is one fewer than the number of samples cnp.int64_t n_samples = mst.shape[0] + 1