diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 8881aa10fe185..e1e95205af57a 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -217,6 +217,19 @@ Changelog `kdtree` and `balltree` values will be removed in 1.6. :pr:`26744` by :user:`Shreesha Kumar Bhat `. +- |Enhancement| : The `mst_algorithm` argument is introduced, allowing for the user to + select between `{"auto", "brute", "prims", "boruvka_exact", "boruvka_approx"}`. + Note that setting `mst_algorithm="prims"` recovers the same functionality as + before this change, except when setting `algorithm="brute"` in which case + both `"auto", "brute"` options for `mst_algorithm` recover current behavior. + This instead introduces `"boruvka_exact", "boruvka_approx"` which are both faster + MST building algorithms than the current `"prims"`. + :pr:`27572` by :user:`Meekail Zain `. + + This implementation is an adaptation from the original implementation of HDBSCAN in + `scikit-learn-contrib/hdbscan `_, + by :user:`Leland McInnes ` et al. + :mod:`sklearn.compose` ...................... diff --git a/setup.py b/setup.py index 14242d60c3f79..64bf7b47112a5 100755 --- a/setup.py +++ b/setup.py @@ -209,6 +209,7 @@ def check_package_status(package, min_version): {"sources": ["_k_means_minibatch.pyx"], "include_np": True}, ], "cluster._hdbscan": [ + {"sources": ["_boruvka.pyx"], "include_np": True}, {"sources": ["_linkage.pyx"], "include_np": True}, {"sources": ["_reachability.pyx"], "include_np": True}, {"sources": ["_tree.pyx"], "include_np": True}, diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx new file mode 100644 index 0000000000000..056077715f7c9 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -0,0 +1,760 @@ +# Minimum spanning tree single linkage implementation for hdbscan +# Authors: Leland McInnes +# License: 3-clause BSD + +# Code to implement a Dual Tree Boruvka Minimimum Spanning Tree computation +# The algorithm is largely tree independent, but some fine details still +# depend on the particular choice of tree. +# +# The core idea of the algorithm is to do repeated sweeps through the dataset, +# adding edges to the tree with each sweep until a full tree is formed. +# To do this, start with each node (or point) existing in it's own component. +# On each sweep find all the edges of minimum weight (in this instance +# of minimal mutual reachability distance) that join separate components. +# Add all these edges to the list of edges in the spanning tree, and then +# combine together all the components joined by edges. Begin the next sweep ... +# +# Eventually we end up with only one component, and all edges in we added +# form the minimum spanning tree. The key insight is that each sweep is +# essentially akin to a nearest neighbor search (with the caveat about being +# in separate components), and so can be performed very efficiently using +# a space tree such as a kdtree or ball tree. By using a dual tree formalism +# with a query tree and reference tree we can prune when all points im the +# query node are in the same component, as are all the points of the reference +# node. This allows for rapid pruning in the dual tree traversal in later +# stages. Importantly, we can construct the full tree in O(log N) sweeps +# and since each sweep has complexity equal to that of an all points +# nearest neighbor query within the tree structure we are using we end +# up with sub-quadratic complexity at worst. +# +# This code is based on the papers: +# +# Fast Euclidean Minimum Spanning Tree: Algorithm, analysis, and applications +# William B. March, Parikshit Ram, Alexander Gray +# Conference: Proceedings of the 16th ACM SIGKDD International Conference on +# Knowledge Discovery and Data Mining +# 2010 +# +# Tree-Independent Dual-Tree Algorithms +# Ryan R. Curtin, William B. March, Parikshit Ram, David V. Anderson, +# Alexander G. Gray, Charles L. Isbell Jr +# 2013, arXiv 1304.4327 +# +# As per the sklearn BallTree and KDTree implementations we make use of +# the rdist for KDTree, which is a faster-to-compute notion of distance +# (for example in the euclidean case it is the distance squared). +# +# To combine together components in between sweeps we make use of +# a union find data structure. This is a separate implementation +# from that used in the labelling of the single linkage tree as +# we can perform more specific optimizations here for what +# is a simpler version of the structure. + +import numpy as np + +cimport numpy as cnp +from libc.float cimport DBL_MAX +from libc.math cimport fabs, pow + +from sklearn.neighbors import KDTree + +from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric64 +from ...utils._typedefs cimport intp_t, float64_t, uint8_t, int8_t +from ...neighbors._binary_tree cimport NodeData_t +from ._linkage cimport MST_edge_t +from ._linkage import MST_edge_dtype +from joblib import effective_n_jobs + +cdef float64_t INF = np.inf + +# Define a function giving the minimum distance between two +# nodes of a ball tree +cdef inline float64_t ball_tree_min_dist_dual( + float64_t radius1, + float64_t radius2, + intp_t node1, + intp_t node2, + float64_t[:, ::1] centroid_dist +) noexcept nogil: + + cdef float64_t dist_pt = centroid_dist[node1, node2] + return max(0, (dist_pt - radius1 - radius2)) + + +# Define a function giving the minimum distance between two +# nodes of a kd-tree +cdef inline float64_t kd_tree_min_dist_dual( + DistanceMetric64 metric, + intp_t node1, + intp_t node2, + float64_t[:, :, ::1] node_bounds, + intp_t num_features +) noexcept nogil: + + cdef float64_t d, d1, d2, rdist = 0.0 + cdef intp_t j + + if metric.p == INF: + for j in range(num_features): + d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] + d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = max(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(num_features): + d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] + d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, metric.p) + + return metric._rdist_to_dist(rdist) + + +cdef class BoruvkaUnionFind: + """ + A union find implementation which avoids virtual nodes in order to keep track + of exact correspondence between initial elements and components. + + Parameters + ---------- + + size : int + The total size of the set of objects to + track via the union find structure. + + Attributes + ---------- + + is_component : array of bool; shape (size, 1) + Array specifying whether each element of the + set is the root node, or identifier for + a component. + """ + + cdef intp_t[::1] _parent + cdef uint8_t[::1] _rank + cdef uint8_t[::1] is_component + + def __init__(self, size): + self._parent = np.arange(size, dtype=np.intp) + self._rank = np.zeros(size, dtype=np.uint8) + self.is_component = np.ones(size, dtype=np.uint8) + + cdef int union_(self, intp_t x, intp_t y) noexcept nogil: + """Union together elements x and y""" + cdef intp_t x_root = self.find(x) + cdef intp_t y_root = self.find(y) + + if x_root == y_root: + return 0 + + if self._rank[x_root] < self._rank[y_root]: + self._parent[x_root] = y_root + self.is_component[x_root] = 0 + elif self._rank[x_root] > self._rank[y_root]: + self._parent[y_root] = x_root + self.is_component[y_root] = 0 + else: + self._rank[x_root] += 1 + self._parent[y_root] = x_root + self.is_component[y_root] = 0 + + return 0 + + cdef intp_t find(self, intp_t x) noexcept nogil: + """Find the root or identifier for the component that x is in""" + cdef intp_t x_parent + cdef intp_t x_grandparent + + x_parent = self._parent[x] + while True: + if x_parent == x: + return x + x_grandparent = self._parent[x_parent] + self._parent[x] = x_grandparent + x = x_parent + x_parent = x_grandparent + + cdef cnp.ndarray[intp_t, ndim=1] components(self): + """Return an array of all component roots/identifiers""" + return np.array(self.is_component).nonzero()[0] + + +cdef class BoruvkaAlgorithm: + """A Dual Tree Boruvka Algorithm implemented for the sklearn + KDTree space tree implementation. + + Parameters + ---------- + + tree : KDTree + The kd-tree to run Dual Tree Boruvka over. + + min_samples : int, optional (default= 5) + The min_samples parameter of HDBSCAN used to + determine core distances. + + metric : string, optional (default='euclidean') + The metric used to compute distances for the tree + + leaf_size : int, optional (default=20) + The Boruvka algorithm benefits from a smaller leaf size than + standard kd-tree nearest neighbor searches. The tree passed in + is used for a kNN search for core distance. A second tree is + constructed with a smaller leaf size for Boruvka; this is that + leaf size. + + alpha : float, optional (default=1.0) + The alpha distance scaling parameter as per Robust Single Linkage. + + approx_min_span_tree : bool, optional (default=False) + Take shortcuts and only approximate the min spanning tree. + This is considerably faster but does not return a true + minimal spanning tree. + + n_jobs : int, optional (default=4) + The number of parallel jobs used to compute core distances. + + **kwargs : + Keyword args passed to the metric. + """ + + cdef: + object tree + DistanceMetric64 dist + readonly const float64_t[:, ::1] raw_data + float64_t[:, :, ::1] node_bounds + float64_t alpha + int8_t approx_min_span_tree + intp_t n_jobs, min_samples + intp_t num_points, num_nodes, num_features + bint has_KDTree + + float64_t[::1] core_distance + float64_t[::1] bounds + intp_t[::1] components + intp_t[::1] component_of_point + intp_t[::1] component_of_node + intp_t[::1] candidate_neighbor + intp_t[::1] candidate_point + float64_t[::1] candidate_distance + float64_t[:, ::1] centroid_distances + intp_t[::1] idx_array + NodeData_t[::1] node_data + BoruvkaUnionFind component_union_find + MST_edge_t[::1] edges + intp_t num_edges + + def __init__( + self, + tree, + min_samples=5, + metric='euclidean', + leaf_size=20, + alpha=1.0, + approx_min_span_tree=False, + n_jobs=None, + **kwargs + ): + + self.tree =tree + self.has_KDTree = isinstance(tree, KDTree) + self.raw_data = self.tree.data + self.node_bounds = self.tree.node_bounds + self.alpha = alpha + self.approx_min_span_tree = approx_min_span_tree + self.n_jobs = effective_n_jobs(n_jobs) + self.min_samples = min_samples + + self.num_points = self.tree.data.shape[0] + self.num_features = self.tree.data.shape[1] + self.num_nodes = self.tree.node_data.shape[0] + + self.dist = DistanceMetric.get_metric(metric, **kwargs) + + self.components = np.arange(self.num_points, dtype=np.intp) + self.bounds = np.empty(self.num_nodes, np.float64) + self.component_of_point = np.empty(self.num_points, dtype=np.intp) + self.component_of_node = np.empty(self.num_nodes, dtype=np.intp) + self.candidate_neighbor = np.empty(self.num_points, dtype=np.intp) + self.candidate_point = np.empty(self.num_points, dtype=np.intp) + self.candidate_distance = np.empty(self.num_points, dtype=np.float64) + self.component_union_find = BoruvkaUnionFind(self.num_points) + + self.edges = np.empty((self.num_points - 1,), dtype=MST_edge_dtype) + self.num_edges = 0 + + self.idx_array = self.tree.idx_array + self.node_data = self.tree.node_data + + if not self.has_KDTree: + # Compute centroids for BallTree + self.centroid_distances = self.dist.pairwise(self.tree.node_bounds[0]) + + self._initialize_components() + self._compute_bounds() + + cdef _compute_bounds(self): + """Initialize core distances""" + + cdef intp_t i, n, m + + cdef cnp.ndarray[float64_t, ndim=2] knn_dist + cdef cnp.ndarray[intp_t, ndim=2] knn_indices + + # TODO: Evaluate query-parallelization featured in original HDBSCAN + # implementation. Removed for now for simplicity. + knn_dist, knn_indices = self.tree.query( + self.tree.data, + k=self.min_samples, + dualtree=True, + breadth_first=True + ) + + self.core_distance = knn_dist[:, self.min_samples - 1].copy() + + # Since we already computed NN distances for the min_samples closest + # points we can use this to do the first round of boruvka -- we won't + # get every point due to core_distance/mutual reachability distance + # issues, but we'll get quite a few, and they are the hard ones to + # get, so fill in any we can and then run update components. + for n in range(self.num_points): + for i in range(0, self.min_samples): + m = knn_indices[n, i] + if n == m: + continue + if self.core_distance[m] <= self.core_distance[n]: + self.candidate_point[n] = n + self.candidate_neighbor[n] = m + self.candidate_distance[n] = self.core_distance[n] + break + + self.update_components() + + for n in range(self.num_nodes): + self.bounds[n] = DBL_MAX + + cdef _initialize_components(self): + """Initialize components of the min spanning tree (eventually there + is only one component; initially each point is its own component)""" + + cdef intp_t n + + for n in range(self.num_points): + self.component_of_point[n] = n + self.candidate_neighbor[n] = -1 + self.candidate_point[n] = -1 + self.candidate_distance[n] = DBL_MAX + + for n in range(self.num_nodes): + self.component_of_node[n] = -(n+1) + + cdef int update_components(self) noexcept nogil: + """Having found the nearest neighbor not in the same component for + each current component (via tree traversal), run through adding + edges to the min spanning tree and recomputing components via + union find.""" + + cdef: + intp_t sink, source, c, component, n, i, p + intp_t current_component, current_source_component + intp_t current_sink_component + intp_t child1, child2 + NodeData_t node_info + + # For each component there should be a: + # - candidate point (a point in the component) + # - candiate neighbor (the point to join with) + # - candidate_distance (the distance from point to neighbor) + # + # We will go through and and an edge to the edge list + # for each of these, and the union the two points + # together in the union find structure + + for c in range(self.components.shape[0]): + component = self.components[c] + source = self.candidate_point[component] + sink = self.candidate_neighbor[component] + if source == -1 or sink == -1: + continue + current_source_component = self.component_union_find.find(source) + current_sink_component = self.component_union_find.find(sink) + if current_source_component == current_sink_component: + # We've already joined these, so ignore this edge + self.candidate_point[component] = -1 + self.candidate_neighbor[component] = -1 + self.candidate_distance[component] = DBL_MAX + continue + + self.edges[self.num_edges].current_node = source + self.edges[self.num_edges].next_node = sink + self.edges[self.num_edges].distance = self.candidate_distance[component] + self.num_edges += 1 + + self.component_union_find.union_(source, sink) + + # Reset everything,and check if we're done + self.candidate_distance[component] = DBL_MAX + if self.num_edges == self.num_points - 1: + with gil: + self.components = self.component_union_find.components() + return self.components.shape[0] + + # After having joined everything in the union find data + # structure, we need to go through and determine the components + # of each point for easy lookup. + # + # Having done that, we then go through and set the component + # of each node, as this provides fast pruning in later + # tree traversals. + for n in range(self.num_points): + self.component_of_point[n] = self.component_union_find.find(n) + + for n in range(self.num_nodes - 1, -1, -1): + node_info = self.node_data[n] + # Case 1: + # If the node is a leaf we need to check that every point + # in the node is of the same component + if node_info.is_leaf: + current_component = self.component_of_point[ + self.idx_array[node_info.idx_start] + ] + for i in range(node_info.idx_start + 1, node_info.idx_end): + p = self.idx_array[i] + if self.component_of_point[p] != current_component: + break + else: + self.component_of_node[n] = current_component + # Case 2: + # If the node is not a leaf we only need to check + # that both child nodes are in the same component + else: + child1 = 2 * n + 1 + child2 = 2 * n + 2 + if self.component_of_node[child1] == self.component_of_node[child2]: + self.component_of_node[n] = self.component_of_node[child1] + + # Since we're working with mutual reachability distance we often have + # ties or near ties; because of that we can benefit by not resetting + # the bounds unless we get stuck (don't join any components). Thus + # we check for that, and only reset bounds in the case where we have + # the same number of components as we did going in. This doesn't + # produce a true min spanning tree, but only and approximation + # Thus only do this if the caller is willing to accept such + if self.approx_min_span_tree: + last_num_components = self.components.shape[0] + with gil: + self.components = self.component_union_find.components() + + if self.components.shape[0] == last_num_components: + # Reset bounds + for n in range(self.num_nodes): + self.bounds[n] = DBL_MAX + else: + with gil: + self.components = self.component_union_find.components() + + for n in range(self.num_nodes): + self.bounds[n] = DBL_MAX + + return self.components.shape[0] + + cdef int dual_tree_traversal( + self, + intp_t node1, + intp_t node2 + ) noexcept nogil: + """Perform a dual tree traversal, pruning wherever possible, to find + the nearest neighbor not in the same component for each component. + This is akin to a standard dual tree NN search, but we also prune + whenever all points in query and reference nodes are in the same + component.""" + + cdef intp_t[::1] point_indices1, point_indices2 + + cdef intp_t i, j, p, q + cdef intp_t parent, component1, component2 + + cdef NodeData_t node1_info = self.node_data[node1] + cdef NodeData_t node2_info = self.node_data[node2] + cdef NodeData_t parent_info, left_info, right_info + + cdef float64_t d, mr_dist, _radius, node_dist + cdef float64_t new_bound, new_upper_bound, new_lower_bound + cdef float64_t bound_max, bound_min + + cdef intp_t left, right + cdef float64_t left_dist, right_dist + + # Compute the distance between the query and reference nodes + if self.has_KDTree: + node_dist = kd_tree_min_dist_dual( + self.dist, + node1, node2, self.node_bounds, + self.num_features + ) + else: + node_dist = ball_tree_min_dist_dual( + node1_info.radius, + node2_info.radius, + node1, node2, + self.centroid_distances + ) + + # If the distance between the nodes is less than the current bound for + # the query and the nodes are not in the same component continue; + # otherwise we get to prune this branch and return early. + if node_dist < self.bounds[node1]: + if ( + self.component_of_node[node1] == + self.component_of_node[node2] and + self.component_of_node[node1] >= 0 + ): + return 0 + else: + return 0 + + # Case 1: Both nodes are leaves + # for each pair of points in node1 x node2 we need + # to compute the distance and see if it's better than + # the current nearest neighbor for the component of + # the point in the query node. + # + # We get to take some shortcuts: + # - if the core distance for a point is larger than + # the distance to the nearst neighbor of the + # component of the point ... then we can't get + # a better mutual reachability distance and we + # can skip computing anything for that point + # - if the points are in the same component we + # don't have to compute the distance. + # + # We also have some catches: + # - we need to compute mutual reachability distance + # not just the ordinary distance; this involves + # fiddling with core distances. + # - We need to scale distances according to alpha, + # but don't want to lose performance in the case + # that alpha is 1.0. + # + # Finally we can compute new bounds for the query node + # based on the distances found here, so do that and + # propagate the results up the tree. + if node1_info.is_leaf and node2_info.is_leaf: + + new_upper_bound = 0.0 + new_lower_bound = DBL_MAX + + point_indices1 = self.idx_array[ + node1_info.idx_start:node1_info.idx_end + ] + point_indices2 = self.idx_array[ + node2_info.idx_start:node2_info.idx_end + ] + + for i in range(point_indices1.shape[0]): + + p = point_indices1[i] + component1 = self.component_of_point[p] + + if self.core_distance[p] > self.candidate_distance[component1]: + continue + + for j in range(point_indices2.shape[0]): + + q = point_indices2[j] + component2 = self.component_of_point[q] + + if self.core_distance[q] > self.candidate_distance[component1]: + continue + + if component1 != component2: + d = self.dist.dist( + &self.raw_data[p][0], + &self.raw_data[q][0], + self.num_features + ) * self.alpha + if self.alpha != 1.0: + mr_dist = max( + d / self.alpha, + self.core_distance[p], + self.core_distance[q] + ) + else: + mr_dist = max( + d, self.core_distance[p], + self.core_distance[q] + ) + if mr_dist < self.candidate_distance[component1]: + self.candidate_distance[component1] = mr_dist + self.candidate_neighbor[component1] = q + self.candidate_point[component1] = p + + new_upper_bound = max( + new_upper_bound, + self.candidate_distance[component1] + ) + new_lower_bound = min( + new_lower_bound, + self.candidate_distance[component1] + ) + + # Compute new bounds for the query node, and + # then propagate the results of that computation + # up the tree. + _radius = node1_info.radius + new_bound = min( + new_upper_bound, + new_lower_bound + 2 * _radius + ) + if new_bound < self.bounds[node1]: + self.bounds[node1] = new_bound + + # Propagate bounds up the tree + while node1 > 0: + parent = (node1 - 1) // 2 + left = 2 * parent + 1 + right = 2 * parent + 2 + + parent_info = self.node_data[parent] + left_info = self.node_data[left] + right_info = self.node_data[right] + + bound_max = max( + self.bounds[left], + self.bounds[right] + ) + + if self.has_KDTree: + new_bound = bound_max + else: + bound_min = min( + self.bounds[left] + 2 * (parent_info.radius - left_info.radius), + self.bounds[right] + 2 * (parent_info.radius - right_info.radius) + ) + + if bound_min > 0: + new_bound = min(bound_max, bound_min) + else: + new_bound = bound_max + if new_bound < self.bounds[parent]: + self.bounds[parent] = new_bound + node1 = parent + else: + break + + # Case 2a: The query node is a leaf, or is smaller than + # the reference node. + # + # We descend in the reference tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the reference tree. + elif ( + node1_info.is_leaf or + ( + not node2_info.is_leaf and + node2_info.radius > node1_info.radius + ) + ): + left = 2 * node2 + 1 + right = 2 * node2 + 2 + + if self.has_KDTree: + left_dist = kd_tree_min_dist_dual( + self.dist, + node1, left, + self.node_bounds, + self.num_features + ) + right_dist = kd_tree_min_dist_dual( + self.dist, + node1, right, + self.node_bounds, + self.num_features + ) + else: + node2_info = self.node_data[left] + left_dist = ball_tree_min_dist_dual( + node1_info.radius, + node2_info.radius, + node1, left, + self.centroid_distances + ) + node2_info = self.node_data[right] + right_dist = ball_tree_min_dist_dual( + node1_info.radius, + node2_info.radius, + node1, right, + self.centroid_distances + ) + + if left_dist < right_dist: + self.dual_tree_traversal(node1, left) + self.dual_tree_traversal(node1, right) + else: + self.dual_tree_traversal(node1, right) + self.dual_tree_traversal(node1, left) + + # Case 2b: The reference node is a leaf, or is smaller than + # the query node. + # + # We descend in the query tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the query tree. + else: + left = 2 * node1 + 1 + right = 2 * node1 + 2 + if self.has_KDTree: + left_dist = kd_tree_min_dist_dual( + self.dist, + left, node2, + self.node_bounds, + self.num_features + ) + right_dist = kd_tree_min_dist_dual( + self.dist, + right, node2, + self.node_bounds, + self.num_features + ) + else: + node1_info = self.node_data[left] + left_dist = ball_tree_min_dist_dual( + node1_info.radius, + node2_info.radius, + left, node2, + self.centroid_distances + ) + node1_info = self.node_data[right] + right_dist = ball_tree_min_dist_dual( + node1_info.radius, + node2_info.radius, + right, node2, + self.centroid_distances + ) + + if left_dist < right_dist: + self.dual_tree_traversal(left, node2) + self.dual_tree_traversal(right, node2) + else: + self.dual_tree_traversal(right, node2) + self.dual_tree_traversal(left, node2) + + return 0 + + cpdef spanning_tree(self): + """Compute the minimum spanning tree of the data held by + the tree passed in at construction""" + + cdef intp_t num_components = self.num_points + with nogil: + while num_components > 1: + self.dual_tree_traversal(0, 0) + num_components = self.update_components() + + return np.array(self.edges, dtype=MST_edge_dtype) diff --git a/sklearn/cluster/_hdbscan/_linkage.pxd b/sklearn/cluster/_hdbscan/_linkage.pxd new file mode 100644 index 0000000000000..2575441f9bd36 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_linkage.pxd @@ -0,0 +1,8 @@ +from ...utils._typedefs cimport float64_t, int64_t + +# Packed shouldn't make a difference since they're all 8-byte quantities, +# but it's included just to be safe. +ctypedef packed struct MST_edge_t: + int64_t current_node + int64_t next_node + float64_t distance diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 0a54d62ae4129..403889df2aa1e 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -39,7 +39,7 @@ from ...metrics._dist_metrics cimport DistanceMetric64 from ...cluster._hierarchical_fast cimport UnionFind from ...cluster._hdbscan._tree cimport HIERARCHY_t from ...cluster._hdbscan._tree import HIERARCHY_dtype -from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t +from ...utils._typedefs cimport intp_t, uint8_t cnp.import_array() @@ -53,13 +53,6 @@ MST_edge_dtype = np.dtype([ ("distance", np.float64), ]) -# Packed shouldn't make a difference since they're all 8-byte quantities, -# but it's included just to be safe. -ctypedef packed struct MST_edge_t: - int64_t current_node - int64_t next_node - float64_t distance - cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( cnp.ndarray[float64_t, ndim=2] mutual_reachability ): diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 52e99027b61c5..3e07c16212927 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -35,10 +35,12 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import warnings from numbers import Integral, Real from warnings import warn import numpy as np +from joblib import effective_n_jobs from scipy.sparse import csgraph, issparse from ...base import BaseEstimator, ClusterMixin @@ -47,6 +49,7 @@ from ...neighbors import BallTree, KDTree, NearestNeighbors from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _allclose_dense_sparse, _assert_all_finite +from ._boruvka import BoruvkaAlgorithm from ._linkage import ( MST_edge_dtype, make_single_linkage, @@ -344,6 +347,37 @@ def _hdbscan_prims( # Mutual reachability distance is implicit in mst_from_data_matrix min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) + + return _process_mst(min_spanning_tree) + + +def _hdbscan_boruvka( + X, + algo, + min_samples=5, + alpha=1.0, + metric="euclidean", + leaf_size=40, + n_jobs=None, + approx_min_span_tree=False, + **metric_params, +): + leaf_size = max(leaf_size, 3) + Tree = KDTree if algo == "kd_tree" else BallTree + tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) + + n_jobs = effective_n_jobs(n_jobs) + out = BoruvkaAlgorithm( + tree=tree, + min_samples=min_samples, + metric=metric, + leaf_size=leaf_size // 3, + alpha=alpha, + approx_min_span_tree=approx_min_span_tree, + n_jobs=n_jobs, + **metric_params, + ) + min_spanning_tree = out.spanning_tree() return _process_mst(min_spanning_tree) @@ -473,7 +507,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): If the `X` passed during `fit` is sparse or `metric` is invalid for both :class:`~sklearn.neighbors.KDTree` and :class:`~sklearn.neighbors.BallTree`, then it resolves to use the - `"brute"` algorithm. + `"brute"` minimum-spanning tree algorithm. .. deprecated:: 1.4 The `'kdtree'` option was deprecated in version 1.4, @@ -483,6 +517,16 @@ class HDBSCAN(ClusterMixin, BaseEstimator): The `'balltree'` option was deprecated in version 1.4, and will be renamed to `'ball_tree'` in 1.6. + mst_algorithm : {"auto", "brute", "prims", "boruvka"}, default="auto" + Exactly which algorithm to use for building the minimum spanning tree. + The `"auto"` option switches between `"brute"` and `"boruvka_exact"` based + on the data and use of precomputed distances. If you can tolerate some + inexactness and would prefer a speedup, consider using `"boruvka_approx"`. + The speedup is especially dramatic when dealing with many features + (n_features > ~45) + + .. versionadded:: 1.4 + leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries when a KDTree or a BallTree are used as core-distance algorithms. A large @@ -613,9 +657,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): >>> from sklearn.cluster import HDBSCAN >>> from sklearn.datasets import load_digits >>> X, _ = load_digits(return_X_y=True) - >>> hdb = HDBSCAN(min_cluster_size=20) + >>> hdb = HDBSCAN(min_cluster_size=20, mst_algorithm='prims') >>> hdb.fit(X) - HDBSCAN(min_cluster_size=20) + HDBSCAN(min_cluster_size=20, mst_algorithm='prims') >>> hdb.labels_ array([ 2, 6, -1, ..., -1, -1, -1]) """ @@ -640,6 +684,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): deprecated={"kdtree", "balltree"}, ), ], + "mst_algorithm": [ + StrOptions( + {"auto", "brute", "prims", "boruvka_exact", "boruvka_approx", "warn"}, + deprecated={"warn"}, + ), + ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "n_jobs": [Integral, None], "cluster_selection_method": [StrOptions({"eom", "leaf"})], @@ -658,6 +708,8 @@ def __init__( metric_params=None, alpha=1.0, algorithm="auto", + # TODO(1.6): Change default to "auto" + mst_algorithm="warn", leaf_size=40, n_jobs=None, cluster_selection_method="eom", @@ -673,6 +725,7 @@ def __init__( self.metric = metric self.metric_params = metric_params self.algorithm = algorithm + self.mst_algorithm = mst_algorithm self.leaf_size = leaf_size self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method @@ -765,6 +818,47 @@ def fit(self, X, y=None): f" samples in X ({X.shape[0]})" ) + # TODO(1.6): Remove and set `mst_algorithm` default to "auto" + if self.mst_algorithm == "warn": + if self.algorithm == "brute" or ( + self.algorithm == "auto" and self.metric == "precomputed" + ): + mst_algorithm = "brute" + else: + mst_algorithm = "prims" + warnings.warn( + ( + "In version 1.6 the default MST algorithm dispatch behavior will" + " change to include the new `boruvka_exact` and `boruvka_approx`" + " algorithms, resulting in some models potentially changing. To" + " suppress this warning, and to avoid unintended changes in" + " behavior, please manually set `mst_algorithm`. You can opt in to" + " the new behavior by manually setting `mst_algorithm='auto'`. You" + " can preserve old behavior by setting `mst_algorithm` to `'brute'`" + " or `'auto'` when `algorithm='brute'`, or `algorithm='auto'` and" + " `metric='precomputed'`; otherwise set" + " `mst_algorithm='prims'` to keep old behavior." + ), + FutureWarning, + ) + else: + mst_algorithm = self.mst_algorithm + + algorithms = {self.algorithm, mst_algorithm} + brute_compat_algorithms = {"auto", "brute"} + using_brute_compat_algos = algorithms.issubset(brute_compat_algorithms) + + if "brute" in algorithms and not using_brute_compat_algos: + raise ValueError( + "When setting either `algorithm='brute'` or `mst_algorithm='brute'`," + " both keyword arguments must only be set to either 'brute' or 'auto'." + ) + if self.metric == "precomputed" and not using_brute_compat_algos: + raise ValueError( + "When setting `metric='precomputed'`, both `mst_algorithm` and" + " `algorithm` must be set to either 'brute' or 'auto'." + ) + # TODO(1.6): Remove if self.algorithm == "kdtree": warn( @@ -811,40 +905,55 @@ def fit(self, X, y=None): " Please select a different metric." ) - if self.algorithm != "auto": + if algorithms != {"auto"}: if ( self.metric != "precomputed" and issparse(X) - and self.algorithm != "brute" + and "brute" not in algorithms ): raise ValueError("Sparse data matrices only support algorithm `brute`.") - - if self.algorithm == "brute": + if "brute" in algorithms: mst_func = _hdbscan_brute kwargs["copy"] = self.copy - elif self.algorithm == "kd_tree": - mst_func = _hdbscan_prims - kwargs["algo"] = "kd_tree" - kwargs["leaf_size"] = self.leaf_size else: - mst_func = _hdbscan_prims - kwargs["algo"] = "ball_tree" kwargs["leaf_size"] = self.leaf_size + # We prefer KDTree unless otherwise specified + if self.algorithm != "auto": + tree_algorithm = self.algorithm + else: + tree_algorithm = ( + "kd_tree" + if self.metric in KDTree.valid_metrics + else "ball_tree" + ) + kwargs["algo"] = tree_algorithm + + if mst_algorithm != "auto": + if mst_algorithm == "prims": + mst_func = _hdbscan_prims + else: + mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = ( + mst_algorithm == "boruvka_approx" + ) + else: + # Boruvka is always preferable + mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = False + else: if issparse(X) or self.metric not in FAST_METRICS: # We can't do much with sparse matrices ... mst_func = _hdbscan_brute kwargs["copy"] = self.copy - elif self.metric in KDTree.valid_metrics: - # TODO: Benchmark KD vs Ball Tree efficiency - mst_func = _hdbscan_prims - kwargs["algo"] = "kd_tree" - kwargs["leaf_size"] = self.leaf_size else: - # Metric is a valid BallTree metric - mst_func = _hdbscan_prims - kwargs["algo"] = "ball_tree" + # Boruvka is always preferable + mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = False kwargs["leaf_size"] = self.leaf_size + kwargs["algo"] = ( + "kd_tree" if self.metric in KDTree.valid_metrics else "ball_tree" + ) self._single_linkage_tree_ = mst_func(**kwargs) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 26f5b64cb2bfd..22818ae702f7e 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -28,21 +28,67 @@ X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) -ALGORITHMS = [ +# These are necessary options for both space-tree/MST algorithm selection +BRUTE_COMPATIBLE = {"auto", "brute"} + +ALGORITHMS = { "kd_tree", "ball_tree", - "brute", - "auto", -] +} | BRUTE_COMPATIBLE + +EXACT_MST_ALGORITHMS = {"prims", "boruvka_exact"} +MST_ALGORITHMS = {"boruvka_approx"} | EXACT_MST_ALGORITHMS | BRUTE_COMPATIBLE + OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} +def _validate_algorithms(algorithm, mst_algorithm): + algos = {algorithm, mst_algorithm} + if "brute" in algos and not algos.issubset(BRUTE_COMPATIBLE): + pytest.xfail("Incompatible algorithm configuration") + + +@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) +@pytest.mark.parametrize("n_jobs", [1, 4]) +@pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "boruvka_approx"]) +def test_hdbscan_boruvka_matches(tree, n_jobs, mst_algorithm): + hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims", n_jobs=n_jobs).fit(X) + hdb_boruvka = HDBSCAN( + algorithm=tree, mst_algorithm=mst_algorithm, n_jobs=n_jobs + ).fit(X) + labels_prims = hdb_prims.labels_ + labels_boruvka = hdb_boruvka.labels_ + + similarity = fowlkes_mallows_score(labels_prims, labels_boruvka) + + # We should expect that the exact boruvka algorithm produces a correct mst, + # but the approximation will almost surely produce an incorrect tree, and + # hence differ from the exact labels. + assert similarity >= 0.91 if "approx" in mst_algorithm else 1 + + +def test_hdbscan_mst_algorithm_errors(): + msg = "When setting either" + for tree in ["kd_tree", "ball_tree"]: + hdb = HDBSCAN(algorithm=tree, mst_algorithm="brute") + with pytest.raises(ValueError, match=msg): + hdb.fit(X, y) + + for mst_algorithm in MST_ALGORITHMS - BRUTE_COMPATIBLE: + hdb = HDBSCAN(algorithm="brute", mst_algorithm=mst_algorithm) + with pytest.raises(ValueError, match=msg): + hdb.fit(X, y) + + @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) -def test_outlier_data(outlier_type): +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS)) +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) +def test_outlier_data(outlier_type, mst_algorithm, algorithm): """ Tests if np.inf and np.nan data are each treated as special outliers. """ + _validate_algorithms(algorithm, mst_algorithm) outlier = { "infinite": np.inf, "missing": np.nan, @@ -57,7 +103,7 @@ def test_outlier_data(outlier_type): X_outlier = X.copy() X_outlier[0] = [outlier, 1] X_outlier[5] = [outlier, outlier] - model = HDBSCAN().fit(X_outlier) + model = HDBSCAN(algorithm=algorithm, mst_algorithm=mst_algorithm).fit(X_outlier) (missing_labels_idx,) = (model.labels_ == label).nonzero() assert_array_equal(missing_labels_idx, [0, 5]) @@ -66,7 +112,9 @@ def test_outlier_data(outlier_type): assert_array_equal(missing_probs_idx, [0, 5]) clean_indices = list(range(1, 5)) + list(range(6, 200)) - clean_model = HDBSCAN().fit(X_outlier[clean_indices]) + clean_model = HDBSCAN(algorithm=algorithm, mst_algorithm=mst_algorithm).fit( + X_outlier[clean_indices] + ) assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) @@ -77,7 +125,9 @@ def test_hdbscan_distance_matrix(): """ D = euclidean_distances(X) D_original = D.copy() - labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D) + labels = HDBSCAN(metric="precomputed", copy=True, mst_algorithm="auto").fit_predict( + D + ) assert_allclose(D, D_original) n_clusters = len(set(labels) - OUTLIER_SET) @@ -90,14 +140,14 @@ def test_hdbscan_distance_matrix(): msg = r"The precomputed distance matrix.*has shape" with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed", copy=True).fit_predict(X) + HDBSCAN(metric="precomputed", copy=True, mst_algorithm="auto").fit_predict(X) msg = r"The precomputed distance matrix.*values" # Ensure the matrix is not symmetric D[0, 1] = 10 D[1, 0] = 1 with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed").fit_predict(D) + HDBSCAN(metric="precomputed", mst_algorithm="auto").fit_predict(D) @pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS]) @@ -114,7 +164,7 @@ def test_hdbscan_sparse_distance_matrix(sparse_constructor): D = sparse_constructor(D) D.eliminate_zeros() - labels = HDBSCAN(metric="precomputed").fit_predict(D) + labels = HDBSCAN(metric="precomputed", mst_algorithm="auto").fit_predict(D) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -124,7 +174,7 @@ def test_hdbscan_feature_array(): Tests that HDBSCAN works with feature array, including an arbitrary goodness of fit check. Note that the check is a simple heuristic. """ - labels = HDBSCAN().fit_predict(X) + labels = HDBSCAN(mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -134,19 +184,19 @@ def test_hdbscan_feature_array(): assert score >= 0.98 -@pytest.mark.parametrize("algo", ALGORITHMS) +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) @pytest.mark.parametrize("metric", _VALID_METRICS) -def test_hdbscan_algorithms(algo, metric): +def test_hdbscan_algorithms(algorithm, metric): """ Tests that HDBSCAN works with the expected combinations of algorithms and metrics, or raises the expected errors. """ - labels = HDBSCAN(algorithm=algo).fit_predict(X) + labels = HDBSCAN(algorithm=algorithm, mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true # Validation for brute is handled by `pairwise_distances` - if algo in ("brute", "auto"): + if algorithm in ("brute", "auto"): return ALGOS_TREES = { @@ -161,12 +211,13 @@ def test_hdbscan_algorithms(algo, metric): }.get(metric, None) hdb = HDBSCAN( - algorithm=algo, + algorithm=algorithm, metric=metric, metric_params=metric_params, + mst_algorithm="auto", ) - if metric not in ALGOS_TREES[algo].valid_metrics: + if metric not in ALGOS_TREES[algorithm].valid_metrics: with pytest.raises(ValueError): hdb.fit(X) elif metric == "wminkowski": @@ -183,7 +234,7 @@ def test_dbscan_clustering(): TODO: Improve and strengthen this test if at all possible. """ - clusterer = HDBSCAN().fit(X) + clusterer = HDBSCAN(mst_algorithm="auto").fit(X) labels = clusterer.dbscan_clustering(0.3) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -201,7 +252,7 @@ def test_dbscan_clustering_outlier_data(cut_distance): X_outlier[0] = [np.inf, 1] X_outlier[2] = [1, np.nan] X_outlier[5] = [np.inf, np.nan] - model = HDBSCAN().fit(X_outlier) + model = HDBSCAN(mst_algorithm="auto").fit(X_outlier) labels = model.dbscan_clustering(cut_distance=cut_distance) missing_labels_idx = np.flatnonzero(labels == missing_label) @@ -211,7 +262,7 @@ def test_dbscan_clustering_outlier_data(cut_distance): assert_array_equal(infinite_labels_idx, [0]) clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx)) - clean_model = HDBSCAN().fit(X_outlier[clean_idx]) + clean_model = HDBSCAN(mst_algorithm="auto").fit(X_outlier[clean_idx]) clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) assert_array_equal(clean_labels, labels[clean_idx]) @@ -225,6 +276,7 @@ def test_hdbscan_high_dimensional(): labels = HDBSCAN( algorithm="auto", metric="seuclidean", + mst_algorithm="auto", metric_params={"V": np.ones(H.shape[1])}, ).fit_predict(H) n_clusters = len(set(labels) - OUTLIER_SET) @@ -236,7 +288,9 @@ def test_hdbscan_best_balltree_metric(): Tests that HDBSCAN using `BallTree` works. """ labels = HDBSCAN( - metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} + metric="seuclidean", + mst_algorithm="auto", + metric_params={"V": np.ones(X.shape[1])}, ).fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -247,7 +301,7 @@ def test_hdbscan_no_clusters(): Tests that HDBSCAN correctly does not generate a valid cluster when the `min_cluster_size` is too large for the data. """ - labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) + labels = HDBSCAN(min_cluster_size=len(X) - 1, mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == 0 @@ -258,7 +312,9 @@ def test_hdbscan_min_cluster_size(): many points """ for min_cluster_size in range(2, len(X), 1): - labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X) + labels = HDBSCAN( + min_cluster_size=min_cluster_size, mst_algorithm="auto" + ).fit_predict(X) true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size @@ -269,77 +325,103 @@ def test_hdbscan_callable_metric(): Tests that HDBSCAN works when passed a callable metric. """ metric = distance.euclidean - labels = HDBSCAN(metric=metric).fit_predict(X) + labels = HDBSCAN(metric=metric, mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true -@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) -def test_hdbscan_precomputed_non_brute(tree): +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS)) +def test_hdbscan_precomputed_non_brute(algorithm, mst_algorithm): """ Tests that HDBSCAN correctly raises an error when passing precomputed data while requesting a tree-based algorithm. """ - hdb = HDBSCAN(metric="precomputed", algorithm=tree) - msg = "precomputed is not a valid metric for" + algos = {algorithm, mst_algorithm} + if algos.issubset(BRUTE_COMPATIBLE): + return + hdb = HDBSCAN( + metric="precomputed", algorithm=algorithm, mst_algorithm=mst_algorithm + ) + + if "brute" in algos: + msg = "When setting either `algorithm='brute'` or `mst_algorithm='brute'`" + else: + msg = "When setting `metric='precomputed'`, both `mst_algorithm` and" with pytest.raises(ValueError, match=msg): hdb.fit(X) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) -def test_hdbscan_sparse(csr_container): +@pytest.mark.parametrize("mst_algorithm", sorted(EXACT_MST_ALGORITHMS)) +def test_hdbscan_sparse(csr_container, mst_algorithm): """ Tests that HDBSCAN works correctly when passing sparse feature data. Evaluates correctness by comparing against the same data passed as a dense array. """ - dense_labels = HDBSCAN().fit(X).labels_ + dense_labels = HDBSCAN(mst_algorithm=mst_algorithm).fit(X).labels_ n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 _X_sparse = csr_container(X) X_sparse = _X_sparse.copy() - sparse_labels = HDBSCAN().fit(X_sparse).labels_ - assert_array_equal(dense_labels, sparse_labels) + sparse_labels = HDBSCAN(mst_algorithm="auto").fit(X_sparse).labels_ + fowlkes_mallows_score(dense_labels, sparse_labels) == 1 # Compare that the sparse and dense non-precomputed routines return the same labels # where the 0th observation contains the outlier. for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): X_dense = X.copy() X_dense[0, 0] = outlier_val - dense_labels = HDBSCAN().fit(X_dense).labels_ + dense_labels = HDBSCAN(mst_algorithm=mst_algorithm).fit(X_dense).labels_ n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] X_sparse = _X_sparse.copy() X_sparse[0, 0] = outlier_val - sparse_labels = HDBSCAN().fit(X_sparse).labels_ - assert_array_equal(dense_labels, sparse_labels) + sparse_labels = HDBSCAN(mst_algorithm="auto").fit(X_sparse).labels_ + fowlkes_mallows_score(dense_labels, sparse_labels) == 1 msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse) + HDBSCAN(metric="euclidean", algorithm="ball_tree", mst_algorithm="auto").fit( + X_sparse + ) -@pytest.mark.parametrize("algorithm", ALGORITHMS) -def test_hdbscan_centers(algorithm): +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS)) +def test_hdbscan_centers(algorithm, mst_algorithm): """ Tests that HDBSCAN centers are calculated and stored properly, and are accurate to the data. """ + _validate_algorithms(algorithm, mst_algorithm) + centers = [(0.0, 0.0), (3.0, 3.0)] H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) - hdb = HDBSCAN(store_centers="both").fit(H) - - for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_): + hdb = HDBSCAN( + algorithm=algorithm, + mst_algorithm=mst_algorithm, + store_centers="both", + min_samples=10, + ).fit(H) + + centroids = np.sort(hdb.centroids_, axis=0) + medoids = np.sort(hdb.medoids_, axis=0) + for center, centroid, medoid in zip(centers, centroids, medoids): assert_allclose(center, centroid, rtol=1, atol=0.05) assert_allclose(center, medoid, rtol=1, atol=0.05) # Ensure that nothing is done for noise hdb = HDBSCAN( - algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0] + algorithm=algorithm, + mst_algorithm=mst_algorithm, + store_centers="both", + min_cluster_size=X.shape[0], ).fit(X) assert hdb.centroids_.shape[0] == 0 assert hdb.medoids_.shape[0] == 0 @@ -357,6 +439,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): cluster_selection_epsilon=0.0, cluster_selection_method="eom", allow_single_cluster=True, + mst_algorithm="auto", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 @@ -372,6 +455,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): cluster_selection_method="eom", allow_single_cluster=True, algorithm="kd_tree", + mst_algorithm="auto", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 @@ -391,7 +475,7 @@ def test_hdbscan_better_than_dbscan(): cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0, ) - hdb = HDBSCAN().fit(X) + hdb = HDBSCAN(mst_algorithm="auto").fit(X) n_clusters = len(set(hdb.labels_)) - int(-1 in hdb.labels_) assert n_clusters == 4 @@ -409,7 +493,7 @@ def test_hdbscan_usable_inputs(X, kwargs): Tests that HDBSCAN works correctly for array-likes and precomputed inputs with non-finite points. """ - HDBSCAN(min_samples=1, **kwargs).fit(X) + HDBSCAN(min_samples=1, mst_algorithm="auto", **kwargs).fit(X) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) @@ -422,10 +506,11 @@ def test_hdbscan_sparse_distances_too_few_nonzero(csr_container): msg = "There exists points with fewer than" with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed").fit(X) + HDBSCAN(metric="precomputed", mst_algorithm="auto").fit(X) -def test_hdbscan_tree_invalid_metric(): +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS - {"brute"})) +def test_hdbscan_tree_invalid_metric(mst_algorithm): """ Tests that HDBSCAN correctly raises an error for invalid metric choices. """ @@ -437,16 +522,24 @@ def test_hdbscan_tree_invalid_metric(): # Callables are not supported for either with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X) + HDBSCAN( + algorithm="kd_tree", metric=metric_callable, mst_algorithm=mst_algorithm + ).fit(X) with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X) + HDBSCAN( + algorithm="ball_tree", metric=metric_callable, mst_algorithm=mst_algorithm + ).fit(X) # The set of valid metrics for KDTree at the time of writing this test is a # strict subset of those supported in BallTree metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) if len(metrics_not_kd) > 0: with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X) + HDBSCAN( + algorithm="kd_tree", + metric=metrics_not_kd[0], + mst_algorithm=mst_algorithm, + ).fit(X) def test_hdbscan_too_many_min_samples(): @@ -454,7 +547,7 @@ def test_hdbscan_too_many_min_samples(): Tests that HDBSCAN correctly raises an error when setting `min_samples` larger than the number of samples. """ - hdb = HDBSCAN(min_samples=len(X) + 1) + hdb = HDBSCAN(min_samples=len(X) + 1, mst_algorithm="auto") msg = r"min_samples (.*) must be at most" with pytest.raises(ValueError, match=msg): hdb.fit(X) @@ -468,7 +561,7 @@ def test_hdbscan_precomputed_dense_nan(): X_nan = X.copy() X_nan[0, 0] = np.nan msg = "np.nan values found in precomputed-dense" - hdb = HDBSCAN(metric="precomputed") + hdb = HDBSCAN(metric="precomputed", mst_algorithm="auto") with pytest.raises(ValueError, match=msg): hdb.fit(X_nan) @@ -491,7 +584,7 @@ def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon): ], ) - est = HDBSCAN().fit(X) + est = HDBSCAN(mst_algorithm="auto").fit(X) condensed_tree = _condense_tree( est._single_linkage_tree_, min_cluster_size=est.min_cluster_size ) @@ -560,7 +653,7 @@ def test_hdbscan_warning_on_deprecated_algorithm_name(): " to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`." ) with pytest.warns(FutureWarning, match=msg): - HDBSCAN(algorithm="kdtree").fit(X) + HDBSCAN(algorithm="kdtree", mst_algorithm="auto").fit(X) # Test that warning message is shown when algorithm='balltree' msg = ( @@ -569,4 +662,11 @@ def test_hdbscan_warning_on_deprecated_algorithm_name(): " `algorithm='ball_tree'`." ) with pytest.warns(FutureWarning, match=msg): - HDBSCAN(algorithm="balltree").fit(X) + HDBSCAN(algorithm="balltree", mst_algorithm="auto").fit(X) + + +# TODO(1.6): Remove +def test_hdbscan_warning_on_mst_default(): + msg = "In version 1.6 the default MST algorithm dispatch behavior will" + with pytest.warns(FutureWarning, match=msg): + HDBSCAN().fit_predict(X) diff --git a/sklearn/neighbors/_binary_tree.pxd b/sklearn/neighbors/_binary_tree.pxd new file mode 100644 index 0000000000000..fcba5f5c39919 --- /dev/null +++ b/sklearn/neighbors/_binary_tree.pxd @@ -0,0 +1,7 @@ +from ..utils._typedefs cimport float32_t, float64_t, intp_t + +cdef struct NodeData_t: + intp_t idx_start + intp_t idx_end + intp_t is_leaf + float64_t radius diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index dd77bcbdfb3d6..434744e3c87e9 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -188,6 +188,7 @@ from ..utils import check_array from ..utils._typedefs cimport float32_t, float64_t, intp_t from ..utils._heap cimport heap_push from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort +from ._binary_tree cimport NodeData_t cnp.import_array() @@ -216,12 +217,6 @@ cdef struct NodeHeapData_t: cdef NodeHeapData_t nhd_tmp NodeHeapData = np.asarray((&nhd_tmp)).dtype -cdef struct NodeData_t: - intp_t idx_start - intp_t idx_end - intp_t is_leaf - float64_t radius - # build the corresponding numpy dtype for NodeData cdef NodeData_t nd_tmp NodeData = np.asarray((&nd_tmp)).dtype diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index e6d2ade736f4a..6adc1419b0ea0 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -253,6 +253,10 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == "MDS": est.set_params(normalized_stress="auto") + # TODO(1.6): TO BE REMOVED for 1.6 (avoid FutureWarning) + if Estimator.__name__ == "HDBSCAN": + est.set_params(mst_algorithm="auto") + # Low max iter to speed up tests: we are only interested in checking the existence # of fitted attributes. This should be invariant to whether it has converged or not. if "max_iter" in est.get_params(): diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index 3ffe5b3b41098..d568d49c75f28 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -14,6 +14,7 @@ # TODO: Stop defining custom types locally or globally like DTYPE_t and friends and # use these consistently throughout the codebase. # NOTE: Extend this list as needed when converting more cython extensions. +ctypedef char int8_t ctypedef unsigned char uint8_t ctypedef unsigned int uint32_t ctypedef unsigned long long uint64_t