diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 0d40191f2c94e..fd9888ac4da82 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -1,210 +1,241 @@ # Minimum spanning tree single linkage implementation for hdbscan # Authors: Leland McInnes # Steve Astels +# Meekail Zain # License: 3-clause BSD -import numpy as np cimport numpy as cnp -import cython - from libc.float cimport DBL_MAX +import numpy as np from ...metrics._dist_metrics cimport DistanceMetric from ...cluster._hierarchical_fast cimport UnionFind from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._typedefs import ITYPE, DTYPE -cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_distance_matrix( - cnp.ndarray[cnp.double_t, ndim=2] distance_matrix +# Numpy structured dtype representing a single ordered edge in Prim's algorithm +MST_edge_dtype = np.dtype([ + ("current_node", np.int64), + ("next_node", np.int64), + ("distance", np.float64), +]) + +# Packed shouldn't make a difference since they're all 8-byte quantities, +# but it's included just to be safe. +ctypedef packed struct MST_edge_t: + cnp.int64_t current_node + cnp.int64_t next_node + cnp.float64_t distance + +cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( + cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability ): - + """Compute the Minimum Spanning Tree (MST) representation of the mutual- + reachability graph using Prim's algorithm. + + Parameters + ---------- + mutual_reachability : ndarray of shape (n_samples, n_samples) + Array of mutual-reachabilities between samples. + + Returns + ------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reahability graph. The MST is + represented as a collecteion of edges. + """ cdef: - cnp.ndarray[cnp.intp_t, ndim=1] node_labels - cnp.ndarray[cnp.intp_t, ndim=1] current_labels - cnp.ndarray[cnp.double_t, ndim=1] current_distances - cnp.ndarray[cnp.double_t, ndim=1] left - cnp.ndarray[cnp.double_t, ndim=1] right - cnp.ndarray[cnp.double_t, ndim=2] result - - cnp.ndarray label_filter - - cnp.intp_t current_node - cnp.intp_t new_node_index - cnp.intp_t new_node - cnp.intp_t i - - result = np.zeros((distance_matrix.shape[0] - 1, 3)) - node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) + # Note: we utilize ndarray's over memory-views to make use of numpy + # binary indexing and sub-selection below. + cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right + cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst + + cnp.ndarray[cnp.uint8_t, mode='c'] label_filter + + cnp.int64_t n_samples = mutual_reachability.shape[0] + cnp.int64_t current_node, new_node_index, new_node, i + + mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) + current_labels = np.arange(n_samples, dtype=np.int64) current_node = 0 - current_distances = np.infty * np.ones(distance_matrix.shape[0]) - current_labels = node_labels - for i in range(1, node_labels.shape[0]): + min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64) + for i in range(0, n_samples - 1): label_filter = current_labels != current_node current_labels = current_labels[label_filter] - left = current_distances[label_filter] - right = distance_matrix[current_node][current_labels] - current_distances = np.where(left < right, left, right) + left = min_reachability[label_filter] + right = mutual_reachability[current_node][current_labels] + min_reachability = np.minimum(left, right) - new_node_index = np.argmin(current_distances) + new_node_index = np.argmin(min_reachability) new_node = current_labels[new_node_index] - result[i - 1, 0] = current_node - result[i - 1, 1] = new_node - result[i - 1, 2] = current_distances[new_node_index] + mst[i].current_node = current_node + mst[i].next_node = new_node + mst[i].distance = min_reachability[new_node_index] current_node = new_node - return result + return mst -cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix( - cnp.ndarray[cnp.double_t, ndim=2, mode='c'] raw_data, - cnp.ndarray[cnp.double_t, ndim=1, mode='c'] core_distances, +cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( + const cnp.float64_t[:, ::1] raw_data, + const cnp.float64_t[::1] core_distances, DistanceMetric dist_metric, - cnp.double_t alpha=1.0 + cnp.float64_t alpha=1.0 ): + """Compute the Minimum Spanning Tree (MST) representation of the mutual- + reachability graph generated from the provided `raw_data` and + `core_distances` using Prim's algorithm. + + Parameters + ---------- + raw_data : ndarray of shape (n_samples, n_features) + Input array of data samples. + + core_distances : ndarray of shape (n_samples,) + An array containing the core-distance calculated for each corresponding + sample. + + dist_metric : DistanceMetric + The distance metric to use when calculating pairwise distances for + determining mutual-reachability. + + Returns + ------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reahability graph. The MST is + represented as a collecteion of edges. + """ cdef: - cnp.ndarray[cnp.double_t, ndim=1] current_distances_arr - cnp.ndarray[cnp.double_t, ndim=1] current_sources_arr - cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr - cnp.ndarray[cnp.double_t, ndim=2] result_arr - - cnp.double_t * current_distances - cnp.double_t * current_sources - cnp.double_t * current_core_distances - cnp.double_t * raw_data_ptr - cnp.int8_t * in_tree - cnp.double_t[:, ::1] raw_data_view - cnp.double_t[:, ::1] result - - cnp.ndarray label_filter - - cnp.intp_t current_node - cnp.intp_t source_node - cnp.intp_t right_node - cnp.intp_t left_node - cnp.intp_t new_node - cnp.intp_t i - cnp.intp_t j - cnp.intp_t dim - cnp.intp_t num_features - - double current_node_core_distance - double right_value - double left_value - double core_value - double new_distance - - dim = raw_data.shape[0] + cnp.int8_t[::1] in_tree + cnp.float64_t[::1] min_reachability + cnp.int64_t[::1] current_sources + cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst + + cnp.int64_t current_node, source_node, right_node, left_node, new_node, next_node_source + cnp.int64_t i, j, n_samples, num_features + + cnp.float64_t current_node_core_dist, new_reachability, mutual_reachability_distance + cnp.float64_t next_node_min_reach, pair_distance, next_node_core_dist + + n_samples = raw_data.shape[0] num_features = raw_data.shape[1] - raw_data_view = ( ( - raw_data.data)) - raw_data_ptr = ( &raw_data_view[0, 0]) + mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) - result_arr = np.zeros((dim - 1, 3)) - in_tree_arr = np.zeros(dim, dtype=np.int8) - current_node = 0 - current_distances_arr = np.infty * np.ones(dim) - current_sources_arr = np.ones(dim) + in_tree = np.zeros(n_samples, dtype=np.int8) + min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64) + current_sources = np.ones(n_samples, dtype=np.int64) - result = ( ( result_arr.data)) - in_tree = ( in_tree_arr.data) - current_distances = ( current_distances_arr.data) - current_sources = ( current_sources_arr.data) - current_core_distances = ( core_distances.data) + current_node = 0 - for i in range(1, dim): + for i in range(0, n_samples - 1): in_tree[current_node] = 1 - current_node_core_distance = current_core_distances[current_node] + current_node_core_dist = core_distances[current_node] - new_distance = DBL_MAX + new_reachability = DBL_MAX source_node = 0 new_node = 0 - for j in range(dim): + for j in range(n_samples): if in_tree[j]: continue - right_value = current_distances[j] - right_source = current_sources[j] - - left_value = dist_metric.dist(&raw_data_ptr[num_features * - current_node], - &raw_data_ptr[num_features * j], - num_features) - left_source = current_node - - if alpha != 1.0: - left_value /= alpha - - core_value = core_distances[j] - if (current_node_core_distance > right_value or - core_value > right_value or - left_value > right_value): - if right_value < new_distance: - new_distance = right_value - source_node = right_source + next_node_min_reach = min_reachability[j] + next_node_source = current_sources[j] + + pair_distance = dist_metric.dist( + &raw_data[current_node, 0], + &raw_data[j, 0], + num_features + ) + + pair_distance /= alpha + + next_node_core_dist = core_distances[j] + mutual_reachability_distance = max( + current_node_core_dist, + next_node_core_dist, + pair_distance + ) + if mutual_reachability_distance > next_node_min_reach: + if next_node_min_reach < new_reachability: + new_reachability = next_node_min_reach + source_node = next_node_source new_node = j continue - if core_value > current_node_core_distance: - if core_value > left_value: - left_value = core_value - else: - if current_node_core_distance > left_value: - left_value = current_node_core_distance - - if left_value < right_value: - current_distances[j] = left_value - current_sources[j] = left_source - if left_value < new_distance: - new_distance = left_value - source_node = left_source + if mutual_reachability_distance < next_node_min_reach: + min_reachability[j] = mutual_reachability_distance + current_sources[j] = current_node + if mutual_reachability_distance < new_reachability: + new_reachability = mutual_reachability_distance + source_node = current_node new_node = j else: - if right_value < new_distance: - new_distance = right_value - source_node = right_source + if next_node_min_reach < new_reachability: + new_reachability = next_node_min_reach + source_node = next_node_source new_node = j - result[i - 1, 0] = source_node - result[i - 1, 1] = new_node - result[i - 1, 2] = new_distance + mst[i].current_node = source_node + mst[i].next_node = new_node + mst[i].distance = new_reachability current_node = new_node - return result_arr + return mst + +cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst): + """Construct a single-linkage tree from an MST. + + Parameters + ---------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reahability graph. The MST is + represented as a collecteion of edges. -@cython.wraparound(True) -cpdef cnp.ndarray[cnp.double_t, ndim=2] label(cnp.double_t[:,:] L): + Returns + ------- + single_linkage : ndarray of shape (n_samples - 1, 4) + The single-linkage tree tree (dendrogram) built from the MST. Each + of the array represents the following: + - left node/cluster + - right node/cluster + - distance + - new cluster size + """ cdef: - cnp.ndarray[cnp.double_t, ndim=2] result_arr - cnp.double_t[:, ::1] result + cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage - cnp.intp_t N, a, aa, b, bb, index - cnp.double_t delta + # Note mst.shape[0] is one fewer than the number of samples + cnp.int64_t n_samples = mst.shape[0] + 1 + cnp.int64_t current_node_cluster, next_node_cluster + cnp.int64_t current_node, next_node, index + cnp.float64_t distance + UnionFind U = UnionFind(n_samples) - result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) - result = ( ( - result_arr.data)) - N = L.shape[0] + 1 - U = UnionFind(N) + single_linkage = np.zeros((n_samples - 1, 4), dtype=np.float64) - for index in range(L.shape[0]): + for i in range(n_samples - 1): - a = L[index, 0] - b = L[index, 1] - delta = L[index, 2] + current_node = mst[i].current_node + next_node = mst[i].next_node + distance = mst[i].distance - aa, bb = U.fast_find(a), U.fast_find(b) + current_node_cluster = U.fast_find(current_node) + next_node_cluster = U.fast_find(next_node) - result[index][0] = aa - result[index][1] = bb - result[index][2] = delta - result[index][3] = U.size[aa] + U.size[bb] + # TODO: Update this to an array of structs (AoS). + # Should be done simultaneously in _tree.pyx to ensure compatability. + single_linkage[i][0] = current_node_cluster + single_linkage[i][1] = next_node_cluster + single_linkage[i][2] = distance + single_linkage[i][3] = U.size[current_node_cluster] + U.size[next_node_cluster] - U.union(aa, bb) + U.union(current_node_cluster, next_node_cluster) - return result_arr + return single_linkage diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 79beead943898..4f1fcf1962d0b 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -21,7 +21,12 @@ from ...neighbors import BallTree, KDTree, NearestNeighbors from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite -from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix +from ._linkage import ( + make_single_linkage, + mst_from_mutual_reachability, + mst_from_data_matrix, + MST_edge_dtype, +) from ._reachability import mutual_reachability from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut @@ -48,7 +53,7 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): if not sparse: - return mst_from_distance_matrix(mutual_reachability) + return mst_from_mutual_reachability(mutual_reachability) # Check connected component on mutual reachability # If more than one component, it means that even if the distance matrix X @@ -70,7 +75,11 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability) rows, cols = sparse_min_spanning_tree.nonzero() - return np.vstack((rows, cols, sparse_min_spanning_tree.data)).T + mst = np.core.records.fromarrays( + [rows, cols, sparse_min_spanning_tree.data], + dtype=MST_edge_dtype, + ) + return mst def _tree_to_labels( @@ -100,10 +109,10 @@ def _tree_to_labels( def _process_mst(min_spanning_tree): # Sort edges of the min_spanning_tree by weight - row_order = np.argsort(min_spanning_tree.T[2]) - min_spanning_tree = min_spanning_tree[row_order, :] + row_order = np.argsort(min_spanning_tree["distance"]) + min_spanning_tree = min_spanning_tree[row_order] # Convert edge list into standard hierarchical clustering format - return label(min_spanning_tree) + return make_single_linkage(min_spanning_tree) def _hdbscan_brute( @@ -141,7 +150,7 @@ def _hdbscan_brute( mutual_reachability_, min_samples=min_samples, sparse=sparse ) # Warn if the MST couldn't be constructed around the missing distances - if np.isinf(min_spanning_tree.T[2]).any(): + if np.isinf(min_spanning_tree["distance"]).any(): warn( "The minimum spanning tree contains edge weights with value " "infinity. Potentially, you are missing too many distances " @@ -149,7 +158,6 @@ def _hdbscan_brute( "size.", UserWarning, ) - return _process_mst(min_spanning_tree) @@ -183,7 +191,6 @@ def _hdbscan_prims( # Mutual reachability distance is implicit in mst_from_data_matrix min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) - return _process_mst(min_spanning_tree)