From cc31a2a39faf1e63a0f34d633a0c6d74dcd1daff Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 5 Nov 2022 15:52:50 -0400 Subject: [PATCH 01/11] Initial cleanup --- sklearn/cluster/_hdbscan/_linkage.pyx | 125 ++++++++++++-------------- 1 file changed, 56 insertions(+), 69 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 0d40191f2c94e..068d6c069482e 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -3,8 +3,9 @@ # Steve Astels # License: 3-clause BSD -import numpy as np cimport numpy as cnp + +import numpy as np import cython from libc.float cimport DBL_MAX @@ -14,106 +15,92 @@ from ...cluster._hierarchical_fast cimport UnionFind from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._typedefs import ITYPE, DTYPE -cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_distance_matrix( - cnp.ndarray[cnp.double_t, ndim=2] distance_matrix +cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_distance_matrix( + cnp.ndarray[cnp.float64_t, ndim=2] distance_matrix ): cdef: cnp.ndarray[cnp.intp_t, ndim=1] node_labels cnp.ndarray[cnp.intp_t, ndim=1] current_labels - cnp.ndarray[cnp.double_t, ndim=1] current_distances - cnp.ndarray[cnp.double_t, ndim=1] left - cnp.ndarray[cnp.double_t, ndim=1] right - cnp.ndarray[cnp.double_t, ndim=2] result + cnp.ndarray[cnp.float64_t, ndim=1] current_distances, left, right + cnp.ndarray[cnp.float64_t, ndim=2] result cnp.ndarray label_filter - cnp.intp_t current_node - cnp.intp_t new_node_index - cnp.intp_t new_node - cnp.intp_t i + cnp.intp_t n_samples = distance_matrix.shape[0] + cnp.intp_t current_node, new_node_index, new_node, i - result = np.zeros((distance_matrix.shape[0] - 1, 3)) - node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) + result = np.zeros((n_samples - 1, 3)) + node_labels = np.arange(n_samples, dtype=np.intp) current_node = 0 - current_distances = np.infty * np.ones(distance_matrix.shape[0]) + current_distances = np.infty * np.ones(n_samples) current_labels = node_labels - for i in range(1, node_labels.shape[0]): + for i in range(1, n_samples): label_filter = current_labels != current_node current_labels = current_labels[label_filter] left = current_distances[label_filter] right = distance_matrix[current_node][current_labels] - current_distances = np.where(left < right, left, right) + current_distances = np.minimum(left, right) new_node_index = np.argmin(current_distances) new_node = current_labels[new_node_index] - result[i - 1, 0] = current_node - result[i - 1, 1] = new_node + result[i - 1, 0] = current_node + result[i - 1, 1] = new_node result[i - 1, 2] = current_distances[new_node_index] current_node = new_node return result -cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix( - cnp.ndarray[cnp.double_t, ndim=2, mode='c'] raw_data, - cnp.ndarray[cnp.double_t, ndim=1, mode='c'] core_distances, +cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix( + cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] raw_data, + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] core_distances, DistanceMetric dist_metric, - cnp.double_t alpha=1.0 + cnp.float64_t alpha=1.0 ): cdef: - cnp.ndarray[cnp.double_t, ndim=1] current_distances_arr - cnp.ndarray[cnp.double_t, ndim=1] current_sources_arr + cnp.ndarray[cnp.float64_t, ndim=1] current_distances_arr + cnp.ndarray[cnp.float64_t, ndim=1] current_sources_arr cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr - cnp.ndarray[cnp.double_t, ndim=2] result_arr + cnp.ndarray[cnp.float64_t, ndim=2] result_arr - cnp.double_t * current_distances - cnp.double_t * current_sources - cnp.double_t * current_core_distances - cnp.double_t * raw_data_ptr + cnp.float64_t * current_distances + cnp.float64_t * current_sources + cnp.float64_t * current_core_distances + cnp.float64_t * raw_data_ptr cnp.int8_t * in_tree - cnp.double_t[:, ::1] raw_data_view - cnp.double_t[:, ::1] result + cnp.float64_t[:, ::1] raw_data_view + cnp.float64_t[:, ::1] result cnp.ndarray label_filter - cnp.intp_t current_node - cnp.intp_t source_node - cnp.intp_t right_node - cnp.intp_t left_node - cnp.intp_t new_node - cnp.intp_t i - cnp.intp_t j - cnp.intp_t dim - cnp.intp_t num_features - - double current_node_core_distance - double right_value - double left_value - double core_value - double new_distance - - dim = raw_data.shape[0] + cnp.intp_t current_node, source_node, right_node, left_node, new_node + cnp.intp_t i, j, n_samples, num_features + + cnp.float64_t current_node_core_distance, new_distance + cnp.float64_t right_value, left_value, core_value + + n_samples = raw_data.shape[0] num_features = raw_data.shape[1] - raw_data_view = ( ( - raw_data.data)) - raw_data_ptr = ( &raw_data_view[0, 0]) + raw_data_view = ( ( + raw_data.data)) + raw_data_ptr = ( &raw_data_view[0, 0]) - result_arr = np.zeros((dim - 1, 3)) - in_tree_arr = np.zeros(dim, dtype=np.int8) + result_arr = np.zeros((n_samples - 1, 3)) + in_tree_arr = np.zeros(n_samples, dtype=np.int8) current_node = 0 - current_distances_arr = np.infty * np.ones(dim) - current_sources_arr = np.ones(dim) + current_distances_arr = np.infty * np.ones(n_samples) + current_sources_arr = np.ones(n_samples) - result = ( ( result_arr.data)) + result = ( ( result_arr.data)) in_tree = ( in_tree_arr.data) - current_distances = ( current_distances_arr.data) - current_sources = ( current_sources_arr.data) - current_core_distances = ( core_distances.data) + current_distances = ( current_distances_arr.data) + current_sources = ( current_sources_arr.data) + current_core_distances = ( core_distances.data) - for i in range(1, dim): + for i in range(1, n_samples): in_tree[current_node] = 1 @@ -123,7 +110,7 @@ cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix( source_node = 0 new_node = 0 - for j in range(dim): + for j in range(n_samples): if in_tree[j]: continue @@ -169,26 +156,26 @@ cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix( source_node = right_source new_node = j - result[i - 1, 0] = source_node - result[i - 1, 1] = new_node + result[i - 1, 0] = source_node + result[i - 1, 1] = new_node result[i - 1, 2] = new_distance current_node = new_node return result_arr @cython.wraparound(True) -cpdef cnp.ndarray[cnp.double_t, ndim=2] label(cnp.double_t[:,:] L): +cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(cnp.float64_t[:,:] L): cdef: - cnp.ndarray[cnp.double_t, ndim=2] result_arr - cnp.double_t[:, ::1] result + cnp.ndarray[cnp.float64_t, ndim=2] result_arr + cnp.float64_t[:, ::1] result cnp.intp_t N, a, aa, b, bb, index - cnp.double_t delta + cnp.float64_t delta result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) - result = ( ( - result_arr.data)) + result = ( ( + result_arr.data)) N = L.shape[0] + 1 U = UnionFind(N) From 4dcfe8e97216ecaf61986c43f78a2c712924bb8e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 7 Nov 2022 19:47:55 -0500 Subject: [PATCH 02/11] WIP partial implementation of custom struct for MST --- sklearn/cluster/_hdbscan/_linkage.pxd | 14 ++++++++++++++ sklearn/cluster/_hdbscan/_linkage.pyx | 13 ++++++------- sklearn/cluster/_hdbscan/hdbscan.py | 13 +++++++++---- 3 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 sklearn/cluster/_hdbscan/_linkage.pxd diff --git a/sklearn/cluster/_hdbscan/_linkage.pxd b/sklearn/cluster/_hdbscan/_linkage.pxd new file mode 100644 index 0000000000000..a67afdfdaab69 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_linkage.pxd @@ -0,0 +1,14 @@ +cimport numpy as cnp +import numpy as np + +# Numpy structured dtype representing a single ordered edge in Prim's algorithm +MST_edge_dtype = np.dtype([ + ("current_node", np.intp), + ("next_node", np.intp), + ("distance", np.float64), +]) + +ctypedef struct MST_edge_t: + cnp.intp_t current_node + cnp.intp_t next_node + cnp.float64_t distance diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 068d6c069482e..bd45175d06272 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -15,22 +15,21 @@ from ...cluster._hierarchical_fast cimport UnionFind from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._typedefs import ITYPE, DTYPE -cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_distance_matrix( +cpdef cnp.ndarray[MST_edge_t, ndim=2] mst_from_distance_matrix( cnp.ndarray[cnp.float64_t, ndim=2] distance_matrix ): - cdef: cnp.ndarray[cnp.intp_t, ndim=1] node_labels cnp.ndarray[cnp.intp_t, ndim=1] current_labels cnp.ndarray[cnp.float64_t, ndim=1] current_distances, left, right - cnp.ndarray[cnp.float64_t, ndim=2] result + cnp.ndarray[MST_edge_t, ndim=1] result cnp.ndarray label_filter cnp.intp_t n_samples = distance_matrix.shape[0] cnp.intp_t current_node, new_node_index, new_node, i - result = np.zeros((n_samples - 1, 3)) + result = np.empty(n_samples - 1, dtype=MST_edge_dtype) node_labels = np.arange(n_samples, dtype=np.intp) current_node = 0 current_distances = np.infty * np.ones(n_samples) @@ -44,9 +43,9 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_distance_matrix( new_node_index = np.argmin(current_distances) new_node = current_labels[new_node_index] - result[i - 1, 0] = current_node - result[i - 1, 1] = new_node - result[i - 1, 2] = current_distances[new_node_index] + result[i - 1].current_node = current_node + result[i - 1].next_node = new_node + result[i - 1].distance = current_distances[new_node_index] current_node = new_node return result diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 79beead943898..d90a3c4279ee9 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -21,7 +21,12 @@ from ...neighbors import BallTree, KDTree, NearestNeighbors from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite -from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix +from ._linkage import ( + label, + mst_from_distance_matrix, + mst_from_data_matrix, + MST_edge_dtype, +) from ._reachability import mutual_reachability from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut @@ -100,8 +105,8 @@ def _tree_to_labels( def _process_mst(min_spanning_tree): # Sort edges of the min_spanning_tree by weight - row_order = np.argsort(min_spanning_tree.T[2]) - min_spanning_tree = min_spanning_tree[row_order, :] + row_order = np.argsort(min_spanning_tree["distance"]) + min_spanning_tree = min_spanning_tree[row_order] # Convert edge list into standard hierarchical clustering format return label(min_spanning_tree) @@ -141,7 +146,7 @@ def _hdbscan_brute( mutual_reachability_, min_samples=min_samples, sparse=sparse ) # Warn if the MST couldn't be constructed around the missing distances - if np.isinf(min_spanning_tree.T[2]).any(): + if np.isinf(min_spanning_tree["distance"]).any(): warn( "The minimum spanning tree contains edge weights with value " "infinity. Potentially, you are missing too many distances " From 7a07548882526e0be3e61780b674e2c027896cc5 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Nov 2022 16:35:03 -0500 Subject: [PATCH 03/11] Refactor including new struct for simplification --- sklearn/cluster/_hdbscan/_linkage.pxd | 14 -- sklearn/cluster/_hdbscan/_linkage.pyx | 205 +++++++++++++------------- sklearn/cluster/_hdbscan/hdbscan.py | 9 +- 3 files changed, 106 insertions(+), 122 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_linkage.pxd diff --git a/sklearn/cluster/_hdbscan/_linkage.pxd b/sklearn/cluster/_hdbscan/_linkage.pxd deleted file mode 100644 index a67afdfdaab69..0000000000000 --- a/sklearn/cluster/_hdbscan/_linkage.pxd +++ /dev/null @@ -1,14 +0,0 @@ -cimport numpy as cnp -import numpy as np - -# Numpy structured dtype representing a single ordered edge in Prim's algorithm -MST_edge_dtype = np.dtype([ - ("current_node", np.intp), - ("next_node", np.intp), - ("distance", np.float64), -]) - -ctypedef struct MST_edge_t: - cnp.intp_t current_node - cnp.intp_t next_node - cnp.float64_t distance diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index bd45175d06272..1fce979662b03 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -15,43 +15,56 @@ from ...cluster._hierarchical_fast cimport UnionFind from ...utils._typedefs cimport ITYPE_t, DTYPE_t from ...utils._typedefs import ITYPE, DTYPE -cpdef cnp.ndarray[MST_edge_t, ndim=2] mst_from_distance_matrix( - cnp.ndarray[cnp.float64_t, ndim=2] distance_matrix +# Numpy structured dtype representing a single ordered edge in Prim's algorithm +MST_edge_dtype = np.dtype([ + ("current_node", np.intp), + ("next_node", np.intp), + ("distance", np.float64), +]) + +ctypedef struct MST_edge_t: + cnp.intp_t current_node + cnp.intp_t next_node + cnp.float64_t distance + +# TODO add contiguous constraint where possible +cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_mutual_reachability( + cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability ): cdef: cnp.ndarray[cnp.intp_t, ndim=1] node_labels cnp.ndarray[cnp.intp_t, ndim=1] current_labels - cnp.ndarray[cnp.float64_t, ndim=1] current_distances, left, right - cnp.ndarray[MST_edge_t, ndim=1] result + cnp.ndarray[cnp.float64_t, ndim=1] min_reachability, left, right + cnp.ndarray[MST_edge_t, ndim=1] mst - cnp.ndarray label_filter + cnp.ndarray[cnp.uint8_t] label_filter - cnp.intp_t n_samples = distance_matrix.shape[0] + cnp.intp_t n_samples = mutual_reachability.shape[0] cnp.intp_t current_node, new_node_index, new_node, i - result = np.empty(n_samples - 1, dtype=MST_edge_dtype) + mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) node_labels = np.arange(n_samples, dtype=np.intp) current_node = 0 - current_distances = np.infty * np.ones(n_samples) + min_reachability = np.infty * np.ones(n_samples) current_labels = node_labels for i in range(1, n_samples): label_filter = current_labels != current_node current_labels = current_labels[label_filter] - left = current_distances[label_filter] - right = distance_matrix[current_node][current_labels] - current_distances = np.minimum(left, right) + left = min_reachability[label_filter] + right = mutual_reachability[current_node][current_labels] + min_reachability = np.minimum(left, right) - new_node_index = np.argmin(current_distances) + new_node_index = np.argmin(min_reachability) new_node = current_labels[new_node_index] - result[i - 1].current_node = current_node - result[i - 1].next_node = new_node - result[i - 1].distance = current_distances[new_node_index] + mst[i - 1].current_node = current_node + mst[i - 1].next_node = new_node + mst[i - 1].distance = min_reachability[new_node_index] current_node = new_node - return result + return mst -cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix( +cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] raw_data, cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] core_distances, DistanceMetric dist_metric, @@ -59,53 +72,39 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix( ): cdef: - cnp.ndarray[cnp.float64_t, ndim=1] current_distances_arr - cnp.ndarray[cnp.float64_t, ndim=1] current_sources_arr - cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr - cnp.ndarray[cnp.float64_t, ndim=2] result_arr + cnp.int8_t[::1] in_tree + cnp.float64_t[::1] min_reachability, current_sources + cnp.float64_t[::1] current_core_distances = core_distances + cnp.float64_t[:, ::1] raw_data_view = raw_data + cnp.ndarray[MST_edge_t, ndim=1] mst + cnp.ndarray[cnp.float64_t, ndim=2] mst_arr - cnp.float64_t * current_distances - cnp.float64_t * current_sources - cnp.float64_t * current_core_distances - cnp.float64_t * raw_data_ptr - cnp.int8_t * in_tree - cnp.float64_t[:, ::1] raw_data_view - cnp.float64_t[:, ::1] result - - cnp.ndarray label_filter + cnp.ndarray[cnp.uint8_t] label_filter cnp.intp_t current_node, source_node, right_node, left_node, new_node cnp.intp_t i, j, n_samples, num_features - cnp.float64_t current_node_core_distance, new_distance - cnp.float64_t right_value, left_value, core_value + cnp.float64_t current_node_core_dist, new_reachability, mutual_reachability_distance + cnp.float64_t next_node_min_reach, pair_distance, next_node_core_dist n_samples = raw_data.shape[0] num_features = raw_data.shape[1] - raw_data_view = ( ( - raw_data.data)) - raw_data_ptr = ( &raw_data_view[0, 0]) + mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) - result_arr = np.zeros((n_samples - 1, 3)) - in_tree_arr = np.zeros(n_samples, dtype=np.int8) - current_node = 0 - current_distances_arr = np.infty * np.ones(n_samples) - current_sources_arr = np.ones(n_samples) + in_tree = np.zeros(n_samples, dtype=np.int8) + min_reachability = np.infty * np.ones(n_samples) + current_sources = np.ones(n_samples) - result = ( ( result_arr.data)) - in_tree = ( in_tree_arr.data) - current_distances = ( current_distances_arr.data) - current_sources = ( current_sources_arr.data) - current_core_distances = ( core_distances.data) + current_node = 0 for i in range(1, n_samples): in_tree[current_node] = 1 - current_node_core_distance = current_core_distances[current_node] + current_node_core_dist = current_core_distances[current_node] - new_distance = DBL_MAX + new_reachability = DBL_MAX source_node = 0 new_node = 0 @@ -113,84 +112,82 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix( if in_tree[j]: continue - right_value = current_distances[j] - right_source = current_sources[j] + next_node_min_reach = min_reachability[j] + next_node_source = current_sources[j] - left_value = dist_metric.dist(&raw_data_ptr[num_features * - current_node], - &raw_data_ptr[num_features * j], - num_features) - left_source = current_node + pair_distance = dist_metric.dist( + &raw_data_view[current_node, 0], + &raw_data_view[j, 0], + num_features + ) if alpha != 1.0: - left_value /= alpha - - core_value = core_distances[j] - if (current_node_core_distance > right_value or - core_value > right_value or - left_value > right_value): - if right_value < new_distance: - new_distance = right_value - source_node = right_source + pair_distance /= alpha + + next_node_core_dist = core_distances[j] + mutual_reachability_distance = max( + current_node_core_dist, + next_node_core_dist, + pair_distance + ) + if mutual_reachability_distance > next_node_min_reach: + if next_node_min_reach < new_reachability: + new_reachability = next_node_min_reach + source_node = next_node_source new_node = j continue - if core_value > current_node_core_distance: - if core_value > left_value: - left_value = core_value - else: - if current_node_core_distance > left_value: - left_value = current_node_core_distance - - if left_value < right_value: - current_distances[j] = left_value - current_sources[j] = left_source - if left_value < new_distance: - new_distance = left_value - source_node = left_source + if mutual_reachability_distance < next_node_min_reach: + min_reachability[j] = mutual_reachability_distance + current_sources[j] = current_node + if mutual_reachability_distance < new_reachability: + new_reachability = mutual_reachability_distance + source_node = current_node new_node = j else: - if right_value < new_distance: - new_distance = right_value - source_node = right_source + if next_node_min_reach < new_reachability: + new_reachability = next_node_min_reach + source_node = next_node_source new_node = j - result[i - 1, 0] = source_node - result[i - 1, 1] = new_node - result[i - 1, 2] = new_distance + mst[i - 1].current_node = source_node + mst[i - 1].next_node = new_node + mst[i - 1].distance = new_reachability current_node = new_node - return result_arr + return mst @cython.wraparound(True) -cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(cnp.float64_t[:,:] L): +cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(MST_edge_t[:] mst): cdef: - cnp.ndarray[cnp.float64_t, ndim=2] result_arr - cnp.float64_t[:, ::1] result + cnp.ndarray[cnp.float64_t, ndim=2] single_linkage - cnp.intp_t N, a, aa, b, bb, index - cnp.float64_t delta + # Note mst.shape[0] is one fewer than the number of samples + cnp.intp_t n_samples = mst.shape[0] + 1 + cnp.intp_t current_node_ancestor, next_node_ancestor + cnp.intp_t current_node, next_node, index + cnp.float64_t distance - result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) - result = ( ( - result_arr.data)) - N = L.shape[0] + 1 - U = UnionFind(N) + single_linkage = np.zeros((n_samples - 1, 4)) + U = UnionFind(n_samples) - for index in range(L.shape[0]): + for i in range(n_samples - 1): - a = L[index, 0] - b = L[index, 1] - delta = L[index, 2] + current_node = mst[i].current_node + next_node = mst[i].next_node + distance = mst[i].distance - aa, bb = U.fast_find(a), U.fast_find(b) + current_node_ancestor, next_node_ancestor = ( + U.fast_find(current_node), + U.fast_find(next_node) + ) - result[index][0] = aa - result[index][1] = bb - result[index][2] = delta - result[index][3] = U.size[aa] + U.size[bb] + single_linkage[i][0] = current_node_ancestor + single_linkage[i][1] = next_node_ancestor + single_linkage[i][2] = distance + single_linkage[i][3] = U.size[current_node_ancestor] + U.size[next_node_ancestor] - U.union(aa, bb) + U.union(current_node_ancestor, next_node_ancestor) - return result_arr + return single_linkage diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index d90a3c4279ee9..95c114dafb070 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -23,7 +23,7 @@ from ...utils.validation import _assert_all_finite from ._linkage import ( label, - mst_from_distance_matrix, + mst_from_mutual_reachability, mst_from_data_matrix, MST_edge_dtype, ) @@ -53,7 +53,7 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): if not sparse: - return mst_from_distance_matrix(mutual_reachability) + return mst_from_mutual_reachability(mutual_reachability) # Check connected component on mutual reachability # If more than one component, it means that even if the distance matrix X @@ -75,7 +75,9 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability) rows, cols = sparse_min_spanning_tree.nonzero() - return np.vstack((rows, cols, sparse_min_spanning_tree.data)).T + mst = np.vstack((rows, cols, sparse_min_spanning_tree.data)) + mst = np.core.records.fromarrays(mst, dtype=MST_edge_dtype, shape=(mst.shape[1],)) + return mst def _tree_to_labels( @@ -154,7 +156,6 @@ def _hdbscan_brute( "size.", UserWarning, ) - return _process_mst(min_spanning_tree) From 9f4fbdfa9eae3f5feb1b1f5c2961065a91f74637 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Nov 2022 16:40:04 -0500 Subject: [PATCH 04/11] Added contiguous specification where applicable --- sklearn/cluster/_hdbscan/_linkage.pyx | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 1fce979662b03..eb5d3191c61ef 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -22,22 +22,23 @@ MST_edge_dtype = np.dtype([ ("distance", np.float64), ]) -ctypedef struct MST_edge_t: +# Packed shouldn't make a difference since they're all 8-byte quantities, +# but it's included just to be safe. +ctypedef packed struct MST_edge_t: cnp.intp_t current_node cnp.intp_t next_node cnp.float64_t distance -# TODO add contiguous constraint where possible -cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_mutual_reachability( +cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability ): cdef: - cnp.ndarray[cnp.intp_t, ndim=1] node_labels - cnp.ndarray[cnp.intp_t, ndim=1] current_labels - cnp.ndarray[cnp.float64_t, ndim=1] min_reachability, left, right - cnp.ndarray[MST_edge_t, ndim=1] mst + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] node_labels + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] current_labels + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right + cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst - cnp.ndarray[cnp.uint8_t] label_filter + cnp.ndarray[cnp.uint8_t, mode='c'] label_filter cnp.intp_t n_samples = mutual_reachability.shape[0] cnp.intp_t current_node, new_node_index, new_node, i @@ -76,10 +77,10 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( cnp.float64_t[::1] min_reachability, current_sources cnp.float64_t[::1] current_core_distances = core_distances cnp.float64_t[:, ::1] raw_data_view = raw_data - cnp.ndarray[MST_edge_t, ndim=1] mst - cnp.ndarray[cnp.float64_t, ndim=2] mst_arr + cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst + cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] mst_arr - cnp.ndarray[cnp.uint8_t] label_filter + cnp.ndarray[cnp.uint8_t, mode='c'] label_filter cnp.intp_t current_node, source_node, right_node, left_node, new_node cnp.intp_t i, j, n_samples, num_features @@ -158,10 +159,10 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( return mst @cython.wraparound(True) -cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(MST_edge_t[:] mst): +cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(MST_edge_t[::1] mst): cdef: - cnp.ndarray[cnp.float64_t, ndim=2] single_linkage + cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage # Note mst.shape[0] is one fewer than the number of samples cnp.intp_t n_samples = mst.shape[0] + 1 From 0182bc9b605e436ea1d4d1925cf575b7ecf52f78 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Nov 2022 16:40:55 -0500 Subject: [PATCH 05/11] Updated authorship --- sklearn/cluster/_hdbscan/_linkage.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index eb5d3191c61ef..78751816f1482 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -1,6 +1,7 @@ # Minimum spanning tree single linkage implementation for hdbscan # Authors: Leland McInnes # Steve Astels +# Meekail Zain # License: 3-clause BSD cimport numpy as cnp From 39e7d7e59dfdcf52488bd0d05dc7c43e67becef2 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 9 Nov 2022 15:32:59 -0500 Subject: [PATCH 06/11] Feedback from review --- sklearn/cluster/_hdbscan/_linkage.pyx | 92 ++++++++++++--------------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 78751816f1482..a92d368c0d621 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -5,12 +5,9 @@ # License: 3-clause BSD cimport numpy as cnp - -import numpy as np -import cython - from libc.float cimport DBL_MAX +import numpy as np from ...metrics._dist_metrics cimport DistanceMetric from ...cluster._hierarchical_fast cimport UnionFind from ...utils._typedefs cimport ITYPE_t, DTYPE_t @@ -18,38 +15,36 @@ from ...utils._typedefs import ITYPE, DTYPE # Numpy structured dtype representing a single ordered edge in Prim's algorithm MST_edge_dtype = np.dtype([ - ("current_node", np.intp), - ("next_node", np.intp), + ("current_node", np.int64), + ("next_node", np.int64), ("distance", np.float64), ]) # Packed shouldn't make a difference since they're all 8-byte quantities, # but it's included just to be safe. ctypedef packed struct MST_edge_t: - cnp.intp_t current_node - cnp.intp_t next_node + cnp.int64_t current_node + cnp.int64_t next_node cnp.float64_t distance cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability ): cdef: - cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] node_labels - cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] current_labels + cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst cnp.ndarray[cnp.uint8_t, mode='c'] label_filter - cnp.intp_t n_samples = mutual_reachability.shape[0] - cnp.intp_t current_node, new_node_index, new_node, i + cnp.int64_t n_samples = mutual_reachability.shape[0] + cnp.int64_t current_node, new_node_index, new_node, i mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) - node_labels = np.arange(n_samples, dtype=np.intp) + current_labels = np.arange(n_samples, dtype=np.int64) current_node = 0 - min_reachability = np.infty * np.ones(n_samples) - current_labels = node_labels - for i in range(1, n_samples): + min_reachability = np.infty * np.ones(n_samples, dtype=np.float64) + for i in range(0, n_samples - 1): label_filter = current_labels != current_node current_labels = current_labels[label_filter] left = min_reachability[label_filter] @@ -58,33 +53,29 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( new_node_index = np.argmin(min_reachability) new_node = current_labels[new_node_index] - mst[i - 1].current_node = current_node - mst[i - 1].next_node = new_node - mst[i - 1].distance = min_reachability[new_node_index] + mst[i].current_node = current_node + mst[i].next_node = new_node + mst[i].distance = min_reachability[new_node_index] current_node = new_node return mst -cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( - cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] raw_data, - cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] core_distances, +cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( + const cnp.float64_t[:, ::1] raw_data, + const cnp.float64_t[::1] core_distances, DistanceMetric dist_metric, cnp.float64_t alpha=1.0 ): cdef: cnp.int8_t[::1] in_tree - cnp.float64_t[::1] min_reachability, current_sources - cnp.float64_t[::1] current_core_distances = core_distances - cnp.float64_t[:, ::1] raw_data_view = raw_data + cnp.float64_t[::1] min_reachability + cnp.int64_t[::1] current_sources cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst - cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] mst_arr - - cnp.ndarray[cnp.uint8_t, mode='c'] label_filter - cnp.intp_t current_node, source_node, right_node, left_node, new_node - cnp.intp_t i, j, n_samples, num_features + cnp.int64_t current_node, source_node, right_node, left_node, new_node, next_node_source + cnp.int64_t i, j, n_samples, num_features cnp.float64_t current_node_core_dist, new_reachability, mutual_reachability_distance cnp.float64_t next_node_min_reach, pair_distance, next_node_core_dist @@ -95,16 +86,16 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) in_tree = np.zeros(n_samples, dtype=np.int8) - min_reachability = np.infty * np.ones(n_samples) - current_sources = np.ones(n_samples) + min_reachability = np.infty * np.ones(n_samples, dtype=np.float64) + current_sources = np.ones(n_samples, dtype=np.int64) current_node = 0 - for i in range(1, n_samples): + for i in range(0, n_samples - 1): in_tree[current_node] = 1 - current_node_core_dist = current_core_distances[current_node] + current_node_core_dist = core_distances[current_node] new_reachability = DBL_MAX source_node = 0 @@ -118,8 +109,8 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( next_node_source = current_sources[j] pair_distance = dist_metric.dist( - &raw_data_view[current_node, 0], - &raw_data_view[j, 0], + &raw_data[current_node, 0], + &raw_data[j, 0], num_features ) @@ -152,27 +143,26 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix( source_node = next_node_source new_node = j - mst[i - 1].current_node = source_node - mst[i - 1].next_node = new_node - mst[i - 1].distance = new_reachability + mst[i].current_node = source_node + mst[i].next_node = new_node + mst[i].distance = new_reachability current_node = new_node return mst -@cython.wraparound(True) -cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(MST_edge_t[::1] mst): +cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(const MST_edge_t[::1] mst): cdef: cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage # Note mst.shape[0] is one fewer than the number of samples - cnp.intp_t n_samples = mst.shape[0] + 1 - cnp.intp_t current_node_ancestor, next_node_ancestor - cnp.intp_t current_node, next_node, index + cnp.int64_t n_samples = mst.shape[0] + 1 + cnp.int64_t current_node_cluster, next_node_cluster + cnp.int64_t current_node, next_node, index cnp.float64_t distance + UnionFind U = UnionFind(n_samples) single_linkage = np.zeros((n_samples - 1, 4)) - U = UnionFind(n_samples) for i in range(n_samples - 1): @@ -180,16 +170,18 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(MST_edge_t[::1] mst): next_node = mst[i].next_node distance = mst[i].distance - current_node_ancestor, next_node_ancestor = ( + current_node_cluster, next_node_cluster = ( U.fast_find(current_node), U.fast_find(next_node) ) - single_linkage[i][0] = current_node_ancestor - single_linkage[i][1] = next_node_ancestor + # TODO: Update this to an array of structs (AoS). + # Should be done simultaneously in _tree.pyx to ensure compatability. + single_linkage[i][0] = current_node_cluster + single_linkage[i][1] = next_node_cluster single_linkage[i][2] = distance - single_linkage[i][3] = U.size[current_node_ancestor] + U.size[next_node_ancestor] + single_linkage[i][3] = U.size[current_node_cluster] + U.size[next_node_cluster] - U.union(current_node_ancestor, next_node_ancestor) + U.union(current_node_cluster, next_node_cluster) return single_linkage From 9c38badb91e6a31f7d2b382eb4ad1d8cef5b1fcc Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 9 Nov 2022 18:04:33 -0500 Subject: [PATCH 07/11] Refactor and remove alpha --- sklearn/cluster/_hdbscan/_linkage.pyx | 57 ++++++++++++++++++++++++--- sklearn/cluster/_hdbscan/hdbscan.py | 18 ++------- 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index a92d368c0d621..e7f3f8feb21ef 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -30,6 +30,27 @@ ctypedef packed struct MST_edge_t: cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability ): + """Compute the Minimum Spanning Tree (MST) representation of the mutual- + reachability graph using Prim's algorithm. + + Parameters + ---------- + mutual_reachability : ndarray of shape (n_samples, n_samples) + Array of mutual-reachabilities between samples. + + Returns + ------- + mst: ndarray of shape (n_samples - 1,) + The MST representation of the mutual-reahability graph. The MST is + represented as a collecteion of edges. Each edge is an instance of a + custom dtype `MST_edge_dtype` with the following specification: + + MST_edge_dtype = np.dtype([ + ("current_node", np.int64), + ("next_node", np.int64), + ("distance", np.float64), + ]) + """ cdef: cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right @@ -65,8 +86,37 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( const cnp.float64_t[:, ::1] raw_data, const cnp.float64_t[::1] core_distances, DistanceMetric dist_metric, - cnp.float64_t alpha=1.0 ): + """Compute the Minimum Spanning Tree (MST) representation of the mutual- + reachability graph generated from the provided `raw_data` and + `core_distances` using Prim's algorithm. + + Parameters + ---------- + raw_data : ndarray of shape (n_samples, n_features) + Input array of data samples. + + core_distances : ndarray of shape (n_samples,) + An array containing the core-distance calculated for each corresponding + sample. + + dist_metric : DistanceMetric + The distance metric to use when calculating pairwise distances for + determining mutual-reachability. + + Returns + ------- + mst: ndarray of shape (n_samples - 1,) + The MST representation of the mutual-reahability graph. The MST is + represented as a collecteion of edges. Each edge is an instance of a + custom dtype `MST_edge_dtype` with the following specification: + + MST_edge_dtype = np.dtype([ + ("current_node", np.int64), + ("next_node", np.int64), + ("distance", np.float64), + ]) + """ cdef: cnp.int8_t[::1] in_tree @@ -114,9 +164,6 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( num_features ) - if alpha != 1.0: - pair_distance /= alpha - next_node_core_dist = core_distances[j] mutual_reachability_distance = max( current_node_core_dist, @@ -150,7 +197,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( return mst -cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(const MST_edge_t[::1] mst): +cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst): cdef: cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 95c114dafb070..611dd2113cdf1 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -22,7 +22,7 @@ from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite from ._linkage import ( - label, + make_single_linkage, mst_from_mutual_reachability, mst_from_data_matrix, MST_edge_dtype, @@ -110,13 +110,12 @@ def _process_mst(min_spanning_tree): row_order = np.argsort(min_spanning_tree["distance"]) min_spanning_tree = min_spanning_tree[row_order] # Convert edge list into standard hierarchical clustering format - return label(min_spanning_tree) + return make_single_linkage(min_spanning_tree) def _hdbscan_brute( X, min_samples=5, - alpha=None, metric="euclidean", n_jobs=None, copy=False, @@ -132,7 +131,6 @@ def _hdbscan_brute( distance_matrix = pairwise_distances( X, metric=metric, n_jobs=n_jobs, **metric_params ) - distance_matrix /= alpha # max_dist is only relevant for sparse and is ignored for dense max_dist = metric_params.get("max_dist", 0.0) @@ -163,7 +161,6 @@ def _hdbscan_prims( X, algo, min_samples=5, - alpha=1.0, metric="euclidean", leaf_size=40, n_jobs=None, @@ -188,8 +185,7 @@ def _hdbscan_prims( dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_from_data_matrix - min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) - + min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric) return _process_mst(min_spanning_tree) @@ -294,10 +290,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): metric_params : dict, default=None Arguments passed to the distance metric. - alpha : float, default=1.0 - A distance scaling parameter as used in robust single linkage. - See [3]_ for more information. - algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto" Exactly which algorithm to use for computing core distances; By default this is set to `"auto"` which attempts to use a @@ -458,7 +450,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): ], "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], "metric_params": [dict, None], - "alpha": [Interval(Real, left=0, right=None, closed="neither")], "algorithm": [ StrOptions( { @@ -485,7 +476,6 @@ def __init__( max_cluster_size=None, metric="euclidean", metric_params=None, - alpha=1.0, algorithm="auto", leaf_size=40, n_jobs=4, @@ -496,7 +486,6 @@ def __init__( ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples - self.alpha = alpha self.max_cluster_size = max_cluster_size self.cluster_selection_epsilon = cluster_selection_epsilon self.metric = metric @@ -597,7 +586,6 @@ def fit(self, X, y=None): kwargs = dict( X=X, min_samples=self._min_samples, - alpha=self.alpha, metric=self.metric, n_jobs=self.n_jobs, **self._metric_params, From e8ad9339d140a446024a579cae5ac564d6797cfd Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 9 Nov 2022 18:13:46 -0500 Subject: [PATCH 08/11] Added documentation --- sklearn/cluster/_hdbscan/_linkage.pyx | 29 +++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index e7f3f8feb21ef..c35dc6e1c865c 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -40,7 +40,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( Returns ------- - mst: ndarray of shape (n_samples - 1,) + mst : ndarray of shape (n_samples - 1,) The MST representation of the mutual-reahability graph. The MST is represented as a collecteion of edges. Each edge is an instance of a custom dtype `MST_edge_dtype` with the following specification: @@ -106,7 +106,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( Returns ------- - mst: ndarray of shape (n_samples - 1,) + mst : ndarray of shape (n_samples - 1,) The MST representation of the mutual-reahability graph. The MST is represented as a collecteion of edges. Each edge is an instance of a custom dtype `MST_edge_dtype` with the following specification: @@ -198,7 +198,32 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( return mst cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst): + """Construct a single-linkage tree from an MST. + Parameters + ---------- + mst : ndarray of shape (n_samples - 1,) + The MST representation of the mutual-reahability graph. The MST is + represented as a collecteion of edges. Each edge is an instance of a + custom dtype `MST_edge_dtype` with the following specification: + + MST_edge_dtype = np.dtype([ + ("current_node", np.int64), + ("next_node", np.int64), + ("distance", np.float64), + ]) + + Returns + ------- + single_linkage : ndarray of shape (n_samples - 1, 4) + The single-linkage tree tree (dendrogram) built from the MST. Each + of the array represents the following: + + - left node/cluster + - right node/cluster + - distance + - new cluster size + """ cdef: cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage From 50847ec97db09daf6fdb3163266fe2fe98846fc7 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Tue, 29 Nov 2022 17:49:50 -0500 Subject: [PATCH 09/11] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/cluster/_hdbscan/_linkage.pyx | 45 +++++++-------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index c35dc6e1c865c..9d47d40847e10 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -40,16 +40,9 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( Returns ------- - mst : ndarray of shape (n_samples - 1,) + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype The MST representation of the mutual-reahability graph. The MST is - represented as a collecteion of edges. Each edge is an instance of a - custom dtype `MST_edge_dtype` with the following specification: - - MST_edge_dtype = np.dtype([ - ("current_node", np.int64), - ("next_node", np.int64), - ("distance", np.float64), - ]) + represented as a collecteion of edges. """ cdef: cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels @@ -64,7 +57,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) current_labels = np.arange(n_samples, dtype=np.int64) current_node = 0 - min_reachability = np.infty * np.ones(n_samples, dtype=np.float64) + min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64) for i in range(0, n_samples - 1): label_filter = current_labels != current_node current_labels = current_labels[label_filter] @@ -106,16 +99,9 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( Returns ------- - mst : ndarray of shape (n_samples - 1,) + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype The MST representation of the mutual-reahability graph. The MST is - represented as a collecteion of edges. Each edge is an instance of a - custom dtype `MST_edge_dtype` with the following specification: - - MST_edge_dtype = np.dtype([ - ("current_node", np.int64), - ("next_node", np.int64), - ("distance", np.float64), - ]) + represented as a collecteion of edges. """ cdef: @@ -136,7 +122,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) in_tree = np.zeros(n_samples, dtype=np.int8) - min_reachability = np.infty * np.ones(n_samples, dtype=np.float64) + min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64) current_sources = np.ones(n_samples, dtype=np.int64) current_node = 0 @@ -202,16 +188,9 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST Parameters ---------- - mst : ndarray of shape (n_samples - 1,) + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype The MST representation of the mutual-reahability graph. The MST is - represented as a collecteion of edges. Each edge is an instance of a - custom dtype `MST_edge_dtype` with the following specification: - - MST_edge_dtype = np.dtype([ - ("current_node", np.int64), - ("next_node", np.int64), - ("distance", np.float64), - ]) + represented as a collecteion of edges. Returns ------- @@ -234,7 +213,7 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST cnp.float64_t distance UnionFind U = UnionFind(n_samples) - single_linkage = np.zeros((n_samples - 1, 4)) + single_linkage = np.zeros((n_samples - 1, 4), dtype=np.float64) for i in range(n_samples - 1): @@ -242,10 +221,8 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST next_node = mst[i].next_node distance = mst[i].distance - current_node_cluster, next_node_cluster = ( - U.fast_find(current_node), - U.fast_find(next_node) - ) + current_node_cluster = U.fast_find(current_node) + next_node_cluster = U.fast_find(next_node) # TODO: Update this to an array of structs (AoS). # Should be done simultaneously in _tree.pyx to ensure compatability. From e533f0c6399f7b99db0282bcf17eed8467c9d972 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 29 Nov 2022 18:18:18 -0500 Subject: [PATCH 10/11] Review feedback and revert alpha changes --- sklearn/cluster/_hdbscan/_linkage.pyx | 6 ++++++ sklearn/cluster/_hdbscan/hdbscan.py | 19 ++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 9d47d40847e10..1957a27eab69f 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -45,6 +45,8 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( represented as a collecteion of edges. """ cdef: + # Note: we utilize ndarray's over memory-views to make use of numpy + # binary indexing and sub-selection below. cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst @@ -79,6 +81,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( const cnp.float64_t[:, ::1] raw_data, const cnp.float64_t[::1] core_distances, DistanceMetric dist_metric, + cnp.float64_t alpha=1.0 ): """Compute the Minimum Spanning Tree (MST) representation of the mutual- reachability graph generated from the provided `raw_data` and @@ -150,6 +153,9 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( num_features ) + if alpha != 1.0: + pair_distance /= alpha + next_node_core_dist = core_distances[j] mutual_reachability_distance = max( current_node_core_dist, diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 611dd2113cdf1..4f1fcf1962d0b 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -75,8 +75,10 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability) rows, cols = sparse_min_spanning_tree.nonzero() - mst = np.vstack((rows, cols, sparse_min_spanning_tree.data)) - mst = np.core.records.fromarrays(mst, dtype=MST_edge_dtype, shape=(mst.shape[1],)) + mst = np.core.records.fromarrays( + [rows, cols, sparse_min_spanning_tree.data], + dtype=MST_edge_dtype, + ) return mst @@ -116,6 +118,7 @@ def _process_mst(min_spanning_tree): def _hdbscan_brute( X, min_samples=5, + alpha=None, metric="euclidean", n_jobs=None, copy=False, @@ -131,6 +134,7 @@ def _hdbscan_brute( distance_matrix = pairwise_distances( X, metric=metric, n_jobs=n_jobs, **metric_params ) + distance_matrix /= alpha # max_dist is only relevant for sparse and is ignored for dense max_dist = metric_params.get("max_dist", 0.0) @@ -161,6 +165,7 @@ def _hdbscan_prims( X, algo, min_samples=5, + alpha=1.0, metric="euclidean", leaf_size=40, n_jobs=None, @@ -185,7 +190,7 @@ def _hdbscan_prims( dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_from_data_matrix - min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric) + min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) return _process_mst(min_spanning_tree) @@ -290,6 +295,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): metric_params : dict, default=None Arguments passed to the distance metric. + alpha : float, default=1.0 + A distance scaling parameter as used in robust single linkage. + See [3]_ for more information. + algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto" Exactly which algorithm to use for computing core distances; By default this is set to `"auto"` which attempts to use a @@ -450,6 +459,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): ], "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], "metric_params": [dict, None], + "alpha": [Interval(Real, left=0, right=None, closed="neither")], "algorithm": [ StrOptions( { @@ -476,6 +486,7 @@ def __init__( max_cluster_size=None, metric="euclidean", metric_params=None, + alpha=1.0, algorithm="auto", leaf_size=40, n_jobs=4, @@ -486,6 +497,7 @@ def __init__( ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples + self.alpha = alpha self.max_cluster_size = max_cluster_size self.cluster_selection_epsilon = cluster_selection_epsilon self.metric = metric @@ -586,6 +598,7 @@ def fit(self, X, y=None): kwargs = dict( X=X, min_samples=self._min_samples, + alpha=self.alpha, metric=self.metric, n_jobs=self.n_jobs, **self._metric_params, From f154164684046d3769c253faa6075ad8a3562f84 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Tue, 6 Dec 2022 17:39:36 -0500 Subject: [PATCH 11/11] Update sklearn/cluster/_hdbscan/_linkage.pyx Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/_linkage.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 1957a27eab69f..fd9888ac4da82 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -153,8 +153,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( num_features ) - if alpha != 1.0: - pair_distance /= alpha + pair_distance /= alpha next_node_core_dist = core_distances[j] mutual_reachability_distance = max(