-
-
Notifications
You must be signed in to change notification settings - Fork 26k
CLN Cleaned cluster/_hdbscan/_linkage.pyx
#24857
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cc31a2a
4dcfe8e
7a07548
9f4fbdf
0182bc9
39e7d7e
9c38bad
e8ad933
50847ec
e533f0c
f154164
6db58bd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,210 +1,241 @@ | ||||||||||||
# Minimum spanning tree single linkage implementation for hdbscan | ||||||||||||
# Authors: Leland McInnes <[email protected]> | ||||||||||||
# Steve Astels <[email protected]> | ||||||||||||
# Meekail Zain <[email protected]> | ||||||||||||
# License: 3-clause BSD | ||||||||||||
|
||||||||||||
import numpy as np | ||||||||||||
cimport numpy as cnp | ||||||||||||
import cython | ||||||||||||
|
||||||||||||
from libc.float cimport DBL_MAX | ||||||||||||
|
||||||||||||
import numpy as np | ||||||||||||
from ...metrics._dist_metrics cimport DistanceMetric | ||||||||||||
from ...cluster._hierarchical_fast cimport UnionFind | ||||||||||||
from ...utils._typedefs cimport ITYPE_t, DTYPE_t | ||||||||||||
from ...utils._typedefs import ITYPE, DTYPE | ||||||||||||
|
||||||||||||
cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_distance_matrix( | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=2] distance_matrix | ||||||||||||
# Numpy structured dtype representing a single ordered edge in Prim's algorithm | ||||||||||||
MST_edge_dtype = np.dtype([ | ||||||||||||
("current_node", np.int64), | ||||||||||||
("next_node", np.int64), | ||||||||||||
("distance", np.float64), | ||||||||||||
]) | ||||||||||||
|
||||||||||||
# Packed shouldn't make a difference since they're all 8-byte quantities, | ||||||||||||
# but it's included just to be safe. | ||||||||||||
ctypedef packed struct MST_edge_t: | ||||||||||||
cnp.int64_t current_node | ||||||||||||
cnp.int64_t next_node | ||||||||||||
cnp.float64_t distance | ||||||||||||
|
||||||||||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( | ||||||||||||
cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability | ||||||||||||
): | ||||||||||||
Comment on lines
+30
to
32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should only use Is the usage of Generally, most There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current algorithm uses binary masks for indexing into subsets of scikit-learn/sklearn/cluster/_hdbscan/_linkage.pyx Lines 48 to 52 in 39e7d7e
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could attempt to change the algorithm to avoid using this, but it is a pretty eloquent solution. Ultimately I don't know how costly the python interactions here are, and whether they outweigh the usefulness of the algorithm as it is now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are some overhead of just typing and referencing variable with We can leave this for now. If this shows to be a hotspot, we can have a PR to rewrite this part. What do you think? In this case, is it possible to add a comment to indicate that numpy arrays are used for binary masks and convenient indexing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good, I will add in such a comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this comment added somewhere? |
||||||||||||
|
||||||||||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual- | ||||||||||||
reachability graph using Prim's algorithm. | ||||||||||||
|
||||||||||||
Parameters | ||||||||||||
---------- | ||||||||||||
mutual_reachability : ndarray of shape (n_samples, n_samples) | ||||||||||||
Array of mutual-reachabilities between samples. | ||||||||||||
|
||||||||||||
Returns | ||||||||||||
------- | ||||||||||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype | ||||||||||||
The MST representation of the mutual-reahability graph. The MST is | ||||||||||||
represented as a collecteion of edges. | ||||||||||||
""" | ||||||||||||
cdef: | ||||||||||||
cnp.ndarray[cnp.intp_t, ndim=1] node_labels | ||||||||||||
cnp.ndarray[cnp.intp_t, ndim=1] current_labels | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=1] current_distances | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=1] left | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=1] right | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=2] result | ||||||||||||
|
||||||||||||
cnp.ndarray label_filter | ||||||||||||
|
||||||||||||
cnp.intp_t current_node | ||||||||||||
cnp.intp_t new_node_index | ||||||||||||
cnp.intp_t new_node | ||||||||||||
cnp.intp_t i | ||||||||||||
|
||||||||||||
result = np.zeros((distance_matrix.shape[0] - 1, 3)) | ||||||||||||
node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) | ||||||||||||
# Note: we utilize ndarray's over memory-views to make use of numpy | ||||||||||||
# binary indexing and sub-selection below. | ||||||||||||
Comment on lines
+48
to
+49
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for this comment, this answers one of @Vincent-Maladiere's questions. 👍 |
||||||||||||
cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels | ||||||||||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right | ||||||||||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst | ||||||||||||
|
||||||||||||
cnp.ndarray[cnp.uint8_t, mode='c'] label_filter | ||||||||||||
|
||||||||||||
cnp.int64_t n_samples = mutual_reachability.shape[0] | ||||||||||||
cnp.int64_t current_node, new_node_index, new_node, i | ||||||||||||
|
||||||||||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) | ||||||||||||
current_labels = np.arange(n_samples, dtype=np.int64) | ||||||||||||
current_node = 0 | ||||||||||||
current_distances = np.infty * np.ones(distance_matrix.shape[0]) | ||||||||||||
current_labels = node_labels | ||||||||||||
for i in range(1, node_labels.shape[0]): | ||||||||||||
min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64) | ||||||||||||
for i in range(0, n_samples - 1): | ||||||||||||
label_filter = current_labels != current_node | ||||||||||||
current_labels = current_labels[label_filter] | ||||||||||||
left = current_distances[label_filter] | ||||||||||||
right = distance_matrix[current_node][current_labels] | ||||||||||||
current_distances = np.where(left < right, left, right) | ||||||||||||
left = min_reachability[label_filter] | ||||||||||||
right = mutual_reachability[current_node][current_labels] | ||||||||||||
min_reachability = np.minimum(left, right) | ||||||||||||
|
||||||||||||
new_node_index = np.argmin(current_distances) | ||||||||||||
new_node_index = np.argmin(min_reachability) | ||||||||||||
new_node = current_labels[new_node_index] | ||||||||||||
result[i - 1, 0] = <double> current_node | ||||||||||||
result[i - 1, 1] = <double> new_node | ||||||||||||
result[i - 1, 2] = current_distances[new_node_index] | ||||||||||||
mst[i].current_node = current_node | ||||||||||||
mst[i].next_node = new_node | ||||||||||||
mst[i].distance = min_reachability[new_node_index] | ||||||||||||
current_node = new_node | ||||||||||||
|
||||||||||||
return result | ||||||||||||
return mst | ||||||||||||
|
||||||||||||
|
||||||||||||
cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix( | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=2, mode='c'] raw_data, | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=1, mode='c'] core_distances, | ||||||||||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( | ||||||||||||
const cnp.float64_t[:, ::1] raw_data, | ||||||||||||
const cnp.float64_t[::1] core_distances, | ||||||||||||
DistanceMetric dist_metric, | ||||||||||||
cnp.double_t alpha=1.0 | ||||||||||||
cnp.float64_t alpha=1.0 | ||||||||||||
): | ||||||||||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual- | ||||||||||||
reachability graph generated from the provided `raw_data` and | ||||||||||||
`core_distances` using Prim's algorithm. | ||||||||||||
|
||||||||||||
Parameters | ||||||||||||
---------- | ||||||||||||
raw_data : ndarray of shape (n_samples, n_features) | ||||||||||||
Input array of data samples. | ||||||||||||
|
||||||||||||
core_distances : ndarray of shape (n_samples,) | ||||||||||||
An array containing the core-distance calculated for each corresponding | ||||||||||||
sample. | ||||||||||||
|
||||||||||||
dist_metric : DistanceMetric | ||||||||||||
The distance metric to use when calculating pairwise distances for | ||||||||||||
determining mutual-reachability. | ||||||||||||
|
||||||||||||
Returns | ||||||||||||
------- | ||||||||||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype | ||||||||||||
The MST representation of the mutual-reahability graph. The MST is | ||||||||||||
represented as a collecteion of edges. | ||||||||||||
""" | ||||||||||||
|
||||||||||||
cdef: | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=1] current_distances_arr | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=1] current_sources_arr | ||||||||||||
cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=2] result_arr | ||||||||||||
|
||||||||||||
cnp.double_t * current_distances | ||||||||||||
cnp.double_t * current_sources | ||||||||||||
cnp.double_t * current_core_distances | ||||||||||||
cnp.double_t * raw_data_ptr | ||||||||||||
cnp.int8_t * in_tree | ||||||||||||
cnp.double_t[:, ::1] raw_data_view | ||||||||||||
cnp.double_t[:, ::1] result | ||||||||||||
|
||||||||||||
cnp.ndarray label_filter | ||||||||||||
|
||||||||||||
cnp.intp_t current_node | ||||||||||||
cnp.intp_t source_node | ||||||||||||
cnp.intp_t right_node | ||||||||||||
cnp.intp_t left_node | ||||||||||||
cnp.intp_t new_node | ||||||||||||
cnp.intp_t i | ||||||||||||
cnp.intp_t j | ||||||||||||
cnp.intp_t dim | ||||||||||||
cnp.intp_t num_features | ||||||||||||
|
||||||||||||
double current_node_core_distance | ||||||||||||
double right_value | ||||||||||||
double left_value | ||||||||||||
double core_value | ||||||||||||
double new_distance | ||||||||||||
|
||||||||||||
dim = raw_data.shape[0] | ||||||||||||
cnp.int8_t[::1] in_tree | ||||||||||||
cnp.float64_t[::1] min_reachability | ||||||||||||
cnp.int64_t[::1] current_sources | ||||||||||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst | ||||||||||||
|
||||||||||||
cnp.int64_t current_node, source_node, right_node, left_node, new_node, next_node_source | ||||||||||||
cnp.int64_t i, j, n_samples, num_features | ||||||||||||
|
||||||||||||
cnp.float64_t current_node_core_dist, new_reachability, mutual_reachability_distance | ||||||||||||
cnp.float64_t next_node_min_reach, pair_distance, next_node_core_dist | ||||||||||||
|
||||||||||||
n_samples = raw_data.shape[0] | ||||||||||||
num_features = raw_data.shape[1] | ||||||||||||
|
||||||||||||
raw_data_view = (<cnp.double_t[:raw_data.shape[0], :raw_data.shape[1]:1]> ( | ||||||||||||
<cnp.double_t *> raw_data.data)) | ||||||||||||
raw_data_ptr = (<cnp.double_t *> &raw_data_view[0, 0]) | ||||||||||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) | ||||||||||||
|
||||||||||||
result_arr = np.zeros((dim - 1, 3)) | ||||||||||||
in_tree_arr = np.zeros(dim, dtype=np.int8) | ||||||||||||
current_node = 0 | ||||||||||||
current_distances_arr = np.infty * np.ones(dim) | ||||||||||||
current_sources_arr = np.ones(dim) | ||||||||||||
in_tree = np.zeros(n_samples, dtype=np.int8) | ||||||||||||
min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64) | ||||||||||||
current_sources = np.ones(n_samples, dtype=np.int64) | ||||||||||||
|
||||||||||||
result = (<cnp.double_t[:dim - 1, :3:1]> (<cnp.double_t *> result_arr.data)) | ||||||||||||
in_tree = (<cnp.int8_t *> in_tree_arr.data) | ||||||||||||
current_distances = (<cnp.double_t *> current_distances_arr.data) | ||||||||||||
current_sources = (<cnp.double_t *> current_sources_arr.data) | ||||||||||||
current_core_distances = (<cnp.double_t *> core_distances.data) | ||||||||||||
current_node = 0 | ||||||||||||
|
||||||||||||
for i in range(1, dim): | ||||||||||||
for i in range(0, n_samples - 1): | ||||||||||||
|
||||||||||||
in_tree[current_node] = 1 | ||||||||||||
|
||||||||||||
current_node_core_distance = current_core_distances[current_node] | ||||||||||||
current_node_core_dist = core_distances[current_node] | ||||||||||||
|
||||||||||||
new_distance = DBL_MAX | ||||||||||||
new_reachability = DBL_MAX | ||||||||||||
source_node = 0 | ||||||||||||
new_node = 0 | ||||||||||||
|
||||||||||||
for j in range(dim): | ||||||||||||
for j in range(n_samples): | ||||||||||||
if in_tree[j]: | ||||||||||||
continue | ||||||||||||
|
||||||||||||
right_value = current_distances[j] | ||||||||||||
right_source = current_sources[j] | ||||||||||||
|
||||||||||||
left_value = dist_metric.dist(&raw_data_ptr[num_features * | ||||||||||||
current_node], | ||||||||||||
&raw_data_ptr[num_features * j], | ||||||||||||
num_features) | ||||||||||||
left_source = current_node | ||||||||||||
|
||||||||||||
if alpha != 1.0: | ||||||||||||
left_value /= alpha | ||||||||||||
|
||||||||||||
core_value = core_distances[j] | ||||||||||||
if (current_node_core_distance > right_value or | ||||||||||||
core_value > right_value or | ||||||||||||
left_value > right_value): | ||||||||||||
if right_value < new_distance: | ||||||||||||
new_distance = right_value | ||||||||||||
source_node = right_source | ||||||||||||
next_node_min_reach = min_reachability[j] | ||||||||||||
next_node_source = current_sources[j] | ||||||||||||
|
||||||||||||
pair_distance = dist_metric.dist( | ||||||||||||
&raw_data[current_node, 0], | ||||||||||||
&raw_data[j, 0], | ||||||||||||
num_features | ||||||||||||
) | ||||||||||||
|
||||||||||||
pair_distance /= alpha | ||||||||||||
|
||||||||||||
next_node_core_dist = core_distances[j] | ||||||||||||
mutual_reachability_distance = max( | ||||||||||||
current_node_core_dist, | ||||||||||||
next_node_core_dist, | ||||||||||||
pair_distance | ||||||||||||
) | ||||||||||||
Micky774 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
if mutual_reachability_distance > next_node_min_reach: | ||||||||||||
if next_node_min_reach < new_reachability: | ||||||||||||
new_reachability = next_node_min_reach | ||||||||||||
source_node = next_node_source | ||||||||||||
new_node = j | ||||||||||||
continue | ||||||||||||
|
||||||||||||
if core_value > current_node_core_distance: | ||||||||||||
if core_value > left_value: | ||||||||||||
left_value = core_value | ||||||||||||
else: | ||||||||||||
if current_node_core_distance > left_value: | ||||||||||||
left_value = current_node_core_distance | ||||||||||||
|
||||||||||||
if left_value < right_value: | ||||||||||||
current_distances[j] = left_value | ||||||||||||
current_sources[j] = left_source | ||||||||||||
if left_value < new_distance: | ||||||||||||
new_distance = left_value | ||||||||||||
source_node = left_source | ||||||||||||
if mutual_reachability_distance < next_node_min_reach: | ||||||||||||
min_reachability[j] = mutual_reachability_distance | ||||||||||||
current_sources[j] = current_node | ||||||||||||
if mutual_reachability_distance < new_reachability: | ||||||||||||
new_reachability = mutual_reachability_distance | ||||||||||||
source_node = current_node | ||||||||||||
new_node = j | ||||||||||||
else: | ||||||||||||
if right_value < new_distance: | ||||||||||||
new_distance = right_value | ||||||||||||
source_node = right_source | ||||||||||||
if next_node_min_reach < new_reachability: | ||||||||||||
new_reachability = next_node_min_reach | ||||||||||||
source_node = next_node_source | ||||||||||||
new_node = j | ||||||||||||
|
||||||||||||
result[i - 1, 0] = <double> source_node | ||||||||||||
result[i - 1, 1] = <double> new_node | ||||||||||||
result[i - 1, 2] = new_distance | ||||||||||||
mst[i].current_node = source_node | ||||||||||||
mst[i].next_node = new_node | ||||||||||||
mst[i].distance = new_reachability | ||||||||||||
current_node = new_node | ||||||||||||
|
||||||||||||
return result_arr | ||||||||||||
return mst | ||||||||||||
|
||||||||||||
cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst): | ||||||||||||
"""Construct a single-linkage tree from an MST. | ||||||||||||
|
||||||||||||
Parameters | ||||||||||||
---------- | ||||||||||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype | ||||||||||||
The MST representation of the mutual-reahability graph. The MST is | ||||||||||||
represented as a collecteion of edges. | ||||||||||||
|
||||||||||||
@cython.wraparound(True) | ||||||||||||
cpdef cnp.ndarray[cnp.double_t, ndim=2] label(cnp.double_t[:,:] L): | ||||||||||||
Returns | ||||||||||||
------- | ||||||||||||
single_linkage : ndarray of shape (n_samples - 1, 4) | ||||||||||||
The single-linkage tree tree (dendrogram) built from the MST. Each | ||||||||||||
of the array represents the following: | ||||||||||||
|
||||||||||||
- left node/cluster | ||||||||||||
- right node/cluster | ||||||||||||
- distance | ||||||||||||
- new cluster size | ||||||||||||
""" | ||||||||||||
cdef: | ||||||||||||
cnp.ndarray[cnp.double_t, ndim=2] result_arr | ||||||||||||
cnp.double_t[:, ::1] result | ||||||||||||
cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage | ||||||||||||
|
||||||||||||
cnp.intp_t N, a, aa, b, bb, index | ||||||||||||
cnp.double_t delta | ||||||||||||
# Note mst.shape[0] is one fewer than the number of samples | ||||||||||||
cnp.int64_t n_samples = mst.shape[0] + 1 | ||||||||||||
cnp.int64_t current_node_cluster, next_node_cluster | ||||||||||||
cnp.int64_t current_node, next_node, index | ||||||||||||
cnp.float64_t distance | ||||||||||||
UnionFind U = UnionFind(n_samples) | ||||||||||||
|
||||||||||||
result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) | ||||||||||||
result = (<cnp.double_t[:L.shape[0], :4:1]> ( | ||||||||||||
<cnp.double_t *> result_arr.data)) | ||||||||||||
N = L.shape[0] + 1 | ||||||||||||
U = UnionFind(N) | ||||||||||||
single_linkage = np.zeros((n_samples - 1, 4), dtype=np.float64) | ||||||||||||
|
||||||||||||
for index in range(L.shape[0]): | ||||||||||||
for i in range(n_samples - 1): | ||||||||||||
|
||||||||||||
a = <cnp.intp_t> L[index, 0] | ||||||||||||
b = <cnp.intp_t> L[index, 1] | ||||||||||||
delta = L[index, 2] | ||||||||||||
current_node = mst[i].current_node | ||||||||||||
next_node = mst[i].next_node | ||||||||||||
distance = mst[i].distance | ||||||||||||
|
||||||||||||
aa, bb = U.fast_find(a), U.fast_find(b) | ||||||||||||
current_node_cluster = U.fast_find(current_node) | ||||||||||||
next_node_cluster = U.fast_find(next_node) | ||||||||||||
|
||||||||||||
result[index][0] = aa | ||||||||||||
result[index][1] = bb | ||||||||||||
result[index][2] = delta | ||||||||||||
result[index][3] = U.size[aa] + U.size[bb] | ||||||||||||
# TODO: Update this to an array of structs (AoS). | ||||||||||||
# Should be done simultaneously in _tree.pyx to ensure compatability. | ||||||||||||
single_linkage[i][0] = <cnp.float64_t> current_node_cluster | ||||||||||||
single_linkage[i][1] = <cnp.float64_t> next_node_cluster | ||||||||||||
single_linkage[i][2] = distance | ||||||||||||
single_linkage[i][3] = U.size[current_node_cluster] + U.size[next_node_cluster] | ||||||||||||
|
||||||||||||
U.union(aa, bb) | ||||||||||||
U.union(current_node_cluster, next_node_cluster) | ||||||||||||
|
||||||||||||
return result_arr | ||||||||||||
return single_linkage |
Uh oh!
There was an error while loading. Please reload this page.