Thanks to visit codestin.com
Credit goes to github.com

Skip to content

CLN HDBSCAN _tree.pyx::do_labelling refactor #26101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 56 additions & 28 deletions sklearn/cluster/_hdbscan/_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ cdef list bfs_from_hierarchy(
return result


cdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
const HIERARCHY_t[::1] hierarchy,
cnp.intp_t min_cluster_size=10
):
Expand Down Expand Up @@ -402,56 +402,84 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
return result


cdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] do_labelling(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] hierarchy,
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
set clusters,
dict cluster_label_map,
cnp.intp_t allow_single_cluster,
cnp.float64_t cluster_selection_epsilon
):
"""Given a condensed tree, clusters and a labeling map for the clusters,
return an array containing the labels of each point based on cluster
membership. Note that this is where points may be marked as noisy
outliers. The determination of some points as noise is in large, single-
cluster datasets is controlled by the `allow_single_cluster` and
`cluster_selection_epsilon` parameters.

Parameters
----------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.

clusters : set
The set of nodes corresponding to identified clusters. These node
values should be the same as those present in `condensed_tree`.

cluster_label_map : dict
A mapping from the node values present in `clusters` to the labels
which will be returned.

Returns
-------
labels : ndarray of shape (n_samples,)
The cluster labels for each point in the data set;
a label of -1 denotes a noise assignment.
"""

cdef:
cnp.intp_t root_cluster
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
cnp.intp_t[:] parent_array, child_array
cnp.float64_t[:] lambda_array
cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
TreeUnionFind union_find
cnp.intp_t n, parent, child, cluster
cnp.float64_t threshold

child_array = hierarchy['child']
parent_array = hierarchy['parent']
lambda_array = hierarchy['value']
child_array = condensed_tree['child']
parent_array = condensed_tree['parent']
lambda_array = condensed_tree['value']

root_cluster = np.min(parent_array)
result = np.empty(root_cluster, dtype=np.intp)
union_find = TreeUnionFind(np.max(parent_array) + 1)

for n in range(hierarchy.shape[0]):
for n in range(condensed_tree.shape[0]):
child = child_array[n]
parent = parent_array[n]
if child not in clusters:
union_find.union(parent, child)

for n in range(root_cluster):
cluster = union_find.find(n)
if cluster < root_cluster:
result[n] = NOISE
elif cluster == root_cluster:
if len(clusters) == 1 and allow_single_cluster:
if cluster_selection_epsilon != 0.0:
if hierarchy['value'][hierarchy['child'] == n] >= 1 / cluster_selection_epsilon :
result[n] = cluster_label_map[cluster]
else:
result[n] = NOISE
elif hierarchy['value'][hierarchy['child'] == n] >= \
hierarchy['value'][hierarchy['parent'] == cluster].max():
result[n] = cluster_label_map[cluster]
else:
result[n] = NOISE
label = NOISE
if cluster != root_cluster:
label = cluster_label_map[cluster]
elif len(clusters) == 1 and allow_single_cluster:
# There can only be one edge with this particular child hence this
# expression extracts a unique, scalar lambda value.
parent_lambda = lambda_array[child_array == n]
if cluster_selection_epsilon != 0.0:
threshold = 1 / cluster_selection_epsilon
else:
result[n] = NOISE
else:
result[n] = cluster_label_map[cluster]
# The threshold should be calculated per-sample based on the
# largest lambda of any simbling node.
threshold = lambda_array[parent_array == cluster].max()
if parent_lambda >= threshold:
label = cluster_label_map[cluster]

result[n] = label

return result

Expand Down Expand Up @@ -536,7 +564,7 @@ cdef cnp.intp_t traverse_upwards(
else:
return leaf #return node closest to root

parent_eps = 1 / <cnp.float64_t> cluster_tree[cluster_tree['child'] == parent]['value']
parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
if parent_eps > cluster_selection_epsilon:
return parent
else:
Expand Down Expand Up @@ -729,7 +757,7 @@ cdef tuple _get_clusters(
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
reverse_cluster_map = {n: c for c, n in cluster_map.items()}

labels = do_labelling(
labels = _do_labelling(
condensed_tree,
clusters,
cluster_map,
Expand Down
77 changes: 77 additions & 0 deletions sklearn/cluster/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
from sklearn.cluster._hdbscan._tree import (
_do_labelling,
_condense_tree,
CONDENSED_dtype,
)

n_clusters_true = 3
X, y = make_blobs(n_samples=200, random_state=10)
Expand Down Expand Up @@ -380,3 +385,75 @@ def test_hdbscan_precomputed_dense_nan():
hdb = HDBSCAN(metric="precomputed")
with pytest.raises(ValueError, match=msg):
hdb.fit(X_nan)


@pytest.mark.parametrize("allow_single_cluster", [True, False])
@pytest.mark.parametrize("epsilon", [0, 0.1])
def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
n_samples = 48
X, y = make_blobs(
n_samples,
random_state=global_random_seed,
# Ensure the clusters are distinct with no overlap
centers=[
[0, 0],
[10, 0],
[0, 10],
],
)

est = HDBSCAN().fit(X)
condensed_tree = _condense_tree(
est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
)
clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters=clusters,
cluster_label_map=cluster_label_map,
allow_single_cluster=allow_single_cluster,
cluster_selection_epsilon=epsilon,
)

first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
aligned_target = np.vectorize(y_to_labels.get)(y)
assert_array_equal(labels, aligned_target)


def test_labelling_thresholding():
n_samples = 5
MAX_LAMBDA = 1.5
condensed_tree = np.array(
[
(5, 2, MAX_LAMBDA, 1),
(5, 1, 0.1, 1),
(5, 0, MAX_LAMBDA, 1),
(5, 3, 0.2, 1),
(5, 4, 0.3, 1),
],
dtype=CONDENSED_dtype,
)
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters={n_samples},
cluster_label_map={n_samples: 0, n_samples + 1: 1},
allow_single_cluster=True,
cluster_selection_epsilon=1,
)
num_noise = condensed_tree["value"] < 1
assert sum(num_noise) == sum(labels == -1)

labels = _do_labelling(
condensed_tree=condensed_tree,
clusters={n_samples},
cluster_label_map={n_samples: 0, n_samples + 1: 1},
allow_single_cluster=True,
cluster_selection_epsilon=0,
)
# The threshold should be calculated per-sample based on the largest
# lambda of any simbling node. In this case, all points are siblings
# and the largest value is exactly MAX_LAMBDA.
num_noise = condensed_tree["value"] < MAX_LAMBDA
assert sum(num_noise) == sum(labels == -1)