From 1c6142955c55906669602f10ffe0c26212a0e8a2 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 24 Feb 2022 23:58:33 -0500 Subject: [PATCH 001/160] Initial addition of hdbscan --- sklearn/cluster/__init__.py | 23 + sklearn/cluster/_hdbscan/__init__.py | 0 sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 1451 ++++++++++++++++ sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 252 +++ .../_hdbscan/_hdbscan_reachability.pyx | 213 +++ sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 806 +++++++++ .../cluster/_hdbscan/_prediction_utils.pyx | 383 +++++ sklearn/cluster/_hdbscan/dist_metrics.pxd | 94 + sklearn/cluster/_hdbscan/dist_metrics.pyx | 1147 +++++++++++++ sklearn/cluster/_hdbscan/flat.py | 983 +++++++++++ sklearn/cluster/_hdbscan/hdbscan_.py | 1525 +++++++++++++++++ sklearn/cluster/_hdbscan/plots.py | 1033 +++++++++++ sklearn/cluster/_hdbscan/prediction.py | 696 ++++++++ .../_hdbscan/robust_single_linkage_.py | 463 +++++ sklearn/cluster/_hdbscan/tests/__init__.py | 0 sklearn/cluster/_hdbscan/tests/test_flat.py | 403 +++++ .../cluster/_hdbscan/tests/test_hdbscan.py | 655 +++++++ .../_hdbscan/tests/test_prediction_utils.py | 12 + sklearn/cluster/_hdbscan/tests/test_rsl.py | 209 +++ sklearn/cluster/_hdbscan/validity.py | 400 +++++ sklearn/cluster/setup.py | 39 + 21 files changed, 10787 insertions(+) create mode 100644 sklearn/cluster/_hdbscan/__init__.py create mode 100644 sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx create mode 100644 sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx create mode 100644 sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx create mode 100644 sklearn/cluster/_hdbscan/_hdbscan_tree.pyx create mode 100644 sklearn/cluster/_hdbscan/_prediction_utils.pyx create mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pxd create mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pyx create mode 100644 sklearn/cluster/_hdbscan/flat.py create mode 100644 sklearn/cluster/_hdbscan/hdbscan_.py create mode 100644 sklearn/cluster/_hdbscan/plots.py create mode 100644 sklearn/cluster/_hdbscan/prediction.py create mode 100644 sklearn/cluster/_hdbscan/robust_single_linkage_.py create mode 100644 sklearn/cluster/_hdbscan/tests/__init__.py create mode 100644 sklearn/cluster/_hdbscan/tests/test_flat.py create mode 100644 sklearn/cluster/_hdbscan/tests/test_hdbscan.py create mode 100644 sklearn/cluster/_hdbscan/tests/test_prediction_utils.py create mode 100644 sklearn/cluster/_hdbscan/tests/test_rsl.py create mode 100644 sklearn/cluster/_hdbscan/validity.py diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 58dc522cfb667..6033f589d5155 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -22,6 +22,15 @@ ) from ._bicluster import SpectralBiclustering, SpectralCoclustering from ._birch import Birch +from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan +from ._hdbscan.robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage +from ._hdbscan.validity import validity_index +from ._hdbscan.prediction import ( + approximate_predict, + membership_vector, + all_points_membership_vectors, + approximate_predict_scores, +) __all__ = [ "AffinityPropagation", @@ -49,4 +58,18 @@ "ward_tree", "SpectralBiclustering", "SpectralCoclustering", + "HDBSCAN", + "hdbscan", + "RobustSingleLinkage", + "robust_single_linkage", + "validity_index", + "approximate_predict", + "membership_vector", + "all_points_membership_vectors", + "approximate_predict_scores", + "HDBSCAN_flat", + "approximate_predict_flat", + "membership_vector_flat", + "all_points_membership_vectors_flat", + "safe_always_positive_division", ] diff --git a/sklearn/cluster/_hdbscan/__init__.py b/sklearn/cluster/_hdbscan/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx new file mode 100644 index 0000000000000..80b5ab0b50243 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -0,0 +1,1451 @@ +# cython: boundscheck=False +# cython: nonecheck=False +# cython: wraparound=False +# cython: initializedcheck=False +# Minimum spanning tree single linkage implementation for hdbscan +# Authors: Leland McInnes +# License: 3-clause BSD + +# Code to implement a Dual Tree Boruvka Minimimum Spanning Tree computation +# The algorithm is largely tree independent, but fine details of handling +# different tree types has resulted in separate implementations. In +# due course this should be cleaned up to remove unnecessarily duplicated +# code, but it stands for now. +# +# The core idea of the algorithm is to do repeated sweeps through the dataset, +# adding edges to the tree with each sweep until a full tree is formed. +# To do this, start with each node (or point) existing in it's own component. +# On each sweep find all the edges of minimum weight (in this instance +# of minimal mutual reachability distance) that join separate components. +# Add all these edges to the list of edges in the spanning tree, and then +# combine together all the components joined by edges. Begin the next sweep ... +# +# Eventually we end up with only one component, and all edges in we added +# form the minimum spanning tree. The key insight is that each sweep is +# essentially akin to a nearest neighbor search (with the caveat about being +# in separate components), and so can be performed very efficiently using +# a space tree such as a kdtree or ball tree. By using a dual tree formalism +# with a query tree and reference tree we can prune when all points im the +# query node are in the same component, as are all the points of the reference +# node. This allows for rapid pruning in the dual tree traversal in later +# stages. Importantly, we can construct the full tree in O(log N) sweeps +# and since each sweep has complexity equal to that of an all points +# nearest neighbor query within the tree structure we are using we end +# up with sub-quadratic complexity at worst, and in the case of cover +# trees (still to be implemented) we can achieve O(N log N) complexity! +# +# This code is based on the papers: +# +# Fast Euclidean Minimum Spanning Tree: Algorithm, analysis, and applications +# William B. March, Parikshit Ram, Alexander Gray +# Conference: Proceedings of the 16th ACM SIGKDD International Conference on +# Knowledge Discovery and Data Mining +# 2010 +# +# Tree-Independent Dual-Tree Algorithms +# Ryan R. Curtin, William B. March, Parikshit Ram, David V. Anderson, +# Alexander G. Gray, Charles L. Isbell Jr +# 2013, arXiv 1304.4327 +# +# As per the sklearn BallTree and KDTree implementations we make use of +# the rdist, which is a faster to compute notion of distance (for example +# in the euclidean case it is the distance squared). +# +# To combine together components in between sweeps we make use of +# a union find data structure. This is a separate implementation +# from that used in the labelling of the single linkage tree as +# we can perform more specific optimizations here for what +# is a simpler version of the structure. + +import numpy as np +cimport numpy as np + +from libc.float cimport DBL_MAX +from libc.math cimport fabs, pow + +from sklearn.neighbors import KDTree, BallTree + +from .dist_metrics cimport DistanceMetric + +from joblib import Parallel, delayed + +cdef np.double_t INF = np.inf + + +# Define the NodeData struct used in sklearn trees for faster +# access to the node data internals in Cython. +cdef struct NodeData_t: + np.intp_t idx_start + np.intp_t idx_end + np.intp_t is_leaf + np.double_t radius + + +# Define a function giving the minimum distance between two +# nodes of a ball tree +cdef inline np.double_t balltree_min_dist_dual( + np.double_t radius1, + np.double_t radius2, + np.intp_t node1, + np.intp_t node2, + np.double_t[:, ::1] centroid_dist) nogil except -1: + + cdef np.double_t dist_pt = centroid_dist[node1, node2] + return max(0, (dist_pt - radius1 - radius2)) + + +# Define a function giving the minimum distance between two +# nodes of a kd-tree +cdef inline np.double_t kdtree_min_dist_dual( + DistanceMetric metric, + np.intp_t node1, + np.intp_t node2, + np.double_t[:, :, ::1] node_bounds, + np.intp_t num_features) except -1: + + cdef np.double_t d, d1, d2, rdist = 0.0 + cdef np.double_t zero = 0.0 + cdef np.intp_t j + + if metric.p == INF: + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = max(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, metric.p) + + return metric._rdist_to_dist(rdist) + + +# As above, but this time we use the rdist as per the kdtree +# implementation. This allows us to release the GIL over +# larger sections of code +cdef inline np.double_t kdtree_min_rdist_dual( + DistanceMetric metric, + np.intp_t node1, + np.intp_t node2, + np.double_t[:, :, ::1] node_bounds, + np.intp_t num_features) nogil except -1: + + cdef np.double_t d, d1, d2, rdist = 0.0 + cdef np.double_t zero = 0.0 + cdef np.intp_t j + + if metric.p == INF: + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = max(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, metric.p) + + return rdist + + +cdef class BoruvkaUnionFind (object): + """Efficient union find implementation. + + Parameters + ---------- + + size : int + The total size of the set of objects to + track via the union find structure. + + Attributes + ---------- + + is_component : array of bool; shape (size, 1) + Array specifying whether each element of the + set is the root node, or identifier for + a component. + """ + + cdef np.ndarray _parent_arr + cdef np.intp_t[::1] _parent + cdef np.ndarray _rank_arr + cdef np.uint8_t[::1] _rank + cdef np.ndarray is_component + + def __init__(self, size): + self._parent_arr = np.arange(size, dtype=np.intp) + self._parent = ( ( + self._parent_arr.data)) + self._rank_arr = np.zeros(size, dtype=np.uint8) + self._rank = ( ( + self._rank_arr.data)) + self.is_component = np.ones(size, dtype=bool) + + cdef int union_(self, np.intp_t x, np.intp_t y) except -1: + """Union together elements x and y""" + cdef np.intp_t x_root = self.find(x) + cdef np.intp_t y_root = self.find(y) + + if x_root == y_root: + return 0 + + if self._rank[x_root] < self._rank[y_root]: + self._parent[x_root] = y_root + self.is_component[x_root] = False + elif self._rank[x_root] > self._rank[y_root]: + self._parent[y_root] = x_root + self.is_component[y_root] = False + else: + self._rank[x_root] += 1 + self._parent[y_root] = x_root + self.is_component[y_root] = False + + return 0 + + cdef np.intp_t find(self, np.intp_t x) except -1: + """Find the root or identifier for the component that x is in""" + cdef np.intp_t x_parent + cdef np.intp_t x_grandparent + + x_parent = self._parent[x] + while True: + if x_parent == x: + return x + x_grandparent = self._parent[x_parent] + self._parent[x] = x_grandparent + x = x_parent + x_parent = x_grandparent + + cdef np.ndarray[np.intp_t, ndim=1] components(self): + """Return an array of all component roots/identifiers""" + return self.is_component.nonzero()[0] + + +def _core_dist_query(tree, data, min_samples): + return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) + + +cdef class KDTreeBoruvkaAlgorithm (object): + """A Dual Tree Boruvka Algorithm implemented for the sklearn + KDTree space tree implementation. + + Parameters + ---------- + + tree : KDTree + The kd-tree to run Dual Tree Boruvka over. + + min_samples : int, optional (default= 5) + The min_samples parameter of HDBSCAN used to + determine core distances. + + metric : string, optional (default='euclidean') + The metric used to compute distances for the tree + + leaf_size : int, optional (default=20) + The Boruvka algorithm benefits from a smaller leaf size than + standard kd-tree nearest neighbor searches. The tree passed in + is used for a kNN search for core distance. A second tree is + constructed with a smaller leaf size for Boruvka; this is that + leaf size. + + alpha : float, optional (default=1.0) + The alpha distance scaling parameter as per Robust Single Linkage. + + approx_min_span_tree : bool, optional (default=False) + Take shortcuts and only approximate the min spanning tree. + This is considerably faster but does not return a true + minimal spanning tree. + + n_jobs : int, optional (default=4) + The number of parallel jobs used to compute core distances. + + **kwargs : + Keyword args passed to the metric. + """ + + cdef object tree + cdef object core_dist_tree + cdef DistanceMetric dist + cdef np.ndarray _data + cdef np.double_t[:, ::1] _raw_data + cdef np.double_t[:, :, ::1] node_bounds + cdef np.double_t alpha + cdef np.int8_t approx_min_span_tree + cdef np.intp_t n_jobs + cdef np.intp_t min_samples + cdef np.intp_t num_points + cdef np.intp_t num_nodes + cdef np.intp_t num_features + + cdef public np.double_t[::1] core_distance + cdef public np.double_t[::1] bounds + cdef public np.intp_t[::1] component_of_point + cdef public np.intp_t[::1] component_of_node + cdef public np.intp_t[::1] candidate_neighbor + cdef public np.intp_t[::1] candidate_point + cdef public np.double_t[::1] candidate_distance + cdef public np.double_t[:, ::1] centroid_distances + cdef public np.intp_t[::1] idx_array + cdef public NodeData_t[::1] node_data + cdef BoruvkaUnionFind component_union_find + cdef np.ndarray edges + cdef np.intp_t num_edges + + cdef np.intp_t *component_of_point_ptr + cdef np.intp_t *component_of_node_ptr + cdef np.double_t *candidate_distance_ptr + cdef np.intp_t *candidate_neighbor_ptr + cdef np.intp_t *candidate_point_ptr + cdef np.double_t *core_distance_ptr + cdef np.double_t *bounds_ptr + + cdef np.ndarray components + cdef np.ndarray core_distance_arr + cdef np.ndarray bounds_arr + cdef np.ndarray _centroid_distances_arr + cdef np.ndarray component_of_point_arr + cdef np.ndarray component_of_node_arr + cdef np.ndarray candidate_point_arr + cdef np.ndarray candidate_neighbor_arr + cdef np.ndarray candidate_distance_arr + + def __init__(self, tree, min_samples=5, metric='euclidean', leaf_size=20, + alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): + + self.core_dist_tree = tree + self.tree = KDTree(tree.data, metric=metric, leaf_size=leaf_size, + **kwargs) + self._data = np.array(self.tree.data) + self._raw_data = self.tree.data + self.node_bounds = self.tree.node_bounds + self.min_samples = min_samples + self.alpha = alpha + self.approx_min_span_tree = approx_min_span_tree + self.n_jobs = n_jobs + + self.num_points = self.tree.data.shape[0] + self.num_features = self.tree.data.shape[1] + self.num_nodes = self.tree.node_data.shape[0] + + self.dist = DistanceMetric.get_metric(metric, **kwargs) + + self.components = np.arange(self.num_points) + self.bounds_arr = np.empty(self.num_nodes, np.double) + self.component_of_point_arr = np.empty(self.num_points, dtype=np.intp) + self.component_of_node_arr = np.empty(self.num_nodes, dtype=np.intp) + self.candidate_neighbor_arr = np.empty(self.num_points, dtype=np.intp) + self.candidate_point_arr = np.empty(self.num_points, dtype=np.intp) + self.candidate_distance_arr = np.empty(self.num_points, + dtype=np.double) + self.component_union_find = BoruvkaUnionFind(self.num_points) + + self.edges = np.empty((self.num_points - 1, 3)) + self.num_edges = 0 + + self.idx_array = self.tree.idx_array + self.node_data = self.tree.node_data + + self.bounds = ( ( + self.bounds_arr.data)) + self.component_of_point = ( ( + self.component_of_point_arr.data)) + self.component_of_node = ( ( + self.component_of_node_arr.data)) + self.candidate_neighbor = ( ( + self.candidate_neighbor_arr.data)) + self.candidate_point = ( ( + self.candidate_point_arr.data)) + self.candidate_distance = ( ( + self.candidate_distance_arr.data)) + + # self._centroid_distances_arr = self.dist.pairwise( + # self.tree.node_bounds[0]) + # self.centroid_distances = ( + # ( + # + # self._centroid_distances_arr.data)) + + self._initialize_components() + self._compute_bounds() + + # Set up fast pointer access to arrays + self.component_of_point_ptr = &self.component_of_point[0] + self.component_of_node_ptr = &self.component_of_node[0] + self.candidate_distance_ptr = ( + &self.candidate_distance[0]) + self.candidate_neighbor_ptr = &self.candidate_neighbor[0] + self.candidate_point_ptr = &self.candidate_point[0] + self.core_distance_ptr = &self.core_distance[0] + self.bounds_ptr = &self.bounds[0] + + cdef _compute_bounds(self): + """Initialize core distances""" + + cdef np.intp_t n + cdef np.intp_t i + cdef np.intp_t m + + cdef np.ndarray[np.double_t, ndim=2] knn_dist + cdef np.ndarray[np.intp_t, ndim=2] knn_indices + + # A shortcut: if we have a lot of points then we can split the points + # into four piles and query them in parallel. On multicore systems + # (most systems) this amounts to a 2x-3x wall clock improvement. + if self.tree.data.shape[0] > 16384 and self.n_jobs > 1: + split_cnt = self.num_points // self.n_jobs + datasets = [] + for i in range(self.n_jobs): + if i == self.n_jobs - 1: + datasets.append(np.asarray(self.tree.data[i*split_cnt:])) + else: + datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt])) + + knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(_core_dist_query) + (self.core_dist_tree, points, + self.min_samples + 1) + for points in datasets) + knn_dist = np.vstack([x[0] for x in knn_data]) + knn_indices = np.vstack([x[1] for x in knn_data]) + else: + knn_dist, knn_indices = self.core_dist_tree.query( + self.tree.data, + k=self.min_samples + 1, + dualtree=True, + breadth_first=True) + + self.core_distance_arr = knn_dist[:, self.min_samples].copy() + self.core_distance = ( ( + self.core_distance_arr.data)) + + # Since we do everything in terms of rdist to free up the GIL + # we need to convert all the core distances beforehand + # to make comparison feasible. + for n in range(self.num_points): + self.core_distance[n] = self.dist._dist_to_rdist( + self.core_distance[n]) + + # Since we already computed NN distances for the min_samples closest + # points we can use this to do the first round of boruvka -- we won't + # get every point due to core_distance/mutual reachability distance + # issues, but we'll get quite a few, and they are the hard ones to + # get, so fill in any we can and then run update components. + for n in range(self.num_points): + for i in range(0, self.min_samples + 1): + m = knn_indices[n, i] + if n == m: + continue + if self.core_distance[m] <= self.core_distance[n]: + self.candidate_point[n] = n + self.candidate_neighbor[n] = m + self.candidate_distance[n] = self.core_distance[n] + break + + self.update_components() + + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + + cdef _initialize_components(self): + """Initialize components of the min spanning tree (eventually there + is only one component; initially each point is its own component)""" + + cdef np.intp_t n + + for n in range(self.num_points): + self.component_of_point[n] = n + self.candidate_neighbor[n] = -1 + self.candidate_point[n] = -1 + self.candidate_distance[n] = DBL_MAX + + for n in range(self.num_nodes): + self.component_of_node[n] = -(n+1) + + cdef int update_components(self) except -1: + """Having found the nearest neighbor not in the same component for + each current component (via tree traversal), run through adding + edges to the min spanning tree and recomputing components via + union find.""" + + cdef np.intp_t source + cdef np.intp_t sink + cdef np.intp_t c + cdef np.intp_t component + cdef np.intp_t n + cdef np.intp_t i + cdef np.intp_t p + cdef np.intp_t current_component + cdef np.intp_t current_source_component + cdef np.intp_t current_sink_component + cdef np.intp_t child1 + cdef np.intp_t child2 + + cdef NodeData_t node_info + + # For each component there should be a: + # - candidate point (a point in the component) + # - candiate neighbor (the point to join with) + # - candidate_distance (the distance from point to neighbor) + # + # We will go through and and an edge to the edge list + # for each of these, and the union the two points + # together in the union find structure + + for c in range(self.components.shape[0]): + component = self.components[c] + source = self.candidate_point[component] + sink = self.candidate_neighbor[component] + if source == -1 or sink == -1: + continue + # raise ValueError('Source or sink of edge is not defined!') + current_source_component = self.component_union_find.find(source) + current_sink_component = self.component_union_find.find(sink) + if current_source_component == current_sink_component: + # We've already joined these, so ignore this edge + self.candidate_point[component] = -1 + self.candidate_neighbor[component] = -1 + self.candidate_distance[component] = DBL_MAX + continue + self.edges[self.num_edges, 0] = source + self.edges[self.num_edges, 1] = sink + self.edges[self.num_edges, 2] = self.dist._rdist_to_dist( + self.candidate_distance[component]) + self.num_edges += 1 + + self.component_union_find.union_(source, sink) + + # Reset everything,and check if we're done + self.candidate_distance[component] = DBL_MAX + if self.num_edges == self.num_points - 1: + self.components = self.component_union_find.components() + return self.components.shape[0] + + # After having joined everything in the union find data + # structure we need to go through and determine the components + # of each point for easy lookup. + # + # Have done that we then go through and set the component + # of each node, as this provides fast pruning in later + # tree traversals. + for n in range(self.tree.data.shape[0]): + self.component_of_point[n] = self.component_union_find.find(n) + + for n in range(self.tree.node_data.shape[0] - 1, -1, -1): + node_info = self.node_data[n] + # Case 1: + # If the node is a leaf we need to check that every point + # in the node is of the same component + if node_info.is_leaf: + current_component = self.component_of_point[ + self.idx_array[node_info.idx_start]] + for i in range(node_info.idx_start + 1, node_info.idx_end): + p = self.idx_array[i] + if self.component_of_point[p] != current_component: + break + else: + self.component_of_node[n] = current_component + # Case 2: + # If the node is not a leaf we only need to check + # that both child nodes are in the same component + else: + child1 = 2 * n + 1 + child2 = 2 * n + 2 + if (self.component_of_node[child1] == + self.component_of_node[child2]): + self.component_of_node[n] = self.component_of_node[child1] + + # Since we're working with mutual reachability distance we often have + # ties or near ties; because of that we can benefit by not resetting + # the bounds unless we get stuck (don't join any components). Thus + # we check for that, and only reset bounds in the case where we have + # the same number of components as we did going in. This doesn't + # produce a true min spanning tree, but only and approximation + # Thus only do this if the caller is willing to accept such + if self.approx_min_span_tree: + last_num_components = self.components.shape[0] + self.components = self.component_union_find.components() + + if self.components.shape[0] == last_num_components: + # Reset bounds + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + else: + self.components = self.component_union_find.components() + + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + + return self.components.shape[0] + + cdef int dual_tree_traversal(self, np.intp_t node1, + np.intp_t node2) nogil except -1: + """Perform a dual tree traversal, pruning wherever possible, to find + the nearest neighbor not in the same component for each component. + This is akin to a standard dual tree NN search, but we also prune + whenever all points in query and reference nodes are in the same + component.""" + + cdef np.intp_t[::1] point_indices1, point_indices2 + + cdef np.intp_t i + cdef np.intp_t j + + cdef np.intp_t p + cdef np.intp_t q + + cdef np.intp_t parent + cdef np.intp_t child1 + cdef np.intp_t child2 + + cdef double node_dist + + cdef NodeData_t node1_info = self.node_data[node1] + cdef NodeData_t node2_info = self.node_data[node2] + cdef NodeData_t parent_info + cdef NodeData_t left_info + cdef NodeData_t right_info + + cdef np.intp_t component1 + cdef np.intp_t component2 + + cdef np.double_t *raw_data = ( &self._raw_data[0, 0]) + cdef np.double_t d + + cdef np.double_t mr_dist + + cdef np.double_t new_bound + cdef np.double_t new_upper_bound + cdef np.double_t new_lower_bound + cdef np.double_t bound_max + cdef np.double_t bound_min + + cdef np.intp_t left + cdef np.intp_t right + cdef np.double_t left_dist + cdef np.double_t right_dist + + # Compute the distance between the query and reference nodes + node_dist = kdtree_min_rdist_dual(self.dist, + node1, node2, self.node_bounds, + self.num_features) + + # If the distance between the nodes is less than the current bound for + # the query and the nodes are not in the same component continue; + # otherwise we get to prune this branch and return early. + if node_dist < self.bounds_ptr[node1]: + if (self.component_of_node_ptr[node1] == + self.component_of_node_ptr[node2] and + self.component_of_node_ptr[node1] >= 0): + return 0 + else: + return 0 + + # Case 1: Both nodes are leaves + # for each pair of points in node1 x node2 we need + # to compute the distance and see if it better than + # the current nearest neighbor for the component of + # the point in the query node. + # + # We get to take some shortcuts: + # - if the core distance for a point is larger than + # the distance to the nearst neighbor of the + # component of the point ... then we can't get + # a better mutual reachability distance and we + # can skip computing anything for that point + # - if the points are in the same component we + # don't have to compute the distance. + # + # We also have some catches: + # - we need to compute mutual reachability distance + # not just the ordinary distance; this involves + # fiddling with core distances. + # - We need to scale distances according to alpha, + # but don't want to lose performance in the case + # that alpha is 1.0. + # + # Finally we can compute new bounds for the query node + # based on the distances found here, so do that and + # propagate the results up the tree. + if node1_info.is_leaf and node2_info.is_leaf: + + new_upper_bound = 0.0 + new_lower_bound = DBL_MAX + + point_indices1 = self.idx_array[node1_info.idx_start: + node1_info.idx_end] + point_indices2 = self.idx_array[node2_info.idx_start: + node2_info.idx_end] + + for i in range(point_indices1.shape[0]): + + p = point_indices1[i] + component1 = self.component_of_point_ptr[p] + + if (self.core_distance_ptr[p] > + self.candidate_distance_ptr[component1]): + continue + + for j in range(point_indices2.shape[0]): + + q = point_indices2[j] + component2 = self.component_of_point_ptr[q] + + if (self.core_distance_ptr[q] > + self.candidate_distance_ptr[component1]): + continue + + if component1 != component2: + + d = self.dist.rdist(&raw_data[self.num_features * p], + &raw_data[self.num_features * q], + self.num_features) + + # mr_dist = max(distances[i, j], + # self.core_distance_ptr[p], + # self.core_distance_ptr[q]) + if self.alpha != 1.0: + mr_dist = max(d / self.alpha, + self.core_distance_ptr[p], + self.core_distance_ptr[q]) + else: + mr_dist = max(d, self.core_distance_ptr[p], + self.core_distance_ptr[q]) + if mr_dist < self.candidate_distance_ptr[component1]: + self.candidate_distance_ptr[component1] = mr_dist + self.candidate_neighbor_ptr[component1] = q + self.candidate_point_ptr[component1] = p + + new_upper_bound = max(new_upper_bound, + self.candidate_distance_ptr[component1]) + new_lower_bound = min(new_lower_bound, + self.candidate_distance_ptr[component1]) + + # Compute new bounds for the query node, and + # then propagate the results of that computation + # up the tree. + new_bound = min(new_upper_bound, + new_lower_bound + 2 * self.dist._dist_to_rdist(node1_info.radius)) + # new_bound = new_upper_bound + if new_bound < self.bounds_ptr[node1]: + self.bounds_ptr[node1] = new_bound + + # Propagate bounds up the tree + while node1 > 0: + parent = (node1 - 1) // 2 + left = 2 * parent + 1 + right = 2 * parent + 2 + + parent_info = self.node_data[parent] + left_info = self.node_data[left] + right_info = self.node_data[right] + + new_bound = max(self.bounds_ptr[left], + self.bounds_ptr[right]) + + if new_bound < self.bounds_ptr[parent]: + self.bounds_ptr[parent] = new_bound + node1 = parent + else: + break + + # Case 2a: The query node is a leaf, or is smaller than + # the reference node. + # + # We descend in the reference tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the reference tree. + elif node1_info.is_leaf or (not node2_info.is_leaf and + node2_info.radius > node1_info.radius): + + left = 2 * node2 + 1 + right = 2 * node2 + 2 + + node2_info = self.node_data[left] + + left_dist = kdtree_min_rdist_dual(self.dist, + node1, left, + self.node_bounds, + self.num_features) + + node2_info = self.node_data[right] + + right_dist = kdtree_min_rdist_dual(self.dist, + node1, right, + self.node_bounds, + self.num_features) + + if left_dist < right_dist: + self.dual_tree_traversal(node1, left) + self.dual_tree_traversal(node1, right) + else: + self.dual_tree_traversal(node1, right) + self.dual_tree_traversal(node1, left) + + # Case 2b: The reference node is a leaf, or is smaller than + # the query node. + # + # We descend in the query tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the query tree. + else: + left = 2 * node1 + 1 + right = 2 * node1 + 2 + + node1_info = self.node_data[left] + + left_dist = kdtree_min_rdist_dual(self.dist, + left, node2, + self.node_bounds, + self.num_features) + + node1_info = self.node_data[right] + + right_dist = kdtree_min_rdist_dual(self.dist, + right, node2, + self.node_bounds, + self.num_features) + + if left_dist < right_dist: + self.dual_tree_traversal(left, node2) + self.dual_tree_traversal(right, node2) + else: + self.dual_tree_traversal(right, node2) + self.dual_tree_traversal(left, node2) + + return 0 + + def spanning_tree(self): + """Compute the minimum spanning tree of the data held by + the tree passed in at construction""" + + # cdef np.intp_t num_components + # cdef np.intp_t num_nodes + + num_components = self.tree.data.shape[0] + num_nodes = self.tree.node_data.shape[0] + iteration = 0 + while num_components > 1: + self.dual_tree_traversal(0, 0) + num_components = self.update_components() + + return self.edges + + +cdef class BallTreeBoruvkaAlgorithm (object): + """A Dual Tree Boruvka Algorithm implemented for the sklearn + BallTree space tree implementation. + + Parameters + ---------- + + tree : BallTree + The ball-tree to run Dual Tree Boruvka over. + + min_samples : int, optional (default=5) + The min_samples parameter of HDBSCAN used to + determine core distances. + + metric : string, optional (default='euclidean') + The metric used to compute distances for the tree + + leaf_size : int, optional (default=20) + The Boruvka algorithm benefits from a smaller leaf size than + standard kd-tree nearest neighbor searches. The tree passed in + is used for a kNN search for core distance. A second tree is + constructed with a smaller leaf size for Boruvka; this is that + leaf size. + + alpha : float, optional (default=1.0) + The alpha distance scaling parameter as per Robust Single Linkage. + + approx_min_span_tree : bool (default False) + Take shortcuts and only approximate the min spanning tree. + This is considerably faster but does not return a true + minimal spanning tree. + + n_jobs : int, optional (default=4) + The number of parallel jobs used to compute core distances. + + **kwargs : + Keyword args passed to the metric. + """ + + cdef object tree + cdef object core_dist_tree + cdef DistanceMetric dist + cdef np.ndarray _data + cdef np.double_t[:, ::1] _raw_data + cdef np.double_t alpha + cdef np.int8_t approx_min_span_tree + cdef np.intp_t n_jobs + cdef np.intp_t min_samples + cdef np.intp_t num_points + cdef np.intp_t num_nodes + cdef np.intp_t num_features + + cdef public np.double_t[::1] core_distance + cdef public np.double_t[::1] bounds + cdef public np.intp_t[::1] component_of_point + cdef public np.intp_t[::1] component_of_node + cdef public np.intp_t[::1] candidate_neighbor + cdef public np.intp_t[::1] candidate_point + cdef public np.double_t[::1] candidate_distance + cdef public np.double_t[:, ::1] centroid_distances + cdef public np.intp_t[::1] idx_array + cdef public NodeData_t[::1] node_data + cdef BoruvkaUnionFind component_union_find + cdef np.ndarray edges + cdef np.intp_t num_edges + + cdef np.intp_t *component_of_point_ptr + cdef np.intp_t *component_of_node_ptr + cdef np.double_t *candidate_distance_ptr + cdef np.intp_t *candidate_neighbor_ptr + cdef np.intp_t *candidate_point_ptr + cdef np.double_t *core_distance_ptr + cdef np.double_t *bounds_ptr + + cdef np.ndarray components + cdef np.ndarray core_distance_arr + cdef np.ndarray bounds_arr + cdef np.ndarray _centroid_distances_arr + cdef np.ndarray component_of_point_arr + cdef np.ndarray component_of_node_arr + cdef np.ndarray candidate_point_arr + cdef np.ndarray candidate_neighbor_arr + cdef np.ndarray candidate_distance_arr + + def __init__(self, tree, min_samples=5, metric='euclidean', + alpha=1.0, leaf_size=20, approx_min_span_tree=False, n_jobs=4, + **kwargs): + + self.core_dist_tree = tree + self.tree = BallTree(tree.data, metric=metric, leaf_size=leaf_size, + **kwargs) + self._data = np.array(self.tree.data) + self._raw_data = self.tree.data + self.min_samples = min_samples + self.alpha = alpha + self.approx_min_span_tree = approx_min_span_tree + self.n_jobs = n_jobs + + self.num_points = self.tree.data.shape[0] + self.num_features = self.tree.data.shape[1] + self.num_nodes = self.tree.node_data.shape[0] + + self.dist = DistanceMetric.get_metric(metric, **kwargs) + + self.components = np.arange(self.num_points) + self.bounds_arr = np.empty(self.num_nodes, np.double) + self.component_of_point_arr = np.empty(self.num_points, dtype=np.intp) + self.component_of_node_arr = np.empty(self.num_nodes, dtype=np.intp) + self.candidate_neighbor_arr = np.empty(self.num_points, dtype=np.intp) + self.candidate_point_arr = np.empty(self.num_points, dtype=np.intp) + self.candidate_distance_arr = np.empty(self.num_points, + dtype=np.double) + self.component_union_find = BoruvkaUnionFind(self.num_points) + + self.edges = np.empty((self.num_points - 1, 3)) + self.num_edges = 0 + + self.idx_array = self.tree.idx_array + self.node_data = self.tree.node_data + + self.bounds = ( ( + self.bounds_arr.data)) + self.component_of_point = ( ( + self.component_of_point_arr.data)) + self.component_of_node = ( ( + self.component_of_node_arr.data)) + self.candidate_neighbor = ( ( + self.candidate_neighbor_arr.data)) + self.candidate_point = ( ( + self.candidate_point_arr.data)) + self.candidate_distance = ( ( + self.candidate_distance_arr.data)) + + self._centroid_distances_arr = self.dist.pairwise( + self.tree.node_bounds[0]) + self.centroid_distances = ( + ( + self._centroid_distances_arr.data)) + + self._initialize_components() + self._compute_bounds() + + # Set up fast pointer access to arrays + self.component_of_point_ptr = &self.component_of_point[0] + self.component_of_node_ptr = &self.component_of_node[0] + self.candidate_distance_ptr = &self.candidate_distance[0] + self.candidate_neighbor_ptr = &self.candidate_neighbor[0] + self.candidate_point_ptr = &self.candidate_point[0] + self.core_distance_ptr = &self.core_distance[0] + self.bounds_ptr = &self.bounds[0] + + cdef _compute_bounds(self): + """Initialize core distances""" + + cdef np.intp_t n + cdef np.intp_t i + cdef np.intp_t m + + cdef np.ndarray[np.double_t, ndim=2] knn_dist + cdef np.ndarray[np.intp_t, ndim=2] knn_indices + + if self.tree.data.shape[0] > 16384 and self.n_jobs > 1: + split_cnt = self.num_points // self.n_jobs + datasets = [] + for i in range(self.n_jobs): + if i == self.n_jobs - 1: + datasets.append(np.asarray(self.tree.data[i*split_cnt:])) + else: + datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt])) + + knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(_core_dist_query) + (self.core_dist_tree, points, + self.min_samples + 1) + for points in datasets) + knn_dist = np.vstack([x[0] for x in knn_data]) + knn_indices = np.vstack([x[1] for x in knn_data]) + else: + knn_dist, knn_indices = self.core_dist_tree.query( + self.tree.data, + k=self.min_samples + 1, + dualtree=True, + breadth_first=True) + + self.core_distance_arr = knn_dist[:, self.min_samples].copy() + self.core_distance = ( ( + self.core_distance_arr.data)) + + # Since we already computed NN distances for the min_samples closest + # points we can use this to do the first round of boruvka -- we won't + # get every point due to core_distance/mutual reachability distance + # issues, but we'll get quite a few, and they are the hard ones to get, + # so fill in any we can and then run update components. + for n in range(self.num_points): + for i in range(0, self.min_samples + 1): + m = knn_indices[n, i] + if n == m: + continue + if self.core_distance[m] <= self.core_distance[n]: + self.candidate_point[n] = n + self.candidate_neighbor[n] = m + self.candidate_distance[n] = self.core_distance[n] + break + + self.update_components() + + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + + cdef _initialize_components(self): + """Initialize components of the min spanning tree (eventually there + is only one component; initially each point is its own component)""" + + cdef np.intp_t n + + for n in range(self.num_points): + self.component_of_point[n] = n + self.candidate_neighbor[n] = -1 + self.candidate_point[n] = -1 + self.candidate_distance[n] = DBL_MAX + + for n in range(self.num_nodes): + self.component_of_node[n] = -(n+1) + + cdef update_components(self): + """Having found the nearest neighbor not in the same component for + each current component (via tree traversal), run through adding + edges to the min spanning tree and recomputing components via + union find.""" + + cdef np.intp_t source + cdef np.intp_t sink + cdef np.intp_t c + cdef np.intp_t component + cdef np.intp_t n + cdef np.intp_t i + cdef np.intp_t p + cdef np.intp_t current_component + cdef np.intp_t current_source_component + cdef np.intp_t current_sink_component + cdef np.intp_t child1 + cdef np.intp_t child2 + + cdef NodeData_t node_info + + # For each component there should be a: + # - candidate point (a point in the component) + # - candiate neighbor (the point to join with) + # - candidate_distance (the distance from point to neighbor) + # + # We will go through and and an edge to the edge list + # for each of these, and the union the two points + # together in the union find structure + + for c in range(self.components.shape[0]): + component = self.components[c] + source = self.candidate_point[component] + sink = self.candidate_neighbor[component] + if source == -1 or sink == -1: + continue + # raise ValueError('Source or sink of edge is not defined!') + current_source_component = self.component_union_find.find(source) + current_sink_component = self.component_union_find.find(sink) + if current_source_component == current_sink_component: + self.candidate_point[component] = -1 + self.candidate_neighbor[component] = -1 + self.candidate_distance[component] = DBL_MAX + continue + self.edges[self.num_edges, 0] = source + self.edges[self.num_edges, 1] = sink + self.edges[self.num_edges, 2] = self.candidate_distance[component] + self.num_edges += 1 + + self.component_union_find.union_(source, sink) + + self.candidate_distance[component] = DBL_MAX + if self.num_edges == self.num_points - 1: + self.components = self.component_union_find.components() + return self.components.shape[0] + + # After having joined everything in the union find data + # structure we need to go through and determine the components + # of each point for easy lookup. + # + # Have done that we then go through and set the component + # of each node, as this provides fast pruning in later + # tree traversals. + for n in range(self.tree.data.shape[0]): + self.component_of_point[n] = self.component_union_find.find(n) + + for n in range(self.tree.node_data.shape[0] - 1, -1, -1): + node_info = self.node_data[n] + # Case 1: + # If the node is a leaf we need to check that every point + # in the node is of the same component + if node_info.is_leaf: + current_component = self.component_of_point[self.idx_array[ + node_info.idx_start]] + for i in range(node_info.idx_start + 1, node_info.idx_end): + p = self.idx_array[i] + if self.component_of_point[p] != current_component: + break + else: + self.component_of_node[n] = current_component + # Case 2: + # If the node is not a leaf we only need to check + # that both child nodes are in the same component + else: + child1 = 2 * n + 1 + child2 = 2 * n + 2 + if self.component_of_node[child1] == self.component_of_node[child2]: + self.component_of_node[n] = self.component_of_node[child1] + + # Since we're working with mutual reachability distance we often have + # ties or near ties; because of that we can benefit by not resetting the + # bounds unless we get stuck (don't join any components). Thus + # we check for that, and only reset bounds in the case where we have + # the same number of components as we did going in. This doesn't + # produce a true min spanning tree, but only and approximation + # Thus only do this if the caller is willing to accept such + if self.approx_min_span_tree: + last_num_components = self.components.shape[0] + self.components = self.component_union_find.components() + + if self.components.shape[0] == last_num_components: + # Reset bounds + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + else: + self.components = self.component_union_find.components() + + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + + return self.components.shape[0] + + cdef int dual_tree_traversal(self, np.intp_t node1, + np.intp_t node2) except -1: + """Perform a dual tree traversal, pruning wherever possible, to find + the nearest neighbor not in the same component for each component. + This is akin to a standard dual tree NN search, but we also prune + whenever all points in query and reference nodes are in the same + component.""" + + cdef np.intp_t[::1] point_indices1, point_indices2 + + cdef np.intp_t i + cdef np.intp_t j + + cdef np.intp_t p + cdef np.intp_t q + + cdef np.intp_t parent + cdef np.intp_t child1 + cdef np.intp_t child2 + + cdef double node_dist + + cdef NodeData_t node1_info = self.node_data[node1] + cdef NodeData_t node2_info = self.node_data[node2] + cdef NodeData_t parent_info + cdef NodeData_t left_info + cdef NodeData_t right_info + + cdef np.intp_t component1 + cdef np.intp_t component2 + + cdef np.double_t *raw_data = ( &self._raw_data[0, 0]) + cdef np.double_t d + + cdef np.double_t mr_dist + + cdef np.double_t new_bound + cdef np.double_t new_upper_bound + cdef np.double_t new_lower_bound + cdef np.double_t bound_max + cdef np.double_t bound_min + + cdef np.intp_t left + cdef np.intp_t right + cdef np.double_t left_dist + cdef np.double_t right_dist + + node_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, node2, + self.centroid_distances) + + # If the distance between the nodes is less than the current bound for + # the query and the nodes are not in the same component continue; + # otherwise we get to prune this branch and return early. + if node_dist < self.bounds_ptr[node1]: + if self.component_of_node_ptr[node1] == self.component_of_node_ptr[ + node2] and self.component_of_node_ptr[node1] >= 0: + return 0 + else: + return 0 + + # Case 1: Both nodes are leaves + # for each pair of points in node1 x node2 we need + # to compute the distance and see if it better than + # the current nearest neighbor for the component of + # the point in the query node. + # + # We get to take some shortcuts: + # - if the core distance for a point is larger than + # the distance to the nearst neighbor of the + # component of the point ... then we can't get + # a better mutual reachability distance and we + # can skip computing anything for that point + # - if the points are in the same component we + # don't have to compute the distance. + # + # We also have some catches: + # - we need to compute mutual reachability distance + # not just the ordinary distance; this involves + # fiddling with core distances. + # - We need to scale distances according to alpha, + # but don't want to lose performance in the case + # that alpha is 1.0. + # + # Finally we can compute new bounds for the query node + # based on the distances found here, so do that and + # propagate the results up the tree. + if node1_info.is_leaf and node2_info.is_leaf: + + new_bound = 0.0 + + point_indices1 = self.idx_array[node1_info.idx_start: + node1_info.idx_end] + point_indices2 = self.idx_array[node2_info.idx_start: + node2_info.idx_end] + + for i in range(point_indices1.shape[0]): + + p = point_indices1[i] + component1 = self.component_of_point_ptr[p] + + if self.core_distance_ptr[p] > self.candidate_distance_ptr[ + component1]: + continue + + for j in range(point_indices2.shape[0]): + + q = point_indices2[j] + component2 = self.component_of_point_ptr[q] + + if self.core_distance_ptr[q] > self.candidate_distance_ptr[ + component1]: + continue + + if component1 != component2: + + d = self.dist.dist(&raw_data[self.num_features * p], + &raw_data[self.num_features * q], + self.num_features) * self.alpha + + if self.alpha != 1.0: + mr_dist = max(d / self.alpha, + self.core_distance_ptr[p], + self.core_distance_ptr[q]) + else: + mr_dist = max(d, self.core_distance_ptr[p], + self.core_distance_ptr[q]) + + if mr_dist < self.candidate_distance_ptr[component1]: + self.candidate_distance_ptr[component1] = mr_dist + self.candidate_neighbor_ptr[component1] = q + self.candidate_point_ptr[component1] = p + + new_upper_bound = max(new_upper_bound, + self.candidate_distance_ptr[component1]) + new_lower_bound = min(new_lower_bound, + self.candidate_distance_ptr[component1]) + + # Compute new bounds for the query node, and + # then propagate the results of that computation + # up the tree. + new_bound = min(new_upper_bound, + new_lower_bound + 2 * node1_info.radius) + if new_bound < self.bounds_ptr[node1]: + self.bounds_ptr[node1] = new_bound + + # Propagate bounds up the tree + while node1 > 0: + parent = (node1 - 1) // 2 + left = 2 * parent + 1 + right = 2 * parent + 2 + + parent_info = self.node_data[parent] + left_info = self.node_data[left] + right_info = self.node_data[right] + + bound_max = max(self.bounds_ptr[left], + self.bounds_ptr[right]) + bound_min = min(self.bounds_ptr[left] + 2 * + (parent_info.radius - left_info.radius), + self.bounds_ptr[right] + 2 * + (parent_info.radius - right_info.radius)) + + if bound_min > 0: + new_bound = min(bound_max, bound_min) + else: + new_bound = bound_max + + if new_bound < self.bounds_ptr[parent]: + self.bounds_ptr[parent] = new_bound + node1 = parent + else: + break + + # Case 2a: The query node is a leaf, or is smaller than + # the reference node. + # + # We descend in the reference tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the reference tree. + elif node1_info.is_leaf or (not node2_info.is_leaf and + node2_info.radius > node1_info.radius): + + left = 2 * node2 + 1 + right = 2 * node2 + 2 + + node2_info = self.node_data[left] + + left_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, left, + self.centroid_distances) + + node2_info = self.node_data[right] + + right_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, right, + self.centroid_distances) + + if left_dist < right_dist: + self.dual_tree_traversal(node1, left) + self.dual_tree_traversal(node1, right) + else: + self.dual_tree_traversal(node1, right) + self.dual_tree_traversal(node1, left) + + # Case 2b: The reference node is a leaf, or is smaller than + # the query node. + # + # We descend in the query tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the query tree. + else: + left = 2 * node1 + 1 + right = 2 * node1 + 2 + + node1_info = self.node_data[left] + + left_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + left, node2, + self.centroid_distances) + + node1_info = self.node_data[right] + + right_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + right, node2, + self.centroid_distances) + + if left_dist < right_dist: + self.dual_tree_traversal(left, node2) + self.dual_tree_traversal(right, node2) + else: + self.dual_tree_traversal(right, node2) + self.dual_tree_traversal(left, node2) + + return 0 + + cpdef spanning_tree(self): + """Compute the minimum spanning tree of the data held by + the tree passed in at construction""" + + cdef np.intp_t num_components + cdef np.intp_t num_nodes + + num_components = self.tree.data.shape[0] + num_nodes = self.tree.node_data.shape[0] + while num_components > 1: + self.dual_tree_traversal(0, 0) + num_components = self.update_components() + + return self.edges diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx new file mode 100644 index 0000000000000..e35470c09f38a --- /dev/null +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -0,0 +1,252 @@ +# cython: boundscheck=False +# cython: nonecheck=False +# Minimum spanning tree single linkage implementation for hdbscan +# Authors: Leland McInnes, Steve Astels +# License: 3-clause BSD + +import numpy as np +cimport numpy as np + +from libc.float cimport DBL_MAX + +from .dist_metrics cimport DistanceMetric + + +cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( + np.ndarray[np.double_t, + ndim=2] distance_matrix): + + cdef np.ndarray[np.intp_t, ndim=1] node_labels + cdef np.ndarray[np.intp_t, ndim=1] current_labels + cdef np.ndarray[np.double_t, ndim=1] current_distances + cdef np.ndarray[np.double_t, ndim=1] left + cdef np.ndarray[np.double_t, ndim=1] right + cdef np.ndarray[np.double_t, ndim=2] result + + cdef np.ndarray label_filter + + cdef np.intp_t current_node + cdef np.intp_t new_node_index + cdef np.intp_t new_node + cdef np.intp_t i + + result = np.zeros((distance_matrix.shape[0] - 1, 3)) + node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) + current_node = 0 + current_distances = np.infty * np.ones(distance_matrix.shape[0]) + current_labels = node_labels + for i in range(1, node_labels.shape[0]): + label_filter = current_labels != current_node + current_labels = current_labels[label_filter] + left = current_distances[label_filter] + right = distance_matrix[current_node][current_labels] + current_distances = np.where(left < right, left, right) + + new_node_index = np.argmin(current_distances) + new_node = current_labels[new_node_index] + result[i - 1, 0] = current_node + result[i - 1, 1] = new_node + result[i - 1, 2] = current_distances[new_node_index] + current_node = new_node + + return result + + +cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( + np.ndarray[np.double_t, ndim=2, mode='c'] raw_data, + np.ndarray[np.double_t, ndim=1, mode='c'] core_distances, + DistanceMetric dist_metric, + np.double_t alpha=1.0): + + # Add a comment + cdef np.ndarray[np.double_t, ndim=1] current_distances_arr + cdef np.ndarray[np.double_t, ndim=1] current_sources_arr + cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr + cdef np.ndarray[np.double_t, ndim=2] result_arr + + cdef np.double_t * current_distances + cdef np.double_t * current_sources + cdef np.double_t * current_core_distances + cdef np.double_t * raw_data_ptr + cdef np.int8_t * in_tree + cdef np.double_t[:, ::1] raw_data_view + cdef np.double_t[:, ::1] result + + cdef np.ndarray label_filter + + cdef np.intp_t current_node + cdef np.intp_t source_node + cdef np.intp_t right_node + cdef np.intp_t left_node + cdef np.intp_t new_node + cdef np.intp_t i + cdef np.intp_t j + cdef np.intp_t dim + cdef np.intp_t num_features + + cdef double current_node_core_distance + cdef double right_value + cdef double left_value + cdef double core_value + cdef double new_distance + + dim = raw_data.shape[0] + num_features = raw_data.shape[1] + + raw_data_view = ( ( + raw_data.data)) + raw_data_ptr = ( &raw_data_view[0, 0]) + + result_arr = np.zeros((dim - 1, 3)) + in_tree_arr = np.zeros(dim, dtype=np.int8) + current_node = 0 + current_distances_arr = np.infty * np.ones(dim) + current_sources_arr = np.ones(dim) + + result = ( ( result_arr.data)) + in_tree = ( in_tree_arr.data) + current_distances = ( current_distances_arr.data) + current_sources = ( current_sources_arr.data) + current_core_distances = ( core_distances.data) + + for i in range(1, dim): + + in_tree[current_node] = 1 + + current_node_core_distance = current_core_distances[current_node] + + new_distance = DBL_MAX + source_node = 0 + new_node = 0 + + for j in range(dim): + if in_tree[j]: + continue + + right_value = current_distances[j] + right_source = current_sources[j] + + left_value = dist_metric.dist(&raw_data_ptr[num_features * + current_node], + &raw_data_ptr[num_features * j], + num_features) + left_source = current_node + + if alpha != 1.0: + left_value /= alpha + + core_value = core_distances[j] + if (current_node_core_distance > right_value or + core_value > right_value or + left_value > right_value): + if right_value < new_distance: + new_distance = right_value + source_node = right_source + new_node = j + continue + + if core_value > current_node_core_distance: + if core_value > left_value: + left_value = core_value + else: + if current_node_core_distance > left_value: + left_value = current_node_core_distance + + if left_value < right_value: + current_distances[j] = left_value + current_sources[j] = left_source + if left_value < new_distance: + new_distance = left_value + source_node = left_source + new_node = j + else: + if right_value < new_distance: + new_distance = right_value + source_node = right_source + new_node = j + + result[i - 1, 0] = source_node + result[i - 1, 1] = new_node + result[i - 1, 2] = new_distance + current_node = new_node + + return result_arr + + +cdef class UnionFind (object): + + cdef np.ndarray parent_arr + cdef np.ndarray size_arr + cdef np.intp_t next_label + cdef np.intp_t *parent + cdef np.intp_t *size + + def __init__(self, N): + self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C') + self.next_label = N + self.size_arr = np.hstack((np.ones(N, dtype=np.intp), + np.zeros(N-1, dtype=np.intp))) + self.parent = ( self.parent_arr.data) + self.size = ( self.size_arr.data) + + cdef void union(self, np.intp_t m, np.intp_t n): + self.size[self.next_label] = self.size[m] + self.size[n] + self.parent[m] = self.next_label + self.parent[n] = self.next_label + self.size[self.next_label] = self.size[m] + self.size[n] + self.next_label += 1 + + return + + cdef np.intp_t fast_find(self, np.intp_t n): + cdef np.intp_t p + p = n + while self.parent_arr[n] != -1: + n = self.parent_arr[n] + # label up to the root + while self.parent_arr[p] != n: + p, self.parent_arr[p] = self.parent_arr[p], n + return n + + +cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): + + cdef np.ndarray[np.double_t, ndim=2] result_arr + cdef np.double_t[:, ::1] result + + cdef np.intp_t N, a, aa, b, bb, index + cdef np.double_t delta + + result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) + result = ( ( + result_arr.data)) + N = L.shape[0] + 1 + U = UnionFind(N) + + for index in range(L.shape[0]): + + a = L[index, 0] + b = L[index, 1] + delta = L[index, 2] + + aa, bb = U.fast_find(a), U.fast_find(b) + + result[index][0] = aa + result[index][1] = bb + result[index][2] = delta + result[index][3] = U.size[aa] + U.size[bb] + + U.union(aa, bb) + + return result_arr + + +cpdef np.ndarray[np.double_t, ndim=2] single_linkage(distance_matrix): + + cdef np.ndarray[np.double_t, ndim=2] hierarchy + cdef np.ndarray[np.double_t, ndim=2] for_labelling + + hierarchy = mst_linkage_core(distance_matrix) + for_labelling = hierarchy[np.argsort(hierarchy.T[2]), :] + + return label(for_labelling) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx new file mode 100644 index 0000000000000..2863dc8af4dca --- /dev/null +++ b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx @@ -0,0 +1,213 @@ +# cython: boundscheck=False +# cython: nonecheck=False +# cython: initializedcheck=False +# mutual reachability distance compiutations +# Authors: Leland McInnes +# License: 3-clause BSD + +import numpy as np +cimport numpy as np + +from scipy.spatial.distance import pdist, squareform +from scipy.sparse import lil_matrix as sparse_matrix +from sklearn.neighbors import KDTree, BallTree +import gc + + +def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): + """Compute the weighted adjacency matrix of the mutual reachability + graph of a distance matrix. + + Parameters + ---------- + distance_matrix : ndarray, shape (n_samples, n_samples) + Array of distances between samples. + + min_points : int, optional (default=5) + The number of points in a neighbourhood for a point to be considered + a core point. + + Returns + ------- + mututal_reachability: ndarray, shape (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. + + References + ---------- + .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). + Density-based clustering based on hierarchical density estimates. + In Pacific-Asia Conference on Knowledge Discovery and Data Mining + (pp. 160-172). Springer Berlin Heidelberg. + """ + size = distance_matrix.shape[0] + min_points = min(size - 1, min_points) + try: + core_distances = np.partition(distance_matrix, + min_points, + axis=0)[min_points] + except AttributeError: + core_distances = np.sort(distance_matrix, + axis=0)[min_points] + + if alpha != 1.0: + distance_matrix = distance_matrix / alpha + + stage1 = np.where(core_distances > distance_matrix, + core_distances, distance_matrix) + result = np.where(core_distances > stage1.T, + core_distances.T, stage1.T).T + return result + + +cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, + float alpha=1.0, float max_dist=0.): + + cdef np.intp_t i + cdef np.intp_t j + cdef np.intp_t n + cdef np.double_t mr_dist + cdef list sorted_row_data + cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance + cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data + cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data + + result = sparse_matrix(lil_matrix.shape) + core_distance = np.empty(lil_matrix.shape[0], dtype=np.double) + + for i in range(lil_matrix.shape[0]): + sorted_row_data = sorted(lil_matrix.data[i]) + if min_points - 1 < len(sorted_row_data): + core_distance[i] = sorted_row_data[min_points - 1] + else: + core_distance[i] = np.infty + + if alpha != 1.0: + lil_matrix = lil_matrix / alpha + + nz_row_data, nz_col_data = lil_matrix.nonzero() + + for n in range(nz_row_data.shape[0]): + i = nz_row_data[n] + j = nz_col_data[n] + + mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j]) + if np.isfinite(mr_dist): + result[i, j] = mr_dist + elif max_dist > 0: + result[i, j] = max_dist + + return result.tocsr() + + +def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, + alpha=1.0, **kwargs): + dim = distance_matrix.shape[0] + min_points = min(dim - 1, min_points) + + if metric == 'minkowski': + tree = KDTree(X, metric=metric, p=p) + else: + tree = KDTree(X, metric=metric, **kwargs) + + core_distances = tree.query(X, k=min_points)[0][:, -1] + + if alpha != 1.0: + distance_matrix = distance_matrix / alpha + + stage1 = np.where(core_distances > distance_matrix, + core_distances, distance_matrix) + result = np.where(core_distances > stage1.T, + core_distances.T, stage1.T).T + return result + + +def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, + alpha=1.0, **kwargs): + dim = distance_matrix.shape[0] + min_points = min(dim - 1, min_points) + + tree = BallTree(X, metric=metric, **kwargs) + + core_distances = tree.query(X, k=min_points)[0][:, -1] + + if alpha != 1.0: + distance_matrix = distance_matrix / alpha + + stage1 = np.where(core_distances > distance_matrix, + core_distances, distance_matrix) + result = np.where(core_distances > stage1.T, + core_distances.T, stage1.T).T + return result + + +cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist( + np.ndarray[np.double_t, ndim=1] core_distances, + np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim): + + cdef np.intp_t i + cdef np.intp_t j + cdef np.intp_t result_pos + + result_pos = 0 + for i in range(dim): + for j in range(i + 1, dim): + if core_distances[i] > core_distances[j]: + if core_distances[i] > dists[result_pos]: + dists[result_pos] = core_distances[i] + + else: + if core_distances[j] > dists[result_pos]: + dists[result_pos] = core_distances[j] + + result_pos += 1 + + return dists + + +def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, + **kwargs): + + dim = X.shape[0] + min_points = min(dim - 1, min_points) + + if metric == 'minkowski': + tree = KDTree(X, metric=metric, p=p) + else: + tree = KDTree(X, metric=metric, **kwargs) + + core_distances = tree.query(X, k=min_points)[0][:, -1] + + del tree + gc.collect() + + dists = pdist(X, metric=metric, p=p, **kwargs) + + if alpha != 1.0: + dists /= alpha + + dists = mutual_reachability_from_pdist(core_distances, dists, dim) + + return dists + + +def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, + **kwargs): + + dim = X.shape[0] + min_points = min(dim - 1, min_points) + + tree = BallTree(X, metric=metric, **kwargs) + + core_distances = tree.query(X, k=min_points)[0][:, -1] + + del tree + gc.collect() + + dists = pdist(X, metric=metric, p=p, **kwargs) + + if alpha != 1.0: + dists /= alpha + + dists = mutual_reachability_from_pdist(core_distances, dists, dim) + + return dists diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx new file mode 100644 index 0000000000000..ca788a8f995e4 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -0,0 +1,806 @@ +# cython: boundscheck=False +# cython: nonecheck=False +# cython: initializedcheck=False +# Tree handling (condensing, finding stable clusters) for hdbscan +# Authors: Leland McInnes +# License: 3-clause BSD + +import numpy as np +cimport numpy as np + +cdef np.double_t INFTY = np.inf + + +cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, + np.intp_t bfs_root): + """ + Perform a breadth first search on a tree in scipy hclust format. + """ + + cdef list to_process + cdef np.intp_t max_node + cdef np.intp_t num_points + cdef np.intp_t dim + + dim = hierarchy.shape[0] + max_node = 2 * dim + num_points = max_node - dim + 1 + + to_process = [bfs_root] + result = [] + + while to_process: + result.extend(to_process) + to_process = [x - num_points for x in + to_process if x >= num_points] + if to_process: + to_process = hierarchy[to_process, + :2].flatten().astype(np.intp).tolist() + + return result + + +cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy, + np.intp_t min_cluster_size=10): + """Condense a tree according to a minimum cluster size. This is akin + to the runt pruning procedure of Stuetzle. The result is a much simpler + tree that is easier to visualize. We include extra information on the + lambda value at which individual points depart clusters for later + analysis and computation. + + Parameters + ---------- + hierarchy : ndarray (n_samples, 4) + A single linkage hierarchy in scipy.cluster.hierarchy format. + + min_cluster_size : int, optional (default 10) + The minimum size of clusters to consider. Smaller "runt" + clusters are pruned from the tree. + + Returns + ------- + condensed_tree : numpy recarray + Effectively an edgelist with a parent, child, lambda_val + and child_size in each row providing a tree structure. + """ + + cdef np.intp_t root + cdef np.intp_t num_points + cdef np.intp_t next_label + cdef list node_list + cdef list result_list + + cdef np.ndarray[np.intp_t, ndim=1] relabel + cdef np.ndarray[np.int_t, ndim=1] ignore + cdef np.ndarray[np.double_t, ndim=1] children + + cdef np.intp_t node + cdef np.intp_t sub_node + cdef np.intp_t left + cdef np.intp_t right + cdef double lambda_value + cdef np.intp_t left_count + cdef np.intp_t right_count + + root = 2 * hierarchy.shape[0] + num_points = root // 2 + 1 + next_label = num_points + 1 + + node_list = bfs_from_hierarchy(hierarchy, root) + + relabel = np.empty(root + 1, dtype=np.intp) + relabel[root] = num_points + result_list = [] + ignore = np.zeros(len(node_list), dtype=int) + + for node in node_list: + if ignore[node] or node < num_points: + continue + + children = hierarchy[node - num_points] + left = children[0] + right = children[1] + if children[2] > 0.0: + lambda_value = 1.0 / children[2] + else: + lambda_value = INFTY + + if left >= num_points: + left_count = hierarchy[left - num_points][3] + else: + left_count = 1 + + if right >= num_points: + right_count = hierarchy[right - num_points][3] + else: + right_count = 1 + + if left_count >= min_cluster_size and right_count >= min_cluster_size: + relabel[left] = next_label + next_label += 1 + result_list.append((relabel[node], relabel[left], lambda_value, + left_count)) + + relabel[right] = next_label + next_label += 1 + result_list.append((relabel[node], relabel[right], lambda_value, + right_count)) + + elif left_count < min_cluster_size and right_count < min_cluster_size: + for sub_node in bfs_from_hierarchy(hierarchy, left): + if sub_node < num_points: + result_list.append((relabel[node], sub_node, + lambda_value, 1)) + ignore[sub_node] = True + + for sub_node in bfs_from_hierarchy(hierarchy, right): + if sub_node < num_points: + result_list.append((relabel[node], sub_node, + lambda_value, 1)) + ignore[sub_node] = True + + elif left_count < min_cluster_size: + relabel[right] = relabel[node] + for sub_node in bfs_from_hierarchy(hierarchy, left): + if sub_node < num_points: + result_list.append((relabel[node], sub_node, + lambda_value, 1)) + ignore[sub_node] = True + + else: + relabel[left] = relabel[node] + for sub_node in bfs_from_hierarchy(hierarchy, right): + if sub_node < num_points: + result_list.append((relabel[node], sub_node, + lambda_value, 1)) + ignore[sub_node] = True + + return np.array(result_list, dtype=[('parent', np.intp), + ('child', np.intp), + ('lambda_val', float), + ('child_size', np.intp)]) + + +cpdef dict compute_stability(np.ndarray condensed_tree): + + cdef np.ndarray[np.double_t, ndim=1] result_arr + cdef np.ndarray sorted_child_data + cdef np.ndarray[np.intp_t, ndim=1] sorted_children + cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas + + cdef np.ndarray[np.intp_t, ndim=1] parents + cdef np.ndarray[np.intp_t, ndim=1] sizes + cdef np.ndarray[np.double_t, ndim=1] lambdas + + cdef np.intp_t child + cdef np.intp_t parent + cdef np.intp_t child_size + cdef np.intp_t result_index + cdef np.intp_t current_child + cdef np.float64_t lambda_ + cdef np.float64_t min_lambda + + cdef np.ndarray[np.double_t, ndim=1] births_arr + cdef np.double_t *births + + cdef np.intp_t largest_child = condensed_tree['child'].max() + cdef np.intp_t smallest_cluster = condensed_tree['parent'].min() + cdef np.intp_t num_clusters = (condensed_tree['parent'].max() - + smallest_cluster + 1) + + if largest_child < smallest_cluster: + largest_child = smallest_cluster + + sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], + axis=0) + births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double) + births = ( births_arr.data) + sorted_children = sorted_child_data['child'].copy() + sorted_lambdas = sorted_child_data['lambda_val'].copy() + + parents = condensed_tree['parent'] + sizes = condensed_tree['child_size'] + lambdas = condensed_tree['lambda_val'] + + current_child = -1 + min_lambda = 0 + + for row in range(sorted_child_data.shape[0]): + child = sorted_children[row] + lambda_ = sorted_lambdas[row] + + if child == current_child: + min_lambda = min(min_lambda, lambda_) + elif current_child != -1: + births[current_child] = min_lambda + current_child = child + min_lambda = lambda_ + else: + # Initialize + current_child = child + min_lambda = lambda_ + + if current_child != -1: + births[current_child] = min_lambda + births[smallest_cluster] = 0.0 + + result_arr = np.zeros(num_clusters, dtype=np.double) + + for i in range(condensed_tree.shape[0]): + parent = parents[i] + lambda_ = lambdas[i] + child_size = sizes[i] + result_index = parent - smallest_cluster + + result_arr[result_index] += (lambda_ - births[parent]) * child_size + + result_pre_dict = np.vstack((np.arange(smallest_cluster, + condensed_tree['parent'].max() + 1), + result_arr)).T + + return dict(result_pre_dict) + + +cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root): + + cdef list result + cdef np.ndarray[np.intp_t, ndim=1] to_process + + result = [] + to_process = np.array([bfs_root], dtype=np.intp) + + while to_process.shape[0] > 0: + result.extend(to_process.tolist()) + to_process = tree['child'][np.in1d(tree['parent'], to_process)] + + return result + + +cdef max_lambdas(np.ndarray tree): + + cdef np.ndarray sorted_parent_data + cdef np.ndarray[np.intp_t, ndim=1] sorted_parents + cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas + + cdef np.intp_t parent + cdef np.intp_t current_parent + cdef np.float64_t lambda_ + cdef np.float64_t max_lambda + + cdef np.ndarray[np.double_t, ndim=1] deaths_arr + cdef np.double_t *deaths + + cdef np.intp_t largest_parent = tree['parent'].max() + + sorted_parent_data = np.sort(tree[['parent', 'lambda_val']], axis=0) + deaths_arr = np.zeros(largest_parent + 1, dtype=np.double) + deaths = ( deaths_arr.data) + sorted_parents = sorted_parent_data['parent'] + sorted_lambdas = sorted_parent_data['lambda_val'] + + current_parent = -1 + max_lambda = 0 + + for row in range(sorted_parent_data.shape[0]): + parent = sorted_parents[row] + lambda_ = sorted_lambdas[row] + + if parent == current_parent: + max_lambda = max(max_lambda, lambda_) + elif current_parent != -1: + deaths[current_parent] = max_lambda + current_parent = parent + max_lambda = lambda_ + else: + # Initialize + current_parent = parent + max_lambda = lambda_ + + deaths[current_parent] = max_lambda # value for last parent + + return deaths_arr + + +cdef class TreeUnionFind (object): + + cdef np.ndarray _data_arr + cdef np.intp_t[:, ::1] _data + cdef np.ndarray is_component + + def __init__(self, size): + self._data_arr = np.zeros((size, 2), dtype=np.intp) + self._data_arr.T[0] = np.arange(size) + self._data = ( ( + self._data_arr.data)) + self.is_component = np.ones(size, dtype=bool) + + cdef union_(self, np.intp_t x, np.intp_t y): + cdef np.intp_t x_root = self.find(x) + cdef np.intp_t y_root = self.find(y) + + if self._data[x_root, 1] < self._data[y_root, 1]: + self._data[x_root, 0] = y_root + elif self._data[x_root, 1] > self._data[y_root, 1]: + self._data[y_root, 0] = x_root + else: + self._data[y_root, 0] = x_root + self._data[x_root, 1] += 1 + + return + + cdef find(self, np.intp_t x): + if self._data[x, 0] != x: + self._data[x, 0] = self.find(self._data[x, 0]) + self.is_component[x] = False + return self._data[x, 0] + + cdef np.ndarray[np.intp_t, ndim=1] components(self): + return self.is_component.nonzero()[0] + + +cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut( + np.ndarray linkage, + np.double_t cut, + np.intp_t min_cluster_size): + """Given a single linkage tree and a cut value, return the + vector of cluster labels at that cut value. This is useful + for Robust Single Linkage, and extracting DBSCAN results + from a single HDBSCAN run. + + Parameters + ---------- + linkage : ndarray (n_samples, 4) + The single linkage tree in scipy.cluster.hierarchy format. + + cut : double + The cut value at which to find clusters. + + min_cluster_size : int + The minimum cluster size; clusters below this size at + the cut will be considered noise. + + Returns + ------- + labels : ndarray (n_samples,) + The cluster labels for each point in the data set; + a label of -1 denotes a noise assignment. + """ + + cdef np.intp_t root + cdef np.intp_t num_points + cdef np.ndarray[np.intp_t, ndim=1] result_arr + cdef np.ndarray[np.intp_t, ndim=1] unique_labels + cdef np.ndarray[np.intp_t, ndim=1] cluster_size + cdef np.intp_t *result + cdef TreeUnionFind union_find + cdef np.intp_t n + cdef np.intp_t cluster + cdef np.intp_t cluster_id + + root = 2 * linkage.shape[0] + num_points = root // 2 + 1 + + result_arr = np.empty(num_points, dtype=np.intp) + result = ( result_arr.data) + + union_find = TreeUnionFind( root + 1) + + cluster = num_points + for row in linkage: + if row[2] < cut: + union_find.union_( row[0], cluster) + union_find.union_( row[1], cluster) + cluster += 1 + + cluster_size = np.zeros(cluster, dtype=np.intp) + for n in range(num_points): + cluster = union_find.find(n) + cluster_size[cluster] += 1 + result[n] = cluster + + cluster_label_map = {-1: -1} + cluster_label = 0 + unique_labels = np.unique(result_arr) + + for cluster in unique_labels: + if cluster_size[cluster] < min_cluster_size: + cluster_label_map[cluster] = -1 + else: + cluster_label_map[cluster] = cluster_label + cluster_label += 1 + + for n in range(num_points): + result[n] = cluster_label_map[result[n]] + + return result_arr + + +cdef np.ndarray[np.intp_t, ndim=1] do_labelling( + np.ndarray tree, + set clusters, + dict cluster_label_map, + np.intp_t allow_single_cluster, + np.double_t cluster_selection_epsilon, + np.intp_t match_reference_implementation): + + cdef np.intp_t root_cluster + cdef np.ndarray[np.intp_t, ndim=1] result_arr + cdef np.ndarray[np.intp_t, ndim=1] parent_array + cdef np.ndarray[np.intp_t, ndim=1] child_array + cdef np.ndarray[np.double_t, ndim=1] lambda_array + cdef np.intp_t *result + cdef TreeUnionFind union_find + cdef np.intp_t parent + cdef np.intp_t child + cdef np.intp_t n + cdef np.intp_t cluster + + child_array = tree['child'] + parent_array = tree['parent'] + lambda_array = tree['lambda_val'] + + root_cluster = parent_array.min() + result_arr = np.empty(root_cluster, dtype=np.intp) + result = ( result_arr.data) + + union_find = TreeUnionFind(parent_array.max() + 1) + + for n in range(tree.shape[0]): + child = child_array[n] + parent = parent_array[n] + if child not in clusters: + union_find.union_(parent, child) + + for n in range(root_cluster): + cluster = union_find.find(n) + if cluster < root_cluster: + result[n] = -1 + elif cluster == root_cluster: + if len(clusters) == 1 and allow_single_cluster: + if cluster_selection_epsilon != 0.0: + if tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon : + result[n] = cluster_label_map[cluster] + else: + result[n] = -1 + elif tree['lambda_val'][tree['child'] == n] >= \ + tree['lambda_val'][tree['parent'] == cluster].max(): + result[n] = cluster_label_map[cluster] + else: + result[n] = -1 + else: + result[n] = -1 + else: + if match_reference_implementation: + point_lambda = lambda_array[child_array == n][0] + cluster_lambda = lambda_array[child_array == cluster][0] + if point_lambda > cluster_lambda: + result[n] = cluster_label_map[cluster] + else: + result[n] = -1 + else: + result[n] = cluster_label_map[cluster] + + return result_arr + + +cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels): + + cdef np.ndarray[np.double_t, ndim=1] result + cdef np.ndarray[np.double_t, ndim=1] deaths + cdef np.ndarray[np.double_t, ndim=1] lambda_array + cdef np.ndarray[np.intp_t, ndim=1] child_array + cdef np.ndarray[np.intp_t, ndim=1] parent_array + cdef np.intp_t root_cluster + cdef np.intp_t n + cdef np.intp_t point + cdef np.intp_t cluster_num + cdef np.intp_t cluster + cdef np.double_t max_lambda + cdef np.double_t lambda_ + + child_array = tree['child'] + parent_array = tree['parent'] + lambda_array = tree['lambda_val'] + + result = np.zeros(labels.shape[0]) + deaths = max_lambdas(tree) + root_cluster = parent_array.min() + + for n in range(tree.shape[0]): + point = child_array[n] + if point >= root_cluster: + continue + + cluster_num = labels[point] + + if cluster_num == -1: + continue + + cluster = cluster_map[cluster_num] + max_lambda = deaths[cluster] + if max_lambda == 0.0 or not np.isfinite(lambda_array[n]): + result[point] = 1.0 + else: + lambda_ = min(lambda_array[n], max_lambda) + result[point] = lambda_ / max_lambda + + return result + + +cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree): + """Generate GLOSH outlier scores from a condensed tree. + + Parameters + ---------- + tree : numpy recarray + The condensed tree to generate GLOSH outlier scores from + + Returns + ------- + outlier_scores : ndarray (n_samples,) + Outlier scores for each sample point. The larger the score + the more outlying the point. + """ + + cdef np.ndarray[np.double_t, ndim=1] result + cdef np.ndarray[np.double_t, ndim=1] deaths + cdef np.ndarray[np.double_t, ndim=1] lambda_array + cdef np.ndarray[np.intp_t, ndim=1] child_array + cdef np.ndarray[np.intp_t, ndim=1] parent_array + cdef np.intp_t root_cluster + cdef np.intp_t point + cdef np.intp_t parent + cdef np.intp_t cluster + cdef np.double_t lambda_max + + child_array = tree['child'] + parent_array = tree['parent'] + lambda_array = tree['lambda_val'] + + deaths = max_lambdas(tree) + root_cluster = parent_array.min() + result = np.zeros(root_cluster, dtype=np.double) + + topological_sort_order = np.argsort(parent_array) + # topologically_sorted_tree = tree[topological_sort_order] + + for n in topological_sort_order: + cluster = child_array[n] + if cluster < root_cluster: + break + + parent = parent_array[n] + if deaths[cluster] > deaths[parent]: + deaths[parent] = deaths[cluster] + + for n in range(tree.shape[0]): + point = child_array[n] + if point >= root_cluster: + continue + + cluster = parent_array[n] + lambda_max = deaths[cluster] + + + if lambda_max == 0.0 or not np.isfinite(lambda_array[n]): + result[point] = 0.0 + else: + result[point] = (lambda_max - lambda_array[n]) / lambda_max + + return result + + +cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters, + dict stability, np.double_t max_lambda): + + cdef np.intp_t cluster_size + cdef np.intp_t n + + result = np.empty(len(clusters), dtype=np.double) + for n, c in enumerate(sorted(list(clusters))): + cluster_size = np.sum(labels == n) + if np.isinf(max_lambda) or max_lambda == 0.0 or cluster_size == 0: + result[n] = 1.0 + else: + result[n] = stability[c] / (cluster_size * max_lambda) + + return result + +cpdef list recurse_leaf_dfs(np.ndarray cluster_tree, np.intp_t current_node): + children = cluster_tree[cluster_tree['parent'] == current_node]['child'] + if len(children) == 0: + return [current_node,] + else: + return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) + + +cpdef list get_cluster_tree_leaves(np.ndarray cluster_tree): + if cluster_tree.shape[0] == 0: + return [] + root = cluster_tree['parent'].min() + return recurse_leaf_dfs(cluster_tree, root) + +cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cluster_selection_epsilon, np.intp_t leaf, np.intp_t allow_single_cluster): + + root = cluster_tree['parent'].min() + parent = cluster_tree[cluster_tree['child'] == leaf]['parent'] + if parent == root: + if allow_single_cluster: + return parent + else: + return leaf #return node closest to root + + parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val'] + if parent_eps > cluster_selection_epsilon: + return parent + else: + return traverse_upwards(cluster_tree, cluster_selection_epsilon, parent, allow_single_cluster) + +cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluster_selection_epsilon, np.intp_t allow_single_cluster): + + selected_clusters = list() + processed = list() + + for leaf in leaves: + eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0] + if eps < cluster_selection_epsilon: + if leaf not in processed: + epsilon_child = traverse_upwards(cluster_tree, cluster_selection_epsilon, leaf, allow_single_cluster) + selected_clusters.append(epsilon_child) + + for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child): + if sub_node != epsilon_child: + processed.append(sub_node) + else: + selected_clusters.append(leaf) + + return set(selected_clusters) + +cpdef tuple get_clusters(np.ndarray tree, dict stability, + cluster_selection_method='eom', + allow_single_cluster=False, + match_reference_implementation=False, + cluster_selection_epsilon=0.0, + max_cluster_size=0): + """Given a tree and stability dict, produce the cluster labels + (and probabilities) for a flat clustering based on the chosen + cluster selection method. + + Parameters + ---------- + tree : numpy recarray + The condensed tree to extract flat clusters from + + stability : dict + A dictionary mapping cluster_ids to stability values + + cluster_selection_method : string, optional (default 'eom') + The method of selecting clusters. The default is the + Excess of Mass algorithm specified by 'eom'. The alternate + option is 'leaf'. + + allow_single_cluster : boolean, optional (default False) + Whether to allow a single cluster to be selected by the + Excess of Mass algorithm. + + match_reference_implementation : boolean, optional (default False) + Whether to match the reference implementation in how to handle + certain edge cases. + + cluster_selection_epsilon: float, optional (default 0.0) + A distance threshold for cluster splits. + + max_cluster_size: int, optional (default 0) + The maximum size for clusters located by the EOM clusterer. Can + be overridden by the cluster_selection_epsilon parameter in + rare cases. + + Returns + ------- + labels : ndarray (n_samples,) + An integer array of cluster labels, with -1 denoting noise. + + probabilities : ndarray (n_samples,) + The cluster membership strength of each sample. + + stabilities : ndarray (n_clusters,) + The cluster coherence strengths of each cluster. + """ + cdef list node_list + cdef np.ndarray cluster_tree + cdef np.ndarray child_selection + cdef dict is_cluster + cdef dict cluster_sizes + cdef float subtree_stability + cdef np.intp_t node + cdef np.intp_t sub_node + cdef np.intp_t cluster + cdef np.intp_t num_points + cdef np.ndarray labels + cdef np.double_t max_lambda + + # Assume clusters are ordered by numeric id equivalent to + # a topological sort of the tree; This is valid given the + # current implementation above, so don't change that ... or + # if you do, change this accordingly! + if allow_single_cluster: + node_list = sorted(stability.keys(), reverse=True) + else: + node_list = sorted(stability.keys(), reverse=True)[:-1] + # (exclude root) + + cluster_tree = tree[tree['child_size'] > 1] + is_cluster = {cluster: True for cluster in node_list} + num_points = np.max(tree[tree['child_size'] == 1]['child']) + 1 + max_lambda = np.max(tree['lambda_val']) + + if max_cluster_size <= 0: + max_cluster_size = num_points + 1 # Set to a value that will never be triggered + cluster_sizes = {child: child_size for child, child_size + in zip(cluster_tree['child'], cluster_tree['child_size'])} + if allow_single_cluster: + # Compute cluster size for the root node + cluster_sizes[node_list[-1]] = np.sum( + cluster_tree[cluster_tree['parent'] == node_list[-1]]['child_size']) + + if cluster_selection_method == 'eom': + for node in node_list: + child_selection = (cluster_tree['parent'] == node) + subtree_stability = np.sum([ + stability[child] for + child in cluster_tree['child'][child_selection]]) + if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size: + is_cluster[node] = False + stability[node] = subtree_stability + else: + for sub_node in bfs_from_cluster_tree(cluster_tree, node): + if sub_node != node: + is_cluster[sub_node] = False + + if cluster_selection_epsilon != 0.0 and cluster_tree.shape[0] > 0: + eom_clusters = [c for c in is_cluster if is_cluster[c]] + selected_clusters = [] + # first check if eom_clusters only has root node, which skips epsilon check. + if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()): + if allow_single_cluster: + selected_clusters = eom_clusters + else: + selected_clusters = epsilon_search(set(eom_clusters), cluster_tree, cluster_selection_epsilon, allow_single_cluster) + for c in is_cluster: + if c in selected_clusters: + is_cluster[c] = True + else: + is_cluster[c] = False + + elif cluster_selection_method == 'leaf': + leaves = set(get_cluster_tree_leaves(cluster_tree)) + if len(leaves) == 0: + for c in is_cluster: + is_cluster[c] = False + is_cluster[tree['parent'].min()] = True + + if cluster_selection_epsilon != 0.0: + selected_clusters = epsilon_search(leaves, cluster_tree, cluster_selection_epsilon, allow_single_cluster) + else: + selected_clusters = leaves + + for c in is_cluster: + if c in selected_clusters: + is_cluster[c] = True + else: + is_cluster[c] = False + else: + raise ValueError('Invalid Cluster Selection Method: %s\n' + 'Should be one of: "eom", "leaf"\n') + + clusters = set([c for c in is_cluster if is_cluster[c]]) + cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))} + reverse_cluster_map = {n: c for c, n in cluster_map.items()} + + labels = do_labelling(tree, clusters, cluster_map, + allow_single_cluster, cluster_selection_epsilon, + match_reference_implementation) + probs = get_probabilities(tree, reverse_cluster_map, labels) + stabilities = get_stability_scores(labels, clusters, stability, max_lambda) + + return (labels, probs, stabilities) diff --git a/sklearn/cluster/_hdbscan/_prediction_utils.pyx b/sklearn/cluster/_hdbscan/_prediction_utils.pyx new file mode 100644 index 0000000000000..a6a7c13489666 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_prediction_utils.pyx @@ -0,0 +1,383 @@ +#cython: boundscheck=False, nonecheck=False, initializedcheck=False +# Utility routines in cython for prediction in hdbscan +# Authors: Leland McInnes +# License: 3-clause BSD + +import numpy as np +cimport numpy as np + +from .dist_metrics cimport DistanceMetric + +from libc.float cimport DBL_MAX +from libc.math cimport exp + +cpdef get_tree_row_with_child(np.ndarray tree, np.intp_t child): + + cdef np.intp_t i + cdef np.ndarray[np.intp_t, ndim = 1] child_array = tree['child'] + + for i in range(tree.shape[0]): + if child_array[i] == child: + return tree[i] + + return tree[0] + +cdef np.float64_t min_dist_to_exemplar( + np.ndarray[np.float64_t, ndim=1] point, + np.ndarray[np.float64_t, ndim=2] cluster_exemplars, + DistanceMetric dist_metric): + + cdef np.intp_t i + cdef np.float64_t result = DBL_MAX + cdef np.float64_t distance + cdef np.float64_t *point_ptr = ( point.data) + cdef np.float64_t[:, ::1] exemplars_view = \ + ( + ( cluster_exemplars.data)) + cdef np.float64_t *exemplars_ptr = \ + ( &exemplars_view[0, 0]) + cdef np.intp_t num_features = point.shape[0] + + for i in range(cluster_exemplars.shape[0]): + distance = dist_metric.dist(point_ptr, + &exemplars_ptr[num_features * i], + num_features) + if distance < result: + result = distance + + return result + +cdef np.ndarray[np.float64_t, ndim=1] dist_vector( + np.ndarray[np.float64_t, ndim=1] point, + list exemplars_list, + DistanceMetric dist_metric): + + cdef np.intp_t i + cdef np.ndarray[np.float64_t, ndim=2] exemplars + cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list)) + + + for i in range(len(exemplars_list)): + exemplars = exemplars_list[i] + result[i] = min_dist_to_exemplar(point, exemplars, dist_metric) + + return result + +cpdef np.ndarray[np.float64_t, ndim=1] dist_membership_vector( + np.ndarray[np.float64_t, ndim=1] point, + list exemplars_list, + DistanceMetric dist_metric, + softmax=False): + + cdef np.intp_t i + cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list)) + cdef np.ndarray[np.float64_t, ndim=1] vector + cdef np.float64_t sum = 0.0 + + vector = dist_vector(point, exemplars_list, dist_metric) + + if softmax: + for i in range(vector.shape[0]): + result[i] = 1.0 / vector[i] + result = np.exp(result - np.nanmax(result)) + sum = np.sum(result) + + else: + for i in range(vector.shape[0]): + if vector[i] != 0: + result[i] = 1.0 / vector[i] + else: + result[i] = DBL_MAX / vector.shape[0] + sum += result[i] + + for i in range(result.shape[0]): + result[i] = result[i] / sum + + return result + +cpdef np.ndarray[np.float64_t, ndim=2] all_points_dist_membership_vector( + np.ndarray[np.float64_t, ndim=2] all_points, + list exemplars_list, + DistanceMetric dist_metric, + softmax=False): + + cdef np.ndarray[np.float64_t, ndim=2] result + cdef np.intp_t i + + result = np.empty((all_points.shape[0], len(exemplars_list)), + dtype=np.float64) + + for i in range(all_points.shape[0]): + result[i] = dist_membership_vector(all_points[i], + exemplars_list, + dist_metric, + softmax) + + return result + +cdef np.ndarray[np.float64_t, ndim=1] merge_height( + np.intp_t point_cluster, + np.float64_t point_lambda, + np.ndarray[np.intp_t, ndim=1] clusters, + np.ndarray cluster_tree): + + cdef np.intp_t i + cdef np.intp_t j + + cdef np.intp_t left_cluster + cdef np.intp_t right_cluster + cdef int took_right_parent + cdef int took_left_parent + cdef np.intp_t cluster + + cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(clusters.shape[0], + dtype=np.float64) + cdef np.ndarray[np.intp_t, ndim=1] parents + cdef np.ndarray[np.intp_t, ndim=1] children + cdef np.ndarray[np.float64_t, ndim=1] lambdas + + # convert the cluster tree for fast direct access + parents = cluster_tree['parent'].astype(np.intp) + children = cluster_tree['child'].astype(np.intp) + lambdas = cluster_tree['lambda_val'].astype(np.float64) + + + for i in range(clusters.shape[0]): + + took_right_parent = False + took_left_parent = False + + right_cluster = clusters[i] + left_cluster = point_cluster + + while left_cluster != right_cluster: + if left_cluster > right_cluster: + took_left_parent = True + last_cluster = left_cluster + + # Set left_cluster to be its parent + for j in range(children.shape[0]): + if children[j] == left_cluster: + left_cluster = parents[j] + break + else: + took_right_parent = True + last_cluster = right_cluster + + # Set right_cluster to be its parent + for j in range(children.shape[0]): + if children[j] == right_cluster: + right_cluster = parents[j] + break + + if took_left_parent and took_right_parent: + # Take the lambda value of last_cluster merging in + for j in range(children.shape[0]): + if children[j] == last_cluster: + result[i] = lambdas[j] + break + else: + result[i] = point_lambda + + return result + + +cpdef np.float64_t safe_always_positive_division( + np.float64_t numerator, + np.float64_t denominator): + """ This is a helper function to divide numbers safely without getting a ZeroDivision error, the + function handles zero division by assuming the denominator is always positive + + Parameters + ---------- + numerator: floating + any floating point type + denominator: floating + any floating point type + + Returns + ------- + floating + """ + if denominator <= 0: + # prevent zero division or negative result + denominator = 1e-8 + return numerator / denominator + + +cpdef np.ndarray[np.float64_t, ndim=1] per_cluster_scores( + np.intp_t neighbor, + np.float32_t lambda_, + np.ndarray[np.intp_t, ndim=1] clusters, + np.ndarray tree, + dict max_lambda_dict, + np.ndarray cluster_tree): + + cdef np.intp_t point_cluster + cdef np.float64_t point_lambda + cdef np.float64_t max_lambda + + cdef np.intp_t i + + cdef np.ndarray[np.float64_t, ndim=1] result + + point_row = get_tree_row_with_child(tree, neighbor) + point_cluster = point_row['parent'] + point_lambda = lambda_ + max_lambda = max_lambda_dict[point_cluster] + + # Save an allocation by assigning and reusing result ... + # height = merge_height(point_cluster, point_lambda, + # clusters, cluster_tree) + result = merge_height(point_cluster, point_lambda, + clusters, cluster_tree) + + # Cythonize: result = np.exp(-(max_lambda / height)) + for i in range(result.shape[0]): + # result[i] = exp(-(max_lambda / result[i])) + result[i] = safe_always_positive_division(max_lambda, (max_lambda - result[i])) + + return result + +cpdef np.ndarray[np.float64_t, ndim=1] outlier_membership_vector(neighbor, + lambda_, clusters, tree, max_lambda_dict, cluster_tree, + softmax=True): + + cdef np.ndarray[np.float64_t, ndim=1] result + + if softmax: + result = per_cluster_scores(neighbor, lambda_, clusters, tree, + max_lambda_dict, cluster_tree) + # Scale for numerical stability, mathematically equivalent with old + # version due to the scaling with the sum in below. + result = np.exp(result - np.nanmax(result)) + #result[~np.isfinite(result)] = np.finfo(np.double).max + else: + result = per_cluster_scores(neighbor, lambda_, clusters, tree, + max_lambda_dict, cluster_tree) + + result /= result.sum() + return result + +cpdef np.float64_t prob_in_some_cluster(neighbor, lambda_, clusters, tree, + max_lambda_dict, cluster_tree): + + cdef np.ndarray[np.float64_t, ndim=1] cluster_merge_heights + + cdef np.intp_t point_cluster + cdef np.float64_t point_lambda + cdef np.float64_t max_lambda + + point_row = get_tree_row_with_child(tree, neighbor) + point_cluster = point_row['parent'] + point_lambda = lambda_ + + cluster_merge_heights = \ + merge_height(point_cluster, point_lambda, clusters, cluster_tree) + point_height = cluster_merge_heights.max() + nearest_cluster = clusters[cluster_merge_heights.argmax()] + + max_lambda = max(lambda_, max_lambda_dict[nearest_cluster]) + 1e-8 # avoid z + + return (point_height / max_lambda) + +cpdef np.ndarray[np.float64_t, ndim=2] all_points_per_cluster_scores( + np.ndarray[np.intp_t, ndim=1] clusters, + np.ndarray tree, + dict max_lambda_dict, + np.ndarray cluster_tree): + + cdef np.intp_t num_points = tree['parent'].min() + cdef np.ndarray[np.float64_t, ndim=2] result_arr + cdef np.float64_t[:, ::1] result + cdef np.intp_t point + cdef np.intp_t point_cluster + cdef np.float64_t point_lambda + cdef np.float64_t max_lambda + + cdef np.intp_t i, j + + result_arr = np.empty((num_points, clusters.shape[0]), dtype=np.float64) + result = ( + ( result_arr.data)) + + point_tree = tree[tree['child_size'] == 1] + + for i in range(point_tree.shape[0]): + point_row = point_tree[i] + point = point_row['child'] + point_cluster = point_row['parent'] + point_lambda = point_row['lambda_val'] + max_lambda = max_lambda_dict[point_cluster] + 1e-8 # avoid zero lambda + + # Can we not do a faster merge height operation here? + result_arr[point] = merge_height(point_cluster, point_lambda, + clusters, cluster_tree) + + # Cythonize: result = np.exp(-(max_lambda / height)) + for j in range(result_arr.shape[1]): + result[point][j] = exp(-(max_lambda / result[point][j])) + + return result_arr + +cpdef np.ndarray[np.float64_t, ndim=2] all_points_outlier_membership_vector( + np.ndarray[np.intp_t, ndim=1] clusters, + np.ndarray tree, + dict max_lambda_dict, + np.ndarray cluster_tree, + np.intp_t softmax=True): + + cdef np.ndarray[np.float64_t, ndim=2] per_cluster_scores + + per_cluster_scores = all_points_per_cluster_scores( + clusters, + tree, + max_lambda_dict, + cluster_tree) + if softmax: + # Scale for numerical stability, mathematically equivalent with old + # version due to the scaling with the sum in below. + result = np.exp(per_cluster_scores - np.nanmax(per_cluster_scores)) + #result[~np.isfinite(result)] = np.finfo(np.double).max + else: + result = per_cluster_scores + + row_sums = result.sum(axis=1) + result = result / row_sums[:, np.newaxis] + + return result + +cpdef all_points_prob_in_some_cluster( + np.ndarray[np.intp_t, ndim=1] clusters, + np.ndarray tree, + dict max_lambda_dict, + np.ndarray cluster_tree): + + cdef np.ndarray[np.float64_t, ndim=1] heights + cdef np.intp_t num_points = tree['parent'].min() + cdef np.ndarray[np.float64_t, ndim=1] result + cdef np.intp_t point + cdef np.intp_t point_cluster + cdef np.float64_t point_lambda + cdef np.float64_t max_lambda + + cdef np.intp_t i + + result = np.empty(num_points, dtype=np.float64) + + point_tree = tree[tree['child_size'] == 1] + + for i in range(point_tree.shape[0]): + point_row = point_tree[i] + point = point_row['child'] + point_cluster = point_row['parent'] + point_lambda = point_row['lambda_val'] + + # Can we not do a faster merge height operation here? + heights = merge_height(point_cluster, point_lambda, + clusters, cluster_tree) + max_lambda = max(max_lambda_dict[clusters[heights.argmax()]], + point_lambda) + result[point] = (heights.max() / max_lambda) + + return result diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pxd b/sklearn/cluster/_hdbscan/dist_metrics.pxd new file mode 100644 index 0000000000000..df3c8af85b105 --- /dev/null +++ b/sklearn/cluster/_hdbscan/dist_metrics.pxd @@ -0,0 +1,94 @@ +#!python +#cython: boundscheck=False +#cython: wraparound=False +#cython: cdivision=True + +import cython +cimport cython + +import numpy as np +cimport numpy as np + +from libc.math cimport fabs, sqrt, exp, cos, pow + +ctypedef np.double_t DTYPE_t +ctypedef np.intp_t ITYPE_t + +cdef enum: + DTYPECODE = np.NPY_FLOAT64 + ITYPECODE = np.NPY_INTP + +# Fused type for certain operations +ctypedef fused DITYPE_t: + ITYPE_t + DTYPE_t + +ITYPE = np.intp + +DTYPE = np.double + +###################################################################### +# Inline distance functions +# +# We use these for the default (euclidean) case so that they can be +# inlined. This leads to faster computation for the most common case +cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t tmp, d=0 + cdef np.intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += tmp * tmp + return sqrt(d) + + +cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t tmp, d=0 + cdef np.intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += tmp * tmp + return d + + +cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1: + return dist * dist + + +cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1: + return sqrt(dist) + + +###################################################################### +# DistanceMetric base class +cdef class DistanceMetric: + # The following attributes are required for a few of the subclasses. + # we must define them here so that cython's limited polymorphism will work. + # Because we don't expect to instantiate a lot of these objects, the + # extra memory overhead of this setup should not be an issue. + cdef DTYPE_t p + #cdef DTYPE_t[::1] vec + #cdef DTYPE_t[:, ::1] mat + cdef np.ndarray vec + cdef np.ndarray mat + cdef DTYPE_t* vec_ptr + cdef DTYPE_t* mat_ptr + cdef ITYPE_t size + cdef object func + cdef object kwargs + + cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1 + + cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1 + + cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 + + cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, + DTYPE_t[:, ::1] D) except -1 + + cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1 + + cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pyx b/sklearn/cluster/_hdbscan/dist_metrics.pyx new file mode 100644 index 0000000000000..7416a9ffa62ce --- /dev/null +++ b/sklearn/cluster/_hdbscan/dist_metrics.pyx @@ -0,0 +1,1147 @@ +# !python +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True + +# By Jake Vanderplas (2013) +# written for the scikit-learn project +# modified for HDBSCAN Dual Tree Boruvka algorithm +# License: BSD + +import numpy as np +cimport numpy as np +np.import_array() # required in order to use C-API + +from libc.math cimport fabs, sqrt, exp, cos, pow, log, acos, M_PI + +DTYPE = np.double +ITYPE = np.intp + + +###################################################################### +# Numpy 1.3-1.4 compatibility utilities +cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( + np.ndarray[DTYPE_t, ndim=2, mode='c'] X): + return ( X.data) + + +cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec): + return &vec[0] + + +cdef DTYPE_t* get_mat_ptr(np.ndarray[DTYPE_t, ndim=2, mode='c'] mat): + return &mat[0, 0] +###################################################################### + + +# First, define a function to get an ndarray from a memory bufffer +cdef extern from "numpy/arrayobject.h": + object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims, + int typenum, void* data) + + +cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): + # Wrap a memory buffer with an ndarray. Warning: this is not robust. + # In particular, if x is deallocated before the returned array goes + # out of scope, this could cause memory errors. Since there is not + # a possibility of this for our use-case, this should be safe. + + # Note: this Segfaults unless np.import_array() is called above + return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) + + +# some handy constants +from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin +cdef DTYPE_t INF = np.inf + + +###################################################################### +# newObj function +# this is a helper function for pickling +def newObj(obj): + return obj.__new__(obj) + + +###################################################################### +# metric mappings +# These map from metric id strings to class names +METRIC_MAPPING = {'euclidean': EuclideanDistance, + 'l2': EuclideanDistance, + 'minkowski': MinkowskiDistance, + 'p': MinkowskiDistance, + 'manhattan': ManhattanDistance, + 'cityblock': ManhattanDistance, + 'l1': ManhattanDistance, + 'chebyshev': ChebyshevDistance, + 'infinity': ChebyshevDistance, + 'seuclidean': SEuclideanDistance, + 'mahalanobis': MahalanobisDistance, + 'wminkowski': WMinkowskiDistance, + 'hamming': HammingDistance, + 'canberra': CanberraDistance, + 'braycurtis': BrayCurtisDistance, + 'matching': MatchingDistance, + 'jaccard': JaccardDistance, + 'dice': DiceDistance, + 'kulsinski': KulsinskiDistance, + 'rogerstanimoto': RogersTanimotoDistance, + 'russellrao': RussellRaoDistance, + 'sokalmichener': SokalMichenerDistance, + 'sokalsneath': SokalSneathDistance, + 'haversine': HaversineDistance, + 'cosine': ArccosDistance, + 'arccos': ArccosDistance, + 'pyfunc': PyFuncDistance} + + +def get_valid_metric_ids(L): + """Given an iterable of metric class names or class identifiers, + return a list of metric IDs which map to those classes. + + Examples + -------- + >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) + >>> sorted(L) + ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] + """ + return [key for (key, val) in METRIC_MAPPING.items() + if (val.__name__ in L) or (val in L)] + + +###################################################################### +# Distance Metric Classes +cdef class DistanceMetric: + """DistanceMetric class + + This class provides a uniform interface to fast distance metric + functions. The various metrics can be accessed via the `get_metric` + class method and the metric string identifier (see below). + + Examples + -------- + + For example, to use the Euclidean distance: + + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[0, 1, 2], + [3, 4, 5]]) + >>> dist.pairwise(X) + array([[ 0. , 5.19615242], + [ 5.19615242, 0. ]]) + + Available Metrics + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``sum(max(|x - y|))`` + "minkowski" MinkowskiDistance p ``sum(|x - y|^p)^(1/p)`` + "wminkowski" WMinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== ======================================== + identifier class name distance function + ------------ ------------------ ---------------------------------------- + "haversine" HaversineDistance 2 arcsin(sqrt(sin^2(0.5*dx) + + cos(x1)cos(x2)sin^2(0.5*dy))) + ============ ================== ======================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N : number of dimensions + - NTT : number of dims in which both values are True + - NTF : number of dims in which the first value is True, second is False + - NFT : number of dims in which the first value is False, second is True + - NFF : number of dims in which both values are False + - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "maching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance NNZ / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ + def __cinit__(self): + self.p = 2 + self.vec = np.zeros(1, dtype=DTYPE, order='c') + self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') + self.vec_ptr = get_vec_ptr(self.vec) + self.mat_ptr = get_mat_ptr(self.mat) + self.size = 1 + + def __reduce__(self): + """ + reduce method used for pickling + """ + return (newObj, (self.__class__,), self.__getstate__()) + + def __getstate__(self): + """ + get state for pickling + """ + if self.__class__.__name__ == "PyFuncDistance": + return (float(self.p), self.vec, self.mat, self.func, self.kwargs) + return (float(self.p), self.vec, self.mat) + + def __setstate__(self, state): + """ + set state for pickling + """ + self.p = state[0] + self.vec = state[1] + self.mat = state[2] + if self.__class__.__name__ == "PyFuncDistance": + self.func = state[3] + self.kwargs = state[4] + self.vec_ptr = get_vec_ptr(self.vec) + self.mat_ptr = get_mat_ptr(self.mat) + self.size = 1 + + @classmethod + def get_metric(cls, metric, **kwargs): + """Get the given distance metric from the string identifier. + + See the docstring of DistanceMetric for a list of available metrics. + + Parameters + ---------- + metric : string or class name + The distance metric to use + **kwargs + additional arguments will be passed to the requested metric + """ + if isinstance(metric, DistanceMetric): + return metric + + if callable(metric): + return PyFuncDistance(metric, **kwargs) + + # Map the metric string ID to the metric class + if isinstance(metric, type) and issubclass(metric, DistanceMetric): + pass + else: + try: + metric = METRIC_MAPPING[metric] + except: + raise ValueError("Unrecognized metric '%s'" % metric) + + # In Minkowski special cases, return more efficient methods + if metric is MinkowskiDistance: + p = kwargs.pop('p', 2) + if p == 1: + return ManhattanDistance(**kwargs) + elif p == 2: + return EuclideanDistance(**kwargs) + elif np.isinf(p): + return ChebyshevDistance(**kwargs) + else: + return MinkowskiDistance(p, **kwargs) + else: + return metric(**kwargs) + + def __init__(self): + if self.__class__ is DistanceMetric: + raise NotImplementedError("DistanceMetric is an abstract class") + + cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + """Compute the distance between vectors x1 and x2 + + This should be overridden in a base class. + """ + return -999 + + cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + """Compute the reduced distance between vectors x1 and x2. + + This can optionally be overridden in a base class. + + The reduced distance is any measure that yields the same rank as the + distance, but is more efficient to compute. For example, for the + Euclidean metric, the reduced distance is the squared-euclidean + distance. + """ + return self.dist(x1, x2, size) + + cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: + """compute the pairwise distances between points in X""" + cdef ITYPE_t i1, i2 + for i1 in range(X.shape[0]): + for i2 in range(i1, X.shape[0]): + D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) + D[i2, i1] = D[i1, i2] + return 0 + + cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, + DTYPE_t[:, ::1] D) except -1: + """compute the cross-pairwise distances between arrays X and Y""" + cdef ITYPE_t i1, i2 + if X.shape[1] != Y.shape[1]: + raise ValueError('X and Y must have the same second dimension') + for i1 in range(X.shape[0]): + for i2 in range(Y.shape[0]): + D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) + return 0 + + cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + """Convert the reduced distance to the distance""" + return rdist + + cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + """Convert the distance to the reduced distance""" + return dist + + def rdist_to_dist(self, rdist): + """Convert the Reduced distance to the true distance. + + The reduced distance, defined for some metrics, is a computationally + more efficent measure which preserves the rank of the true distance. + For example, in the Euclidean distance metric, the reduced distance + is the squared-euclidean distance. + """ + return rdist + + def dist_to_rdist(self, dist): + """Convert the true distance to the reduced distance. + + The reduced distance, defined for some metrics, is a computationally + more efficent measure which preserves the rank of the true distance. + For example, in the Euclidean distance metric, the reduced distance + is the squared-euclidean distance. + """ + return dist + + def pairwise(self, X, Y=None): + """Compute the pairwise distances between X and Y + + This is a convenience routine for the sake of testing. For many + metrics, the utilities in scipy.spatial.distance.cdist and + scipy.spatial.distance.pdist will be faster. + + Parameters + ---------- + X : array_like + Array of shape (Nx, D), representing Nx points in D dimensions. + Y : array_like (optional) + Array of shape (Ny, D), representing Ny points in D dimensions. + If not specified, then Y=X. + Returns + ------- + dist : ndarray + The shape (Nx, Ny) array of pairwise distances between points in + X and Y. + """ + cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr + cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr + cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr + + Xarr = np.asarray(X, dtype=DTYPE, order='C') + if Y is None: + Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), + dtype=DTYPE, order='C') + self.pdist(get_memview_DTYPE_2D(Xarr), + get_memview_DTYPE_2D(Darr)) + else: + Yarr = np.asarray(Y, dtype=DTYPE, order='C') + Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), + dtype=DTYPE, order='C') + self.cdist(get_memview_DTYPE_2D(Xarr), + get_memview_DTYPE_2D(Yarr), + get_memview_DTYPE_2D(Darr)) + return Darr + + +# ------------------------------------------------------------ +# Euclidean Distance +# d = sqrt(sum(x_i^2 - y_i^2)) +cdef class EuclideanDistance(DistanceMetric): + """Euclidean Distance metric + + .. math:: + D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } + """ + def __init__(self): + self.p = 2 + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return euclidean_dist(x1, x2, size) + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return euclidean_rdist(x1, x2, size) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return sqrt(rdist) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + +# ------------------------------------------------------------ +# SEuclidean Distance +# d = sqrt(sum((x_i - y_i2)^2 / v_i)) +cdef class SEuclideanDistance(DistanceMetric): + """Standardized Euclidean Distance metric + + .. math:: + D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } + """ + def __init__(self, V): + self.vec = np.asarray(V, dtype=DTYPE) + self.vec_ptr = get_vec_ptr(self.vec) + self.size = self.vec.shape[0] + self.p = 2 + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != self.size: + with gil: + raise ValueError('SEuclidean dist: size of V does not match') + cdef DTYPE_t tmp, d=0 + cdef np.intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += tmp * tmp / self.vec_ptr[j] + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return sqrt(self.rdist(x1, x2, size)) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return sqrt(rdist) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + +# ------------------------------------------------------------ +# Manhattan Distance +# d = sum(abs(x_i - y_i)) +cdef class ManhattanDistance(DistanceMetric): + """Manhattan/City-block Distance metric + + .. math:: + D(x, y) = \sum_i |x_i - y_i| + """ + def __init__(self): + self.p = 1 + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d = 0 + cdef np.intp_t j + for j in range(size): + d += fabs(x1[j] - x2[j]) + return d + + +# ------------------------------------------------------------ +# Chebyshev Distance +# d = max_i(abs(x_i), abs(y_i)) +cdef class ChebyshevDistance(DistanceMetric): + """Chebyshev/Infinity Distance + + .. math:: + D(x, y) = max_i (|x_i - y_i|) + """ + def __init__(self): + self.p = INF + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d = 0 + cdef np.intp_t j + for j in range(size): + d = fmax(d, fabs(x1[j] - x2[j])) + return d + + +# ------------------------------------------------------------ +# Minkowski Distance +# d = sum(x_i^p - y_i^p) ^ (1/p) +cdef class MinkowskiDistance(DistanceMetric): + """Minkowski Distance + + .. math:: + D(x, y) = [\sum_i (x_i - y_i)^p] ^ (1/p) + + Minkowski Distance requires p >= 1 and finite. For p = infinity, + use ChebyshevDistance. + Note that for p=1, ManhattanDistance is more efficient, and for + p=2, EuclideanDistance is more efficient. + """ + def __init__(self, p): + if p < 1: + raise ValueError("p must be greater than 1") + elif np.isinf(p): + raise ValueError("MinkowskiDistance requires finite p. " + "For p=inf, use ChebyshevDistance.") + self.p = p + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d=0 + cdef np.intp_t j + for j in range(size): + d += pow(fabs(x1[j] - x2[j]), self.p) + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return pow(self.rdist(x1, x2, size), 1. / self.p) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return pow(rdist, 1. / self.p) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return pow(dist, self.p) + + def rdist_to_dist(self, rdist): + return rdist ** (1. / self.p) + + def dist_to_rdist(self, dist): + return dist ** self.p + + +# ------------------------------------------------------------ +# W-Minkowski Distance +# d = sum(w_i * (x_i^p - y_i^p)) ^ (1/p) +cdef class WMinkowskiDistance(DistanceMetric): + """Weighted Minkowski Distance + + .. math:: + D(x, y) = [\sum_i w_i (x_i - y_i)^p] ^ (1/p) + + Weighted Minkowski Distance requires p >= 1 and finite. + + Parameters + ---------- + p : int + The order of the norm of the difference :math:`{||u-v||}_p`. + w : (N,) array_like + The weight vector. + + """ + def __init__(self, p, w): + if p < 1: + raise ValueError("p must be greater than 1") + elif np.isinf(p): + raise ValueError("WMinkowskiDistance requires finite p. " + "For p=inf, use ChebyshevDistance.") + self.p = p + self.vec = np.asarray(w, dtype=DTYPE) + self.vec_ptr = get_vec_ptr(self.vec) + self.size = self.vec.shape[0] + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != self.size: + with gil: + raise ValueError('WMinkowskiDistance dist: ' + 'size of w does not match') + cdef DTYPE_t d=0 + cdef np.intp_t j + for j in range(size): + d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p) + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return pow(self.rdist(x1, x2, size), 1. / self.p) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return pow(rdist, 1. / self.p) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return pow(dist, self.p) + + def rdist_to_dist(self, rdist): + return rdist ** (1. / self.p) + + def dist_to_rdist(self, dist): + return dist ** self.p + + +# ------------------------------------------------------------ +# Mahalanobis Distance +# d = sqrt( (x - y)^T V^-1 (x - y) ) +cdef class MahalanobisDistance(DistanceMetric): + """Mahalanobis Distance + + .. math:: + D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } + + Parameters + ---------- + V : array_like + Symmetric positive-definite covariance matrix. + The inverse of this matrix will be explicitly computed. + VI : array_like + optionally specify the inverse directly. If VI is passed, + then V is not referenced. + """ + def __init__(self, V=None, VI=None): + if VI is None: + VI = np.linalg.inv(V) + if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: + raise ValueError("V/VI must be square") + + self.mat = np.asarray(VI, dtype=float, order='C') + self.mat_ptr = get_mat_ptr(self.mat) + + self.size = self.mat.shape[0] + + # we need vec as a work buffer + self.vec = np.zeros(self.size, dtype=DTYPE) + self.vec_ptr = get_vec_ptr(self.vec) + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != self.size: + with gil: + raise ValueError('Mahalanobis dist: size of V does not match') + + cdef DTYPE_t tmp, d = 0 + cdef np.intp_t i, j + + # compute (x1 - x2).T * VI * (x1 - x2) + for i in range(size): + self.vec_ptr[i] = x1[i] - x2[i] + + for i in range(size): + tmp = 0 + for j in range(size): + tmp += self.mat_ptr[i * size + j] * self.vec_ptr[j] + d += tmp * self.vec_ptr[i] + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return sqrt(self.rdist(x1, x2, size)) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return sqrt(rdist) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + +# ------------------------------------------------------------ +# Hamming Distance +# d = N_unequal(x, y) / N_tot +cdef class HammingDistance(DistanceMetric): + """Hamming Distance + + Hamming distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int n_unequal = 0 + cdef np.intp_t j + for j in range(size): + if x1[j] != x2[j]: + n_unequal += 1 + return float(n_unequal) / size + + +# ------------------------------------------------------------ +# Canberra Distance +# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] +cdef class CanberraDistance(DistanceMetric): + """Canberra Distance + + Canberra distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t denom, d = 0 + cdef np.intp_t j + for j in range(size): + denom = fabs(x1[j]) + fabs(x2[j]) + if denom > 0: + d += fabs(x1[j] - x2[j]) / denom + return d + + +# ------------------------------------------------------------ +# Bray-Curtis Distance +# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] +cdef class BrayCurtisDistance(DistanceMetric): + """Bray-Curtis Distance + + Bray-Curtis distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t num = 0, denom = 0 + cdef np.intp_t j + for j in range(size): + num += fabs(x1[j] - x2[j]) + denom += fabs(x1[j]) + fabs(x2[j]) + if denom > 0: + return num / denom + else: + return 0.0 + + +# ------------------------------------------------------------ +# Jaccard Distance (boolean) +# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) +cdef class JaccardDistance(DistanceMetric): + """Jaccard Distance + + Jaccard Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_eq = 0, nnz = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + nnz += (tf1 or tf2) + n_eq += (tf1 and tf2) + if nnz == 0: + return 0.0 + return (nnz - n_eq) * 1.0 / nnz + + +# ------------------------------------------------------------ +# Matching Distance (boolean) +# D(x, y) = n_neq / n +cdef class MatchingDistance(DistanceMetric): + """Matching Distance + + Matching Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{N} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return n_neq * 1. / size + + +# ------------------------------------------------------------ +# Dice Distance (boolean) +# D(x, y) = n_neq / (2 * ntt + n_neq) +cdef class DiceDistance(DistanceMetric): + """Dice Distance + + Dice Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0, ntt = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + ntt += (tf1 and tf2) + n_neq += (tf1 != tf2) + return n_neq / (2.0 * ntt + n_neq) + + +# ------------------------------------------------------------ +# Kulsinski Distance (boolean) +# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) +cdef class KulsinskiDistance(DistanceMetric): + """Kulsinski Distance + + Kulsinski Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + ntt += (tf1 and tf2) + return (n_neq - ntt + size) * 1.0 / (n_neq + size) + + +# ------------------------------------------------------------ +# Rogers-Tanimoto Distance (boolean) +# D(x, y) = 2 * n_neq / (n + n_neq) +cdef class RogersTanimotoDistance(DistanceMetric): + """Rogers-Tanimoto Distance + + Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return (2.0 * n_neq) / (size + n_neq) + + +# ------------------------------------------------------------ +# Russell-Rao Distance (boolean) +# D(x, y) = (n - ntt) / n +cdef class RussellRaoDistance(DistanceMetric): + """Russell-Rao Distance + + Russell-Rao Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N - N_{TT}}{N} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, ntt = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + ntt += (tf1 and tf2) + return (size - ntt) * 1. / size + + +# ------------------------------------------------------------ +# Sokal-Michener Distance (boolean) +# D(x, y) = 2 * n_neq / (n + n_neq) +cdef class SokalMichenerDistance(DistanceMetric): + """Sokal-Michener Distance + + Sokal-Michener Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return (2.0 * n_neq) / (size + n_neq) + + +# ------------------------------------------------------------ +# Sokal-Sneath Distance (boolean) +# D(x, y) = n_neq / (0.5 * n_tt + n_neq) +cdef class SokalSneathDistance(DistanceMetric): + """Sokal-Sneath Distance + + Sokal-Sneath Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + ntt += (tf1 and tf2) + return n_neq / (0.5 * ntt + n_neq) + + +# ------------------------------------------------------------ +# Haversine Distance (2 dimensional) +# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) +# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} +cdef class HaversineDistance(DistanceMetric): + """Haversine (Spherical) Distance + + The Haversine distance is the angular distance between two points on + the surface of a sphere. The first distance of each point is assumed + to be the latitude, the second is the longitude, given in radians. + The dimension of the points must be 2: + + .. math:: + D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2) + + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] + """ + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != 2: + with gil: + raise ValueError("Haversine distance only valid " + "in 2 dimensions") + cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) + cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != 2: + with gil: + raise ValueError("Haversine distance only valid in" + " 2 dimensions") + cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) + cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) + return 2 * asin(sqrt(sin_0 * sin_0 + + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return 2 * asin(sqrt(rdist)) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + cdef DTYPE_t tmp = sin(0.5 * dist) + return tmp * tmp + + def rdist_to_dist(self, rdist): + return 2 * np.arcsin(np.sqrt(rdist)) + + def dist_to_rdist(self, dist): + tmp = np.sin(0.5 * dist) + return tmp * tmp + + +# ------------------------------------------------------------ +# Yule Distance (boolean) +# D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft) +# [This is not a true metric, so we will leave it out.] +# +# cdef class YuleDistance(DistanceMetric): +# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0 +# cdef np.intp_t j +# for j in range(size): +# tf1 = x1[j] != 0 +# tf2 = x2[j] != 0 +# ntt += tf1 and tf2 +# ntf += tf1 and (tf2 == 0) +# nft += (tf1 == 0) and tf2 +# nff = size - ntt - ntf - nft +# return (2.0 * ntf * nft) / (ntt * nff + ntf * nft) + + +# ------------------------------------------------------------ +# Cosine Distance +# D(x, y) = dot(x, y) / (|x| * |y|) +# [This is not a true metric, so we will leave it out. Use the `arccos` +# distance instead] + +# cdef class CosineDistance(DistanceMetric): +# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, +# ITYPE_t size) nogil except -1: +# cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 +# cdef np.intp_t j +# for j in range(size): +# d += x1[j] * x2[j] +# norm1 += x1[j] * x1[j] +# norm2 += x2[j] * x2[j] +# return 1.0 - d / sqrt(norm1 * norm2) + +# ------------------------------------------------------------ +# Arccos Distance +# D(x, y) = arccos(dot(x, y) / (|x| * |y|)) / PI + +cdef class ArccosDistance(DistanceMetric): + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 + cdef np.intp_t j + for j in range(size): + d += x1[j] * x2[j] + norm1 += x1[j] * x1[j] + norm2 += x2[j] * x2[j] + return acos(d / sqrt(norm1 * norm2)) / M_PI + + +# ------------------------------------------------------------ +# Correlation Distance +# D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|) +# [This is not a true metric, so we will leave it out.] +# +# cdef class CorrelationDistance(DistanceMetric): +# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0 +# cdef DTYPE_t tmp1, tmp2 +# +# cdef np.intp_t i +# for i in range(size): +# mu1 += x1[i] +# mu2 += x2[i] +# mu1 /= size +# mu2 /= size +# +# for i in range(size): +# tmp1 = x1[i] - mu1 +# tmp2 = x2[i] - mu2 +# x1nrm += tmp1 * tmp1 +# x2nrm += tmp2 * tmp2 +# x1Tx2 += tmp1 * tmp2 +# +# return (1. - x1Tx2) / sqrt(x1nrm * x2nrm) + + +# ------------------------------------------------------------ +# User-defined distance +# +cdef class PyFuncDistance(DistanceMetric): + """PyFunc Distance + A user-defined distance + Parameters + ---------- + func : function + func should take two numpy arrays as input, and return a distance. + """ + def __init__(self, func, **kwargs): + self.func = func + self.kwargs = kwargs + + # in cython < 0.26, GIL was required to be acquired during definition of + # the function and inside the body of the function. This behaviour is not + # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The + # only way to be back compatible is to inherit `dist` from the base class + # without GIL and called an inline `_dist` which acquire GIL. + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return self._dist(x1, x2, size) + + cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) except -1 with gil: + cdef np.ndarray x1arr + cdef np.ndarray x2arr + x1arr = _buffer_to_ndarray(x1, size) + x2arr = _buffer_to_ndarray(x2, size) + d = self.func(x1arr, x2arr, **self.kwargs) + try: + # Cython generates code here that results in a TypeError + # if d is the wrong type. + return d + except TypeError: + raise TypeError("Custom distance function must accept two " + "vectors and return a float.") + + +cdef inline double fmax(double a, double b) nogil: + return max(a, b) diff --git a/sklearn/cluster/_hdbscan/flat.py b/sklearn/cluster/_hdbscan/flat.py new file mode 100644 index 0000000000000..9455ffb963364 --- /dev/null +++ b/sklearn/cluster/_hdbscan/flat.py @@ -0,0 +1,983 @@ +"""flat.py + +Provides alternative functions to hdbscan.HDBSCAN and others to +1. Allow prediction on a flat clustering by specifying 'n_clusters'. + This is done by choosing the best cluster_selection_epsilon that produces + the required number of clusters without adding unnecessary outliers. +2. Makes approximate_predict, membership_vector, and + all_points_membership_vectors consistent with cluster_selection_epsilon + +Provides the following functions: +================================== +HDBSCAN_flat: trained HDBSCAN instance with 'n_clusters' clusters + The attributes (labels, probabilities, prediction_data) are tuned to + produce 'n_clusters' clusters. + +approximate_predict_flat: labels and probabilities for novel points + Allows selecting n_clusters for novel points, or using the + original clustering (potentially specified using cluster_selection_epsilon) + +membership_vector_flat: Soft-clustering probabilities for novel points + Similar to approximate_predict_flat, but for soft-clustering. + **Use with caution** + +all_points_membership_vectors_flat: Soft-clustering probabilities + Similar to membership_vector_flat, but for points in training set + **Use with caution** +""" + +import copy +from warnings import warn + +import numpy as np +from ._hdbscan_tree import compute_stability, get_cluster_tree_leaves +from .hdbscan_ import HDBSCAN, _tree_to_labels +from .plots import _bfs_from_cluster_tree +from .prediction import ( + PredictionData, + _find_cluster_and_probability, + _find_neighbor_and_lambda, +) +from ._prediction_utils import ( + get_tree_row_with_child, + dist_membership_vector, + outlier_membership_vector, + prob_in_some_cluster, + all_points_dist_membership_vector, + all_points_outlier_membership_vector, + all_points_prob_in_some_cluster, +) + + +def HDBSCAN_flat( + X, + n_clusters=None, + cluster_selection_epsilon=0.0, + clusterer=None, + inplace=False, + **kwargs, +): + """ + Train a HDBSCAN clusterer by specifying n_clusters. + Or, modify a trained clusterer to return specific n_clusters. + + Parameters + ---------- + X: array-like + Data to be passed to HDBSCAN for training. + + n_clusters: int, default=None + Number of clusters to produce. + If None, revert to default HDBSCAN + + cluster_selection_epsilon: float, default=0. + core-distance below which to stop splitting clusters. + This can indirectly impose n_clusters. + This argument is ignored if n_clusters is supplied. + + clusterer: HDBSCAN, default=None + If supplied, modify this clusterer to produce n_clusters clusters. + + inplace: bool, default=False + If 'clusterer' parameter is supplied, and inplace is True, + modify the previous clusterer inplace. + If False, return a modified copy of the previous clusterer. + + **kwargs: keyword arguments + All init arguments for HDBSCAN + + Returns + ------- + new_clusterer: HDBSCAN + New HDBSCAN instance; returned irrespective of inplace=True or False + + Usage + ----- + # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters + clusterer = HDBSCAN_flat(X_train, n_clusters=7, + min_cluster_size=12, min_samples=8) + labels = clusterer.labels_ + proba = clusterer.probabilities_ + + # Use a previously initialized/trained HDBSCAN + old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8) + clusterer = HDBSCAN_flat(X_train, n_clusters=7, + clusterer=old_clusterer, inplace=True) + labels = clusterer.labels_ + proba = clusterer.probabilities_ + + See Also + --------- + :py:func:`hdbscan.HDBSCAN` + :py:func:`re_init` + """ + # Handle the trivial case first. + if (n_clusters is None) and (cluster_selection_epsilon == 0.0): + if (not isinstance(clusterer, HDBSCAN)) or (not inplace): + # Always generate prediction_data to avoid later woes + kwargs["prediction_data"] = True + new_clusterer = HDBSCAN(**kwargs) + else: + new_clusterer = clusterer + new_clusterer.prediction_data = True + + new_clusterer.fit(X) + return new_clusterer + + if (n_clusters is not None) and (cluster_selection_epsilon != 0.0): + warn( + f"'cluster_selection_epsilon' (={cluster_selection_epsilon})" + " is ignored when 'n_clusters' is supplied." + ) + cluster_selection_epsilon = 0.0 + # This will later be chosen according to n_clusters + + if not isinstance(clusterer, HDBSCAN): + # Initialize and train clusterer if one was not previously supplied. + # Always generate prediction data + kwargs["prediction_data"] = True + new_clusterer = HDBSCAN(**kwargs) + # We do not pass cluster_selection_epsilon here. + # While this adds unnecessary computation, it makes the code + # easier to read and debug. + new_clusterer.fit(X) + else: + if inplace: + new_clusterer = clusterer + else: + new_clusterer = copy.deepcopy(clusterer) + + new_clusterer.prediction_data = True + + # Train on 'X'. Do this even if the supplied clusterer was trained, + # because we want to make sure it fits 'X'. + new_clusterer.prediction_data = True + new_clusterer.fit(X) + + if new_clusterer.cluster_selection_method == "eom": + max_eom_clusters = len(new_clusterer.condensed_tree_._select_clusters()) + + # Pick an epsilon value right after a split produces n_clusters, + # and the don't split further for smaller epsilon (larger lambda) + if n_clusters is not None: + if (new_clusterer.cluster_selection_method == "eom") and ( + n_clusters > max_eom_clusters + ): + warn( + f"Cannot predict more than {max_eom_clusters} with cluster " + "selection method 'eom'. Changing to method 'leaf'..." + ) + new_clusterer.cluster_selection_method = "leaf" + epsilon = select_epsilon(new_clusterer.condensed_tree_, n_clusters) + else: + # Or use the specified cluster_selection_epsilon + epsilon = cluster_selection_epsilon + + new_clusterer.cluster_selection_epsilon = float(epsilon) + + # Extract tree related stuff, in order to re-assign labels + single_linkage_tree = new_clusterer.single_linkage_tree_ + single_linkage_tree = single_linkage_tree.to_numpy() + min_cluster_size = new_clusterer.min_cluster_size + cluster_selection_method = new_clusterer.cluster_selection_method + allow_single_cluster = new_clusterer.allow_single_cluster + match_reference_implementation = False + + # Get labels according to the required cluster_selection_epsilon + output = _tree_to_labels( + None, + single_linkage_tree, + min_cluster_size, + cluster_selection_method, + allow_single_cluster, + match_reference_implementation, + cluster_selection_epsilon=epsilon, + ) + + # Reflect the related changes in HDBSCAN. + ( + new_clusterer.labels_, + new_clusterer.probabilities_, + new_clusterer.cluster_persistence_, + new_clusterer._condensed_tree, + new_clusterer._single_linkage_tree, + ) = output + + # PredictionData attached to HDBSCAN should also change. + # A function re_init is defined in this module to handle this. + re_init( + new_clusterer.prediction_data_, + new_clusterer.condensed_tree_, + cluster_selection_epsilon=epsilon, + ) + return new_clusterer + + +def approximate_predict_flat( + clusterer, + points_to_predict, + n_clusters=None, + cluster_selection_epsilon=None, + prediction_data=None, + return_prediction_data=False, +): + """ + Predict the cluster label of new points at a particular flat clustering, + specified by n_clusters. This is a modified version of + hdbscan.approximate_predict to allow selection of n_clusters. + + Parameters + ---------- + clusterer : HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + + points_to_predict : array, or array-like (n_samples, n_features) + The new data points to predict cluster labels for. They should + have the same dimensionality as the original dataset over which + clusterer was fit. + + n_clusters: int, default=None + The number of clusters to have in the flat clustering + (over the training data, not points_to_predict) + Ignored when prediction_data is supplied. + + cluster_selection_epsilon: float, default=None + core-distance below which to stop splitting clusters. + This can indirectly impose n_clusters. + This argument is ignored if n_clusters is supplied. + + prediction_data: PredictionData, default=None + If supplied, use this to predict clusters for points_to_predict. + This allows predicting on multiple datasets without corrupting + prediction data associated with clusterer. + + If neither n_clusters, nor prediction_data are supplied, + then the prediction_data associated with clusterer is used. + + return_prediction_data: bool, default=False + If True, return prediction_data along with labels and proba. + + Returns + ------- + labels : array (n_samples,) + The predicted labels of the ``points_to_predict`` + + probabilities : array (n_samples,) + The soft cluster scores for each of the ``points_to_predict`` + + prediction_data: PredictionData, optional + prediction_data used to predict. + Returned if return_prediciton_data is set to True. + + + Usage + ----- + # From a fitted HDBSCAN model, predict for n_clusters=5 + labels, proba = approximate_predict_flat( + clusterer, X_predict, n_clusters=5) + + # Store prediciton data for later use. + labels, proba, pred_data = approximate_predict_flat( + clusterer, X_predict, n_clusters=5, + return_prediction_data=True) + # and use this prediction data to predict on new points + labels1, proba1 = approximate_predict_flat( + clusterer, X_pred1, + prediction_data=pred_data) + + See Also + --------- + :py:func:`hdbscan.prediction.approximate_predict` + """ + # Get number of fitted clusters for later use. + n_clusters_fit = np.sum(np.unique(clusterer.labels_) >= 0) + if n_clusters is not None: + n_clusters = int(n_clusters) # Ensure n_clusters is int + + # We'll need the condensed tree later... + condensed_tree = clusterer.condensed_tree_ + + # If none of the three arguments: prediction_data, n_clusters, + # and cluster_selection_epsilon are supplied, + # then use clusterer's prediciton data directly + if ( + (prediction_data is None) + and ((n_clusters is None) or (n_clusters == n_clusters_fit)) + and (cluster_selection_epsilon is None) + ): + prediction_data = clusterer.prediction_data_ + + # If either of n_clusters or cluster_selection_epsilon were supplied, + # then build prediction data from these by modifying clusterer's + if not isinstance(prediction_data, PredictionData): + if clusterer.prediction_data_ is None: + raise ValueError( + "Clusterer does not have prediction data!" + " Try fitting with prediction_data=True set," + " or run generate_prediction_data on the clusterer" + ) + # Get prediction data from clusterer + prediction_data = clusterer.prediction_data_ + # Modify prediction_data to reflect new n_clusters + # First, make a copy of prediction data to avoid modifying source + prediction_data = copy.deepcopy(prediction_data) + # Cluster selection method is hold by condensed_tree. + # Change from 'eom' to 'leaf' if n_clusters is too large. + if (condensed_tree.cluster_selection_method == "eom") and ( + (n_clusters is not None) and (n_clusters > n_clusters_fit) + ): + warn( + f"Cannot predict more than {n_clusters_fit} with cluster " + "selection method 'eom'. Changing to method 'leaf'..." + ) + condensed_tree.cluster_selection_method = "leaf" + # This change does not affect the tree associated with 'clusterer' + # Re-initialize prediction_data for the specified n_clusters or epsilon + re_init( + prediction_data, + condensed_tree, + n_clusters=n_clusters, + cluster_selection_epsilon=cluster_selection_epsilon, + ) + + # ============================================================ + # Now we're ready to use prediction_data + # The rest of the code is copied from HDBSCAN's approximate_predict, + # but modified to use prediction_data instead of clusterer's attribute + points_to_predict = np.asarray(points_to_predict) + + if points_to_predict.shape[1] != prediction_data.raw_data.shape[1]: + raise ValueError("New points dimension does not match fit data!") + + if prediction_data.cluster_tree.shape[0] == 0: + warn( + "Prediction data does not have any defined clusters, new data" + " will be automatically predicted as noise." + ) + labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) + probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) + if return_prediction_data: + return labels, probabilities, prediction_data + else: + return labels, probabilities + + labels = np.empty(points_to_predict.shape[0], dtype=np.int32) + probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) + + min_samples = clusterer.min_samples or clusterer.min_cluster_size + neighbor_distances, neighbor_indices = prediction_data.tree.query( + points_to_predict, k=2 * min_samples + ) + + for i in range(points_to_predict.shape[0]): + label, prob = _find_cluster_and_probability( + condensed_tree, + prediction_data.cluster_tree, + neighbor_indices[i], + neighbor_distances[i], + prediction_data.core_distances, + prediction_data.cluster_map, + prediction_data.max_lambdas, + min_samples, + ) + labels[i] = label + probabilities[i] = prob + + if return_prediction_data: + return labels, probabilities, prediction_data + else: + return labels, probabilities + + +def membership_vector_flat( + clusterer, + points_to_predict, + prediction_data=None, + n_clusters=None, + cluster_selection_epsilon=0.0, +): + """ + (Adaptation of hdbscan's membership_vector for n_clusters, epsilon) + Predict soft cluster membership probabilities; + a vector for each point in ``points_to_predict`` that gives + a probability that the given point is a member of a cluster + for each of the selected clusters of the ``clusterer``. + + Parameters + ---------- + clusterer: HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + + points_to_predict: array, or array-like (n_samples, n_features) + The new data points to predict cluster labels for. They should + have the same dimensionality as the original dataset over which + clusterer was fit. + + prediction_data: PredictionData, default=None + Prediction data associated with HDBSCAN for some flat clustering + + n_clusters: int, default=None + Number of clusters over which to compute membership probabilities. + These clusters are obtained as a flat clustering at some + cluster_selection_epsilon. + + cluster_selection_epsilon: float, default=0. + core-distance below which to stop splitting clusters. + This can indirectly impose n_clusters. + This argument is ignored if n_clusters is supplied. + + Note: If neither n_clusters nor cluster_selection_epsilon are supplied, + the clusterer's original clustering is used. + + Returns + ------- + membership_vectors : array (n_samples, n_clusters) + The probability that point ``i`` is a member of cluster ``j`` is + in ``membership_vectors[i, j]``. + + See Also + -------- + :py:func:`hdbscan.predict.membership_vector` + :py:func:`hdbscan.predict.all_points_membership_vectors` + """ + points_to_predict = points_to_predict.astype(np.float64) + # Extract condensed tree for later use + condensed_tree = clusterer.condensed_tree_ + + # Choose flat clustering based on cluster_selection_epsilon or n_clusters. + # If neither is specified, use clusterer's cluster_selection_epsilon + if ( + (n_clusters is None) + and (cluster_selection_epsilon == 0.0) + and (prediction_data is None) + ): + epsilon = clusterer.cluster_selection_epsilon + # Use the same prediction_data as clusterer's + prediction_data = clusterer.prediction_data_ + elif prediction_data is None: + if n_clusters is not None: + # Compute cluster_selection_epsilon so that a flat clustering + # produces a specified number of n_clusters + # With method 'eom', we may fail to get 'n_clusters' clusters. So, + try: + epsilon = select_epsilon(condensed_tree, n_clusters) + except AssertionError: + warn( + f"Failed to predict {n_clusters} clusters with " + "cluster selection method 'eom'. Switching to 'leaf'..." + ) + condensed_tree.cluster_selection_method = "leaf" + epsilon = select_epsilon(condensed_tree, n_clusters) + else: + epsilon = cluster_selection_epsilon + # Create another instance of prediction_data that is consistent + # with the selected value of epsilon. + prediction_data = copy.deepcopy(clusterer.prediction_data_) + re_init(prediction_data, condensed_tree, cluster_selection_epsilon=epsilon) + + # Flat clustering from prediction data + clusters = clusters_from_prediction_data(prediction_data) + + # Initialize probabilities + result = np.empty((points_to_predict.shape[0], clusters.shape[0]), dtype=np.float64) + + # k-NN for prediciton points to training set + min_samples = clusterer.min_samples or clusterer.min_cluster_size + neighbor_distances, neighbor_indices = prediction_data.tree.query( + points_to_predict, k=2 * min_samples + ) + + # Loop over prediction points to compute probabilities + for i in range(points_to_predict.shape[0]): + # We need to find where in the tree the new point would go + # for the purposes of outlier membership approximation + nearest_neighbor, lambda_ = _find_neighbor_and_lambda( + neighbor_indices[i], + neighbor_distances[i], + prediction_data.core_distances, + min_samples, + ) + + # Find row in tree where nearest neighbor drops out, + # so we can get a lambda value for the nearest neighbor + neighbor_tree_row = get_tree_row_with_child( + condensed_tree._raw_tree, nearest_neighbor + ) + + # Assign lambda as min(lambda-to-neighbor, neighbor's-lambda-to-tree) + # Equivalently, this assigns core distance for prediction point as + # max(dist-to-neighbor, neighbor's-dist-to-tree) + if neighbor_tree_row["lambda_val"] <= lambda_: + lambda_ = neighbor_tree_row["lambda_val"] + + # Probabilities based on distance to closest exemplar in each cluster: + # Use new prediction_data that points to exemplars that are specific + # to the choice of n_clusters + distance_vec = dist_membership_vector( + points_to_predict[i], prediction_data.exemplars, prediction_data.dist_metric + ) + # Probabilities based on how long the nearest exemplar persists in + # each cluster (with respect to most persistent exemplar) + # Use new clusters that are defined by the choice of n_clusters. + outlier_vec = outlier_membership_vector( + nearest_neighbor, + lambda_, + clusters, + condensed_tree._raw_tree, + prediction_data.leaf_max_lambdas, + prediction_data.cluster_tree, + ) + + # Merge the two probabilities to produce a single set of probabilities + result[i] = distance_vec**0.5 * outlier_vec**2.0 + result[i] /= result[i].sum() + + # Include probability that the nearest neighbor belongs to a cluster + result[i] *= prob_in_some_cluster( + nearest_neighbor, + lambda_, + clusters, + condensed_tree._raw_tree, + prediction_data.leaf_max_lambdas, + prediction_data.cluster_tree, + ) + + # Rename variable so it's easy to understand what's being returned + membership_vectors = result + return membership_vectors + + +def all_points_membership_vectors_flat( + clusterer, prediction_data=None, n_clusters=None, cluster_selection_epsilon=None +): + """ + (Adaptation of hdbscan's all_points_membership_vector + for n_clusters, epsilon) + Predict soft cluster membership vectors for all points in the + original dataset the clusterer was trained on. This function is more + efficient by making use of the fact that all points are already in the + condensed tree, and processing in bulk. + + Parameters + ---------- + clusterer : HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + This method does not work if the clusterer was trained + with ``metric='precomputed'``. + + prediction_data: PredictionData, default=None + Prediction data associated with HDBSCAN for some flat clustering + + n_clusters: int, optional, default=None + Number of clusters over which to compute membership probabilities. + These clusters are obtained as a flat clustering at some + cluster_selection_epsilon. + + cluster_selection_epsilon: float, optional, default=None + core-distance below which to stop splitting clusters. + This can indirectly impose n_clusters. + This argument is ignored if n_clusters is supplied. + + Note: If neither n_clusters nor cluster_selection_epsilon are supplied, + the clusterer's original clustering is used. + + Returns + ------- + membership_vectors : array (n_samples, n_clusters) + The probability that point ``i`` of the original dataset is a member of + cluster ``j`` is in ``membership_vectors[i, j]``. + See Also + -------- + :py:func:`hdbscan.prediction.all_points_membership_vectors` + :py:func:`hdbscan.prediction.membership_vector` + """ + # Extract condensed tree for later use + condensed_tree = clusterer.condensed_tree_ + + # Choose flat clustering based on cluster_selection_epsilon or n_clusters. + # If neither is specified, use clusterer's cluster_selection_epsilon + if (n_clusters is None) and (cluster_selection_epsilon is None): + epsilon = clusterer.cluster_selection_epsilon + # Use the same prediction_data as clusterer's + prediction_data = clusterer.prediction_data_ + elif prediction_data is None: + if n_clusters is not None: + # Compute cluster_selection_epsilon so that a flat clustering + # produces a specified number of n_clusters + # With method 'eom', we may fail to get 'n_clusters' clusters. So, + try: + epsilon = select_epsilon(condensed_tree, n_clusters) + except AssertionError: + warn( + f"Failed to predict {n_clusters} clusters with " + "cluster selection method 'eom'. Switching to 'leaf'..." + ) + condensed_tree.cluster_selection_method = "leaf" + epsilon = select_epsilon(condensed_tree, n_clusters) + else: + epsilon = cluster_selection_epsilon + # Create another instance of prediction_data that is consistent + # with the selected value of epsilon. + prediction_data = copy.deepcopy(clusterer.prediction_data_) + re_init(prediction_data, condensed_tree, cluster_selection_epsilon=epsilon) + + # Flat clustering at the chosen epsilon from prediction_data + clusters = clusters_from_prediction_data(prediction_data) + + all_points = prediction_data.raw_data + + # When no clusters found, return array of 0's + if clusters.size == 0: + return np.zeros(all_points.shape[0]) + + # Probabilities based on distance to closest exemplar in each cluster: + # Use new prediction_data that points to exemplars that are specific + # to the choice of n_clusters + distance_vecs = all_points_dist_membership_vector( + all_points, prediction_data.exemplars, prediction_data.dist_metric + ) + + # Probabilities based on how long the point persists in + # each cluster (with respect to most persistent exemplar) + # Use new clusters that are defined by the choice of n_clusters. + outlier_vecs = all_points_outlier_membership_vector( + clusters, + condensed_tree._raw_tree, + prediction_data.leaf_max_lambdas, + prediction_data.cluster_tree, + ) + + # Include probability that the point belongs to a cluster + in_cluster_probs = all_points_prob_in_some_cluster( + clusters, + condensed_tree._raw_tree, + prediction_data.leaf_max_lambdas, + prediction_data.cluster_tree, + ) + + # Aggregate the three probabilities to produce membership vectors + result = distance_vecs * outlier_vecs + row_sums = result.sum(axis=1) + result = result / row_sums[:, np.newaxis] + result *= in_cluster_probs[:, np.newaxis] + + # Re-name variable to clarify what's being returned. + membership_vectors = result + return membership_vectors + + +def select_epsilon(condensed_tree, n_clusters): + """ + Pick optimal epsilon from condensed tree based on n_clusters, + calls functions specific to 'eom' or 'leaf' selection methods + """ + cluster_selection_method = condensed_tree.cluster_selection_method + if cluster_selection_method == "eom": + return select_epsilon_eom(condensed_tree, n_clusters) + if cluster_selection_method == "leaf": + return select_epsilon_leaf(condensed_tree, n_clusters) + raise ValueError( + 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' + ) + + +def select_epsilon_eom(condensed_tree, n_clusters): + """ + Select epsilon so that persistence-based clustering, + after truncating the tree at the above epsilon, + has exactly 'n_clusters' clusters + """ + # With method 'eom', max clusters are produced for epsilon=0, + # as computed by + eom_base_clusters = condensed_tree._select_clusters() + max_clusters = len(eom_base_clusters) + # Increasing epsilon can only reduce the number of ouput clusters. + + assert n_clusters <= max_clusters, ( + f"Cannot produce more than {max_clusters} with method 'eom'. " + + "Use method 'leaf' instead to extract flat clustering." + ) + + tree = condensed_tree._raw_tree + # To select epsilon, consider all values where clusters are split + cluster_lambdas = tree["lambda_val"][tree["child_size"] > 1] + candidate_epsilons = 1.0 / np.unique(cluster_lambdas) - 1.0e-12 + # Subtract the extra e-12 to avoid numerical errors in comparison + # Then, we avoid splitting for all epsilon below this. + candidate_epsilons = np.sort(candidate_epsilons)[::-1] + + for epsilon in candidate_epsilons: + sel_clusters = _new_select_clusters(condensed_tree, epsilon) + if len(sel_clusters) == n_clusters: + break + else: + raise RuntimeError("Could not find epsilon") + + return epsilon + + +def select_epsilon_leaf(condensed_tree, n_clusters): + """ + Select epsilon so that the leaves of condensed tree, + after truncating at the above epsilon, + has exactly 'n_clusters' clusters + """ + # Use an epsilon value that produces the right number of clusters. + # The condensed tree of HDBSCAN has this information. + # Extract the lambda levels (=1/distance) from the condensed tree + lambdas = condensed_tree._raw_tree["lambda_val"] + # We don't want values that produce a large cluster and + # just one or two individual points. + child_sizes = condensed_tree._raw_tree["child_size"] + child_sizes = child_sizes.astype(int) + # Keep only those lambda values corresponding to cluster separation; + # i.e., with child_sizes > 1 + lambdas = lambdas[child_sizes > 1] + # Get the unique values, because when two clusters fall out of one, + # the entry with lambda is repeated. + lambdas = np.unique(lambdas.astype(float)) + if n_clusters > len(lambdas) + 1: + warn( + f"HDBSCAN can only compute {len(lambdas)+1} clusters. " + f"Setting n_clusters to {len(lambdas)+1}..." + ) + n_clusters = len(lambdas) + 1 + + # lambda values are sorted by np.unique. + # Now, get epsilon (distance threshold) as 1/lambda + epsilon = 1.0 / lambdas[n_clusters - 2] + # At this epsilon, n_clusters have been split. + # Stop splits at epsilons smaller than this. + # To allow for numerical errors, + return epsilon - 1.0e-12 + + +def re_init(predData, condensed_tree, n_clusters=None, cluster_selection_epsilon=0.0): + """ + Modify PredictionData of HDBSCAN to account for epsilon. + epsilon is the cluster_selection_epsilon that controls granularity + of clusters; Large epsilon => More clusters + + Parameters + ---------- + predData: PredictionData + Contains data to use for predicting novel points. + Defined in the HDBSCAN module + + condensed_tree: CondensedTree + Tree structure that contains hierarchical clustering. + Defined in the HDBSCAN module + + n_clusters: int, optional, default=None + If specified, use this to obtain cluster_selection_epsilon + from CondensedTree; Overrides cluster_selection_epsilon parameter + + cluster_selection_epsilon: float, default=0. + In cluster tree, nodes are not split further beyond (>=) this value. + epsilon is the inverse of core distance. + + Returns + ------- + None + """ + # predData must be a pre-trained PredictionData instance from hdbscan + # If n_clusters is specified, compute cluster_selection_epsilon; + if n_clusters is not None: + cluster_selection_epsilon = select_epsilon(condensed_tree, n_clusters) + + # This is the key modification: + # Select clusters according to selection method and epsilon. + selected_clusters = _new_select_clusters(condensed_tree, cluster_selection_epsilon) + # _new_select_clusters is a modification of get_clusters + # from hdbscan._hdbscan_tree + + # raw tree, used later to get exemplars and lambda values + raw_condensed_tree = condensed_tree._raw_tree + + # Re-do the cluster map: Map cluster numbers in tree (N, N+1, ..) + # to the cluster labels produced as output + predData.cluster_map = { + int(c): n for n, c in enumerate(sorted(list(selected_clusters))) + } + predData.reverse_cluster_map = {n: c for c, n in predData.cluster_map.items()} + + # Re-compute lambdas and exemplars for selected clusters; + predData.max_lambdas = {} + predData.exemplars = [] + + for cluster in selected_clusters: + # max_lambda <=> smallest distance <=> most persistent point(s) + predData.max_lambdas[cluster] = raw_condensed_tree["lambda_val"][ + raw_condensed_tree["parent"] == cluster + ].max() + + # Map all sub-clusters of selected cluster to the selected cluster's + # label in output. + # Map lambdas too... + for sub_cluster in predData._clusters_below(cluster): + predData.cluster_map[sub_cluster] = predData.cluster_map[cluster] + predData.max_lambdas[sub_cluster] = predData.max_lambdas[cluster] + + # Create set of exemplar points for later use. + # Novel points are assigned based on cluster of closest exemplar. + cluster_exemplars = np.array([], dtype=np.int64) + # For each selected cluster, get all of its leaves, + # and leaves of leaves, and so on... + for leaf in predData._recurse_leaf_dfs(cluster): + # Largest lambda => Most persistent points + leaf_max_lambda = raw_condensed_tree["lambda_val"][ + raw_condensed_tree["parent"] == leaf + ].max() + # Get the most persistent points + points = raw_condensed_tree["child"][ + (raw_condensed_tree["parent"] == leaf) + & (raw_condensed_tree["lambda_val"] == leaf_max_lambda) + ] + # Add most persistent points as exemplars + cluster_exemplars = np.hstack([cluster_exemplars, points]) + + # Add exemplars for each leaf of each selected cluster. + predData.exemplars.append(predData.raw_data[cluster_exemplars]) + return + + +def _new_select_clusters( + condensed_tree, + cluster_selection_epsilon, + allow_single_cluster=False, + match_reference_implementation=False, +): + """ + Adaptation of get_clusters from hdbscan._hdbscan_tree. + Avoids the label and proba computation at the end, + and returns only the selected clusters instead. + """ + tree = condensed_tree._raw_tree + cluster_selection_method = condensed_tree.cluster_selection_method + stability = compute_stability(tree) + + if allow_single_cluster: + node_list = sorted(stability.keys(), reverse=True) + else: + node_list = sorted(stability.keys(), reverse=True)[:-1] + # (exclude root) + + cluster_tree = tree[tree["child_size"] > 1] + is_cluster = {cluster: True for cluster in node_list} + + if cluster_selection_method == "eom": + for node in node_list: + child_selection = cluster_tree["parent"] == node + subtree_stability = np.sum( + [stability[child] for child in cluster_tree["child"][child_selection]] + ) + if subtree_stability > stability[node]: + is_cluster[node] = False + stability[node] = subtree_stability + else: + for sub_node in _bfs_from_cluster_tree(cluster_tree, node): + if sub_node != node: + is_cluster[sub_node] = False + + if cluster_selection_epsilon != 0.0: + eom_clusters = set([c for c in is_cluster if is_cluster[c]]) + selected_clusters = epsilon_search( + eom_clusters, + cluster_tree, + cluster_selection_epsilon, + allow_single_cluster, + ) + for c in is_cluster: + if c in selected_clusters: + is_cluster[c] = True + else: + is_cluster[c] = False + + elif cluster_selection_method == "leaf": + leaves = set(get_cluster_tree_leaves(cluster_tree)) + if len(leaves) == 0: + for c in is_cluster: + is_cluster[c] = False + is_cluster[tree["parent"].min()] = True + + if cluster_selection_epsilon != 0.0: + selected_clusters = epsilon_search( + leaves, cluster_tree, cluster_selection_epsilon, allow_single_cluster + ) + else: + selected_clusters = leaves + + for c in is_cluster: + if c in selected_clusters: + is_cluster[c] = True + else: + is_cluster[c] = False + else: + raise ValueError( + 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' + ) + + clusters = set([int(c) for c in is_cluster if is_cluster[c]]) + return clusters + + +def epsilon_search( + leaves, cluster_tree, cluster_selection_epsilon, allow_single_cluster +): + selected_clusters = [] + processed = [] + + for leaf in leaves: + eps = 1 / cluster_tree["lambda_val"][cluster_tree["child"] == leaf][0] + if eps < cluster_selection_epsilon: + if leaf not in processed: + epsilon_child = traverse_upwards( + cluster_tree, cluster_selection_epsilon, leaf, allow_single_cluster + ) + if hasattr(epsilon_child, "__len__"): + epsilon_child = epsilon_child[0] + + selected_clusters.append(epsilon_child) + + for sub_node in _bfs_from_cluster_tree(cluster_tree, epsilon_child): + if sub_node != epsilon_child: + processed.append(sub_node) + else: + selected_clusters.append(leaf) + + return set(selected_clusters) + + +def traverse_upwards( + cluster_tree, cluster_selection_epsilon, leaf, allow_single_cluster +): + root = cluster_tree["parent"].min() + parent = cluster_tree[cluster_tree["child"] == leaf]["parent"] + if parent == root: + if allow_single_cluster: + return parent + else: + return leaf # return node closest to root + + parent_eps = 1 / cluster_tree[cluster_tree["child"] == parent]["lambda_val"] + if parent_eps > cluster_selection_epsilon: + return parent + else: + return traverse_upwards( + cluster_tree, cluster_selection_epsilon, parent, allow_single_cluster + ) + + +def clusters_from_prediction_data(prediction_data): + """ + Extract selected clusters from PredictionData instance. + """ + return np.array(sorted(list(prediction_data.reverse_cluster_map.values()))).astype( + np.intp + ) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py new file mode 100644 index 0000000000000..89b2590ea27a5 --- /dev/null +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -0,0 +1,1525 @@ +# -*- coding: utf-8 -*- +""" +HDBSCAN: Hierarchical Density-Based Spatial Clustering + of Applications with Noise +""" + +import numpy as np +from numpy import isclose + +from sklearn.base import BaseEstimator, ClusterMixin +from sklearn.metrics import pairwise_distances +from scipy.sparse import issparse +from sklearn.neighbors import KDTree, BallTree +from joblib import Memory +from warnings import warn +from sklearn.utils import check_array +from joblib.parallel import cpu_count + +from scipy.sparse import csgraph + +from ._hdbscan_linkage import ( + mst_linkage_core, + mst_linkage_core_vector, + label, +) +from ._hdbscan_tree import ( + condense_tree, + compute_stability, + get_clusters, + outlier_scores, +) +from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability + +from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm +from .dist_metrics import DistanceMetric + +from .plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree +from .prediction import PredictionData + +FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] + +# Author: Leland McInnes +# Steve Astels +# John Healy +# +# License: BSD 3 clause + + +def _tree_to_labels( + X, + single_linkage_tree, + min_cluster_size=10, + cluster_selection_method="eom", + allow_single_cluster=False, + match_reference_implementation=False, + cluster_selection_epsilon=0.0, + max_cluster_size=0, +): + """Converts a pretrained tree and cluster size into a + set of labels and probabilities. + """ + condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) + stability_dict = compute_stability(condensed_tree) + labels, probabilities, stabilities = get_clusters( + condensed_tree, + stability_dict, + cluster_selection_method, + allow_single_cluster, + match_reference_implementation, + cluster_selection_epsilon, + max_cluster_size, + ) + + return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree) + + +def _hdbscan_generic( + X, + min_samples=5, + alpha=1.0, + metric="minkowski", + p=2, + leaf_size=None, + gen_min_span_tree=False, + **kwargs, +): + if metric == "minkowski": + distance_matrix = pairwise_distances(X, metric=metric, p=p) + elif metric == "arccos": + distance_matrix = pairwise_distances(X, metric="cosine", **kwargs) + elif metric == "precomputed": + # Treating this case explicitly, instead of letting + # sklearn.metrics.pairwise_distances handle it, + # enables the usage of numpy.inf in the distance + # matrix to indicate missing distance information. + # TODO: Check if copying is necessary + distance_matrix = X.copy() + else: + distance_matrix = pairwise_distances(X, metric=metric, **kwargs) + + if issparse(distance_matrix): + # raise TypeError('Sparse distance matrices not yet supported') + return _hdbscan_sparse_distance_matrix( + distance_matrix, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **kwargs, + ) + + mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) + + min_spanning_tree = mst_linkage_core(mutual_reachability_) + + # Warn if the MST couldn't be constructed around the missing distances + if np.isinf(min_spanning_tree.T[2]).any(): + warn( + "The minimum spanning tree contains edge weights with value " + "infinity. Potentially, you are missing too many distances " + "in the initial distance matrix for the given neighborhood " + "size.", + UserWarning, + ) + + # mst_linkage_core does not generate a full minimal spanning tree + # If a tree is required then we must build the edges from the information + # returned by mst_linkage_core (i.e. just the order of points to be merged) + if gen_min_span_tree: + result_min_span_tree = min_spanning_tree.copy() + for index, row in enumerate(result_min_span_tree[1:], 1): + candidates = np.where(isclose(mutual_reachability_[int(row[1])], row[2]))[0] + candidates = np.intersect1d( + candidates, min_spanning_tree[:index, :2].astype(int) + ) + candidates = candidates[candidates != row[1]] + assert len(candidates) > 0 + row[0] = candidates[0] + else: + result_min_span_tree = None + + # Sort edges of the min_spanning_tree by weight + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + return single_linkage_tree, result_min_span_tree + + +def _hdbscan_sparse_distance_matrix( + X, + min_samples=5, + alpha=1.0, + metric="minkowski", + p=2, + leaf_size=40, + gen_min_span_tree=False, + **kwargs, +): + assert issparse(X) + # Check for connected component on X + if csgraph.connected_components(X, directed=False, return_labels=False) > 1: + raise ValueError( + "Sparse distance matrix has multiple connected " + "components!\nThat is, there exist groups of points " + "that are completely disjoint -- there are no distance " + "relations connecting them\n" + "Run hdbscan on each component." + ) + + lil_matrix = X.tolil() + + # Compute sparse mutual reachability graph + # if max_dist > 0, max distance to use when the reachability is infinite + max_dist = kwargs.get("max_dist", 0.0) + mutual_reachability_ = sparse_mutual_reachability( + lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha + ) + # Check connected component on mutual reachability + # If more than one component, it means that even if the distance matrix X + # has one component, there exists with less than `min_samples` neighbors + if ( + csgraph.connected_components( + mutual_reachability_, directed=False, return_labels=False + ) + > 1 + ): + raise ValueError( + "There exists points with less than %s neighbors. " + "Ensure your distance matrix has non zeros values for " + "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " + "or specify a `max_dist` to use when distances are missing." + % (min_samples, min_samples) + ) + + # Compute the minimum spanning tree for the sparse graph + sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) + + # Convert the graph to scipy cluster array format + nonzeros = sparse_min_spanning_tree.nonzero() + nonzero_vals = sparse_min_spanning_tree[nonzeros] + min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T + + # Sort edges of the min_spanning_tree by weight + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + if gen_min_span_tree: + return single_linkage_tree, min_spanning_tree + else: + return single_linkage_tree, None + + +def _hdbscan_prims_kdtree( + X, + min_samples=5, + alpha=1.0, + metric="minkowski", + p=2, + leaf_size=40, + gen_min_span_tree=False, + **kwargs, +): + if X.dtype != np.float64: + X = X.astype(np.float64) + + # The Cython routines used require contiguous arrays + if not X.flags["C_CONTIGUOUS"]: + X = np.array(X, dtype=np.double, order="C") + + tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + + # TO DO: Deal with p for minkowski appropriately + dist_metric = DistanceMetric.get_metric(metric, **kwargs) + + # Get distance to kth nearest neighbour + core_distances = tree.query( + X, k=min_samples + 1, dualtree=True, breadth_first=True + )[0][:, -1].copy(order="C") + + # Mutual reachability distance is implicit in mst_linkage_core_vector + min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) + + # Sort edges of the min_spanning_tree by weight + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + if gen_min_span_tree: + return single_linkage_tree, min_spanning_tree + else: + return single_linkage_tree, None + + +def _hdbscan_prims_balltree( + X, + min_samples=5, + alpha=1.0, + metric="minkowski", + p=2, + leaf_size=40, + gen_min_span_tree=False, + **kwargs, +): + if X.dtype != np.float64: + X = X.astype(np.float64) + + # The Cython routines used require contiguous arrays + if not X.flags["C_CONTIGUOUS"]: + X = np.array(X, dtype=np.double, order="C") + + tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + + dist_metric = DistanceMetric.get_metric(metric, **kwargs) + + # Get distance to kth nearest neighbour + core_distances = tree.query( + X, k=min_samples + 1, dualtree=True, breadth_first=True + )[0][:, -1].copy(order="C") + + # Mutual reachability distance is implicit in mst_linkage_core_vector + min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) + # Sort edges of the min_spanning_tree by weight + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + if gen_min_span_tree: + return single_linkage_tree, min_spanning_tree + else: + return single_linkage_tree, None + + +def _hdbscan_boruvka_kdtree( + X, + min_samples=5, + alpha=1.0, + metric="minkowski", + p=2, + leaf_size=40, + approx_min_span_tree=True, + gen_min_span_tree=False, + core_dist_n_jobs=4, + **kwargs, +): + if leaf_size < 3: + leaf_size = 3 + + if core_dist_n_jobs < 1: + core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) + + if X.dtype != np.float64: + X = X.astype(np.float64) + + tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + alg = KDTreeBoruvkaAlgorithm( + tree, + min_samples, + metric=metric, + leaf_size=leaf_size // 3, + approx_min_span_tree=approx_min_span_tree, + n_jobs=core_dist_n_jobs, + **kwargs, + ) + min_spanning_tree = alg.spanning_tree() + # Sort edges of the min_spanning_tree by weight + row_order = np.argsort(min_spanning_tree.T[2]) + min_spanning_tree = min_spanning_tree[row_order, :] + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + if gen_min_span_tree: + return single_linkage_tree, min_spanning_tree + else: + return single_linkage_tree, None + + +def _hdbscan_boruvka_balltree( + X, + min_samples=5, + alpha=1.0, + metric="minkowski", + p=2, + leaf_size=40, + approx_min_span_tree=True, + gen_min_span_tree=False, + core_dist_n_jobs=4, + **kwargs, +): + if leaf_size < 3: + leaf_size = 3 + + if core_dist_n_jobs < 1: + core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) + + if X.dtype != np.float64: + X = X.astype(np.float64) + + tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + alg = BallTreeBoruvkaAlgorithm( + tree, + min_samples, + metric=metric, + leaf_size=leaf_size // 3, + approx_min_span_tree=approx_min_span_tree, + n_jobs=core_dist_n_jobs, + **kwargs, + ) + min_spanning_tree = alg.spanning_tree() + # Sort edges of the min_spanning_tree by weight + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + if gen_min_span_tree: + return single_linkage_tree, min_spanning_tree + else: + return single_linkage_tree, None + + +def check_precomputed_distance_matrix(X): + """ + Perform check_array(X) after removing infinite values (numpy.inf) + from the given distance matrix. + """ + tmp = X.copy() + tmp[np.isinf(tmp)] = 1 + check_array(tmp) + + +def remap_condensed_tree(tree, internal_to_raw, outliers): + """ + Takes an internal condensed_tree structure and adds back in a set of points + that were initially detected as non-finite and returns that new tree. + These points will all be split off from the maximal node at lambda zero and + considered noise points. + + Parameters + ---------- + tree: condensed_tree + internal_to_raw: dict + a mapping from internal integer index to the raw integer index + finite_index: ndarray + Boolean array of which entries in the raw data were finite + """ + finite_count = len(internal_to_raw) + + outlier_count = len(outliers) + for i, (parent, child, lambda_val, child_size) in enumerate(tree): + if child < finite_count: + child = internal_to_raw[child] + else: + child = child + outlier_count + tree[i] = (parent + outlier_count, child, lambda_val, child_size) + + outlier_list = [] + root = tree[0][0] # Should I check to be sure this is the minimal lambda? + for outlier in outliers: + outlier_list.append((root, outlier, 0, 1)) + + outlier_tree = np.array( + outlier_list, + dtype=[ + ("parent", np.intp), + ("child", np.intp), + ("lambda_val", float), + ("child_size", np.intp), + ], + ) + tree = np.append(outlier_tree, tree) + return tree + + +def remap_single_linkage_tree(tree, internal_to_raw, outliers): + """ + Takes an internal single_linkage_tree structure and adds back in a set of points + that were initially detected as non-finite and returns that new tree. + These points will all be merged into the final node at np.inf distance and + considered noise points. + + Parameters + ---------- + tree: single_linkage_tree + internal_to_raw: dict + a mapping from internal integer index to the raw integer index + finite_index: ndarray + Boolean array of which entries in the raw data were finite + """ + finite_count = len(internal_to_raw) + + outlier_count = len(outliers) + for i, (left, right, distance, size) in enumerate(tree): + if left < finite_count: + tree[i, 0] = internal_to_raw[left] + else: + tree[i, 0] = left + outlier_count + if right < finite_count: + tree[i, 1] = internal_to_raw[right] + else: + tree[i, 1] = right + outlier_count + + outlier_tree = np.zeros((len(outliers), 4)) + last_cluster_id = tree[tree.shape[0] - 1][0:2].max() + last_cluster_size = tree[tree.shape[0] - 1][3] + for i, outlier in enumerate(outliers): + outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) + last_cluster_id += 1 + last_cluster_size += 1 + tree = np.vstack([tree, outlier_tree]) + return tree + + +def is_finite(matrix): + """Returns true only if all the values of a ndarray or sparse matrix are finite""" + if issparse(matrix): + return np.alltrue(np.isfinite(matrix.tocoo().data)) + else: + return np.alltrue(np.isfinite(matrix)) + + +def get_finite_row_indices(matrix): + """ + Returns the indices of the purely finite rows of a + sparse matrix or dense ndarray + """ + if issparse(matrix): + row_indices = np.array( + [i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))] + ) + else: + row_indices = np.where(np.isfinite(matrix).sum(axis=1) == matrix.shape[1])[0] + return row_indices + + +def hdbscan( + X, + min_cluster_size=5, + min_samples=None, + alpha=1.0, + cluster_selection_epsilon=0.0, + max_cluster_size=0, + metric="minkowski", + p=2, + leaf_size=40, + algorithm="best", + memory=Memory(cachedir=None, verbose=0), + approx_min_span_tree=True, + gen_min_span_tree=False, + core_dist_n_jobs=4, + cluster_selection_method="eom", + allow_single_cluster=False, + match_reference_implementation=False, + **kwargs, +): + """Perform HDBSCAN clustering from a vector array or distance matrix. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + min_cluster_size : int, optional (default=5) + The minimum number of samples in a group for that group to be + considered a cluster; groupings smaller than this size will be left + as noise. + + min_samples : int, optional (default=None) + The number of samples in a neighborhood for a point + to be considered as a core point. This includes the point itself. + defaults to the min_cluster_size. + + cluster_selection_epsilon: float, optional (default=0.0) + A distance threshold. Clusters below this value will be merged. + See [3]_ for more information. Note that this should not be used + if we want to predict the cluster labels for new points in future + (e.g. using approximate_predict), as the approximate_predict function + is not aware of this argument. + + alpha : float, optional (default=1.0) + A distance scaling parameter as used in robust single linkage. + See [2]_ for more information. + + max_cluster_size : int, optional (default=0) + A limit to the size of clusters returned by the eom algorithm. + Has no effect when using leaf clustering (where clusters are + usually small regardless) and can also be overridden in rare + cases by a high value for cluster_selection_epsilon. Note that + this should not be used if we want to predict the cluster labels + for new points in future (e.g. using approximate_predict), as + the approximate_predict function is not aware of this argument. + + metric : string or callable, optional (default='minkowski') + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by metrics.pairwise.pairwise_distances for its + metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. + + p : int, optional (default=2) + p value to use if using the minkowski metric. + + leaf_size : int, optional (default=40) + Leaf size for trees responsible for fast nearest + neighbour queries. + + algorithm : string, optional (default='best') + Exactly which algorithm to use; hdbscan has variants specialised + for different characteristics of the data. By default this is set + to ``best`` which chooses the "best" algorithm given the nature of + the data. You can force other options if you believe you know + better. Options are: + * ``best`` + * ``generic`` + * ``prims_kdtree`` + * ``prims_balltree`` + * ``boruvka_kdtree`` + * ``boruvka_balltree`` + + memory : instance of joblib.Memory or string, optional + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + approx_min_span_tree : bool, optional (default=True) + Whether to accept an only approximate minimum spanning tree. + For some algorithms this can provide a significant speedup, but + the resulting clustering may be of marginally lower quality. + If you are willing to sacrifice speed for correctness you may want + to explore this; in general this should be left at the default True. + + gen_min_span_tree : bool, optional (default=False) + Whether to generate the minimum spanning tree for later analysis. + + core_dist_n_jobs : int, optional (default=4) + Number of parallel jobs to run in core distance computations (if + supported by the specific algorithm). For ``core_dist_n_jobs`` + below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + + cluster_selection_method : string, optional (default='eom') + The method used to select clusters from the condensed tree. The + standard approach for HDBSCAN* is to use an Excess of Mass algorithm + to find the most persistent clusters. Alternatively you can instead + select the clusters at the leaves of the tree -- this provides the + most fine grained and homogeneous clusters. Options are: + * ``eom`` + * ``leaf`` + + allow_single_cluster : bool, optional (default=False) + By default HDBSCAN* will not produce a single cluster, setting this + to t=True will override this and allow single cluster results in + the case that you feel this is a valid result for your dataset. + (default False) + + match_reference_implementation : bool, optional (default=False) + There exist some interpretational differences between this + HDBSCAN* implementation and the original authors reference + implementation in Java. This can result in very minor differences + in clustering results. Setting this flag to True will, at a some + performance cost, ensure that the clustering results match the + reference implementation. + + **kwargs : optional + Arguments passed to the distance metric + + Returns + ------- + labels : ndarray, shape (n_samples, ) + Cluster labels for each point. Noisy samples are given the label -1. + + probabilities : ndarray, shape (n_samples, ) + Cluster membership strengths for each point. Noisy samples are assigned + 0. + + cluster_persistence : array, shape (n_clusters, ) + A score of how persistent each cluster is. A score of 1.0 represents + a perfectly stable cluster that persists over all distance scales, + while a score of 0.0 represents a perfectly ephemeral cluster. These + scores can be guage the relative coherence of the clusters output + by the algorithm. + + condensed_tree : record array + The condensed cluster hierarchy used to generate clusters. + + single_linkage_tree : ndarray, shape (n_samples - 1, 4) + The single linkage tree produced during clustering in scipy + hierarchical clustering format + (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). + + min_spanning_tree : ndarray, shape (n_samples - 1, 3) + The minimum spanning as an edgelist. If gen_min_span_tree was False + this will be None. + + References + ---------- + + .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). + Density-based clustering based on hierarchical density estimates. + In Pacific-Asia Conference on Knowledge Discovery and Data Mining + (pp. 160-172). Springer Berlin Heidelberg. + + .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the + cluster tree. In Advances in Neural Information Processing Systems + (pp. 343-351). + + .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical + Density-based Cluster Selection. arxiv preprint 1911.02282. + """ + if min_samples is None: + min_samples = min_cluster_size + + if type(min_samples) is not int or type(min_cluster_size) is not int: + raise ValueError("Min samples and min cluster size must be integers!") + + if min_samples <= 0 or min_cluster_size <= 0: + raise ValueError("Min samples and Min cluster size must be positive integers") + + if min_cluster_size == 1: + raise ValueError("Min cluster size must be greater than one") + + if type(cluster_selection_epsilon) is int: + cluster_selection_epsilon = float(cluster_selection_epsilon) + + if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0: + raise ValueError("Epsilon must be a float value greater than or equal to 0!") + + if not isinstance(alpha, float) or alpha <= 0.0: + raise ValueError("Alpha must be a positive float value greater than 0!") + + if leaf_size < 1: + raise ValueError("Leaf size must be greater than 0!") + + if metric == "minkowski": + if p is None: + raise TypeError("Minkowski metric given but no p value supplied!") + if p < 0: + raise ValueError("Minkowski metric with negative p value is not defined!") + + if match_reference_implementation: + min_samples = min_samples - 1 + min_cluster_size = min_cluster_size + 1 + approx_min_span_tree = False + + if cluster_selection_method not in ("eom", "leaf"): + raise ValueError( + 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' + ) + + # Checks input and converts to an nd-array where possible + if metric != "precomputed" or issparse(X): + X = check_array(X, accept_sparse="csr", force_all_finite=False) + else: + # Only non-sparse, precomputed distance matrices are handled here + # and thereby allowed to contain numpy.inf for missing distances + check_precomputed_distance_matrix(X) + + # Python 2 and 3 compliant string_type checking + if isinstance(memory, str): + memory = Memory(cachedir=memory, verbose=0) + + size = X.shape[0] + min_samples = min(size - 1, min_samples) + if min_samples == 0: + min_samples = 1 + + if algorithm != "best": + if metric != "precomputed" and issparse(X) and algorithm != "generic": + raise ValueError("Sparse data matrices only support algorithm 'generic'.") + + if algorithm == "generic": + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_generic + )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + elif algorithm == "prims_kdtree": + if metric not in KDTree.valid_metrics: + raise ValueError("Cannot use Prim's with KDTree for this metric!") + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_prims_kdtree + )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + elif algorithm == "prims_balltree": + if metric not in BallTree.valid_metrics: + raise ValueError("Cannot use Prim's with BallTree for this metric!") + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_prims_balltree + )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + elif algorithm == "boruvka_kdtree": + if metric not in BallTree.valid_metrics: + raise ValueError("Cannot use Boruvka with KDTree for this metric!") + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_boruvka_kdtree + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + approx_min_span_tree, + gen_min_span_tree, + core_dist_n_jobs, + **kwargs, + ) + elif algorithm == "boruvka_balltree": + if metric not in BallTree.valid_metrics: + raise ValueError("Cannot use Boruvka with BallTree for this metric!") + if (X.shape[0] // leaf_size) > 16000: + warn( + "A large dataset size and small leaf_size may induce excessive " + "memory usage. If you are running out of memory consider " + "increasing the ``leaf_size`` parameter." + ) + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_boruvka_balltree + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + approx_min_span_tree, + gen_min_span_tree, + core_dist_n_jobs, + **kwargs, + ) + else: + raise TypeError("Unknown algorithm type %s specified" % algorithm) + else: + + if issparse(X) or metric not in FAST_METRICS: + # We can't do much with sparse matrices ... + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_generic + )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + elif metric in KDTree.valid_metrics: + # TO DO: Need heuristic to decide when to go to boruvka; + # still debugging for now + if X.shape[1] > 60: + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_prims_kdtree + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **kwargs, + ) + else: + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_boruvka_kdtree + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + approx_min_span_tree, + gen_min_span_tree, + core_dist_n_jobs, + **kwargs, + ) + else: # Metric is a valid BallTree metric + # TO DO: Need heuristic to decide when to go to boruvka; + # still debugging for now + if X.shape[1] > 60: + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_prims_balltree + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **kwargs, + ) + else: + (single_linkage_tree, result_min_span_tree) = memory.cache( + _hdbscan_boruvka_balltree + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + approx_min_span_tree, + gen_min_span_tree, + core_dist_n_jobs, + **kwargs, + ) + + return _tree_to_labels( + X, + single_linkage_tree, + min_cluster_size, + cluster_selection_method, + allow_single_cluster, + match_reference_implementation, + cluster_selection_epsilon, + max_cluster_size, + ) + (result_min_span_tree,) + + +# Inherits from sklearn +class HDBSCAN(BaseEstimator, ClusterMixin): + """Perform HDBSCAN clustering from vector array or distance matrix. + + HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications + with Noise. Performs DBSCAN over varying epsilon values and integrates + the result to find a clustering that gives the best stability over epsilon. + This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), + and be more robust to parameter selection. + + Parameters + ---------- + min_cluster_size : int, optional (default=5) + The minimum size of clusters; single linkage splits that contain + fewer points than this will be considered points "falling out" of a + cluster rather than a cluster splitting into two new clusters. + + min_samples : int, optional (default=None) + The number of samples in a neighbourhood for a point to be + considered a core point. + + metric : string, or callable, optional (default='euclidean') + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by metrics.pairwise.pairwise_distances for its + metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. + + p : int, optional (default=None) + p value to use if using the minkowski metric. + + alpha : float, optional (default=1.0) + A distance scaling parameter as used in robust single linkage. + See [3]_ for more information. + + cluster_selection_epsilon: float, optional (default=0.0) + A distance threshold. Clusters below this value will be merged. + See [5]_ for more information. + + algorithm : string, optional (default='best') + Exactly which algorithm to use; hdbscan has variants specialised + for different characteristics of the data. By default this is set + to ``best`` which chooses the "best" algorithm given the nature of + the data. You can force other options if you believe you know + better. Options are: + * ``best`` + * ``generic`` + * ``prims_kdtree`` + * ``prims_balltree`` + * ``boruvka_kdtree`` + * ``boruvka_balltree`` + + leaf_size: int, optional (default=40) + If using a space tree algorithm (kdtree, or balltree) the number + of points ina leaf node of the tree. This does not alter the + resulting clustering, but may have an effect on the runtime + of the algorithm. + + memory : Instance of joblib.Memory or string (optional) + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + approx_min_span_tree : bool, optional (default=True) + Whether to accept an only approximate minimum spanning tree. + For some algorithms this can provide a significant speedup, but + the resulting clustering may be of marginally lower quality. + If you are willing to sacrifice speed for correctness you may want + to explore this; in general this should be left at the default True. + + gen_min_span_tree: bool, optional (default=False) + Whether to generate the minimum spanning tree with regard + to mutual reachability distance for later analysis. + + core_dist_n_jobs : int, optional (default=4) + Number of parallel jobs to run in core distance computations (if + supported by the specific algorithm). For ``core_dist_n_jobs`` + below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + + cluster_selection_method : string, optional (default='eom') + The method used to select clusters from the condensed tree. The + standard approach for HDBSCAN* is to use an Excess of Mass algorithm + to find the most persistent clusters. Alternatively you can instead + select the clusters at the leaves of the tree -- this provides the + most fine grained and homogeneous clusters. Options are: + * ``eom`` + * ``leaf`` + + allow_single_cluster : bool, optional (default=False) + By default HDBSCAN* will not produce a single cluster, setting this + to True will override this and allow single cluster results in + the case that you feel this is a valid result for your dataset. + + prediction_data : boolean, optional + Whether to generate extra cached data for predicting labels or + membership vectors few new unseen points later. If you wish to + persist the clustering object for later re-use you probably want + to set this to True. + (default False) + + match_reference_implementation : bool, optional (default=False) + There exist some interpretational differences between this + HDBSCAN* implementation and the original authors reference + implementation in Java. This can result in very minor differences + in clustering results. Setting this flag to True will, at a some + performance cost, ensure that the clustering results match the + reference implementation. + + **kwargs : optional + Arguments passed to the distance metric + + Attributes + ---------- + labels_ : ndarray, shape (n_samples, ) + Cluster labels for each point in the dataset given to fit(). + Noisy samples are given the label -1. + + probabilities_ : ndarray, shape (n_samples, ) + The strength with which each sample is a member of its assigned + cluster. Noise points have probability zero; points in clusters + have values assigned proportional to the degree that they + persist as part of the cluster. + + cluster_persistence_ : ndarray, shape (n_clusters, ) + A score of how persistent each cluster is. A score of 1.0 represents + a perfectly stable cluster that persists over all distance scales, + while a score of 0.0 represents a perfectly ephemeral cluster. These + scores can be guage the relative coherence of the clusters output + by the algorithm. + + condensed_tree_ : CondensedTree object + The condensed tree produced by HDBSCAN. The object has methods + for converting to pandas, networkx, and plotting. + + single_linkage_tree_ : SingleLinkageTree object + The single linkage tree produced by HDBSCAN. The object has methods + for converting to pandas, networkx, and plotting. + + minimum_spanning_tree_ : MinimumSpanningTree object + The minimum spanning tree of the mutual reachability graph generated + by HDBSCAN. Note that this is not generated by default and will only + be available if `gen_min_span_tree` was set to True on object creation. + Even then in some optimized cases a tre may not be generated. + + outlier_scores_ : ndarray, shape (n_samples, ) + Outlier scores for clustered points; the larger the score the more + outlier-like the point. Useful as an outlier detection technique. + Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander. + + prediction_data_ : PredictionData object + Cached data used for predicting the cluster labels of new or + unseen points. Necessary only if you are using functions from + ``hdbscan.prediction`` (see + :func:`~hdbscan.prediction.approximate_predict`, + :func:`~hdbscan.prediction.membership_vector`, + and :func:`~hdbscan.prediction.all_points_membership_vectors`). + + exemplars_ : list + A list of exemplar points for clusters. Since HDBSCAN supports + arbitrary shapes for clusters we cannot provide a single cluster + exemplar per cluster. Instead a list is returned with each element + of the list being a numpy array of exemplar points for a cluster -- + these points are the "most representative" points of the cluster. + + relative_validity_ : float + A fast approximation of the Density Based Cluster Validity (DBCV) + score [4]. The only differece, and the speed, comes from the fact + that this relative_validity_ is computed using the mutual- + reachability minimum spanning tree, i.e. minimum_spanning_tree_, + instead of the all-points minimum spanning tree used in the + reference. This score might not be an objective measure of the + goodness of clusterering. It may only be used to compare results + across different choices of hyper-parameters, therefore is only a + relative score. + + References + ---------- + + .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). + Density-based clustering based on hierarchical density estimates. + In Pacific-Asia Conference on Knowledge Discovery and Data Mining + (pp. 160-172). Springer Berlin Heidelberg. + + .. [2] Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. (2015). + Hierarchical density estimates for data clustering, visualization, + and outlier detection. ACM Transactions on Knowledge Discovery + from Data (TKDD), 10(1), 5. + + .. [3] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the + cluster tree. In Advances in Neural Information Processing Systems + (pp. 343-351). + + .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and + Sander, J., 2014. Density-Based Clustering Validation. In SDM + (pp. 839-847). + + .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical + Density-based Cluster Selection. arxiv preprint 1911.02282. + + """ + + def __init__( + self, + min_cluster_size=5, + min_samples=None, + cluster_selection_epsilon=0.0, + max_cluster_size=0, + metric="euclidean", + alpha=1.0, + p=None, + algorithm="best", + leaf_size=40, + memory=Memory(cachedir=None, verbose=0), + approx_min_span_tree=True, + gen_min_span_tree=False, + core_dist_n_jobs=4, + cluster_selection_method="eom", + allow_single_cluster=False, + prediction_data=False, + match_reference_implementation=False, + **kwargs, + ): + self.min_cluster_size = min_cluster_size + self.min_samples = min_samples + self.alpha = alpha + self.max_cluster_size = max_cluster_size + self.cluster_selection_epsilon = cluster_selection_epsilon + self.metric = metric + self.p = p + self.algorithm = algorithm + self.leaf_size = leaf_size + self.memory = memory + self.approx_min_span_tree = approx_min_span_tree + self.gen_min_span_tree = gen_min_span_tree + self.core_dist_n_jobs = core_dist_n_jobs + self.cluster_selection_method = cluster_selection_method + self.allow_single_cluster = allow_single_cluster + self.match_reference_implementation = match_reference_implementation + self.prediction_data = prediction_data + + self._metric_kwargs = kwargs + + self._condensed_tree = None + self._single_linkage_tree = None + self._min_spanning_tree = None + self._raw_data = None + self._outlier_scores = None + self._prediction_data = None + self._relative_validity = None + + def fit(self, X, y=None): + """Perform HDBSCAN clustering from features or distance matrix. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + Returns + ------- + self : object + Returns self + """ + if self.metric != "precomputed": + # Non-precomputed matrices may contain non-finite values. + # Rows with these values + X = check_array(X, accept_sparse="csr", force_all_finite=False) + self._raw_data = X + + self._all_finite = is_finite(X) + if ~self._all_finite: + # Pass only the purely finite indices into hdbscan + # We will later assign all non-finite points to the + # background-1 cluster + finite_index = get_finite_row_indices(X) + clean_data = X[finite_index] + internal_to_raw = { + x: y for x, y in zip(range(len(finite_index)), finite_index) + } + outliers = list(set(range(X.shape[0])) - set(finite_index)) + else: + clean_data = X + elif issparse(X): + # Handle sparse precomputed distance matrices separately + X = check_array(X, accept_sparse="csr") + clean_data = X + else: + # Only non-sparse, precomputed distance matrices are allowed + # to have numpy.inf values indicating missing distances + check_precomputed_distance_matrix(X) + clean_data = X + + kwargs = self.get_params() + # prediction data only applies to the persistent model, so remove + # it from the keyword args we pass on the the function + kwargs.pop("prediction_data", None) + kwargs.update(self._metric_kwargs) + + ( + self.labels_, + self.probabilities_, + self.cluster_persistence_, + self._condensed_tree, + self._single_linkage_tree, + self._min_spanning_tree, + ) = hdbscan(clean_data, **kwargs) + + if self.metric != "precomputed" and not self._all_finite: + # remap indices to align with original data in the case of + # non-finite entries. + self._condensed_tree = remap_condensed_tree( + self._condensed_tree, internal_to_raw, outliers + ) + self._single_linkage_tree = remap_single_linkage_tree( + self._single_linkage_tree, internal_to_raw, outliers + ) + new_labels = np.full(X.shape[0], -1) + new_labels[finite_index] = self.labels_ + self.labels_ = new_labels + + new_probabilities = np.zeros(X.shape[0]) + new_probabilities[finite_index] = self.probabilities_ + self.probabilities_ = new_probabilities + + if self.prediction_data: + self.generate_prediction_data() + + return self + + def fit_predict(self, X, y=None): + """Performs clustering on X and returns cluster labels. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + Returns + ------- + y : ndarray, shape (n_samples, ) + cluster labels + """ + self.fit(X) + return self.labels_ + + def generate_prediction_data(self): + """ + Create data that caches intermediate results used for predicting + the label of new/unseen points. This data is only useful if + you are intending to use functions from ``hdbscan.prediction``. + """ + + if self.metric in FAST_METRICS: + min_samples = self.min_samples or self.min_cluster_size + if self.metric in KDTree.valid_metrics: + tree_type = "kdtree" + elif self.metric in BallTree.valid_metrics: + tree_type = "balltree" + else: + warn("Metric {} not supported for prediction data!".format(self.metric)) + return + + self._prediction_data = PredictionData( + self._raw_data, + self.condensed_tree_, + min_samples, + tree_type=tree_type, + metric=self.metric, + **self._metric_kwargs, + ) + else: + warn( + "Cannot generate prediction data for non-vector" + "space inputs -- access to the source data rather" + "than mere distances is required!" + ) + + def weighted_cluster_centroid(self, cluster_id): + """Provide an approximate representative point for a given cluster. + Note that this technique assumes a euclidean metric for speed of + computation. For more general metrics use the ``weighted_cluster_medoid`` + method which is slower, but can work with the metric the model trained + with. + + Parameters + ---------- + cluster_id: int + The id of the cluster to compute a centroid for. + + Returns + ------- + centroid: array of shape (n_features,) + A representative centroid for cluster ``cluster_id``. + """ + if not hasattr(self, "labels_"): + raise AttributeError("Model has not been fit to data") + + if cluster_id == -1: + raise ValueError( + "Cannot calculate weighted centroid for -1 cluster " + "since it is a noise cluster" + ) + + mask = self.labels_ == cluster_id + cluster_data = self._raw_data[mask] + cluster_membership_strengths = self.probabilities_[mask] + + return np.average(cluster_data, weights=cluster_membership_strengths, axis=0) + + def weighted_cluster_medoid(self, cluster_id): + """Provide an approximate representative point for a given cluster. + Note that this technique can be very slow and memory intensive for + large clusters. For faster results use the ``weighted_cluster_centroid`` + method which is faster, but assumes a euclidean metric. + + Parameters + ---------- + cluster_id: int + The id of the cluster to compute a medoid for. + + Returns + ------- + centroid: array of shape (n_features,) + A representative medoid for cluster ``cluster_id``. + """ + if not hasattr(self, "labels_"): + raise AttributeError("Model has not been fit to data") + + if cluster_id == -1: + raise ValueError( + "Cannot calculate weighted centroid for -1 cluster " + "since it is a noise cluster" + ) + + mask = self.labels_ == cluster_id + cluster_data = self._raw_data[mask] + cluster_membership_strengths = self.probabilities_[mask] + + dist_mat = pairwise_distances( + cluster_data, metric=self.metric, **self._metric_kwargs + ) + + dist_mat = dist_mat * cluster_membership_strengths + medoid_index = np.argmin(dist_mat.sum(axis=1)) + return cluster_data[medoid_index] + + def dbscan_clustering(self, cut_distance, min_cluster_size=5): + """ + Return clustering that would be equivalent to running DBSCAN* for a + particular cut_distance (or epsilon) DBSCAN* can be thought of as + DBSCAN without the border points. As such these results may differ + slightly from sklearns implementation of dbscan in the non-core points. + + This can also be thought of as a flat clustering derived from constant + height cut through the single linkage tree. + + This represents the result of selecting a cut value for robust single linkage + clustering. The `min_cluster_size` allows the flat clustering to declare noise + points (and cluster smaller than `min_cluster_size`). + + Parameters + ---------- + + cut_distance : float + The mutual reachability distance cut value to use to generate a + flat clustering. + + min_cluster_size : int, optional + Clusters smaller than this value with be called 'noise' and remain + unclustered in the resulting flat clustering. + + Returns + ------- + + labels : array [n_samples] + An array of cluster labels, one per datapoint. Unclustered points + are assigned the label -1. + """ + return self.single_linkage_tree_.get_clusters( + cut_distance=cut_distance, + min_cluster_size=min_cluster_size, + ) + + @property + def prediction_data_(self): + if self._prediction_data is None: + raise AttributeError("No prediction data was generated") + else: + return self._prediction_data + + @property + def outlier_scores_(self): + if self._outlier_scores is not None: + return self._outlier_scores + else: + if self._condensed_tree is not None: + self._outlier_scores = outlier_scores(self._condensed_tree) + return self._outlier_scores + else: + raise AttributeError( + "No condensed tree was generated; try running fit first." + ) + + @property + def condensed_tree_(self): + if self._condensed_tree is not None: + return CondensedTree( + self._condensed_tree, + self.cluster_selection_method, + self.allow_single_cluster, + ) + else: + raise AttributeError( + "No condensed tree was generated; try running fit first." + ) + + @property + def single_linkage_tree_(self): + if self._single_linkage_tree is not None: + return SingleLinkageTree(self._single_linkage_tree) + else: + raise AttributeError( + "No single linkage tree was generated; try running fit first." + ) + + @property + def minimum_spanning_tree_(self): + if self._min_spanning_tree is not None: + if self._raw_data is not None: + return MinimumSpanningTree(self._min_spanning_tree, self._raw_data) + else: + warn( + "No raw data is available; this may be due to using" + " a precomputed metric matrix. No minimum spanning" + " tree will be provided without raw data." + ) + return None + else: + raise AttributeError( + "No minimum spanning tree was generated." + "This may be due to optimized algorithm variations that skip" + " explicit generation of the spanning tree." + ) + + @property + def exemplars_(self): + if self._prediction_data is not None: + return self._prediction_data.exemplars + elif self.metric in FAST_METRICS: + self.generate_prediction_data() + return self._prediction_data.exemplars + else: + raise AttributeError( + "Currently exemplars require the use of vector input data" + "with a suitable metric. This will likely change in the " + "future, but for now no exemplars can be provided" + ) + + @property + def relative_validity_(self): + if self._relative_validity is not None: + return self._relative_validity + + if not self.gen_min_span_tree: + raise AttributeError( + "Minimum spanning tree not present. " + + "Either HDBSCAN object was created with " + + "gen_min_span_tree=False or the tree was " + + "not generated in spite of it owing to " + + "internal optimization criteria." + ) + return + + labels = self.labels_ + sizes = np.bincount(labels + 1) + noise_size = sizes[0] + cluster_size = sizes[1:] + total = noise_size + np.sum(cluster_size) + num_clusters = len(cluster_size) + DSC = np.zeros(num_clusters) + min_outlier_sep = np.inf # only required if num_clusters = 1 + correction_const = 2 # only required if num_clusters = 1 + + # Unltimately, for each Ci, we only require the + # minimum of DSPC(Ci, Cj) over all Cj != Ci. + # So let's call this value DSPC_wrt(Ci), i.e. + # density separation 'with respect to' Ci. + DSPC_wrt = np.ones(num_clusters) * np.inf + max_distance = 0 + + mst_df = self.minimum_spanning_tree_.to_pandas() + + for edge in mst_df.iterrows(): + label1 = labels[int(edge[1]["from"])] + label2 = labels[int(edge[1]["to"])] + length = edge[1]["distance"] + + max_distance = max(max_distance, length) + + if label1 == -1 and label2 == -1: + continue + elif label1 == -1 or label2 == -1: + # If exactly one of the points is noise + min_outlier_sep = min(min_outlier_sep, length) + continue + + if label1 == label2: + # Set the density sparseness of the cluster + # to the sparsest value seen so far. + DSC[label1] = max(length, DSC[label1]) + else: + # Check whether density separations with + # respect to each of these clusters can + # be reduced. + DSPC_wrt[label1] = min(length, DSPC_wrt[label1]) + DSPC_wrt[label2] = min(length, DSPC_wrt[label2]) + + # In case min_outlier_sep is still np.inf, we assign a new value to it. + # This only makes sense if num_clusters = 1 since it has turned out + # that the MR-MST has no edges between a noise point and a core point. + min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep + + # DSPC_wrt[Ci] might be infinite if the connected component for Ci is + # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the + # MR-MST might contain an edge with one point in Cj and ther other one + # in Ck. Here, we replace the infinite density separation of Ci by + # another large enough value. + # + # TODO: Think of a better yet efficient way to handle this. + correction = correction_const * ( + max_distance if num_clusters > 1 else min_outlier_sep + ) + DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction + + V_index = [ + (DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) + for i in range(num_clusters) + ] + score = np.sum( + [(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)] + ) + self._relative_validity = score + return self._relative_validity diff --git a/sklearn/cluster/_hdbscan/plots.py b/sklearn/cluster/_hdbscan/plots.py new file mode 100644 index 0000000000000..e00a415af8aa5 --- /dev/null +++ b/sklearn/cluster/_hdbscan/plots.py @@ -0,0 +1,1033 @@ +# -*- coding: utf-8 -*- +# Author: Leland McInnes +# +# License: BSD 3 clause + +import numpy as np + +from scipy.cluster.hierarchy import dendrogram +from sklearn.manifold import TSNE +from sklearn.decomposition import PCA +from warnings import warn +from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs + +CB_LEFT = 0 +CB_RIGHT = 1 +CB_BOTTOM = 2 +CB_TOP = 3 + + +def _bfs_from_cluster_tree(tree, bfs_root): + """ + Perform a breadth first search on a tree in condensed tree format + """ + + result = [] + to_process = [bfs_root] + + while to_process: + result.extend(to_process) + to_process = tree["child"][np.in1d(tree["parent"], to_process)].tolist() + + return result + + +def _recurse_leaf_dfs(cluster_tree, current_node): + children = cluster_tree[cluster_tree["parent"] == current_node]["child"] + if len(children) == 0: + return [ + current_node, + ] + else: + return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) + + +def _get_leaves(condensed_tree): + cluster_tree = condensed_tree[condensed_tree["child_size"] > 1] + if cluster_tree.shape[0] == 0: + # Return the only cluster, the root + return [condensed_tree["parent"].min()] + + root = cluster_tree["parent"].min() + return _recurse_leaf_dfs(cluster_tree, root) + + +class CondensedTree(object): + """The condensed tree structure, which provides a simplified or smoothed version + of the :class:`~hdbscan.plots.SingleLinkageTree`. + + Parameters + ---------- + condensed_tree_array : numpy recarray from :class:`~hdbscan.HDBSCAN` + The raw numpy rec array version of the condensed tree as produced + internally by hdbscan. + + cluster_selection_method : string, optional (default 'eom') + The method of selecting clusters. One of 'eom' or 'leaf' + + allow_single_cluster : Boolean, optional (default False) + Whether to allow the root cluster as the only selected cluster + + """ + + def __init__( + self, + condensed_tree_array, + cluster_selection_method="eom", + allow_single_cluster=False, + ): + self._raw_tree = condensed_tree_array + self.cluster_selection_method = cluster_selection_method + self.allow_single_cluster = allow_single_cluster + + def get_plot_data( + self, leaf_separation=1, log_size=False, max_rectangle_per_icicle=20 + ): + """Generates data for use in plotting the 'icicle plot' or dendrogram + plot of the condensed tree generated by HDBSCAN. + + Parameters + ---------- + leaf_separation : float, optional + How far apart to space the final leaves of the + dendrogram. (default 1) + + log_size : boolean, optional + Use log scale for the 'size' of clusters (i.e. number of + points in the cluster at a given lambda value). + (default False) + + max_rectangles_per_icicle : int, optional + To simplify the plot this method will only emit + ``max_rectangles_per_icicle`` bars per branch of the dendrogram. + This ensures that we don't suffer from massive overplotting in + cases with a lot of data points. + + Returns + ------- + plot_data : dict + Data associated to bars in a bar plot: + `bar_centers` x coordinate centers for bars + `bar_tops` heights of bars in lambda scale + `bar_bottoms` y coordinate of bottoms of bars + `bar_widths` widths of the bars (in x coord scale) + `bar_bounds` a 4-tuple of [left, right, bottom, top] + giving the bounds on a full set of + cluster bars + Data associates with cluster splits: + `line_xs` x coordinates for horizontal dendrogram lines + `line_ys` y coordinates for horizontal dendrogram lines + """ + leaves = _get_leaves(self._raw_tree) + last_leaf = self._raw_tree["parent"].max() + root = self._raw_tree["parent"].min() + + # We want to get the x and y coordinates for the start of each cluster + # Initialize the leaves, since we know where they go, the iterate + # through everything from the leaves back, setting coords as we go + if isinstance(leaves, np.int64): + cluster_x_coords = {leaves: leaf_separation} + else: + cluster_x_coords = dict( + zip(leaves, [leaf_separation * x for x in range(len(leaves))]) + ) + cluster_y_coords = {root: 0.0} + + for cluster in range(last_leaf, root - 1, -1): + split = self._raw_tree[["child", "lambda_val"]] + split = split[ + (self._raw_tree["parent"] == cluster) + & (self._raw_tree["child_size"] > 1) + ] + if len(split["child"]) > 1: + left_child, right_child = split["child"] + cluster_x_coords[cluster] = np.mean( + [cluster_x_coords[left_child], cluster_x_coords[right_child]] + ) + cluster_y_coords[left_child] = split["lambda_val"][0] + cluster_y_coords[right_child] = split["lambda_val"][1] + + # We use bars to plot the 'icicles', so we need to generate centers, tops, + # bottoms and widths for each rectangle. We can go through each cluster + # and do this for each in turn. + bar_centers = [] + bar_tops = [] + bar_bottoms = [] + bar_widths = [] + + cluster_bounds = {} + + scaling = np.sum(self._raw_tree[self._raw_tree["parent"] == root]["child_size"]) + + if log_size: + scaling = np.log(scaling) + + for c in range(last_leaf, root - 1, -1): + + cluster_bounds[c] = [0, 0, 0, 0] + + c_children = self._raw_tree[self._raw_tree["parent"] == c] + current_size = np.sum(c_children["child_size"]) + current_lambda = cluster_y_coords[c] + cluster_max_size = current_size + cluster_max_lambda = c_children["lambda_val"].max() + cluster_min_size = np.sum( + c_children[c_children["lambda_val"] == cluster_max_lambda]["child_size"] + ) + + if log_size: + current_size = np.log(current_size) + cluster_max_size = np.log(cluster_max_size) + cluster_min_size = np.log(cluster_min_size) + + total_size_change = float(cluster_max_size - cluster_min_size) + step_size_change = total_size_change / max_rectangle_per_icicle + + cluster_bounds[c][CB_LEFT] = cluster_x_coords[c] * scaling - ( + current_size / 2.0 + ) + cluster_bounds[c][CB_RIGHT] = cluster_x_coords[c] * scaling + ( + current_size / 2.0 + ) + cluster_bounds[c][CB_BOTTOM] = cluster_y_coords[c] + cluster_bounds[c][CB_TOP] = np.max(c_children["lambda_val"]) + + last_step_size = current_size + last_step_lambda = current_lambda + + for i in np.argsort(c_children["lambda_val"]): + row = c_children[i] + if row["lambda_val"] != current_lambda and ( + last_step_size - current_size > step_size_change + or row["lambda_val"] == cluster_max_lambda + ): + bar_centers.append(cluster_x_coords[c] * scaling) + bar_tops.append(row["lambda_val"] - last_step_lambda) + bar_bottoms.append(last_step_lambda) + bar_widths.append(last_step_size) + last_step_size = current_size + last_step_lambda = current_lambda + if log_size: + exp_size = np.exp(current_size) - row["child_size"] + # Ensure we don't try to take log of zero + if exp_size > 0.01: + current_size = np.log(np.exp(current_size) - row["child_size"]) + else: + current_size = 0.0 + else: + current_size -= row["child_size"] + current_lambda = row["lambda_val"] + + # Finally we need the horizontal lines that occur at cluster splits. + line_xs = [] + line_ys = [] + + for row in self._raw_tree[self._raw_tree["child_size"] > 1]: + parent = row["parent"] + child = row["child"] + child_size = row["child_size"] + if log_size: + child_size = np.log(child_size) + sign = np.sign(cluster_x_coords[child] - cluster_x_coords[parent]) + line_xs.append( + [ + cluster_x_coords[parent] * scaling, + cluster_x_coords[child] * scaling + sign * (child_size / 2.0), + ] + ) + line_ys.append([cluster_y_coords[child], cluster_y_coords[child]]) + + return { + "bar_centers": bar_centers, + "bar_tops": bar_tops, + "bar_bottoms": bar_bottoms, + "bar_widths": bar_widths, + "line_xs": line_xs, + "line_ys": line_ys, + "cluster_bounds": cluster_bounds, + } + + def _select_clusters(self): + if self.cluster_selection_method == "eom": + stability = compute_stability(self._raw_tree) + if self.allow_single_cluster: + node_list = sorted(stability.keys(), reverse=True) + else: + node_list = sorted(stability.keys(), reverse=True)[:-1] + cluster_tree = self._raw_tree[self._raw_tree["child_size"] > 1] + is_cluster = {cluster: True for cluster in node_list} + + for node in node_list: + child_selection = cluster_tree["parent"] == node + subtree_stability = np.sum( + [ + stability[child] + for child in cluster_tree["child"][child_selection] + ] + ) + + if subtree_stability > stability[node]: + is_cluster[node] = False + stability[node] = subtree_stability + else: + for sub_node in _bfs_from_cluster_tree(cluster_tree, node): + if sub_node != node: + is_cluster[sub_node] = False + + return sorted([cluster for cluster in is_cluster if is_cluster[cluster]]) + + elif self.cluster_selection_method == "leaf": + return _get_leaves(self._raw_tree) + else: + raise ValueError( + "Invalid Cluster Selection Method: %s\n" + 'Should be one of: "eom", "leaf"\n' + ) + + def plot( + self, + leaf_separation=1, + cmap="viridis", + select_clusters=False, + label_clusters=False, + selection_palette=None, + axis=None, + colorbar=True, + log_size=False, + max_rectangles_per_icicle=20, + ): + """Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree. + + Effectively this is a dendrogram where the width of each cluster bar is + equal to the number of points (or log of the number of points) in the cluster + at the given lambda value. Thus bars narrow as points progressively drop + out of clusters. The make the effect more apparent the bars are also colored + according the the number of points (or log of the number of points). + + Parameters + ---------- + leaf_separation : float, optional (default 1) + How far apart to space the final leaves of the + dendrogram. + + cmap : string or matplotlib colormap, optional (default viridis) + The matplotlib colormap to use to color the cluster bars. + + + select_clusters : boolean, optional (default False) + Whether to draw ovals highlighting which cluster + bar represent the clusters that were selected by + HDBSCAN as the final clusters. + + label_clusters : boolean, optional (default False) + If select_clusters is True then this determines + whether to draw text labels on the clusters. + + selection_palette : list of colors, optional (default None) + If not None, and at least as long as + the number of clusters, draw ovals + in colors iterating through this palette. + This can aid in cluster identification + when plotting. + + axis : matplotlib axis or None, optional (default None) + The matplotlib axis to render to. If None then a new axis + will be generated. The rendered axis will be returned. + + + colorbar : boolean, optional (default True) + Whether to draw a matplotlib colorbar displaying the range + of cluster sizes as per the colormap. + + log_size : boolean, optional (default False) + Use log scale for the 'size' of clusters (i.e. number of + points in the cluster at a given lambda value). + + + max_rectangles_per_icicle : int, optional (default 20) + To simplify the plot this method will only emit + ``max_rectangles_per_icicle`` bars per branch of the dendrogram. + This ensures that we don't suffer from massive overplotting in + cases with a lot of data points. + + Returns + ------- + axis : matplotlib axis + The axis on which the 'icicle plot' has been rendered. + """ + try: + import matplotlib.pyplot as plt + except ImportError: + raise ImportError( + "You must install the matplotlib library to plot the condensed tree." + "Use get_plot_data to calculate the relevant data without plotting." + ) + + plot_data = self.get_plot_data( + leaf_separation=leaf_separation, + log_size=log_size, + max_rectangle_per_icicle=max_rectangles_per_icicle, + ) + + if cmap != "none": + sm = plt.cm.ScalarMappable( + cmap=cmap, norm=plt.Normalize(0, max(plot_data["bar_widths"])) + ) + sm.set_array(plot_data["bar_widths"]) + bar_colors = [sm.to_rgba(x) for x in plot_data["bar_widths"]] + else: + bar_colors = "black" + + if axis is None: + axis = plt.gca() + + axis.bar( + plot_data["bar_centers"], + plot_data["bar_tops"], + bottom=plot_data["bar_bottoms"], + width=plot_data["bar_widths"], + color=bar_colors, + align="center", + linewidth=0, + ) + + drawlines = [] + for xs, ys in zip(plot_data["line_xs"], plot_data["line_ys"]): + drawlines.append(xs) + drawlines.append(ys) + axis.plot(*drawlines, color="black", linewidth=1) + # for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']): + # axis.plot(xs, ys, color='black', linewidth=1) + + if select_clusters: + try: + from matplotlib.patches import Ellipse + except ImportError: + raise ImportError( + "You must have matplotlib.patches available to plot selected" + " clusters." + ) + + chosen_clusters = self._select_clusters() + + # Extract the chosen cluster bounds. If enough duplicate data + # points exist in the data the lambda value might be infinite. + # This breaks labeling and highlighting the chosen clusters. + cluster_bounds = np.array( + [plot_data["cluster_bounds"][c] for c in chosen_clusters] + ) + if not np.isfinite(cluster_bounds).all(): + warn( + "Infinite lambda values encountered in chosen clusters." + " This might be due to duplicates in the data." + ) + + # Extract the plot range of the y-axis and set default center and + # height values for ellipses. Extremly dense clusters might result + # in near infinite lambda values. Setting max_height based on the + # percentile should alleviate the impact on plotting. + plot_range = np.hstack([plot_data["bar_tops"], plot_data["bar_bottoms"]]) + plot_range = plot_range[np.isfinite(plot_range)] + mean_y_center = np.mean([np.max(plot_range), np.min(plot_range)]) + max_height = np.diff(np.percentile(plot_range, q=[10, 90])) + + for i, c in enumerate(chosen_clusters): + c_bounds = plot_data["cluster_bounds"][c] + width = c_bounds[CB_RIGHT] - c_bounds[CB_LEFT] + height = c_bounds[CB_TOP] - c_bounds[CB_BOTTOM] + center = ( + np.mean([c_bounds[CB_LEFT], c_bounds[CB_RIGHT]]), + np.mean([c_bounds[CB_TOP], c_bounds[CB_BOTTOM]]), + ) + + # Set center and height to default values if necessary + if not np.isfinite(center[1]): + center = (center[0], mean_y_center) + if not np.isfinite(height): + height = max_height + + # Ensure the ellipse is visible + min_height = 0.1 * max_height + if height < min_height: + height = min_height + + if selection_palette is not None and len(selection_palette) >= len( + chosen_clusters + ): + oval_color = selection_palette[i] + else: + oval_color = "r" + + box = Ellipse( + center, + 2.0 * width, + 1.2 * height, + facecolor="none", + edgecolor=oval_color, + linewidth=2, + ) + + if label_clusters: + axis.annotate( + str(i), + xy=center, + xytext=(center[0] - 4.0 * width, center[1] + 0.65 * height), + horizontalalignment="left", + verticalalignment="bottom", + ) + + axis.add_artist(box) + + if colorbar: + cb = plt.colorbar(sm, ax=axis) + if log_size: + cb.ax.set_ylabel("log(Number of points)") + else: + cb.ax.set_ylabel("Number of points") + + axis.set_xticks([]) + for side in ("right", "top", "bottom"): + axis.spines[side].set_visible(False) + axis.invert_yaxis() + axis.set_ylabel("$\lambda$ value") + + return axis + + def to_numpy(self): + """Return a numpy structured array representation of the condensed tree.""" + return self._raw_tree.copy() + + def to_pandas(self): + """Return a pandas dataframe representation of the condensed tree. + + Each row of the dataframe corresponds to an edge in the tree. + The columns of the dataframe are `parent`, `child`, `lambda_val` + and `child_size`. + + The `parent` and `child` are the ids of the + parent and child nodes in the tree. Node ids less than the number + of points in the original dataset represent individual points, while + ids greater than the number of points are clusters. + + The `lambda_val` value is the value (1/distance) at which the `child` + node leaves the cluster. + + The `child_size` is the number of points in the `child` node. + """ + try: + from pandas import DataFrame, Series + except ImportError: + raise ImportError( + "You must have pandas installed to export pandas DataFrames" + ) + + result = DataFrame(self._raw_tree) + + return result + + def to_networkx(self): + """Return a NetworkX DiGraph object representing the condensed tree. + + Edge weights in the graph are the lamba values at which child nodes + 'leave' the parent cluster. + + Nodes have a `size` attribute attached giving the number of points + that are in the cluster (or 1 if it is a singleton point) at the + point of cluster creation (fewer points may be in the cluster at + larger lambda values). + """ + try: + from networkx import DiGraph, set_node_attributes + except ImportError: + raise ImportError( + "You must have networkx installed to export networkx graphs" + ) + + result = DiGraph() + for row in self._raw_tree: + result.add_edge(row["parent"], row["child"], weight=row["lambda_val"]) + + set_node_attributes( + result, dict(self._raw_tree[["child", "child_size"]]), "size" + ) + + return result + + +def _get_dendrogram_ordering(parent, linkage, root): + + if parent < root: + return [] + + return ( + _get_dendrogram_ordering(int(linkage[parent - root][0]), linkage, root) + + _get_dendrogram_ordering(int(linkage[parent - root][1]), linkage, root) + + [parent] + ) + + +def _calculate_linewidths(ordering, linkage, root): + + linewidths = [] + + for x in ordering: + if linkage[x - root][0] >= root: + left_width = linkage[int(linkage[x - root][0]) - root][3] + else: + left_width = 1 + + if linkage[x - root][1] >= root: + right_width = linkage[int(linkage[x - root][1]) - root][3] + else: + right_width = 1 + + linewidths.append((left_width, right_width)) + + return linewidths + + +class SingleLinkageTree(object): + """A single linkage format dendrogram tree, with plotting functionality + and networkX support. + + Parameters + ---------- + linkage : ndarray (n_samples, 4) + The numpy array that holds the tree structure. As output by + scipy.cluster.hierarchy, hdbscan, of fastcluster. + + """ + + def __init__(self, linkage): + self._linkage = linkage + + def plot( + self, + axis=None, + truncate_mode=None, + p=0, + vary_line_width=True, + cmap="viridis", + colorbar=True, + ): + """Plot a dendrogram of the single linkage tree. + + Parameters + ---------- + truncate_mode : str, optional + The dendrogram can be hard to read when the original + observation matrix from which the linkage is derived + is large. Truncation is used to condense the dendrogram. + There are several modes: + + ``None/'none'`` + No truncation is performed (Default). + + ``'lastp'`` + The last p non-singleton formed in the linkage are the only + non-leaf nodes in the linkage; they correspond to rows + Z[n-p-2:end] in Z. All other non-singleton clusters are + contracted into leaf nodes. + + ``'level'/'mtica'`` + No more than p levels of the dendrogram tree are displayed. + This corresponds to Mathematica(TM) behavior. + + p : int, optional + The ``p`` parameter for ``truncate_mode``. + + vary_line_width : boolean, optional + Draw downward branches of the dendrogram with line thickness that + varies depending on the size of the cluster. + + cmap : string or matplotlib colormap, optional + The matplotlib colormap to use to color the cluster bars. + A value of 'none' will result in black bars. + (default 'viridis') + + colorbar : boolean, optional + Whether to draw a matplotlib colorbar displaying the range + of cluster sizes as per the colormap. (default True) + + Returns + ------- + axis : matplotlib axis + The axis on which the dendrogram plot has been rendered. + + """ + dendrogram_data = dendrogram( + self._linkage, p=p, truncate_mode=truncate_mode, no_plot=True + ) + X = dendrogram_data["icoord"] + Y = dendrogram_data["dcoord"] + + try: + import matplotlib.pyplot as plt + except ImportError: + raise ImportError( + "You must install the matplotlib library to plot the single linkage" + " tree." + ) + + if axis is None: + axis = plt.gca() + + if vary_line_width: + dendrogram_ordering = _get_dendrogram_ordering( + 2 * len(self._linkage), self._linkage, len(self._linkage) + 1 + ) + linewidths = _calculate_linewidths( + dendrogram_ordering, self._linkage, len(self._linkage) + 1 + ) + else: + linewidths = [(1.0, 1.0)] * len(Y) + + if cmap != "none": + color_array = np.log2(np.array(linewidths).flatten()) + sm = plt.cm.ScalarMappable( + cmap=cmap, norm=plt.Normalize(0, color_array.max()) + ) + sm.set_array(color_array) + + for x, y, lw in zip(X, Y, linewidths): + left_x = x[:2] + right_x = x[2:] + left_y = y[:2] + right_y = y[2:] + horizontal_x = x[1:3] + horizontal_y = y[1:3] + + if cmap != "none": + axis.plot( + left_x, + left_y, + color=sm.to_rgba(np.log2(lw[0])), + linewidth=np.log2(1 + lw[0]), + solid_joinstyle="miter", + solid_capstyle="butt", + ) + axis.plot( + right_x, + right_y, + color=sm.to_rgba(np.log2(lw[1])), + linewidth=np.log2(1 + lw[1]), + solid_joinstyle="miter", + solid_capstyle="butt", + ) + else: + axis.plot( + left_x, + left_y, + color="k", + linewidth=np.log2(1 + lw[0]), + solid_joinstyle="miter", + solid_capstyle="butt", + ) + axis.plot( + right_x, + right_y, + color="k", + linewidth=np.log2(1 + lw[1]), + solid_joinstyle="miter", + solid_capstyle="butt", + ) + + axis.plot( + horizontal_x, + horizontal_y, + color="k", + linewidth=1.0, + solid_joinstyle="miter", + solid_capstyle="butt", + ) + + if colorbar: + cb = plt.colorbar(sm, ax=axis) + cb.ax.set_ylabel("log(Number of points)") + + axis.set_xticks([]) + for side in ("right", "top", "bottom"): + axis.spines[side].set_visible(False) + axis.set_ylabel("distance") + + return axis + + def to_numpy(self): + """Return a numpy array representation of the single linkage tree. + + This representation conforms to the scipy.cluster.hierarchy notion + of a single linkage tree, and can be used with all the associated + scipy tools. Please see the scipy documentation for more details + on the format. + """ + return self._linkage.copy() + + def to_pandas(self): + """Return a pandas dataframe representation of the single linkage tree. + + Each row of the dataframe corresponds to an edge in the tree. + The columns of the dataframe are `parent`, `left_child`, + `right_child`, `distance` and `size`. + + The `parent`, `left_child` and `right_child` are the ids of the + parent and child nodes in the tree. Node ids less than the number + of points in the original dataset represent individual points, while + ids greater than the number of points are clusters. + + The `distance` value is the at which the child nodes merge to form + the parent node. + + The `size` is the number of points in the `parent` node. + """ + try: + from pandas import DataFrame, Series + except ImportError: + raise ImportError( + "You must have pandas installed to export pandas DataFrames" + ) + + max_node = 2 * self._linkage.shape[0] + num_points = max_node - (self._linkage.shape[0] - 1) + + parent_array = np.arange(num_points, max_node + 1) + + result = DataFrame( + { + "parent": parent_array, + "left_child": self._linkage.T[0], + "right_child": self._linkage.T[1], + "distance": self._linkage.T[2], + "size": self._linkage.T[3], + } + )[["parent", "left_child", "right_child", "distance", "size"]] + + return result + + def to_networkx(self): + """Return a NetworkX DiGraph object representing the single linkage tree. + + Edge weights in the graph are the distance values at which child nodes + merge to form the parent cluster. + + Nodes have a `size` attribute attached giving the number of points + that are in the cluster. + """ + try: + from networkx import DiGraph, set_node_attributes + except ImportError: + raise ImportError( + "You must have networkx installed to export networkx graphs" + ) + + max_node = 2 * self._linkage.shape[0] + num_points = max_node - (self._linkage.shape[0] - 1) + + result = DiGraph() + for parent, row in enumerate(self._linkage, num_points): + result.add_edge(parent, row[0], weight=row[2]) + result.add_edge(parent, row[1], weight=row[2]) + + size_dict = { + parent: row[3] for parent, row in enumerate(self._linkage, num_points) + } + set_node_attributes(result, size_dict, "size") + + return result + + def get_clusters(self, cut_distance, min_cluster_size=5): + """Return a flat clustering from the single linkage hierarchy. + + This represents the result of selecting a cut value for robust single linkage + clustering. The `min_cluster_size` allows the flat clustering to declare noise + points (and cluster smaller than `min_cluster_size`). + + Parameters + ---------- + + cut_distance : float + The mutual reachability distance cut value to use to generate a + flat clustering. + + min_cluster_size : int, optional + Clusters smaller than this value with be called 'noise' and remain + unclustered in the resulting flat clustering. + + Returns + ------- + + labels : array [n_samples] + An array of cluster labels, one per datapoint. Unclustered points + are assigned the label -1. + """ + return labelling_at_cut(self._linkage, cut_distance, min_cluster_size) + + +class MinimumSpanningTree(object): + def __init__(self, mst, data): + self._mst = mst + self._data = data + + def plot( + self, + axis=None, + node_size=40, + node_color="k", + node_alpha=0.8, + edge_alpha=0.5, + edge_cmap="viridis_r", + edge_linewidth=2, + vary_line_width=True, + colorbar=True, + ): + """Plot the minimum spanning tree (as projected into 2D by t-SNE if required). + + Parameters + ---------- + + axis : matplotlib axis, optional + The axis to render the plot to + + node_size : int, optional + The size of nodes in the plot (default 40). + + node_color : matplotlib color spec, optional + The color to render nodes (default black). + + node_alpha : float, optional + The alpha value (between 0 and 1) to render nodes with + (default 0.8). + + edge_cmap : matplotlib colormap, optional + The colormap to color edges by (varying color by edge + weight/distance). Can be a cmap object or a string + recognised by matplotlib. (default `viridis_r`) + + edge_alpha : float, optional + The alpha value (between 0 and 1) to render edges with + (default 0.5). + + edge_linewidth : float, optional + The linewidth to use for rendering edges (default 2). + + vary_line_width : bool, optional + Edge width is proportional to (log of) the inverse of the + mutual reachability distance. (default True) + + colorbar : bool, optional + Whether to draw a colorbar. (default True) + + Returns + ------- + + axis : matplotlib axis + The axis used the render the plot. + """ + try: + import matplotlib.pyplot as plt + from matplotlib.collections import LineCollection + except ImportError: + raise ImportError( + "You must install the matplotlib library to plot the minimum spanning" + " tree." + ) + + if self._data.shape[0] > 32767: + warn("Too many data points for safe rendering of an minimal spanning tree!") + return None + + if axis is None: + axis = plt.gca() + + if self._data.shape[1] > 2: + # Get a 2D projection; if we have a lot of dimensions use PCA first + if self._data.shape[1] > 32: + # Use PCA to get down to 32 dimension + data_for_projection = PCA(n_components=32).fit_transform(self._data) + else: + data_for_projection = self._data.copy() + + projection = TSNE().fit_transform(data_for_projection) + else: + projection = self._data.copy() + + if vary_line_width: + line_width = edge_linewidth * ( + np.log(self._mst.T[2].max() / self._mst.T[2]) + 1.0 + ) + else: + line_width = edge_linewidth + + line_coords = projection[self._mst[:, :2].astype(int)] + line_collection = LineCollection( + line_coords, linewidth=line_width, cmap=edge_cmap, alpha=edge_alpha + ) + line_collection.set_array(self._mst[:, 2].T) + + axis.add_artist(line_collection) + axis.scatter( + projection.T[0], + projection.T[1], + c=node_color, + alpha=node_alpha, + s=node_size, + ) + axis.set_xticks([]) + axis.set_yticks([]) + + if colorbar: + cb = plt.colorbar(line_collection, ax=axis) + cb.ax.set_ylabel("Mutual reachability distance") + + return axis + + def to_numpy(self): + """Return a numpy array of weighted edges in the minimum spanning tree""" + return self._mst.copy() + + def to_pandas(self): + """Return a Pandas dataframe of the minimum spanning tree. + + Each row is an edge in the tree; the columns are `from`, + `to`, and `distance` giving the two vertices of the edge + which are indices into the dataset, and the distance + between those datapoints. + """ + try: + from pandas import DataFrame + except ImportError: + raise ImportError( + "You must have pandas installed to export pandas DataFrames" + ) + + result = DataFrame( + { + "from": self._mst.T[0].astype(int), + "to": self._mst.T[1].astype(int), + "distance": self._mst.T[2], + } + ) + return result + + def to_networkx(self): + """Return a NetworkX Graph object representing the minimum spanning tree. + + Edge weights in the graph are the distance between the nodes they connect. + + Nodes have a `data` attribute attached giving the data vector of the + associated point. + """ + try: + from networkx import Graph, set_node_attributes + except ImportError: + raise ImportError( + "You must have networkx installed to export networkx graphs" + ) + + result = Graph() + for row in self._mst: + result.add_edge(row[0], row[1], weight=row[2]) + + data_dict = {index: tuple(row) for index, row in enumerate(self._data)} + set_node_attributes(result, data_dict, "data") + + return result diff --git a/sklearn/cluster/_hdbscan/prediction.py b/sklearn/cluster/_hdbscan/prediction.py new file mode 100644 index 0000000000000..888ce25b05b2f --- /dev/null +++ b/sklearn/cluster/_hdbscan/prediction.py @@ -0,0 +1,696 @@ +# Support various prediction methods for predicting cluster membership +# of new or unseen points. There are several ways to interpret how +# to do this correctly, so we provide several methods for +# the different use cases that may arise. + +import numpy as np + +from sklearn.neighbors import KDTree, BallTree +from .dist_metrics import DistanceMetric +from ._hdbscan_tree import recurse_leaf_dfs +from ._prediction_utils import ( + get_tree_row_with_child, + dist_membership_vector, + outlier_membership_vector, + prob_in_some_cluster, + all_points_dist_membership_vector, + all_points_outlier_membership_vector, + all_points_prob_in_some_cluster, +) +from warnings import warn + + +class PredictionData(object): + """ + Extra data that allows for faster prediction if cached. + + Parameters + ---------- + + data : array (n_samples, n_features) + The original data set that was clustered + + condensed_tree : CondensedTree + The condensed tree object created by a clustering + + min_samples : int + The min_samples value used in clustering + + tree_type : string, optional + Which type of space tree to use for core distance computation. + One of: + * ``kdtree`` + * ``balltree`` + + metric : string, optional + The metric used to determine distance for the clustering. + This is the metric that will be used for the space tree to determine + core distances etc. + + **kwargs : + Any further arguments to the metric. + + Attributes + ---------- + + raw_data : array (n_samples, n_features) + The original data set that was clustered + + tree : KDTree or BallTree + A space partitioning tree that can be queried for nearest neighbors. + + core_distances : array (n_samples,) + The core distances for every point in the original data set. + + cluster_map : dict + A dictionary mapping cluster numbers in the condensed tree to labels + in the final selected clustering. + + cluster_tree : structured array + A version of the condensed tree that only contains clusters, not + individual points. + + max_lambdas : dict + A dictionary mapping cluster numbers in the condensed tree to the + maximum lambda value seen in that cluster. + """ + + _tree_type_map = {"kdtree": KDTree, "balltree": BallTree} + + def _clusters_below(self, cluster): + result = [] + to_process = [cluster] + + while to_process: + result.extend(to_process) + to_process = self.cluster_tree["child"][ + np.in1d(self.cluster_tree["parent"], to_process) + ] + to_process = to_process.tolist() + + return result + + def _recurse_leaf_dfs(self, current_node): + children = self.cluster_tree[self.cluster_tree["parent"] == current_node][ + "child" + ] + if len(children) == 0: + return [ + current_node, + ] + else: + return sum( + [recurse_leaf_dfs(self.cluster_tree, child) for child in children], [] + ) + + def __init__( + self, + data, + condensed_tree, + min_samples, + tree_type="kdtree", + metric="euclidean", + **kwargs, + ): + self.raw_data = data.astype(np.float64) + self.tree = self._tree_type_map[tree_type]( + self.raw_data, metric=metric, **kwargs + ) + self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1] + self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + + selected_clusters = sorted(condensed_tree._select_clusters()) + # raw_condensed_tree = condensed_tree.to_numpy() + raw_condensed_tree = condensed_tree._raw_tree + + self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))} + self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()} + + self.cluster_tree = raw_condensed_tree[raw_condensed_tree["child_size"] > 1] + self.max_lambdas = {} + self.leaf_max_lambdas = {} + self.exemplars = [] + + all_clusters = set( + np.hstack([self.cluster_tree["parent"], self.cluster_tree["child"]]) + ) + + for cluster in all_clusters: + self.leaf_max_lambdas[cluster] = raw_condensed_tree["lambda_val"][ + raw_condensed_tree["parent"] == cluster + ].max() + + for cluster in selected_clusters: + self.max_lambdas[cluster] = raw_condensed_tree["lambda_val"][ + raw_condensed_tree["parent"] == cluster + ].max() + + for sub_cluster in self._clusters_below(cluster): + self.cluster_map[sub_cluster] = self.cluster_map[cluster] + self.max_lambdas[sub_cluster] = self.max_lambdas[cluster] + + cluster_exemplars = np.array([], dtype=np.int64) + for leaf in self._recurse_leaf_dfs(cluster): + leaf_max_lambda = raw_condensed_tree["lambda_val"][ + raw_condensed_tree["parent"] == leaf + ].max() + points = raw_condensed_tree["child"][ + (raw_condensed_tree["parent"] == leaf) + & (raw_condensed_tree["lambda_val"] == leaf_max_lambda) + ] + cluster_exemplars = np.hstack([cluster_exemplars, points]) + + self.exemplars.append(self.raw_data[cluster_exemplars]) + + +def _find_neighbor_and_lambda( + neighbor_indices, neighbor_distances, core_distances, min_samples +): + """ + Find the nearest mutual reachability neighbor of a point, and compute + the associated lambda value for the point, given the mutual reachability + distance to a nearest neighbor. + + Parameters + ---------- + neighbor_indices : array (2 * min_samples, ) + An array of raw distance based nearest neighbor indices. + + neighbor_distances : array (2 * min_samples, ) + An array of raw distances to the nearest neighbors. + + core_distances : array (n_samples, ) + An array of core distances for all points + + min_samples : int + The min_samples value used to generate core distances. + + Returns + ------- + neighbor : int + The index into the full raw data set of the nearest mutual reachability + distance neighbor of the point. + + lambda_ : float + The lambda value at which this point joins/merges with `neighbor`. + """ + neighbor_core_distances = core_distances[neighbor_indices] + point_core_distances = neighbor_distances[min_samples] * np.ones( + neighbor_indices.shape[0] + ) + mr_distances = np.vstack( + (neighbor_core_distances, point_core_distances, neighbor_distances) + ).max(axis=0) + + nn_index = mr_distances.argmin() + + nearest_neighbor = neighbor_indices[nn_index] + if mr_distances[nn_index] > 0.0: + lambda_ = 1.0 / mr_distances[nn_index] + else: + lambda_ = np.finfo(np.double).max + + return nearest_neighbor, lambda_ + + +def _extend_condensed_tree( + tree, neighbor_indices, neighbor_distances, core_distances, min_samples +): + """ + Create a new condensed tree with an additional point added, allowing for + computations as if this point had been part of the original tree. Note + that this makes as little change to the tree as possible, with no + re-optimizing/re-condensing so that the selected clusters remain + effectively unchanged. + + Parameters + ---------- + tree : structured array + The raw format condensed tree to update. + + neighbor_indices : array (2 * min_samples, ) + An array of raw distance based nearest neighbor indices. + + neighbor_distances : array (2 * min_samples, ) + An array of raw distances to the nearest neighbors. + + core_distances : array (n_samples, ) + An array of core distances for all points + + min_samples : int + The min_samples value used to generate core distances. + + Returns + ------- + new_tree : structured array + The original tree with an extra row providing the parent cluster + and lambda information for a new point given index -1. + """ + tree_root = tree["parent"].min() + + nearest_neighbor, lambda_ = _find_neighbor_and_lambda( + neighbor_indices, neighbor_distances, core_distances, min_samples + ) + + neighbor_tree_row = get_tree_row_with_child(tree, nearest_neighbor) + potential_cluster = neighbor_tree_row["parent"] + + if neighbor_tree_row["lambda_val"] <= lambda_: + # New point departs with the old + new_tree_row = (potential_cluster, -1, 1, neighbor_tree_row["lambda_val"]) + else: + # Find appropriate cluster based on lambda of new point + while ( + potential_cluster > tree_root + and tree[tree["child"] == potential_cluster]["lambda_val"] >= lambda_ + ): + potential_cluster = tree["parent"][tree["child"] == potential_cluster][0] + + new_tree_row = (potential_cluster, -1, 1, lambda_) + + return np.append(tree, new_tree_row) + + +def _find_cluster_and_probability( + tree, + cluster_tree, + neighbor_indices, + neighbor_distances, + core_distances, + cluster_map, + max_lambdas, + min_samples, +): + """ + Return the cluster label (of the original clustering) and membership + probability of a new data point. + + Parameters + ---------- + tree : CondensedTree + The condensed tree associated with the clustering. + + cluster_tree : structured_array + The raw form of the condensed tree with only cluster information (no + data on individual points). This is significantly more compact. + + neighbor_indices : array (2 * min_samples, ) + An array of raw distance based nearest neighbor indices. + + neighbor_distances : array (2 * min_samples, ) + An array of raw distances to the nearest neighbors. + + core_distances : array (n_samples, ) + An array of core distances for all points + + cluster_map : dict + A dictionary mapping cluster numbers in the condensed tree to labels + in the final selected clustering. + + max_lambdas : dict + A dictionary mapping cluster numbers in the condensed tree to the + maximum lambda value seen in that cluster. + + min_samples : int + The min_samples value used to generate core distances. + """ + raw_tree = tree._raw_tree + tree_root = cluster_tree["parent"].min() + + nearest_neighbor, lambda_ = _find_neighbor_and_lambda( + neighbor_indices, neighbor_distances, core_distances, min_samples + ) + + neighbor_tree_row = get_tree_row_with_child(raw_tree, nearest_neighbor) + potential_cluster = neighbor_tree_row["parent"] + + if neighbor_tree_row["lambda_val"] > lambda_: + # Find appropriate cluster based on lambda of new point + while ( + potential_cluster > tree_root + and cluster_tree["lambda_val"][cluster_tree["child"] == potential_cluster] + >= lambda_ + ): + potential_cluster = cluster_tree["parent"][ + cluster_tree["child"] == potential_cluster + ][0] + + if potential_cluster in cluster_map: + cluster_label = cluster_map[potential_cluster] + else: + cluster_label = -1 + + if cluster_label >= 0: + max_lambda = max_lambdas[potential_cluster] + + if max_lambda > 0.0: + lambda_ = min(max_lambda, lambda_) + prob = lambda_ / max_lambda + else: + prob = 1.0 + else: + prob = 0.0 + + return cluster_label, prob + + +def approximate_predict(clusterer, points_to_predict): + """Predict the cluster label of new points. The returned labels + will be those of the original clustering found by ``clusterer``, + and therefore are not (necessarily) the cluster labels that would + be found by clustering the original data combined with + ``points_to_predict``, hence the 'approximate' label. + + If you simply wish to assign new points to an existing clustering + in the 'best' way possible, this is the function to use. If you + want to predict how ``points_to_predict`` would cluster with + the original data under HDBSCAN the most efficient existing approach + is to simply recluster with the new point(s) added to the original dataset. + + Parameters + ---------- + clusterer : HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + + points_to_predict : array, or array-like (n_samples, n_features) + The new data points to predict cluster labels for. They should + have the same dimensionality as the original dataset over which + clusterer was fit. + + Returns + ------- + labels : array (n_samples,) + The predicted labels of the ``points_to_predict`` + + probabilities : array (n_samples,) + The soft cluster scores for each of the ``points_to_predict`` + + See Also + -------- + :py:func:`hdbscan.predict.membership_vector` + :py:func:`hdbscan.predict.all_points_membership_vectors` + + """ + if clusterer.prediction_data_ is None: + raise ValueError( + "Clusterer does not have prediction data!" + " Try fitting with prediction_data=True set," + " or run generate_prediction_data on the clusterer" + ) + + points_to_predict = np.asarray(points_to_predict) + + if points_to_predict.shape[1] != clusterer.prediction_data_.raw_data.shape[1]: + raise ValueError("New points dimension does not match fit data!") + + if clusterer.prediction_data_.cluster_tree.shape[0] == 0: + warn( + "Clusterer does not have any defined clusters, new data" + " will be automatically predicted as noise." + ) + labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) + probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) + return labels, probabilities + + labels = np.empty(points_to_predict.shape[0], dtype=np.int32) + probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) + + min_samples = clusterer.min_samples or clusterer.min_cluster_size + neighbor_distances, neighbor_indices = clusterer.prediction_data_.tree.query( + points_to_predict, k=2 * min_samples + ) + + for i in range(points_to_predict.shape[0]): + label, prob = _find_cluster_and_probability( + clusterer.condensed_tree_, + clusterer.prediction_data_.cluster_tree, + neighbor_indices[i], + neighbor_distances[i], + clusterer.prediction_data_.core_distances, + clusterer.prediction_data_.cluster_map, + clusterer.prediction_data_.max_lambdas, + min_samples, + ) + labels[i] = label + probabilities[i] = prob + + return labels, probabilities + + +def approximate_predict_scores(clusterer, points_to_predict): + """Predict the outlier score of new points. The returned scores + will be based on the original clustering found by ``clusterer``, + and therefore are not (necessarily) the outlier scores that would + be found by clustering the original data combined with + ``points_to_predict``, hence the 'approximate' label. + + If you simply wish to calculate the outlier scores for new points + in the 'best' way possible, this is the function to use. If you + want to predict the outlier score of ``points_to_predict`` with + the original data under HDBSCAN the most efficient existing approach + is to simply recluster with the new point(s) added to the original dataset. + + Parameters + ---------- + clusterer : HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + + points_to_predict : array, or array-like (n_samples, n_features) + The new data points to predict cluster labels for. They should + have the same dimensionality as the original dataset over which + clusterer was fit. + + Returns + ------- + scores : array (n_samples,) + The predicted scores of the ``points_to_predict`` + + See Also + -------- + :py:func:`hdbscan.predict.membership_vector` + :py:func:`hdbscan.predict.all_points_membership_vectors` + + """ + try: + clusterer.prediction_data_ + except AttributeError: + raise ValueError( + "Clusterer does not have prediction data!" + " Try fitting with prediction_data=True set," + " or run generate_prediction_data on the clusterer" + ) + + points_to_predict = np.asarray(points_to_predict) + + if points_to_predict.shape[1] != clusterer.prediction_data_.raw_data.shape[1]: + raise ValueError("New points dimension does not match fit data!") + + if clusterer.prediction_data_.cluster_tree.shape[0] == 0: + warn( + "Clusterer does not have any defined clusters, new data" + " will be automatically predicted as outliers." + ) + scores = np.ones(points_to_predict.shape[0], dtype=np.int32) + return scores + + scores = np.empty(points_to_predict.shape[0], dtype=np.float64) + + min_samples = clusterer.min_samples or clusterer.min_cluster_size + neighbor_distances, neighbor_indices = clusterer.prediction_data_.tree.query( + points_to_predict, k=2 * min_samples + ) + + tree = clusterer.condensed_tree_._raw_tree + + parent_array = tree["parent"] + + tree_root = parent_array.min() + max_lambdas = {} + for parent in np.unique(tree["parent"]): + max_lambdas[parent] = tree[tree["parent"] == parent]["lambda_val"].max() + + for n in np.argsort(parent_array): + cluster = tree["child"][n] + if cluster < tree_root: + break + + parent = parent_array[n] + if max_lambdas[cluster] > max_lambdas[parent]: + max_lambdas[parent] = max_lambdas[cluster] + + for i in range(points_to_predict.shape[0]): + neigh, lambda_ = _find_neighbor_and_lambda( + neighbor_indices[i], + neighbor_distances[i], + clusterer.prediction_data_.core_distances, + min_samples, + ) + + neighbor_tree_row = get_tree_row_with_child(tree, neigh) + potential_cluster = neighbor_tree_row["parent"] + + if neighbor_distances[i].min() == 0: + # the point is in the dataset, fix lambda for rounding errors + lambda_ = neighbor_tree_row["lambda_val"] + + max_lambda = max_lambdas[potential_cluster] + + if max_lambda > 0.0: + scores[i] = (max_lambda - lambda_) / max_lambda + else: + scores[i] = 0.0 + + return scores + + +def membership_vector(clusterer, points_to_predict): + """Predict soft cluster membership. The result produces a vector + for each point in ``points_to_predict`` that gives a probability that + the given point is a member of a cluster for each of the selected clusters + of the ``clusterer``. + + Parameters + ---------- + clusterer : HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + + points_to_predict : array, or array-like (n_samples, n_features) + The new data points to predict cluster labels for. They should + have the same dimensionality as the original dataset over which + clusterer was fit. + + Returns + ------- + membership_vectors : array (n_samples, n_clusters) + The probability that point ``i`` is a member of cluster ``j`` is + in ``membership_vectors[i, j]``. + + See Also + -------- + :py:func:`hdbscan.predict.predict` + :py:func:`hdbscan.predict.all_points_membership_vectors`""" + + points_to_predict = points_to_predict.astype(np.float64) + clusters = np.array( + sorted(list(clusterer.condensed_tree_._select_clusters())) + ).astype(np.intp) + + result = np.empty((points_to_predict.shape[0], clusters.shape[0]), dtype=np.float64) + + min_samples = clusterer.min_samples or clusterer.min_cluster_size + neighbor_distances, neighbor_indices = clusterer.prediction_data_.tree.query( + points_to_predict, k=2 * min_samples + ) + + for i in range(points_to_predict.shape[0]): + + # We need to find where in the tree the new point would go + # for the purposes of outlier membership approximation + nearest_neighbor, lambda_ = _find_neighbor_and_lambda( + neighbor_indices[i], + neighbor_distances[i], + clusterer.prediction_data_.core_distances, + min_samples, + ) + + neighbor_tree_row = get_tree_row_with_child( + clusterer.condensed_tree_._raw_tree, nearest_neighbor + ) + + if neighbor_tree_row["lambda_val"] <= lambda_: + lambda_ = neighbor_tree_row["lambda_val"] + + distance_vec = dist_membership_vector( + points_to_predict[i], + clusterer.prediction_data_.exemplars, + clusterer.prediction_data_.dist_metric, + ) + outlier_vec = outlier_membership_vector( + nearest_neighbor, + lambda_, + clusters, + clusterer.condensed_tree_._raw_tree, + clusterer.prediction_data_.leaf_max_lambdas, + clusterer.prediction_data_.cluster_tree, + ) + + result[i] = distance_vec**0.5 * outlier_vec**2.0 + result[i] /= result[i].sum() + + result[i] *= prob_in_some_cluster( + nearest_neighbor, + lambda_, + clusters, + clusterer.condensed_tree_._raw_tree, + clusterer.prediction_data_.leaf_max_lambdas, + clusterer.prediction_data_.cluster_tree, + ) + + return result + + +def all_points_membership_vectors(clusterer): + """Predict soft cluster membership vectors for all points in the + original dataset the clusterer was trained on. This function is more + efficient by making use of the fact that all points are already in the + condensed tree, and processing in bulk. + + Parameters + ---------- + clusterer : HDBSCAN + A clustering object that has been fit to the data and + either had ``prediction_data=True`` set, or called the + ``generate_prediction_data`` method after the fact. + This method does not work if the clusterer was trained + with ``metric='precomputed'``. + + Returns + ------- + membership_vectors : array (n_samples, n_clusters) + The probability that point ``i`` of the original dataset is a member of + cluster ``j`` is in ``membership_vectors[i, j]``. + + See Also + -------- + :py:func:`hdbscan.predict.predict` + :py:func:`hdbscan.predict.all_points_membership_vectors` + """ + clusters = np.array( + sorted(list(clusterer.condensed_tree_._select_clusters())) + ).astype(np.intp) + all_points = clusterer.prediction_data_.raw_data + + # When no clusters found, return array of 0's + if clusters.size == 0: + return np.zeros(all_points.shape[0]) + + distance_vecs = all_points_dist_membership_vector( + all_points, + clusterer.prediction_data_.exemplars, + clusterer.prediction_data_.dist_metric, + ) + outlier_vecs = all_points_outlier_membership_vector( + clusters, + clusterer.condensed_tree_._raw_tree, + clusterer.prediction_data_.leaf_max_lambdas, + clusterer.prediction_data_.cluster_tree, + ) + in_cluster_probs = all_points_prob_in_some_cluster( + clusters, + clusterer.condensed_tree_._raw_tree, + clusterer.prediction_data_.leaf_max_lambdas, + clusterer.prediction_data_.cluster_tree, + ) + + result = distance_vecs * outlier_vecs + row_sums = result.sum(axis=1) + result = result / row_sums[:, np.newaxis] + result *= in_cluster_probs[:, np.newaxis] + + return result diff --git a/sklearn/cluster/_hdbscan/robust_single_linkage_.py b/sklearn/cluster/_hdbscan/robust_single_linkage_.py new file mode 100644 index 0000000000000..760f9a2335edd --- /dev/null +++ b/sklearn/cluster/_hdbscan/robust_single_linkage_.py @@ -0,0 +1,463 @@ +# -*- coding: utf-8 -*- +""" +Robust Single Linkage: Density based single linkage clustering. +""" +import numpy as np + +from sklearn.base import BaseEstimator, ClusterMixin +from sklearn.metrics import pairwise_distances +from scipy.sparse import issparse + +from joblib import Memory, cpu_count +from sklearn.utils import check_array + +from ._hdbscan_linkage import mst_linkage_core, mst_linkage_core_vector, label +from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm +from .dist_metrics import DistanceMetric +from ._hdbscan_reachability import mutual_reachability +from .plots import SingleLinkageTree +from sklearn.neighbors import KDTree, BallTree + +# Author: Leland McInnes +# +# License: BSD 3 clause + +FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + + +def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): + distance_matrix = pairwise_distances(X, metric=metric, **kwargs) + + mutual_reachability_ = mutual_reachability(distance_matrix, k) + + min_spanning_tree = mst_linkage_core(mutual_reachability_) + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] + + single_linkage_tree = label(min_spanning_tree) + single_linkage_tree = SingleLinkageTree(single_linkage_tree) + + return single_linkage_tree + + +def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): + + # The Cython routines used require contiguous arrays + if not X.flags["C_CONTIGUOUS"]: + X = np.array(X, dtype=np.double, order="C") + + dim = X.shape[0] + k = min(dim - 1, k) + + tree = KDTree(X, metric=metric, **kwargs) + + dist_metric = DistanceMetric.get_metric(metric, **kwargs) + + core_distances = tree.query(X, k=k)[0][:, -1].copy(order="C") + min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) + + single_linkage_tree = label(min_spanning_tree) + single_linkage_tree = SingleLinkageTree(single_linkage_tree) + + return single_linkage_tree + + +def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): + + # The Cython routines used require contiguous arrays + if not X.flags["C_CONTIGUOUS"]: + X = np.array(X, dtype=np.double, order="C") + + dim = X.shape[0] + k = min(dim - 1, k) + + tree = BallTree(X, metric=metric, **kwargs) + + dist_metric = DistanceMetric.get_metric(metric, **kwargs) + + core_distances = tree.query(X, k=k)[0][:, -1].copy(order="C") + min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) + + single_linkage_tree = label(min_spanning_tree) + single_linkage_tree = SingleLinkageTree(single_linkage_tree) + + return single_linkage_tree + + +def _rsl_boruvka_kdtree( + X, k=5, alpha=1.0, metric="euclidean", leaf_size=40, core_dist_n_jobs=4, **kwargs +): + + if core_dist_n_jobs < 1: + core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) + + dim = X.shape[0] + min_samples = min(dim - 1, k) + + tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + alg = KDTreeBoruvkaAlgorithm( + tree, min_samples, metric=metric, alpha=alpha, leaf_size=leaf_size, **kwargs + ) + min_spanning_tree = alg.spanning_tree() + + single_linkage_tree = label(min_spanning_tree) + single_linkage_tree = SingleLinkageTree(single_linkage_tree) + + return single_linkage_tree + + +def _rsl_boruvka_balltree( + X, k=5, alpha=1.0, metric="euclidean", leaf_size=40, core_dist_n_jobs=4, **kwargs +): + + if core_dist_n_jobs < 1: + core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) + + dim = X.shape[0] + min_samples = min(dim - 1, k) + + tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + alg = BallTreeBoruvkaAlgorithm( + tree, min_samples, metric=metric, alpha=alpha, leaf_size=leaf_size, **kwargs + ) + min_spanning_tree = alg.spanning_tree() + + single_linkage_tree = label(min_spanning_tree) + single_linkage_tree = SingleLinkageTree(single_linkage_tree) + + return single_linkage_tree + + +def robust_single_linkage( + X, + cut, + k=5, + alpha=1.4142135623730951, + gamma=5, + metric="euclidean", + algorithm="best", + memory=Memory(cachedir=None, verbose=0), + leaf_size=40, + core_dist_n_jobs=4, + **kwargs, +): + """Perform robust single linkage clustering from a vector array + or distance matrix. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + cut : float + The reachability distance value to cut the cluster heirarchy at + to derive a flat cluster labelling. + + k : int, optional (default=5) + Reachability distances will be computed with regard to the `k` + nearest neighbors. + + alpha : float, optional (default=np.sqrt(2)) + Distance scaling for reachability distance computation. Reachability + distance is computed as + + .. math:: + `\max(core_k(a), core_k(b), 1/\alpha d(a,b))`. + + gamma : int, optional (default=5) + Ignore any clusters in the flat clustering with size less than gamma, + and declare points in such clusters as noise points. + + metric : string, or callable, optional (default='euclidean') + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by metrics.pairwise.pairwise_distances for its + metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. + + algorithm : string, optional (default='best') + Exactly which algorithm to use; hdbscan has variants specialised + for different characteristics of the data. By default this is set + to ``best`` which chooses the "best" algorithm given the nature of + the data. You can force other options if you believe you know + better. Options are: + * ``generic`` + * ``best`` + * ``prims_kdtree`` + * ``prims_balltree`` + * ``boruvka_kdtree`` + * ``boruvka_balltree`` + + memory : Instance of joblib.Memory or string (optional) + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + leaf_size : int, optional (default=40) + Leaf size for trees responsible for fast nearest + neighbour queries. + + core_dist_n_jobs : int, optional + Number of parallel jobs to run in core distance computations (if + supported by the specific algorithm). For ``core_dist_n_jobs`` + below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + (default 4) + + Returns + ------- + labels : ndarray, shape (n_samples, ) + Cluster labels for each point. Noisy samples are given the label -1. + + single_linkage_tree : ndarray, shape (n_samples - 1, 4) + The single linkage tree produced during clustering in scipy + hierarchical clustering format + (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). + + References + ---------- + .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the + cluster tree. In Advances in Neural Information Processing Systems + (pp. 343-351). + + """ + + if not isinstance(k, int) or k < 1: + raise ValueError("k must be an integer greater than zero!") + + if not isinstance(alpha, float) or alpha < 1.0: + raise ValueError("alpha must be a float greater than or equal to 1.0!") + + if not isinstance(gamma, int) or gamma < 1: + raise ValueError("gamma must be an integer greater than zero!") + + if not isinstance(leaf_size, int) or leaf_size < 1: + raise ValueError("Leaf size must be at least one!") + + if metric == "minkowski": + if "p" not in kwargs or kwargs["p"] is None: + raise TypeError("Minkowski metric given but no p value supplied!") + if kwargs["p"] < 0: + raise ValueError("Minkowski metric with negative p value is not defined!") + + X = check_array(X, accept_sparse="csr") + if isinstance(memory, str): + memory = Memory(cachedir=memory, verbose=0) + + if algorithm != "best": + if algorithm == "generic": + single_linkage_tree = memory.cache(_rsl_generic)( + X, k, alpha, metric, **kwargs + ) + elif algorithm == "prims_kdtree": + single_linkage_tree = memory.cache(_rsl_prims_kdtree)( + X, k, alpha, metric, **kwargs + ) + elif algorithm == "prims_balltree": + single_linkage_tree = memory.cache(_rsl_prims_balltree)( + X, k, alpha, metric, **kwargs + ) + elif algorithm == "boruvka_kdtree": + single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)( + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + ) + elif algorithm == "boruvka_balltree": + single_linkage_tree = memory.cache(_rsl_boruvka_balltree)( + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + ) + else: + raise TypeError("Unknown algorithm type %s specified" % algorithm) + else: + if issparse(X) or metric not in FAST_METRICS: + # We can't do much with sparse matrices ... + single_linkage_tree = memory.cache(_rsl_generic)( + X, k, alpha, metric, **kwargs + ) + elif metric in KDTree.valid_metrics: + # Need heuristic to decide when to go to boruvka; + # still debugging for now + if X.shape[1] > 128: + single_linkage_tree = memory.cache(_rsl_prims_kdtree)( + X, k, alpha, metric, **kwargs + ) + else: + single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)( + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + ) + else: # Metric is a valid BallTree metric + # Need heuristic to decide when to go to boruvka; + # still debugging for now + if X.shape[1] > 128: + single_linkage_tree = memory.cache(_rsl_prims_kdtree)( + X, k, alpha, metric, **kwargs + ) + else: + single_linkage_tree = memory.cache(_rsl_boruvka_balltree)( + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + ) + + labels = single_linkage_tree.get_clusters(cut, gamma) + + return labels, single_linkage_tree.to_numpy() + + +class RobustSingleLinkage(BaseEstimator, ClusterMixin): + r"""Perform robust single linkage clustering from a vector array + or distance matrix. + + Robust single linkage is a modified version of single linkage that + attempts to be more robust to noise. Specifically the goal is to + more accurately approximate the level set tree of the unknown + probability density function from which the sample data has + been drawn. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + cut : float + The reachability distance value to cut the cluster heirarchy at + to derive a flat cluster labelling. + + k : int, optional (default=5) + Reachability distances will be computed with regard to the `k` + nearest neighbors. + + alpha : float, optional (default=np.sqrt(2)) + Distance scaling for reachability distance computation. Reachability + distance is computed as + $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$. + + gamma : int, optional (default=5) + Ignore any clusters in the flat clustering with size less than gamma, + and declare points in such clusters as noise points. + + metric : string, or callable, optional (default='euclidean') + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by metrics.pairwise.pairwise_distances for its + metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. + + metric_params : dict, option (default={}) + Keyword parameter arguments for calling the metric (for example + the p values if using the minkowski metric). + + algorithm : string, optional (default='best') + Exactly which algorithm to use; hdbscan has variants specialised + for different characteristics of the data. By default this is set + to ``best`` which chooses the "best" algorithm given the nature of + the data. You can force other options if you believe you know + better. Options are: + * ``small`` + * ``small_kdtree`` + * ``large_kdtree`` + * ``large_kdtree_fastcluster`` + + + core_dist_n_jobs : int, optional + Number of parallel jobs to run in core distance computations (if + supported by the specific algorithm). For ``core_dist_n_jobs`` + below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + (default 4) + + Attributes + ------- + labels_ : ndarray, shape (n_samples, ) + Cluster labels for each point. Noisy samples are given the label -1. + + cluster_hierarchy_ : SingleLinkageTree object + The single linkage tree produced during clustering. + This object provides several methods for: + * Plotting + * Generating a flat clustering + * Exporting to NetworkX + * Exporting to Pandas + + References + ---------- + .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the + cluster tree. In Advances in Neural Information Processing Systems + (pp. 343-351). + + """ + + def __init__( + self, + cut=0.4, + k=5, + alpha=1.4142135623730951, + gamma=5, + metric="euclidean", + algorithm="best", + core_dist_n_jobs=4, + metric_params={}, + ): + + self.cut = cut + self.k = k + self.alpha = alpha + self.gamma = gamma + self.metric = metric + self.algorithm = algorithm + self.core_dist_n_jobs = core_dist_n_jobs + self.metric_params = metric_params + + def fit(self, X, y=None): + """Perform robust single linkage clustering from features or + distance matrix. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + Returns + ------- + self : object + Returns self + """ + X = check_array(X, accept_sparse="csr") + + kwargs = self.get_params() + del kwargs["metric_params"] + kwargs.update(self.metric_params) + + self.labels_, self._cluster_hierarchy = robust_single_linkage(X, **kwargs) + + return self + + def fit_predict(self, X, y=None): + """Performs clustering on X and returns cluster labels. + + Parameters + ---------- + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + Returns + ------- + y : ndarray, shape (n_samples, ) + cluster labels + """ + + self.fit(X) + return self.labels_ + + @property + def cluster_hierarchy_(self): + if hasattr(self, "_cluster_hierarchy"): + return SingleLinkageTree(self._cluster_hierarchy) + else: + raise AttributeError( + "No single linkage tree was generated; try running fit first." + ) diff --git a/sklearn/cluster/_hdbscan/tests/__init__.py b/sklearn/cluster/_hdbscan/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/cluster/_hdbscan/tests/test_flat.py b/sklearn/cluster/_hdbscan/tests/test_flat.py new file mode 100644 index 0000000000000..25073ce0d23ec --- /dev/null +++ b/sklearn/cluster/_hdbscan/tests/test_flat.py @@ -0,0 +1,403 @@ +""" +Simple tests for flat clustering over HDBSCAN hierarchy +""" +import warnings +import numpy as np + +from sklearn.cluster import HDBSCAN, approximate_predict +from sklearn.cluster._hdbscan.flat import ( + HDBSCAN_flat, + approximate_predict_flat, + membership_vector_flat, + all_points_membership_vectors_flat, +) + +from sklearn.datasets import make_blobs, make_moons +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from sklearn.utils._testing import assert_array_equal, assert_array_less + +# Ignore future warnings thrown by sklearn +warnings.filterwarnings("ignore", category=FutureWarning) + +# Create a nice dataset with 6 circular clusters and 2 moons +centers = [(0, 2), (-0.2, 0), (0.2, 0), (1.5, 0), (2.0, 1.0), (2.5, 0.0)] +std = [0.5, 0.08, 0.06, 0.35, 0.35, 0.35] +X0, y0 = make_blobs( + n_samples=[70, 30, 80, 100, 40, 150], + centers=centers, + cluster_std=std, + random_state=1, +) +X1, y1 = make_moons(n_samples=300, noise=0.07, random_state=42) +X1 += 3.0 +y1 += len(centers) +X = np.vstack((X0, X1)) +y = np.concatenate((y0, y1)) + +X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +scaler = StandardScaler() +X = scaler.fit_transform(X) +X_test = scaler.transform(X_test) + + +def n_clusters_from_labels(labels_): + return np.amax(labels_) + 1 + + +def test_flat_base_default(): + """ + Verify that the default clustering of HDBSCAN is preserved. + """ + # Given, the base HDBSCAN with method 'eom' + clusterer = HDBSCAN(cluster_selection_method="eom").fit(X) + n_clusters = n_clusters_from_labels(clusterer.labels_) + + # When we ask for flat clustering with same n_clusters, + clusterer_flat = HDBSCAN_flat( + X, n_clusters=n_clusters, cluster_selection_method="eom" + ) + + # Then, the labels and probabilities should match + assert_array_equal(clusterer_flat.labels_, clusterer.labels_) + assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) + + # Given, the base HDBSCAN with method 'leaf' + clusterer = HDBSCAN(cluster_selection_method="leaf").fit(X) + n_clusters = n_clusters_from_labels(clusterer.labels_) + + # When we ask for flat clustering with same n_clusters, + clusterer_flat = HDBSCAN_flat( + X, n_clusters=n_clusters, cluster_selection_method="leaf" + ) + + # Then, the labels and probabilities should match + assert_array_equal(clusterer_flat.labels_, clusterer.labels_) + assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) + return + + +def test_flat_base_epsilon(): + """ + Verify that a clustering of HDBSCAN specified by + cluster_selection_epsilon is preserved. + """ + # Method 'eom'... + # Given, a flat clustering for required n_clusters, + n_clusters = 4 + clusterer_flat = HDBSCAN_flat( + X, n_clusters=n_clusters, cluster_selection_method="eom" + ) + + # When we run the base HDBSCAN using it's epsilon, + epsilon = clusterer_flat.cluster_selection_epsilon + clusterer = HDBSCAN( + cluster_selection_method="eom", cluster_selection_epsilon=epsilon + ).fit(X) + + # Then, the labels and probabilities should match + assert_array_equal(clusterer_flat.labels_, clusterer.labels_) + assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) + + # Method 'leaf'... + # Given, a flat clustering for required n_clusters, + n_clusters = 6 + clusterer_flat = HDBSCAN_flat( + X, n_clusters=n_clusters, cluster_selection_method="leaf" + ) + + # When we run the base HDBSCAN using it's epsilon, + epsilon = clusterer_flat.cluster_selection_epsilon + clusterer = HDBSCAN( + cluster_selection_method="leaf", cluster_selection_epsilon=epsilon + ).fit(X) + + # Then, the labels and probabilities should match + assert_array_equal(clusterer_flat.labels_, clusterer.labels_) + assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) + return + + +def test_switch_to_leaf(): + """ + Verify that when we request more clusters than 'eom' can handle, + method switches to 'leaf' and the results match 'leaf'. + """ + # Given the max number of clusters that can be produced by 'eom', + # (these are produced for epsilon=0) (??? Needs verification) + clusterer = HDBSCAN( + cluster_selection_method="eom", cluster_selection_epsilon=0 + ).fit(X) + max_clusters = n_clusters_from_labels(clusterer.labels_) + + with warnings.catch_warnings(record=True) as w: + # When we try flat clustering with 'eom' method for more n_clusters, + clusterer_flat = HDBSCAN_flat( + X, cluster_selection_method="eom", n_clusters=max_clusters + 2 + ) + # Then, a warning is raised saying 'eom' can't get this clustering, + assert len(w) > 0 + assert issubclass(w[-1].category, UserWarning) + assert "Cannot predict" in str(w[-1].message) + + # the resulting clusterer switches to using method 'leaf', + assert ( + clusterer_flat.cluster_selection_method == "leaf" + ), "cluster selection method has not switched to 'leaf'" + # and the resulting probabilities and labels must match + epsilon = clusterer_flat.cluster_selection_epsilon + clusterer_leaf = HDBSCAN( + cluster_selection_method="leaf", cluster_selection_epsilon=epsilon + ).fit(X) + assert_array_equal(clusterer_flat.labels_, clusterer_leaf.labels_) + assert_array_equal(clusterer_flat.probabilities_, clusterer_leaf.probabilities_) + return + + +def test_approx_predict_default(): + """ + Verify that approximate_predict_flat produces same results as default + """ + # Given the base HDBSCAN trained on some data, + clusterer = HDBSCAN( + cluster_selection_method="eom", + cluster_selection_epsilon=0, + prediction_data=True, + ).fit(X) + + # When using approximate_predict_flat without specifying n_clusters, + labels_flat, proba_flat = approximate_predict_flat( + clusterer, X_test, n_clusters=None + ) + + # Then, the clustering should match that due to approximate_predict, + labels_base, proba_base = approximate_predict(clusterer, X_test) + assert_array_equal(labels_flat, labels_base) + assert_array_equal(proba_flat, proba_base) + return + + +def test_approx_predict_same_clusters(): + """ + Verify that approximate_predict_flat produces as many clusters as clusterer + """ + # Given a flat clustering trained for some n_clusters, + n_clusters = 5 + clusterer = HDBSCAN_flat(X, cluster_selection_method="eom", n_clusters=n_clusters) + + # When using approximate_predict_flat without specifying n_clusters, + labels_flat, proba_flat = approximate_predict_flat( + clusterer, X_test, n_clusters=None + ) + + # Then, the number of clusters produced must match the original n_clusters + n_clusters_out = n_clusters_from_labels(labels_flat) + assert n_clusters_out == n_clusters + # and all probabilities are <= 1. + assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.0e-14) + return + + +def test_approx_predict_diff_clusters(): + """ + Verify that approximate_predict_flat produces as many clusters as asked + """ + # Given a flat clustering trained for some n_clusters, + n_clusters_fit = 5 + clusterer = HDBSCAN_flat( + X, + cluster_selection_method="eom", + n_clusters=n_clusters_fit, + prediction_data=True, + ) + + # When using approximate_predict_flat with specified n_clusters, + n_clusters_predict = 3 + labels_flat, proba_flat = approximate_predict_flat( + clusterer, X_test, n_clusters=n_clusters_predict + ) + + # Then, the requested number of clusters must be produced + n_clusters_out = n_clusters_from_labels(labels_flat) + assert n_clusters_out == n_clusters_predict + # and all probabilities are <= 1. + assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.0e-14) + + # When using approximate_predict_flat with more clusters + # than 'eom' can handle, + n_clusters_predict = 12 + with warnings.catch_warnings(record=True) as w: + labels_flat, proba_flat = approximate_predict_flat( + clusterer, X_test, n_clusters=n_clusters_predict + ) + # Then, a warning is raised saying 'eom' can't get this clustering, + assert len(w) > 0 + assert issubclass(w[-1].category, UserWarning) + assert "Cannot predict" in str(w[-1].message) + # But the requested number of clusters must still be produced using 'leaf' + n_clusters_out = n_clusters_from_labels(labels_flat) + assert n_clusters_out == n_clusters_predict + # and all probabilities are <= 1. + assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.0e-14) + return + + +def test_mem_vec_same_clusters(): + """ + Verify membership vector produces same n_clusters as clusterer + """ + # Given a flat clustering trained for n_clusters picked by HDBSCAN, + n_clusters_fit = None + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + + # When membership_vector_flat is called with new data, + memberships = membership_vector_flat(clusterer, X_test) + + # Then the number of clusters in memberships matches those of clusterer, + assert memberships.shape[1] == n_clusters_from_labels(clusterer.labels_) + # and the number of points should equal those in the test set + assert len(memberships) == len(X_test) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + + # ======================================== + # Given a flat clustering for a specified n_clusters, + n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2 + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + + # When membership_vector_flat is called with new data, + memberships = membership_vector_flat(clusterer, X_test) + + # Then the number of clusters in memberships matches those of clusterer, + assert memberships.shape[1] == n_clusters_fit + # and the number of points should equal those in the test set + assert len(memberships) == len(X_test) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + return + + +def test_mem_vec_diff_clusters(): + """ + Verify membership vector produces as many clusters as requested + """ + # Ignore user warnings in this function + warnings.filterwarnings("ignore", category=UserWarning) + + # Given a flat clustering trained for n_clusters picked by HDBSCAN, + n_clusters_fit = None + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + n_clusters_fitted = n_clusters_from_labels(clusterer.labels_) + + # When membership_vector_flat is called with new data for some n_clusters, + n_clusters_predict = n_clusters_fitted + 3 + memberships = membership_vector_flat( + clusterer, X_test, n_clusters=n_clusters_predict + ) + + # Then the number of clusters in memberships should be as requested, + assert memberships.shape[1] == n_clusters_predict + # and the number of points should equal those in the test set + assert len(memberships) == len(X_test) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + + # ======================================== + # Given a flat clustering for a specified n_clusters, + n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2 + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + + # When membership_vector_flat is called with new data for some n_clusters, + n_clusters_predict = n_clusters_fit + 3 + memberships = membership_vector_flat( + clusterer, X_test, n_clusters=n_clusters_predict + ) + + # Then the number of clusters in memberships should be as requested, + assert memberships.shape[1] == n_clusters_predict + # and the number of points should equal those in the test set + assert len(memberships) == len(X_test) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + return + + +def test_all_points_mem_vec_same_clusters(): + """ + Verify membership vector for training set produces same n_clusters + as clusterer + """ + # Given a flat clustering trained for n_clusters picked by HDBSCAN, + n_clusters_fit = None + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + + # When all_points_membership_vectors_flat is called, + memberships = all_points_membership_vectors_flat(clusterer) + + # Then the number of clusters in memberships matches those of clusterer, + assert memberships.shape[1] == n_clusters_from_labels(clusterer.labels_) + # and the number of points should equal those in the training set + assert len(memberships) == len(X) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + + # ======================================== + # Given a flat clustering for a specified n_clusters, + n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2 + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + + # When all_points_membership_vectors_flat is called, + memberships = all_points_membership_vectors_flat(clusterer) + + # Then the number of clusters in memberships matches those of clusterer, + assert memberships.shape[1] == n_clusters_from_labels(clusterer.labels_) + # and the number of points should equal those in the training set + assert len(memberships) == len(X) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + return + + +def test_all_points_mem_vec_diff_clusters(): + """ + Verify membership vector produces as many clusters as requested + """ + # Ignore user warnings in this function + warnings.filterwarnings("ignore", category=UserWarning) + + # Given a flat clustering trained for n_clusters picked by HDBSCAN, + n_clusters_fit = None + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + n_clusters_fitted = n_clusters_from_labels(clusterer.labels_) + + # When all_points_membership_vectors_flat is called for some n_clusters, + n_clusters_predict = n_clusters_fitted + 3 + memberships = all_points_membership_vectors_flat( + clusterer, n_clusters=n_clusters_predict + ) + + # Then the number of clusters in memberships should be as requested, + assert memberships.shape[1] == n_clusters_predict + # and the number of points should equal those in the training set + assert len(memberships) == len(X) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + + # ======================================== + # Given a flat clustering for a specified n_clusters, + n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2 + clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) + + # When membership_vector_flat is called for some n_clusters, + n_clusters_predict = n_clusters_fitted + 3 + memberships = all_points_membership_vectors_flat( + clusterer, n_clusters=n_clusters_predict + ) + + # Then the number of clusters in memberships should be as requested, + assert memberships.shape[1] == n_clusters_predict + # and the number of points should equal those in the training set + assert len(memberships) == len(X) + # and all probabilities are <= 1. + assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) + return diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py new file mode 100644 index 0000000000000..ce54c86ca8192 --- /dev/null +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -0,0 +1,655 @@ +""" +Tests for HDBSCAN clustering algorithm +Shamelessly based on (i.e. ripped off from) the DBSCAN test code +""" +import numpy as np +from scipy.spatial import distance +from scipy import sparse +from scipy import stats +from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils._testing import ( + assert_array_equal, + assert_array_almost_equal, + assert_raises, +) +from sklearn.cluster import ( + HDBSCAN, + hdbscan, + validity_index, + approximate_predict, + approximate_predict_scores, + all_points_membership_vectors, +) + +# from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.datasets import make_blobs +from sklearn.utils import shuffle +from sklearn.preprocessing import StandardScaler +from scipy.stats import mode + +from tempfile import mkdtemp +from functools import wraps +import pytest + +from sklearn import datasets + +import warnings + +n_clusters = 3 +# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) +X, y = make_blobs(n_samples=200, random_state=10) +X, y = shuffle(X, y, random_state=7) +X = StandardScaler().fit_transform(X) + +X_missing_data = X.copy() +X_missing_data[0] = [np.nan, 1] +X_missing_data[5] = [np.nan, np.nan] + + +def test_missing_data(): + """Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster""" + model = HDBSCAN().fit(X_missing_data) + assert model.labels_[0] == -1 + assert model.labels_[5] == -1 + assert model.probabilities_[0] == 0 + assert model.probabilities_[5] == 0 + assert model.probabilities_[5] == 0 + clean_indices = list(range(1, 5)) + list(range(6, 200)) + clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) + assert np.allclose(clean_model.labels_, model.labels_[clean_indices]) + + +def if_matplotlib(func): + """Test decorator that skips test if matplotlib not installed. + + Parameters + ---------- + func + """ + + @wraps(func) + def run_test(*args, **kwargs): + try: + import matplotlib + + matplotlib.use("Agg") + # this fails if no $DISPLAY specified + import matplotlib.pyplot as plt + + plt.figure() + except ImportError: + pytest.skip("Matplotlib not available.") + else: + return func(*args, **kwargs) + + return run_test + + +def if_pandas(func): + """Test decorator that skips test if pandas not installed.""" + + @wraps(func) + def run_test(*args, **kwargs): + try: + import pandas + except ImportError: + pytest.skip("Pandas not available.") + else: + return func(*args, **kwargs) + + return run_test + + +def if_networkx(func): + """Test decorator that skips test if networkx not installed.""" + + @wraps(func) + def run_test(*args, **kwargs): + try: + import networkx + except ImportError: + pytest.skip("NetworkX not available.") + else: + return func(*args, **kwargs) + + return run_test + + +def generate_noisy_data(): + blobs, _ = datasets.make_blobs( + n_samples=200, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 + ) + moons, _ = datasets.make_moons(n_samples=200, noise=0.05) + noise = np.random.uniform(-1.0, 3.0, (50, 2)) + return np.vstack([blobs, moons, noise]) + + +def homogeneity(labels1, labels2): + num_missed = 0.0 + for label in set(labels1): + matches = labels2[labels1 == label] + match_mode = mode(matches)[0][0] + num_missed += np.sum(matches != match_mode) + + for label in set(labels2): + matches = labels1[labels2 == label] + match_mode = mode(matches)[0][0] + num_missed += np.sum(matches != match_mode) + + return num_missed / 2.0 + + +def test_hdbscan_distance_matrix(): + D = distance.squareform(distance.pdist(X)) + D /= np.max(D) + + labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(metric="precomputed").fit(D).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + validity = validity_index(D, labels, metric="precomputed", d=2) + assert validity >= 0.6 + + +def test_hdbscan_sparse_distance_matrix(): + D = distance.squareform(distance.pdist(X)) + D /= np.max(D) + + threshold = stats.scoreatpercentile(D.flatten(), 50) + + D[D >= threshold] = 0.0 + D = sparse.csr_matrix(D) + D.eliminate_zeros() + + labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(metric="precomputed", gen_min_span_tree=True).fit(D).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_hdbscan_feature_vector(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN().fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + validity = validity_index(X, labels) + assert validity >= 0.4 + + +def test_hdbscan_prims_kdtree(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_kdtree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(algorithm="prims_kdtree", gen_min_span_tree=True).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + assert_raises(ValueError, hdbscan, X, algorithm="prims_kdtree", metric="russelrao") + + +def test_hdbscan_prims_balltree(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_balltree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(algorithm="prims_balltree", gen_min_span_tree=True).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + assert_raises(ValueError, hdbscan, X, algorithm="prims_balltree", metric="cosine") + + +def test_hdbscan_boruvka_kdtree(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_kdtree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(algorithm="boruvka_kdtree", gen_min_span_tree=True).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + assert_raises( + ValueError, hdbscan, X, algorithm="boruvka_kdtree", metric="russelrao" + ) + + +def test_hdbscan_boruvka_balltree(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_balltree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = ( + HDBSCAN(algorithm="boruvka_balltree", gen_min_span_tree=True).fit(X).labels_ + ) + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + assert_raises(ValueError, hdbscan, X, algorithm="boruvka_balltree", metric="cosine") + + +def test_hdbscan_generic(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="generic") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(algorithm="generic", gen_min_span_tree=True).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_hdbscan_dbscan_clustering(): + clusterer = HDBSCAN().fit(X) + labels = clusterer.dbscan_clustering(0.3) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_1 + + +def test_hdbscan_high_dimensional(): + H, y = make_blobs(n_samples=50, random_state=0, n_features=64) + # H, y = shuffle(X, y, random_state=7) + H = StandardScaler().fit_transform(H) + labels, p, persist, ctree, ltree, mtree = hdbscan(H) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = ( + HDBSCAN(algorithm="best", metric="seuclidean", V=np.ones(H.shape[1])) + .fit(H) + .labels_ + ) + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_hdbscan_best_balltree_metric(): + labels, p, persist, ctree, ltree, mtree = hdbscan( + X, metric="seuclidean", V=np.ones(X.shape[1]) + ) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(metric="seuclidean", V=np.ones(X.shape[1])).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_hdbscan_no_clusters(): + labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X) + 1) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == 0 + + labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == 0 + + +def test_hdbscan_min_cluster_size(): + for min_cluster_size in range(2, len(X) + 1, 1): + labels, p, persist, ctree, ltree, mtree = hdbscan( + X, min_cluster_size=min_cluster_size + ) + true_labels = [label for label in labels if label != -1] + if len(true_labels) != 0: + assert np.min(np.bincount(true_labels)) >= min_cluster_size + + labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_ + true_labels = [label for label in labels if label != -1] + if len(true_labels) != 0: + assert np.min(np.bincount(true_labels)) >= min_cluster_size + + +def test_hdbscan_callable_metric(): + # metric is the function reference, not the string key. + metric = distance.euclidean + + labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = HDBSCAN(metric=metric).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_hdbscan_input_lists(): + X = [[1.0, 2.0], [3.0, 4.0]] + HDBSCAN().fit(X) # must not raise exception + + +def test_hdbscan_boruvka_kdtree_matches(): + + data = generate_noisy_data() + + labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") + labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( + data, algorithm="boruvka_kdtree" + ) + + num_mismatches = homogeneity(labels_prims, labels_boruvka) + + assert (num_mismatches / float(data.shape[0])) < 0.15 + + labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) + labels_boruvka = HDBSCAN(algorithm="boruvka_kdtree").fit_predict(data) + + num_mismatches = homogeneity(labels_prims, labels_boruvka) + + assert (num_mismatches / float(data.shape[0])) < 0.15 + + +def test_hdbscan_boruvka_balltree_matches(): + + data = generate_noisy_data() + + labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") + labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( + data, algorithm="boruvka_balltree" + ) + + num_mismatches = homogeneity(labels_prims, labels_boruvka) + + assert (num_mismatches / float(data.shape[0])) < 0.15 + + labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) + labels_boruvka = HDBSCAN(algorithm="boruvka_balltree").fit_predict(data) + + num_mismatches = homogeneity(labels_prims, labels_boruvka) + + assert (num_mismatches / float(data.shape[0])) < 0.15 + + +def test_condensed_tree_plot(): + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + if_matplotlib(clusterer.condensed_tree_.plot)( + select_clusters=True, + label_clusters=True, + selection_palette=("r", "g", "b"), + cmap="Reds", + ) + if_matplotlib(clusterer.condensed_tree_.plot)( + log_size=True, colorbar=False, cmap="none" + ) + + +def test_single_linkage_tree_plot(): + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + if_matplotlib(clusterer.single_linkage_tree_.plot)(cmap="Reds") + if_matplotlib(clusterer.single_linkage_tree_.plot)( + vary_line_width=False, truncate_mode="lastp", p=10, cmap="none", colorbar=False + ) + + +def test_min_span_tree_plot(): + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap="Reds") + + H, y = make_blobs(n_samples=50, random_state=0, n_features=10) + H = StandardScaler().fit_transform(H) + + clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) + if_matplotlib(clusterer.minimum_spanning_tree_.plot)( + edge_cmap="Reds", vary_line_width=False, colorbar=False + ) + + H, y = make_blobs(n_samples=50, random_state=0, n_features=40) + H = StandardScaler().fit_transform(H) + + clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) + if_matplotlib(clusterer.minimum_spanning_tree_.plot)( + edge_cmap="Reds", vary_line_width=False, colorbar=False + ) + + +def test_tree_numpy_output_formats(): + + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + + clusterer.single_linkage_tree_.to_numpy() + clusterer.condensed_tree_.to_numpy() + clusterer.minimum_spanning_tree_.to_numpy() + + +def test_tree_pandas_output_formats(): + + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + if_pandas(clusterer.condensed_tree_.to_pandas)() + if_pandas(clusterer.single_linkage_tree_.to_pandas)() + if_pandas(clusterer.minimum_spanning_tree_.to_pandas)() + + +def test_tree_networkx_output_formats(): + + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + if_networkx(clusterer.condensed_tree_.to_networkx)() + if_networkx(clusterer.single_linkage_tree_.to_networkx)() + if_networkx(clusterer.minimum_spanning_tree_.to_networkx)() + + +def test_hdbscan_outliers(): + clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) + scores = clusterer.outlier_scores_ + assert scores is not None + + +# def test_hdbscan_unavailable_attributes(): +# clusterer = HDBSCAN(gen_min_span_tree=False) +# with warnings.catch_warnings(record=True) as w: +# tree = clusterer.condensed_tree_ +# assert len(w) > 0 +# assert tree is None +# with warnings.catch_warnings(record=True) as w: +# tree = clusterer.single_linkage_tree_ +# assert len(w) > 0 +# assert tree is None +# with warnings.catch_warnings(record=True) as w: +# scores = clusterer.outlier_scores_ +# assert len(w) > 0 +# assert scores is None +# with warnings.catch_warnings(record=True) as w: +# tree = clusterer.minimum_spanning_tree_ +# assert len(w) > 0 +# assert tree is None + + +# def test_hdbscan_min_span_tree_availability(): +# clusterer = HDBSCAN().fit(X) +# tree = clusterer.minimum_spanning_tree_ +# assert tree is None +# D = distance.squareform(distance.pdist(X)) +# D /= np.max(D) +# HDBSCAN(metric='precomputed').fit(D) +# tree = clusterer.minimum_spanning_tree_ +# assert tree is None + + +def test_hdbscan_approximate_predict(): + clusterer = HDBSCAN(prediction_data=True).fit(X) + cluster, prob = approximate_predict(clusterer, np.array([[-1.5, -1.0]])) + assert cluster == 2 + cluster, prob = approximate_predict(clusterer, np.array([[1.5, -1.0]])) + assert cluster == 1 + cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]])) + assert cluster == -1 + + +def test_hdbscan_approximate_predict_score(): + clusterer = HDBSCAN(min_cluster_size=200).fit(X) + # no prediction data error + assert_raises(ValueError, approximate_predict_scores, clusterer, X) + clusterer.generate_prediction_data() + # wrong dimensions error + assert_raises( + ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]]) + ) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + approximate_predict_scores(clusterer, np.array([[1.5, -1.0]])) + # no clusters warning + assert "Clusterer does not have any defined clusters" in str(w[-1].message) + clusterer = HDBSCAN(prediction_data=True).fit(X) + scores = approximate_predict_scores(clusterer, X) + assert_array_almost_equal(scores, clusterer.outlier_scores_) + assert scores.min() >= 0 + assert scores.max() <= 1 + + +# def test_hdbscan_membership_vector(): +# clusterer = HDBSCAN(prediction_data=True).fit(X) +# vector = membership_vector(clusterer, np.array([[-1.5, -1.0]])) +# assert_array_almost_equal( +# vector, +# np.array([[ 0.05705305, 0.05974177, 0.12228153]])) +# vector = membership_vector(clusterer, np.array([[1.5, -1.0]])) +# assert_array_almost_equal( +# vector, +# np.array([[ 0.09462176, 0.32061556, 0.10112905]])) +# vector = membership_vector(clusterer, np.array([[0.0, 0.0]])) +# assert_array_almost_equal( +# vector, +# np.array([[ 0.03545607, 0.03363318, 0.04643177]])) +# +# def test_hdbscan_all_points_membership_vectors(): +# clusterer = HDBSCAN(prediction_data=True).fit(X) +# vects = all_points_membership_vectors(clusterer) +# assert_array_almost_equal(vects[0], np.array([7.86400992e-002, +# 2.52734246e-001, +# 8.38299608e-002])) +# assert_array_almost_equal(vects[-1], np.array([8.09055344e-001, +# 8.35882503e-002, +# 1.07356406e-001])) + + +def test_hdbscan_all_points_membership_vectors(): + clusterer = HDBSCAN(prediction_data=True, min_cluster_size=200).fit(X) + vects = all_points_membership_vectors(clusterer) + assert_array_equal(vects, np.zeros(clusterer.prediction_data_.raw_data.shape[0])) + + +def test_hdbscan_badargs(): + assert_raises(ValueError, hdbscan, X="fail") + assert_raises(ValueError, hdbscan, X=None) + assert_raises(ValueError, hdbscan, X, min_cluster_size="fail") + assert_raises(ValueError, hdbscan, X, min_samples="fail") + assert_raises(ValueError, hdbscan, X, min_samples=-1) + assert_raises(ValueError, hdbscan, X, metric="imperial") + assert_raises(ValueError, hdbscan, X, metric=None) + assert_raises(ValueError, hdbscan, X, metric="minkowski", p=-1) + assert_raises( + ValueError, hdbscan, X, metric="minkowski", p=-1, algorithm="prims_kdtree" + ) + assert_raises( + ValueError, hdbscan, X, metric="minkowski", p=-1, algorithm="prims_balltree" + ) + assert_raises( + ValueError, hdbscan, X, metric="minkowski", p=-1, algorithm="boruvka_balltree" + ) + assert_raises( + ValueError, hdbscan, X, metric="precomputed", algorithm="boruvka_kdtree" + ) + assert_raises( + ValueError, hdbscan, X, metric="precomputed", algorithm="prims_kdtree" + ) + assert_raises( + ValueError, hdbscan, X, metric="precomputed", algorithm="prims_balltree" + ) + assert_raises( + ValueError, hdbscan, X, metric="precomputed", algorithm="boruvka_balltree" + ) + assert_raises(ValueError, hdbscan, X, alpha=-1) + assert_raises(ValueError, hdbscan, X, alpha="fail") + assert_raises(Exception, hdbscan, X, algorithm="something_else") + assert_raises(TypeError, hdbscan, X, metric="minkowski", p=None) + assert_raises(ValueError, hdbscan, X, leaf_size=0) + + +def test_hdbscan_sparse(): + + sparse_X = sparse.csr_matrix(X) + + labels = HDBSCAN().fit(sparse_X).labels_ + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == 3 + + +def test_hdbscan_caching(): + + cachedir = mkdtemp() + labels1 = HDBSCAN(memory=cachedir, min_samples=5).fit(X).labels_ + labels2 = HDBSCAN(memory=cachedir, min_samples=5, min_cluster_size=6).fit(X).labels_ + n_clusters1 = len(set(labels1)) - int(-1 in labels1) + n_clusters2 = len(set(labels2)) - int(-1 in labels2) + assert n_clusters1 == n_clusters2 + + +def test_hdbscan_centroids_medoids(): + centers = [(0.0, 0.0), (3.0, 3.0)] + H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) + clusterer = HDBSCAN().fit(H) + + for idx, center in enumerate(centers): + centroid = clusterer.weighted_cluster_centroid(idx) + assert_array_almost_equal(centroid, center, decimal=1) + + medoid = clusterer.weighted_cluster_medoid(idx) + assert_array_almost_equal(medoid, center, decimal=1) + + +def test_hdbscan_no_centroid_medoid_for_noise(): + clusterer = HDBSCAN().fit(X) + assert_raises(ValueError, clusterer.weighted_cluster_centroid, -1) + assert_raises(ValueError, clusterer.weighted_cluster_medoid, -1) + + +def test_hdbscan_allow_single_cluster_with_epsilon(): + np.random.seed(0) + no_structure = np.random.rand(150, 2) + # without epsilon we should see many noise points as children of root. + labels = HDBSCAN( + min_cluster_size=5, + cluster_selection_epsilon=0.0, + cluster_selection_method="eom", + allow_single_cluster=True, + ).fit_predict(no_structure) + unique_labels, counts = np.unique(labels, return_counts=True) + assert len(unique_labels) == 2 + assert counts[unique_labels == -1] == 46 + + # for this random seed an epsilon of 0.2 will produce exactly 2 noise + # points at that cut in single linkage. + labels = HDBSCAN( + min_cluster_size=5, + cluster_selection_epsilon=0.2, + cluster_selection_method="eom", + allow_single_cluster=True, + ).fit_predict(no_structure) + unique_labels, counts = np.unique(labels, return_counts=True) + assert len(unique_labels) == 2 + assert counts[unique_labels == -1] == 2 + + +# Disable for now -- need to refactor to meet newer standards +@pytest.mark.skip(reason="need to refactor to meet newer standards") +def test_hdbscan_is_sklearn_estimator(): + check_estimator(HDBSCAN) + + +# Probably not applicable now # +# def test_dbscan_sparse(): +# def test_dbscan_balltree(): +# def test_pickle(): +# def test_dbscan_core_samples_toy(): +# def test_boundaries(): diff --git a/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py b/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py new file mode 100644 index 0000000000000..a6eba19d99d11 --- /dev/null +++ b/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py @@ -0,0 +1,12 @@ +import pytest + +from sklearn.cluster._hdbscan._prediction_utils import safe_always_positive_division + + +@pytest.mark.parametrize("denominator", [-1, 0, 1]) +def test_safe_always_positive_division(denominator): + numerator = 1 + # Given negative, zero and positive denominator and positive numerator + value = safe_always_positive_division(numerator, 0) + # Make sure safe division is always positive and doesn't raise ZeroDivision error + assert value >= 0 diff --git a/sklearn/cluster/_hdbscan/tests/test_rsl.py b/sklearn/cluster/_hdbscan/tests/test_rsl.py new file mode 100644 index 0000000000000..a87ef8490afe9 --- /dev/null +++ b/sklearn/cluster/_hdbscan/tests/test_rsl.py @@ -0,0 +1,209 @@ +""" +Tests for Robust Single Linkage clustering algorithm +""" +# import pickle +import numpy as np +from scipy.spatial import distance +from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils._testing import assert_raises +from sklearn.cluster import RobustSingleLinkage, robust_single_linkage + +from sklearn.datasets import make_blobs +from sklearn.utils import shuffle +from sklearn.preprocessing import StandardScaler + +import pytest + +n_clusters = 3 +X, y = make_blobs(n_samples=50, random_state=1) +X, y = shuffle(X, y, random_state=7) +X = StandardScaler().fit_transform(X) +# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) + + +def test_rsl_distance_matrix(): + D = distance.squareform(distance.pdist(X)) + D /= np.max(D) + + labels, tree = robust_single_linkage(D, 0.4, metric="precomputed") + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise + assert n_clusters_1 == 2 + + labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == 2 + + +def test_rsl_feature_vector(): + labels, tree = robust_single_linkage(X, 0.4) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = RobustSingleLinkage().fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_rsl_callable_metric(): + # metric is the function reference, not the string key. + metric = distance.euclidean + + labels, tree = robust_single_linkage(X, 0.4, metric=metric) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = RobustSingleLinkage(metric=metric).fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_rsl_input_lists(): + X = [[1.0, 2.0], [3.0, 4.0]] + RobustSingleLinkage().fit(X) # must not raise exception + + +def test_rsl_boruvka_balltree(): + labels, tree = robust_single_linkage(X, 0.45, algorithm="boruvka_balltree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = RobustSingleLinkage(cut=0.45, algorithm="boruvka_balltree").fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_rsl_prims_balltree(): + labels, tree = robust_single_linkage(X, 0.4, algorithm="prims_balltree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = RobustSingleLinkage(algorithm="prims_balltree").fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_rsl_prims_kdtree(): + labels, tree = robust_single_linkage(X, 0.4, algorithm="prims_kdtree") + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = RobustSingleLinkage(algorithm="prims_kdtree").fit(X).labels_ + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +# def test_rsl_unavailable_hierarchy(): +# clusterer = RobustSingleLinkage() +# with warnings.catch_warnings(record=True) as w: +# tree = clusterer.cluster_hierarchy_ +# assert len(w) > 0 +# assert tree is None + + +def test_rsl_hierarchy(): + clusterer = RobustSingleLinkage().fit(X) + assert clusterer.cluster_hierarchy_ is not None + + +def test_rsl_high_dimensional(): + H, y = make_blobs(n_samples=50, random_state=0, n_features=64) + # H, y = shuffle(X, y, random_state=7) + H = StandardScaler().fit_transform(H) + labels, tree = robust_single_linkage(H, 5.5) + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + labels = ( + RobustSingleLinkage( + cut=5.5, + algorithm="best", + metric="seuclidean", + metric_params={"V": np.ones(H.shape[1])}, + ) + .fit(H) + .labels_ + ) + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_rsl_badargs(): + assert_raises(ValueError, robust_single_linkage, "fail", 0.4) + assert_raises(ValueError, robust_single_linkage, None, 0.4) + assert_raises(ValueError, robust_single_linkage, X, 0.4, k="fail") + assert_raises(ValueError, robust_single_linkage, X, 0.4, k=-1) + assert_raises(ValueError, robust_single_linkage, X, 0.4, metric="imperial") + assert_raises(ValueError, robust_single_linkage, X, 0.4, metric=None) + assert_raises(ValueError, robust_single_linkage, X, 0.4, metric="minkowski", p=-1) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="minkowski", + p=-1, + algorithm="prims_kdtree", + ) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="minkowski", + p=-1, + algorithm="prims_balltree", + ) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="minkowski", + p=-1, + algorithm="boruvka_balltree", + ) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="precomputed", + algorithm="boruvka_kdtree", + ) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="precomputed", + algorithm="prims_kdtree", + ) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="precomputed", + algorithm="prims_balltree", + ) + assert_raises( + ValueError, + robust_single_linkage, + X, + 0.4, + metric="precomputed", + algorithm="boruvka_balltree", + ) + assert_raises(ValueError, robust_single_linkage, X, 0.4, alpha=-1) + assert_raises(ValueError, robust_single_linkage, X, 0.4, alpha="fail") + assert_raises(Exception, robust_single_linkage, X, 0.4, algorithm="something_else") + assert_raises(TypeError, robust_single_linkage, X, 0.4, metric="minkowski", p=None) + assert_raises(ValueError, robust_single_linkage, X, 0.4, leaf_size=0) + assert_raises(ValueError, robust_single_linkage, X, 0.4, gamma=0) + + +# Disable for now -- need to refactor to meet newer standards +@pytest.mark.skip(reason="need to refactor to meet newer standards") +def test_rsl_is_sklearn_estimator(): + check_estimator(RobustSingleLinkage) diff --git a/sklearn/cluster/_hdbscan/validity.py b/sklearn/cluster/_hdbscan/validity.py new file mode 100644 index 0000000000000..c29103cff3734 --- /dev/null +++ b/sklearn/cluster/_hdbscan/validity.py @@ -0,0 +1,400 @@ +import numpy as np +from sklearn.metrics import pairwise_distances +from scipy.spatial.distance import cdist +from ._hdbscan_linkage import mst_linkage_core +from .hdbscan_ import isclose + + +def all_points_core_distance(distance_matrix, d=2.0): + """ + Compute the all-points-core-distance for all the points of a cluster. + + Parameters + ---------- + distance_matrix : array (cluster_size, cluster_size) + The pairwise distance matrix between points in the cluster. + + d : integer + The dimension of the data set, which is used in the computation + of the all-point-core-distance as per the paper. + + Returns + ------- + core_distances : array (cluster_size,) + The all-points-core-distance of each point in the cluster + + References + ---------- + Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., + 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). + """ + distance_matrix[distance_matrix != 0] = ( + 1.0 / distance_matrix[distance_matrix != 0] + ) ** d + result = distance_matrix.sum(axis=1) + result /= distance_matrix.shape[0] - 1 + result **= -1.0 / d + + return result + + +def all_points_mutual_reachability( + X, labels, cluster_id, metric="euclidean", d=None, **kwd_args +): + """ + Compute the all-points-mutual-reachability distances for all the points of + a cluster. + + If metric is 'precomputed' then assume X is a distance matrix for the full + dataset. Note that in this case you must pass in 'd' the dimension of the + dataset. + + Parameters + ---------- + X : array (n_samples, n_features) or (n_samples, n_samples) + The input data of the clustering. This can be the data, or, if + metric is set to `precomputed` the pairwise distance matrix used + for the clustering. + + labels : array (n_samples) + The label array output by the clustering, providing an integral + cluster label to each data point, with -1 for noise points. + + cluster_id : integer + The cluster label for which to compute the all-points + mutual-reachability (which should be done on a cluster + by cluster basis). + + metric : string + The metric used to compute distances for the clustering (and + to be re-used in computing distances for mr distance). If + set to `precomputed` then X is assumed to be the precomputed + distance matrix between samples. + + d : integer (or None) + The number of features (dimension) of the dataset. This need only + be set in the case of metric being set to `precomputed`, where + the ambient dimension of the data is unknown to the function. + + **kwd_args : + Extra arguments to pass to the distance computation for other + metrics, such as minkowski, Mahanalobis etc. + + Returns + ------- + + mutual_reachaibility : array (n_samples, n_samples) + The pairwise mutual reachability distances between all points in `X` + with `label` equal to `cluster_id`. + + core_distances : array (n_samples,) + The all-points-core_distance of all points in `X` with `label` equal + to `cluster_id`. + + References + ---------- + Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., + 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). + """ + if metric == "precomputed": + if d is None: + raise ValueError("If metric is precomputed a d value must be provided!") + distance_matrix = X[labels == cluster_id, :][:, labels == cluster_id] + else: + subset_X = X[labels == cluster_id, :] + distance_matrix = pairwise_distances(subset_X, metric=metric, **kwd_args) + d = X.shape[1] + + core_distances = all_points_core_distance(distance_matrix.copy(), d=d) + core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1)) + + result = np.dstack([distance_matrix, core_dist_matrix, core_dist_matrix.T]).max( + axis=-1 + ) + + return result, core_distances + + +def internal_minimum_spanning_tree(mr_distances): + """ + Compute the 'internal' minimum spanning tree given a matrix of mutual + reachability distances. Given a minimum spanning tree the 'internal' + graph is the subgraph induced by vertices of degree greater than one. + + Parameters + ---------- + mr_distances : array (cluster_size, cluster_size) + The pairwise mutual reachability distances, inferred to be the edge + weights of a complete graph. Since MSTs are computed per cluster + this is the all-points-mutual-reacability for points within a single + cluster. + + Returns + ------- + internal_nodes : array + An array listing the indices of the internal nodes of the MST + + internal_edges : array (?, 3) + An array of internal edges in weighted edge list format; that is + an edge is an array of length three listing the two vertices + forming the edge and weight of the edge. + + References + ---------- + Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., + 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). + """ + single_linkage_data = mst_linkage_core(mr_distances) + min_span_tree = single_linkage_data.copy() + for index, row in enumerate(min_span_tree[1:], 1): + candidates = np.where(isclose(mr_distances[int(row[1])], row[2]))[0] + candidates = np.intersect1d( + candidates, single_linkage_data[:index, :2].astype(int) + ) + candidates = candidates[candidates != row[1]] + assert len(candidates) > 0 + row[0] = candidates[0] + + vertices = np.arange(mr_distances.shape[0])[ + np.bincount(min_span_tree.T[:2].flatten().astype(np.intp)) > 1 + ] + # A little "fancy" we select from the flattened array reshape back + # (Fortran format to get indexing right) and take the product to do an and + # then convert back to boolean type. + edge_selection = np.prod( + np.in1d(min_span_tree.T[:2], vertices).reshape( + (min_span_tree.shape[0], 2), order="F" + ), + axis=1, + ).astype(bool) + + # Density sparseness is not well defined if there are no + # internal edges (as per the referenced paper). However + # MATLAB code from the original authors simply selects the + # largest of *all* the edges in the case that there are + # no internal edges, so we do the same here + if np.any(edge_selection): + # If there are any internal edges, then subselect them out + edges = min_span_tree[edge_selection] + else: + # If there are no internal edges then we want to take the + # max over all the edges that exist in the MST, so we simply + # do nothing and return all the edges in the MST. + edges = min_span_tree.copy() + + return vertices, edges + + +def density_separation( + X, + labels, + cluster_id1, + cluster_id2, + internal_nodes1, + internal_nodes2, + core_distances1, + core_distances2, + metric="euclidean", + **kwd_args, +): + """ + Compute the density separation between two clusters. This is the minimum + all-points mutual reachability distance between pairs of points, one from + internal nodes of MSTs of each cluster. + + Parameters + ---------- + X : array (n_samples, n_features) or (n_samples, n_samples) + The input data of the clustering. This can be the data, or, if + metric is set to `precomputed` the pairwise distance matrix used + for the clustering. + + labels : array (n_samples) + The label array output by the clustering, providing an integral + cluster label to each data point, with -1 for noise points. + + cluster_id1 : integer + The first cluster label to compute separation between. + + cluster_id2 : integer + The second cluster label to compute separation between. + + internal_nodes1 : array + The vertices of the MST for `cluster_id1` that were internal vertices. + + internal_nodes2 : array + The vertices of the MST for `cluster_id2` that were internal vertices. + + core_distances1 : array (size of cluster_id1,) + The all-points-core_distances of all points in the cluster + specified by cluster_id1. + + core_distances2 : array (size of cluster_id2,) + The all-points-core_distances of all points in the cluster + specified by cluster_id2. + + metric : string + The metric used to compute distances for the clustering (and + to be re-used in computing distances for mr distance). If + set to `precomputed` then X is assumed to be the precomputed + distance matrix between samples. + + **kwd_args : + Extra arguments to pass to the distance computation for other + metrics, such as minkowski, Mahanalobis etc. + + Returns + ------- + The 'density separation' between the clusters specified by + `cluster_id1` and `cluster_id2`. + + References + ---------- + Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., + 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). + """ + if metric == "precomputed": + sub_select = X[labels == cluster_id1, :][:, labels == cluster_id2] + distance_matrix = sub_select[internal_nodes1, :][:, internal_nodes2] + else: + cluster1 = X[labels == cluster_id1][internal_nodes1] + cluster2 = X[labels == cluster_id2][internal_nodes2] + distance_matrix = cdist(cluster1, cluster2, metric, **kwd_args) + + core_dist_matrix1 = np.tile( + core_distances1[internal_nodes1], (distance_matrix.shape[1], 1) + ).T + core_dist_matrix2 = np.tile( + core_distances2[internal_nodes2], (distance_matrix.shape[0], 1) + ) + + mr_dist_matrix = np.dstack( + [distance_matrix, core_dist_matrix1, core_dist_matrix2] + ).max(axis=-1) + + return mr_dist_matrix.min() + + +def validity_index( + X, labels, metric="euclidean", d=None, per_cluster_scores=False, **kwd_args +): + """ + Compute the density based cluster validity index for the + clustering specified by `labels` and for each cluster in `labels`. + + Parameters + ---------- + X : array (n_samples, n_features) or (n_samples, n_samples) + The input data of the clustering. This can be the data, or, if + metric is set to `precomputed` the pairwise distance matrix used + for the clustering. + + labels : array (n_samples) + The label array output by the clustering, providing an integral + cluster label to each data point, with -1 for noise points. + + metric : optional, string (default 'euclidean') + The metric used to compute distances for the clustering (and + to be re-used in computing distances for mr distance). If + set to `precomputed` then X is assumed to be the precomputed + distance matrix between samples. + + d : optional, integer (or None) (default None) + The number of features (dimension) of the dataset. This need only + be set in the case of metric being set to `precomputed`, where + the ambient dimension of the data is unknown to the function. + + per_cluster_scores : optional, boolean (default False) + Whether to return the validity index for individual clusters. + Defaults to False with the function returning a single float + value for the whole clustering. + + **kwd_args : + Extra arguments to pass to the distance computation for other + metrics, such as minkowski, Mahanalobis etc. + + Returns + ------- + validity_index : float + The density based cluster validity index for the clustering. This + is a numeric value between -1 and 1, with higher values indicating + a 'better' clustering. + + per_cluster_validity_index : array (n_clusters,) + The cluster validity index of each individual cluster as an array. + The overall validity index is the weighted average of these values. + Only returned if per_cluster_scores is set to True. + + References + ---------- + Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., + 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). + """ + core_distances = {} + density_sparseness = {} + mst_nodes = {} + mst_edges = {} + + max_cluster_id = labels.max() + 1 + density_sep = np.inf * np.ones((max_cluster_id, max_cluster_id), dtype=np.float64) + cluster_validity_indices = np.empty(max_cluster_id, dtype=np.float64) + + for cluster_id in range(max_cluster_id): + + if np.sum(labels == cluster_id) == 0: + continue + + mr_distances, core_distances[cluster_id] = all_points_mutual_reachability( + X, labels, cluster_id, metric, d, **kwd_args + ) + + mst_nodes[cluster_id], mst_edges[cluster_id] = internal_minimum_spanning_tree( + mr_distances + ) + density_sparseness[cluster_id] = mst_edges[cluster_id].T[2].max() + + for i in range(max_cluster_id): + + if np.sum(labels == i) == 0: + continue + + internal_nodes_i = mst_nodes[i] + for j in range(i + 1, max_cluster_id): + + if np.sum(labels == j) == 0: + continue + + internal_nodes_j = mst_nodes[j] + density_sep[i, j] = density_separation( + X, + labels, + i, + j, + internal_nodes_i, + internal_nodes_j, + core_distances[i], + core_distances[j], + metric=metric, + **kwd_args, + ) + density_sep[j, i] = density_sep[i, j] + + n_samples = float(X.shape[0]) + result = 0 + + for i in range(max_cluster_id): + + if np.sum(labels == i) == 0: + continue + + min_density_sep = density_sep[i].min() + cluster_validity_indices[i] = (min_density_sep - density_sparseness[i]) / max( + min_density_sep, density_sparseness[i] + ) + cluster_size = np.sum(labels == i) + result += (cluster_size / n_samples) * cluster_validity_indices[i] + + if per_cluster_scores: + return result, cluster_validity_indices + else: + return result diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index c26872fd750a0..d8658c0458532 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -59,6 +59,45 @@ def configuration(parent_package="", top_path=None): config.add_subpackage("tests") + # HDBSCAN subpackage + config.add_subpackage("_hdbscan.tests") + config.add_extension( + "_hdbscan._hdbscan_boruvka", + sources=["_hdbscan/_hdbscan_boruvka.pyx"], + include_dirs=[numpy.get_include(), "_hdbscan"], + libraries=libraries, + ) + config.add_extension( + "_hdbscan._hdbscan_linkage", + sources=["_hdbscan/_hdbscan_linkage.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_extension( + "_hdbscan._hdbscan_reachability", + sources=["_hdbscan/_hdbscan_reachability.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_extension( + "_hdbscan._hdbscan_tree", + sources=["_hdbscan/_hdbscan_tree.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_extension( + "_hdbscan._prediction_utils", + sources=["_hdbscan/_prediction_utils.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_extension( + "_hdbscan.dist_metrics", + sources=["_hdbscan/dist_metrics.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + return config From c5240b75173422c5c5644dabfbf2b5b1aee4bb48 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Feb 2022 19:14:50 -0500 Subject: [PATCH 002/160] Added wraparound wrappers where needed --- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 5 ++++- sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index e35470c09f38a..ddb1db48e8622 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -6,8 +6,10 @@ import numpy as np cimport numpy as np +import cython from libc.float cimport DBL_MAX +from libc.stdio cimport printf from .dist_metrics cimport DistanceMetric @@ -198,6 +200,7 @@ cdef class UnionFind (object): return + @cython.wraparound(True) cdef np.intp_t fast_find(self, np.intp_t n): cdef np.intp_t p p = n @@ -208,7 +211,7 @@ cdef class UnionFind (object): p, self.parent_arr[p] = self.parent_arr[p], n return n - +@cython.wraparound(True) cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): cdef np.ndarray[np.double_t, ndim=2] result_arr diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx index ca788a8f995e4..bf2e7014d6026 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -7,6 +7,7 @@ import numpy as np cimport numpy as np +import cython cdef np.double_t INFTY = np.inf @@ -656,6 +657,7 @@ cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluste return set(selected_clusters) +@cython.wraparound(True) cpdef tuple get_clusters(np.ndarray tree, dict stability, cluster_selection_method='eom', allow_single_cluster=False, From 74bd0b349c76cedab7d0d6f7b25b1e3686535a61 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Feb 2022 22:51:24 -0500 Subject: [PATCH 003/160] Updated documentation --- sklearn/cluster/_hdbscan/hdbscan_.py | 219 +++++++++++++++++---------- sklearn/cluster/_hdbscan/validity.py | 27 ++-- 2 files changed, 154 insertions(+), 92 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 89b2590ea27a5..86a2e66a5ec06 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -525,30 +525,30 @@ def hdbscan( X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if - ``metric='precomputed'``. + `metric='precomputed'`. - min_cluster_size : int, optional (default=5) + min_cluster_size : int, default=5 The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise. - min_samples : int, optional (default=None) + min_samples : int, default=None The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. defaults to the min_cluster_size. - cluster_selection_epsilon: float, optional (default=0.0) + cluster_selection_epsilon: float, default=0.0 A distance threshold. Clusters below this value will be merged. See [3]_ for more information. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. - alpha : float, optional (default=1.0) + alpha : float, default=1.0 A distance scaling parameter as used in robust single linkage. See [2]_ for more information. - max_cluster_size : int, optional (default=0) + max_cluster_size : int, default=0 A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare @@ -557,7 +557,7 @@ def hdbscan( for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. - metric : string or callable, optional (default='minkowski') + metric : string or callable, default='minkowski' The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its @@ -565,62 +565,62 @@ def hdbscan( If metric is "precomputed", X is assumed to be a distance matrix and must be square. - p : int, optional (default=2) + p : int, default=2 p value to use if using the minkowski metric. - leaf_size : int, optional (default=40) + leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries. - algorithm : string, optional (default='best') + algorithm : string, default='best' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to ``best`` which chooses the "best" algorithm given the nature of + to `best` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: - * ``best`` - * ``generic`` - * ``prims_kdtree`` - * ``prims_balltree`` - * ``boruvka_kdtree`` - * ``boruvka_balltree`` + * `best` + * `generic` + * `prims_kdtree` + * `prims_balltree` + * `boruvka_kdtree` + * `boruvka_balltree` memory : instance of joblib.Memory or string, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. - approx_min_span_tree : bool, optional (default=True) + approx_min_span_tree : bool, default=True Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. - gen_min_span_tree : bool, optional (default=False) + gen_min_span_tree : bool, default=False Whether to generate the minimum spanning tree for later analysis. - core_dist_n_jobs : int, optional (default=4) + core_dist_n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For ``core_dist_n_jobs`` + supported by the specific algorithm). For `core_dist_n_jobs` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - cluster_selection_method : string, optional (default='eom') + cluster_selection_method : string, default='eom' The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: - * ``eom`` - * ``leaf`` + * `eom` + * `leaf` - allow_single_cluster : bool, optional (default=False) + allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this to t=True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. (default False) - match_reference_implementation : bool, optional (default=False) + match_reference_implementation : bool, default=False There exist some interpretational differences between this HDBSCAN* implementation and the original authors reference implementation in Java. This can result in very minor differences @@ -775,7 +775,7 @@ def hdbscan( warn( "A large dataset size and small leaf_size may induce excessive " "memory usage. If you are running out of memory consider " - "increasing the ``leaf_size`` parameter." + "increasing the `leaf_size` parameter." ) (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_boruvka_balltree @@ -887,16 +887,29 @@ class HDBSCAN(BaseEstimator, ClusterMixin): Parameters ---------- - min_cluster_size : int, optional (default=5) + min_cluster_size : int, default=5 The minimum size of clusters; single linkage splits that contain fewer points than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. - min_samples : int, optional (default=None) + min_samples : int, default=None The number of samples in a neighbourhood for a point to be considered a core point. - metric : string, or callable, optional (default='euclidean') + cluster_selection_epsilon : float, default=0.0 + A distance threshold. Clusters below this value will be merged. + See [5]_ for more information. + + max_cluster_size : int, default=0 + A limit to the size of clusters returned by the eom algorithm. + Has no effect when using leaf clustering (where clusters are + usually small regardless) and can also be overridden in rare + cases by a high value for cluster_selection_epsilon. Note that + this should not be used if we want to predict the cluster labels + for new points in future (e.g. using approximate_predict), as + the approximate_predict function is not aware of this argument. + + metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its @@ -904,79 +917,74 @@ class HDBSCAN(BaseEstimator, ClusterMixin): If metric is "precomputed", X is assumed to be a distance matrix and must be square. - p : int, optional (default=None) - p value to use if using the minkowski metric. - - alpha : float, optional (default=1.0) + alpha : float, default=1.0 A distance scaling parameter as used in robust single linkage. See [3]_ for more information. - cluster_selection_epsilon: float, optional (default=0.0) - A distance threshold. Clusters below this value will be merged. - See [5]_ for more information. + p : int, default=None + Value of `p` if using the minkowski metric. - algorithm : string, optional (default='best') + algorithm : str, default='best' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to ``best`` which chooses the "best" algorithm given the nature of + to `best` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: - * ``best`` - * ``generic`` - * ``prims_kdtree`` - * ``prims_balltree`` - * ``boruvka_kdtree`` - * ``boruvka_balltree`` - - leaf_size: int, optional (default=40) + * `best` + * `generic` + * `prims_kdtree` + * `prims_balltree` + * `boruvka_kdtree` + * `boruvka_balltree` + + leaf_size : int, default=40 If using a space tree algorithm (kdtree, or balltree) the number of points ina leaf node of the tree. This does not alter the resulting clustering, but may have an effect on the runtime of the algorithm. - memory : Instance of joblib.Memory or string (optional) + memory : Instance of joblib.Memory or str, default=Memory(verbose=1) Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. - approx_min_span_tree : bool, optional (default=True) + approx_min_span_tree : bool, default=True Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. - gen_min_span_tree: bool, optional (default=False) + gen_min_span_tree : bool, default=False Whether to generate the minimum spanning tree with regard to mutual reachability distance for later analysis. - core_dist_n_jobs : int, optional (default=4) + core_dist_n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For ``core_dist_n_jobs`` + supported by the specific algorithm). For `core_dist_n_jobs` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - cluster_selection_method : string, optional (default='eom') + cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: - * ``eom`` - * ``leaf`` + * `eom` + * `leaf` - allow_single_cluster : bool, optional (default=False) + allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this to True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. - prediction_data : boolean, optional + prediction_data : bool, default=False Whether to generate extra cached data for predicting labels or membership vectors few new unseen points later. If you wish to persist the clustering object for later re-use you probably want to set this to True. - (default False) - match_reference_implementation : bool, optional (default=False) + match_reference_implementation : bool, default=False There exist some interpretational differences between this HDBSCAN* implementation and the original authors reference implementation in Java. This can result in very minor differences @@ -985,7 +993,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): reference implementation. **kwargs : optional - Arguments passed to the distance metric + Arguments passed to the distance metric. Attributes ---------- @@ -1028,7 +1036,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): prediction_data_ : PredictionData object Cached data used for predicting the cluster labels of new or unseen points. Necessary only if you are using functions from - ``hdbscan.prediction`` (see + `hdbscan.prediction` (see :func:`~hdbscan.prediction.approximate_predict`, :func:`~hdbscan.prediction.membership_vector`, and :func:`~hdbscan.prediction.all_points_membership_vectors`). @@ -1051,6 +1059,13 @@ class HDBSCAN(BaseEstimator, ClusterMixin): across different choices of hyper-parameters, therefore is only a relative score. + See Also + -------- + DBSCAN : Density-Based Spatial Clustering of Applications + with Noise. + OPTICS : Ordering Points To Identify the Clustering Structure. + BIRCH : Memory-efficient, online-learning algorithm. + References ---------- @@ -1075,6 +1090,16 @@ class HDBSCAN(BaseEstimator, ClusterMixin): .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical Density-based Cluster Selection. arxiv preprint 1911.02282. + Examples + -------- + >>> from sklearn.cluster import HDBSCAN + >>> from sklearn.datasets import load_digits + >>> X, _ = load_digits(return_X_y=True) + >>> hdb = HDBSCAN(min_cluster_size=20) + >>> hdb.fit(X) + HDBSCAN(min_cluster_size=20) + >>> hdb.labels_ + array([ 2, 6, -1, ..., -1, -1, -1], dtype=int64) """ def __init__( @@ -1134,12 +1159,15 @@ def fit(self, X, y=None): X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if - ``metric='precomputed'``. + `metric='precomputed'`. + + y : Ignored + Ignored. Returns ------- self : object - Returns self + Returns self. """ if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. @@ -1208,28 +1236,33 @@ def fit(self, X, y=None): return self def fit_predict(self, X, y=None): - """Performs clustering on X and returns cluster labels. + """Perform clustering on X and return cluster labels. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if - ``metric='precomputed'``. + `metric='precomputed'`. + + y : Ignored + Ignored. Returns ------- y : ndarray, shape (n_samples, ) - cluster labels + Cluster labels. """ self.fit(X) return self.labels_ def generate_prediction_data(self): """ + Create data that caches intermediate results for label prediction. + Create data that caches intermediate results used for predicting the label of new/unseen points. This data is only useful if - you are intending to use functions from ``hdbscan.prediction``. + you are intending to use functions from `hdbscan.prediction`. """ if self.metric in FAST_METRICS: @@ -1258,21 +1291,22 @@ def generate_prediction_data(self): ) def weighted_cluster_centroid(self, cluster_id): - """Provide an approximate representative point for a given cluster. + """ + Provide an approximate representative point for a given cluster. + Note that this technique assumes a euclidean metric for speed of - computation. For more general metrics use the ``weighted_cluster_medoid`` - method which is slower, but can work with the metric the model trained - with. + computation. For more general metrics use the `weighted_cluster_medoid` + method which is slower, but can work with more general metrics. Parameters ---------- - cluster_id: int + cluster_id : int The id of the cluster to compute a centroid for. Returns ------- - centroid: array of shape (n_features,) - A representative centroid for cluster ``cluster_id``. + centroid : array of shape (n_features,) + A representative centroid for cluster `cluster_id`. """ if not hasattr(self, "labels_"): raise AttributeError("Model has not been fit to data") @@ -1290,20 +1324,22 @@ def weighted_cluster_centroid(self, cluster_id): return np.average(cluster_data, weights=cluster_membership_strengths, axis=0) def weighted_cluster_medoid(self, cluster_id): - """Provide an approximate representative point for a given cluster. + """ + Provide an approximate representative point for a given cluster. + Note that this technique can be very slow and memory intensive for - large clusters. For faster results use the ``weighted_cluster_centroid`` + large clusters. For faster results use the `weighted_cluster_centroid` method which is faster, but assumes a euclidean metric. Parameters ---------- - cluster_id: int + cluster_id : int The id of the cluster to compute a medoid for. Returns ------- - centroid: array of shape (n_features,) - A representative medoid for cluster ``cluster_id``. + centroid : array of shape (n_features,) + A representative medoid for cluster `cluster_id`. """ if not hasattr(self, "labels_"): raise AttributeError("Model has not been fit to data") @@ -1328,10 +1364,13 @@ def weighted_cluster_medoid(self, cluster_id): def dbscan_clustering(self, cut_distance, min_cluster_size=5): """ + Return clustering given by DBSCAN without border points. + Return clustering that would be equivalent to running DBSCAN* for a particular cut_distance (or epsilon) DBSCAN* can be thought of as DBSCAN without the border points. As such these results may differ - slightly from sklearns implementation of dbscan in the non-core points. + slightly from `cluster.DBSCAN` due to the difference in implementation + over the non-core points. This can also be thought of as a flat clustering derived from constant height cut through the single linkage tree. @@ -1365,6 +1404,9 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): @property def prediction_data_(self): + """ + Cached data for predicting cluster labels of new or unseen points. + """ if self._prediction_data is None: raise AttributeError("No prediction data was generated") else: @@ -1372,6 +1414,9 @@ def prediction_data_(self): @property def outlier_scores_(self): + """ + Points with larger scores are more outlier-like points. + """ if self._outlier_scores is not None: return self._outlier_scores else: @@ -1385,6 +1430,7 @@ def outlier_scores_(self): @property def condensed_tree_(self): + """A simplified or smoothed version of `sinkle_linkage_tree_`.""" if self._condensed_tree is not None: return CondensedTree( self._condensed_tree, @@ -1398,6 +1444,7 @@ def condensed_tree_(self): @property def single_linkage_tree_(self): + """A single linkage format dendrogram tree.""" if self._single_linkage_tree is not None: return SingleLinkageTree(self._single_linkage_tree) else: @@ -1407,6 +1454,9 @@ def single_linkage_tree_(self): @property def minimum_spanning_tree_(self): + """ + The minimum spanning tree of the mutual reachability graph. + """ if self._min_spanning_tree is not None: if self._raw_data is not None: return MinimumSpanningTree(self._min_spanning_tree, self._raw_data) @@ -1426,6 +1476,12 @@ def minimum_spanning_tree_(self): @property def exemplars_(self): + """ + A list of exemplar points for clusters. + + These are the "most representative" points of the arbitrarily shaped + clusters. + """ if self._prediction_data is not None: return self._prediction_data.exemplars elif self.metric in FAST_METRICS: @@ -1440,6 +1496,9 @@ def exemplars_(self): @property def relative_validity_(self): + """ + A fast approximation of the Density Based Cluster Validity (DBCV) score. + """ if self._relative_validity is not None: return self._relative_validity diff --git a/sklearn/cluster/_hdbscan/validity.py b/sklearn/cluster/_hdbscan/validity.py index c29103cff3734..f0d86cdb36bd4 100644 --- a/sklearn/cluster/_hdbscan/validity.py +++ b/sklearn/cluster/_hdbscan/validity.py @@ -276,40 +276,43 @@ def density_separation( def validity_index( - X, labels, metric="euclidean", d=None, per_cluster_scores=False, **kwd_args + X, labels, metric="euclidean", d=None, per_cluster_scores=False, kwargs=None ): """ + Compute the density based cluster validity index. + Compute the density based cluster validity index for the clustering specified by `labels` and for each cluster in `labels`. Parameters ---------- - X : array (n_samples, n_features) or (n_samples, n_samples) - The input data of the clustering. This can be the data, or, if - metric is set to `precomputed` the pairwise distance matrix used - for the clustering. + X : array-like of shape (n_samples, n_features) or (n_samples, n_samples) + The input data of the clustering. - labels : array (n_samples) + If `metric=precomputed` this is treated as the pairwise distance matrix + used for the clustering. + + labels : array-like (n_samples) The label array output by the clustering, providing an integral cluster label to each data point, with -1 for noise points. - metric : optional, string (default 'euclidean') + metric : str, default='euclidean' The metric used to compute distances for the clustering (and to be re-used in computing distances for mr distance). If set to `precomputed` then X is assumed to be the precomputed distance matrix between samples. - d : optional, integer (or None) (default None) + d : int, default=None The number of features (dimension) of the dataset. This need only be set in the case of metric being set to `precomputed`, where the ambient dimension of the data is unknown to the function. - per_cluster_scores : optional, boolean (default False) + per_cluster_scores : bool, default=False Whether to return the validity index for individual clusters. Defaults to False with the function returning a single float value for the whole clustering. - **kwd_args : + kwargs : dict, default=None Extra arguments to pass to the distance computation for other metrics, such as minkowski, Mahanalobis etc. @@ -345,7 +348,7 @@ def validity_index( continue mr_distances, core_distances[cluster_id] = all_points_mutual_reachability( - X, labels, cluster_id, metric, d, **kwd_args + X, labels, cluster_id, metric, d, **kwargs ) mst_nodes[cluster_id], mst_edges[cluster_id] = internal_minimum_spanning_tree( @@ -375,7 +378,7 @@ def validity_index( core_distances[i], core_distances[j], metric=metric, - **kwd_args, + **kwargs, ) density_sep[j, i] = density_sep[i, j] From faa06b574517ade6488d486500b3a147bf13c1ea Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 4 Mar 2022 19:02:45 -0500 Subject: [PATCH 004/160] Added a new batch of doc updates for passing docstring tests --- sklearn/cluster/_hdbscan/hdbscan_.py | 46 +++++----- sklearn/cluster/_hdbscan/prediction.py | 85 +++++++++++-------- .../_hdbscan/robust_single_linkage_.py | 37 ++++---- 3 files changed, 95 insertions(+), 73 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 86a2e66a5ec06..36de9aeff72df 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -509,7 +509,7 @@ def hdbscan( p=2, leaf_size=40, algorithm="best", - memory=Memory(cachedir=None, verbose=0), + memory=None, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, @@ -537,17 +537,17 @@ def hdbscan( to be considered as a core point. This includes the point itself. defaults to the min_cluster_size. - cluster_selection_epsilon: float, default=0.0 + alpha : float, default=1.0 + A distance scaling parameter as used in robust single linkage. + See [2]_ for more information. + + cluster_selection_epsilon : float, default=0.0 A distance threshold. Clusters below this value will be merged. See [3]_ for more information. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. - alpha : float, default=1.0 - A distance scaling parameter as used in robust single linkage. - See [2]_ for more information. - max_cluster_size : int, default=0 A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are @@ -557,22 +557,25 @@ def hdbscan( for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. - metric : string or callable, default='minkowski' + metric : str or callable, default='minkowski' The metric to use when calculating distance between instances in a - feature array. If metric is a string or callable, it must be one of - the options allowed by metrics.pairwise.pairwise_distances for its + feature array. + + If metric is a string or callable, it must be one of + the options allowed by `metrics.pairwise.pairwise_distances` for its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and must be square. p : int, default=2 - p value to use if using the minkowski metric. + Value of `p` if using the minkowski metric. leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries. - algorithm : string, default='best' + algorithm : str, default='best' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to `best` which chooses the "best" algorithm given the nature of @@ -585,7 +588,7 @@ def hdbscan( * `boruvka_kdtree` * `boruvka_balltree` - memory : instance of joblib.Memory or string, optional + memory : str, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. @@ -605,7 +608,7 @@ def hdbscan( supported by the specific algorithm). For `core_dist_n_jobs` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - cluster_selection_method : string, default='eom' + cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead @@ -618,7 +621,6 @@ def hdbscan( By default HDBSCAN* will not produce a single cluster, setting this to t=True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. - (default False) match_reference_implementation : bool, default=False There exist some interpretational differences between this @@ -629,7 +631,7 @@ def hdbscan( reference implementation. **kwargs : optional - Arguments passed to the distance metric + Arguments passed to the distance metric. Returns ------- @@ -723,8 +725,7 @@ def hdbscan( check_precomputed_distance_matrix(X) # Python 2 and 3 compliant string_type checking - if isinstance(memory, str): - memory = Memory(cachedir=memory, verbose=0) + memory = Memory(cachedir=memory, verbose=0) size = X.shape[0] min_samples = min(size - 1, min_samples) @@ -897,7 +898,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): considered a core point. cluster_selection_epsilon : float, default=0.0 - A distance threshold. Clusters below this value will be merged. + A distance threshold. Clusters below this value will be merged. See [5]_ for more information. max_cluster_size : int, default=0 @@ -909,11 +910,14 @@ class HDBSCAN(BaseEstimator, ClusterMixin): for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument. - metric : str, or callable, default='euclidean' + metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a - feature array. If metric is a string or callable, it must be one of - the options allowed by metrics.pairwise.pairwise_distances for its + feature array. + + If metric is a string or callable, it must be one of + the options allowed by `metrics.pairwise.pairwise_distances` for its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and must be square. diff --git a/sklearn/cluster/_hdbscan/prediction.py b/sklearn/cluster/_hdbscan/prediction.py index 888ce25b05b2f..737de77e85948 100644 --- a/sklearn/cluster/_hdbscan/prediction.py +++ b/sklearn/cluster/_hdbscan/prediction.py @@ -28,21 +28,21 @@ class PredictionData(object): ---------- data : array (n_samples, n_features) - The original data set that was clustered + The original data set that was clustered. condensed_tree : CondensedTree - The condensed tree object created by a clustering + The condensed tree object created by a clustering. min_samples : int - The min_samples value used in clustering + The min_samples value used in clustering. - tree_type : string, optional + tree_type : str, default="kdtree" Which type of space tree to use for core distance computation. One of: * ``kdtree`` * ``balltree`` - metric : string, optional + metric : str, default="euclidean" The metric used to determine distance for the clustering. This is the metric that will be used for the space tree to determine core distances etc. @@ -355,10 +355,12 @@ def _find_cluster_and_probability( def approximate_predict(clusterer, points_to_predict): - """Predict the cluster label of new points. The returned labels - will be those of the original clustering found by ``clusterer``, - and therefore are not (necessarily) the cluster labels that would - be found by clustering the original data combined with + """ + Predict the cluster label of new points. + + The returned labels will be those of the original clustering found by + ``clusterer``, and therefore are not (necessarily) the cluster labels that + would be found by clustering the original data combined with ``points_to_predict``, hence the 'approximate' label. If you simply wish to assign new points to an existing clustering @@ -382,16 +384,18 @@ def approximate_predict(clusterer, points_to_predict): Returns ------- labels : array (n_samples,) - The predicted labels of the ``points_to_predict`` + The predicted labels of the ``points_to_predict``. probabilities : array (n_samples,) - The soft cluster scores for each of the ``points_to_predict`` + The soft cluster scores for each of the ``points_to_predict``. See Also -------- - :py:func:`hdbscan.predict.membership_vector` - :py:func:`hdbscan.predict.all_points_membership_vectors` - + sklearn.cluster.hdbscan.prediction.membership_vector : Predict soft cluster + membership. + sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict + soft cluster membership vectors for all points in the original dataset + the clusterer was trained on. """ if clusterer.prediction_data_ is None: raise ValueError( @@ -440,10 +444,12 @@ def approximate_predict(clusterer, points_to_predict): def approximate_predict_scores(clusterer, points_to_predict): - """Predict the outlier score of new points. The returned scores - will be based on the original clustering found by ``clusterer``, - and therefore are not (necessarily) the outlier scores that would - be found by clustering the original data combined with + """ + Predict the outlier score of new points. + + The returned scores will be based on the original clustering found by + ``clusterer``, and therefore are not (necessarily) the outlier scores that + would be found by clustering the original data combined with ``points_to_predict``, hence the 'approximate' label. If you simply wish to calculate the outlier scores for new points @@ -467,13 +473,15 @@ def approximate_predict_scores(clusterer, points_to_predict): Returns ------- scores : array (n_samples,) - The predicted scores of the ``points_to_predict`` + The predicted scores of the ``points_to_predict``. See Also -------- - :py:func:`hdbscan.predict.membership_vector` - :py:func:`hdbscan.predict.all_points_membership_vectors` - + sklearn.cluster.hdbscan.prediction.membership_vector : Predict soft cluster + membership. + sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict + soft cluster membership vectors for all points in the original dataset + the clusterer was trained on. """ try: clusterer.prediction_data_ @@ -548,10 +556,12 @@ def approximate_predict_scores(clusterer, points_to_predict): def membership_vector(clusterer, points_to_predict): - """Predict soft cluster membership. The result produces a vector - for each point in ``points_to_predict`` that gives a probability that - the given point is a member of a cluster for each of the selected clusters - of the ``clusterer``. + """ + Predict soft cluster membership. + + Predicts sofr cluster membership, producing a vector for each point in + ``points_to_predict`` that gives a probability that the given point is a + member of a cluster for each of the selected clusters of the ``clusterer``. Parameters ---------- @@ -573,8 +583,12 @@ def membership_vector(clusterer, points_to_predict): See Also -------- - :py:func:`hdbscan.predict.predict` - :py:func:`hdbscan.predict.all_points_membership_vectors`""" + sklearn.cluster.hdbscan.prediction.approximate_predict : Predict the + cluster label of new points. + sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict + soft cluster membership vectors for all points in the original dataset + the clusterer was trained on. + """ points_to_predict = points_to_predict.astype(np.float64) clusters = np.array( @@ -636,10 +650,11 @@ def membership_vector(clusterer, points_to_predict): def all_points_membership_vectors(clusterer): - """Predict soft cluster membership vectors for all points in the - original dataset the clusterer was trained on. This function is more - efficient by making use of the fact that all points are already in the - condensed tree, and processing in bulk. + """ + Predict soft cluster membership for all points in the original dataset. + + This function is more efficient by making use of the fact that all points + are already in the condensed tree, and processing in bulk. Parameters ---------- @@ -658,8 +673,10 @@ def all_points_membership_vectors(clusterer): See Also -------- - :py:func:`hdbscan.predict.predict` - :py:func:`hdbscan.predict.all_points_membership_vectors` + sklearn.cluster.hdbscan.prediction.approximate_predict : Predict the + cluster label of new points. + sklearn.cluster.hdbscan.prediction.membership_vectors : Predict soft cluster + membership. """ clusters = np.array( sorted(list(clusterer.condensed_tree_._select_clusters())) diff --git a/sklearn/cluster/_hdbscan/robust_single_linkage_.py b/sklearn/cluster/_hdbscan/robust_single_linkage_.py index 760f9a2335edd..5ecc1f173549a 100644 --- a/sklearn/cluster/_hdbscan/robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/robust_single_linkage_.py @@ -135,13 +135,13 @@ def robust_single_linkage( gamma=5, metric="euclidean", algorithm="best", - memory=Memory(cachedir=None, verbose=0), + memory=None, leaf_size=40, core_dist_n_jobs=4, **kwargs, ): - """Perform robust single linkage clustering from a vector array - or distance matrix. + """ + Perform robust single linkage clustering. Parameters ---------- @@ -154,30 +154,31 @@ def robust_single_linkage( The reachability distance value to cut the cluster heirarchy at to derive a flat cluster labelling. - k : int, optional (default=5) + k : int, default=5 Reachability distances will be computed with regard to the `k` nearest neighbors. - alpha : float, optional (default=np.sqrt(2)) + alpha : float, default=np.sqrt(2) Distance scaling for reachability distance computation. Reachability distance is computed as .. math:: - `\max(core_k(a), core_k(b), 1/\alpha d(a,b))`. - gamma : int, optional (default=5) + \\max (core_k(a), core_k(b), 1/\\alpha d(a,b)). + + gamma : int, default=5 Ignore any clusters in the flat clustering with size less than gamma, and declare points in such clusters as noise points. - metric : string, or callable, optional (default='euclidean') + metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of - the options allowed by metrics.pairwise.pairwise_distances for its + the options allowed by `metrics.pairwise.pairwise_distances` for its metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and + If `metric="precomputed"`, X is assumed to be a distance matrix and must be square. - algorithm : string, optional (default='best') + algorithm : str, default='best' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of @@ -190,20 +191,22 @@ def robust_single_linkage( * ``boruvka_kdtree`` * ``boruvka_balltree`` - memory : Instance of joblib.Memory or string (optional) + memory : str, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. - leaf_size : int, optional (default=40) + leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries. - core_dist_n_jobs : int, optional + core_dist_n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - (default 4) + + **kwargs : optional + Arguments passed to the distance metric. Returns ------- @@ -220,7 +223,6 @@ def robust_single_linkage( .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the cluster tree. In Advances in Neural Information Processing Systems (pp. 343-351). - """ if not isinstance(k, int) or k < 1: @@ -242,8 +244,7 @@ def robust_single_linkage( raise ValueError("Minkowski metric with negative p value is not defined!") X = check_array(X, accept_sparse="csr") - if isinstance(memory, str): - memory = Memory(cachedir=memory, verbose=0) + memory = Memory(cachedir=memory, verbose=0) if algorithm != "best": if algorithm == "generic": From 266c958eb5258341cd49e502aa60b13e8e698b34 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 14:22:31 -0500 Subject: [PATCH 005/160] Parameter and attribute revisions - Added support for `n_features_in_` - Improved validation and added support for `feature_names_in_` - Renamed `kwargs` to `metric_params` and added safety check for an empty dict - Removed attributes set in init and deferred to properties - Raised error if tree query is performed with too few samples - Cleaned up some list/dict comprehension logic --- sklearn/cluster/_hdbscan/hdbscan_.py | 75 ++++++++++++++-------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 36de9aeff72df..2539e5e88fdf1 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -319,6 +319,14 @@ def _hdbscan_boruvka_kdtree( X = X.astype(np.float64) tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + + n_samples = X.shape[0] + if min_samples + 1 > n_samples: + raise ValueError( + "Expected min_samples + 1 <= n_samples, " + f" but {min_samples+1=}, {n_samples=}" + ) + alg = KDTreeBoruvkaAlgorithm( tree, min_samples, @@ -996,7 +1004,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): performance cost, ensure that the clustering results match the reference implementation. - **kwargs : optional + **metric_params : optional Arguments passed to the distance metric. Attributes @@ -1063,6 +1071,13 @@ class HDBSCAN(BaseEstimator, ClusterMixin): across different choices of hyper-parameters, therefore is only a relative score. + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + See Also -------- DBSCAN : Density-Based Spatial Clustering of Applications @@ -1125,7 +1140,7 @@ def __init__( allow_single_cluster=False, prediction_data=False, match_reference_implementation=False, - **kwargs, + **metric_params, ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples @@ -1144,16 +1159,7 @@ def __init__( self.allow_single_cluster = allow_single_cluster self.match_reference_implementation = match_reference_implementation self.prediction_data = prediction_data - - self._metric_kwargs = kwargs - - self._condensed_tree = None - self._single_linkage_tree = None - self._min_spanning_tree = None - self._raw_data = None - self._outlier_scores = None - self._prediction_data = None - self._relative_validity = None + self.metric_params = metric_params or {} def fit(self, X, y=None): """Perform HDBSCAN clustering from features or distance matrix. @@ -1176,37 +1182,32 @@ def fit(self, X, y=None): if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. # Rows with these values - X = check_array(X, accept_sparse="csr", force_all_finite=False) + X = self._validate_data(X, force_all_finite=False) self._raw_data = X self._all_finite = is_finite(X) - if ~self._all_finite: + if not self._all_finite: # Pass only the purely finite indices into hdbscan # We will later assign all non-finite points to the # background-1 cluster finite_index = get_finite_row_indices(X) - clean_data = X[finite_index] - internal_to_raw = { - x: y for x, y in zip(range(len(finite_index)), finite_index) - } + X = X[finite_index] + internal_to_raw = {x: y for x, y in enumerate(finite_index)} outliers = list(set(range(X.shape[0])) - set(finite_index)) - else: - clean_data = X elif issparse(X): # Handle sparse precomputed distance matrices separately - X = check_array(X, accept_sparse="csr") - clean_data = X + X = self._validate_data(X, accept_sparse="csr") else: # Only non-sparse, precomputed distance matrices are allowed # to have numpy.inf values indicating missing distances - check_precomputed_distance_matrix(X) - clean_data = X + X = self._validate_data(X, force_all_finite="allow-nan") + self.n_features_in_ = X.shape[1] kwargs = self.get_params() # prediction data only applies to the persistent model, so remove # it from the keyword args we pass on the the function kwargs.pop("prediction_data", None) - kwargs.update(self._metric_kwargs) + kwargs.update(self.metric_params) ( self.labels_, @@ -1215,7 +1216,7 @@ def fit(self, X, y=None): self._condensed_tree, self._single_linkage_tree, self._min_spanning_tree, - ) = hdbscan(clean_data, **kwargs) + ) = hdbscan(X, **kwargs) if self.metric != "precomputed" and not self._all_finite: # remap indices to align with original data in the case of @@ -1226,11 +1227,11 @@ def fit(self, X, y=None): self._single_linkage_tree = remap_single_linkage_tree( self._single_linkage_tree, internal_to_raw, outliers ) - new_labels = np.full(X.shape[0], -1) + new_labels = np.full(self._raw_data.shape[0], -1) new_labels[finite_index] = self.labels_ self.labels_ = new_labels - new_probabilities = np.zeros(X.shape[0]) + new_probabilities = np.zeros(self._raw_data.shape[0]) new_probabilities[finite_index] = self.probabilities_ self.probabilities_ = new_probabilities @@ -1285,7 +1286,7 @@ def generate_prediction_data(self): min_samples, tree_type=tree_type, metric=self.metric, - **self._metric_kwargs, + **self.metric_params, ) else: warn( @@ -1359,7 +1360,7 @@ def weighted_cluster_medoid(self, cluster_id): cluster_membership_strengths = self.probabilities_[mask] dist_mat = pairwise_distances( - cluster_data, metric=self.metric, **self._metric_kwargs + cluster_data, metric=self.metric, **self.metric_params ) dist_mat = dist_mat * cluster_membership_strengths @@ -1411,7 +1412,7 @@ def prediction_data_(self): """ Cached data for predicting cluster labels of new or unseen points. """ - if self._prediction_data is None: + if getattr(self, "_prediction_data", None) is not None: raise AttributeError("No prediction data was generated") else: return self._prediction_data @@ -1421,7 +1422,7 @@ def outlier_scores_(self): """ Points with larger scores are more outlier-like points. """ - if self._outlier_scores is not None: + if getattr(self, "_outlier_scores", None) is not None: return self._outlier_scores else: if self._condensed_tree is not None: @@ -1435,7 +1436,7 @@ def outlier_scores_(self): @property def condensed_tree_(self): """A simplified or smoothed version of `sinkle_linkage_tree_`.""" - if self._condensed_tree is not None: + if getattr(self, "_condensed_tree", None) is not None: return CondensedTree( self._condensed_tree, self.cluster_selection_method, @@ -1449,7 +1450,7 @@ def condensed_tree_(self): @property def single_linkage_tree_(self): """A single linkage format dendrogram tree.""" - if self._single_linkage_tree is not None: + if getattr(self, "_single_linkage_tree", None) is not None: return SingleLinkageTree(self._single_linkage_tree) else: raise AttributeError( @@ -1461,7 +1462,7 @@ def minimum_spanning_tree_(self): """ The minimum spanning tree of the mutual reachability graph. """ - if self._min_spanning_tree is not None: + if getattr(self, "_min_spanning_tree", None) is not None: if self._raw_data is not None: return MinimumSpanningTree(self._min_spanning_tree, self._raw_data) else: @@ -1486,7 +1487,7 @@ def exemplars_(self): These are the "most representative" points of the arbitrarily shaped clusters. """ - if self._prediction_data is not None: + if getattr(self, "_prediction_data", None) is not None: return self._prediction_data.exemplars elif self.metric in FAST_METRICS: self.generate_prediction_data() @@ -1503,7 +1504,7 @@ def relative_validity_(self): """ A fast approximation of the Density Based Cluster Validity (DBCV) score. """ - if self._relative_validity is not None: + if getattr(self, "_relative_validity", None) is not None: return self._relative_validity if not self.gen_min_span_tree: From 2a7cc226ca1facdd0fd64e81186d161b7a9943f9 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 14:58:03 -0500 Subject: [PATCH 006/160] Improved `metric_params` handling --- sklearn/cluster/_hdbscan/hdbscan_.py | 81 +++++++++++++++++++++------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 2539e5e88fdf1..9f1c2923a1821 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -524,7 +524,7 @@ def hdbscan( cluster_selection_method="eom", allow_single_cluster=False, match_reference_implementation=False, - **kwargs, + metric_params=None, ): """Perform HDBSCAN clustering from a vector array or distance matrix. @@ -638,7 +638,7 @@ def hdbscan( performance cost, ensure that the clustering results match the reference implementation. - **kwargs : optional + metric_params : dict, default=None Arguments passed to the distance metric. Returns @@ -740,6 +740,7 @@ def hdbscan( if min_samples == 0: min_samples = 1 + metric_params = metric_params or {} if algorithm != "best": if metric != "precomputed" and issparse(X) and algorithm != "generic": raise ValueError("Sparse data matrices only support algorithm 'generic'.") @@ -747,19 +748,46 @@ def hdbscan( if algorithm == "generic": (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_generic - )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **metric_params, + ) elif algorithm == "prims_kdtree": if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Prim's with KDTree for this metric!") (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_prims_kdtree - )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **metric_params, + ) elif algorithm == "prims_balltree": if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Prim's with BallTree for this metric!") (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_prims_balltree - )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **metric_params, + ) elif algorithm == "boruvka_kdtree": if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with KDTree for this metric!") @@ -775,7 +803,7 @@ def hdbscan( approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, - **kwargs, + **metric_params, ) elif algorithm == "boruvka_balltree": if metric not in BallTree.valid_metrics: @@ -798,7 +826,7 @@ def hdbscan( approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, - **kwargs, + **metric_params, ) else: raise TypeError("Unknown algorithm type %s specified" % algorithm) @@ -808,7 +836,16 @@ def hdbscan( # We can't do much with sparse matrices ... (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_generic - )(X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) + )( + X, + min_samples, + alpha, + metric, + p, + leaf_size, + gen_min_span_tree, + **metric_params, + ) elif metric in KDTree.valid_metrics: # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now @@ -823,7 +860,7 @@ def hdbscan( p, leaf_size, gen_min_span_tree, - **kwargs, + **metric_params, ) else: (single_linkage_tree, result_min_span_tree) = memory.cache( @@ -838,7 +875,7 @@ def hdbscan( approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, - **kwargs, + **metric_params, ) else: # Metric is a valid BallTree metric # TO DO: Need heuristic to decide when to go to boruvka; @@ -854,7 +891,7 @@ def hdbscan( p, leaf_size, gen_min_span_tree, - **kwargs, + **metric_params, ) else: (single_linkage_tree, result_min_span_tree) = memory.cache( @@ -869,7 +906,7 @@ def hdbscan( approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, - **kwargs, + **metric_params, ) return _tree_to_labels( @@ -1004,7 +1041,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): performance cost, ensure that the clustering results match the reference implementation. - **metric_params : optional + metric_params : dict, default=None Arguments passed to the distance metric. Attributes @@ -1121,6 +1158,9 @@ class HDBSCAN(BaseEstimator, ClusterMixin): array([ 2, 6, -1, ..., -1, -1, -1], dtype=int64) """ + def _more_tags(self): + return {"allow_nan": True} + def __init__( self, min_cluster_size=5, @@ -1140,7 +1180,7 @@ def __init__( allow_single_cluster=False, prediction_data=False, match_reference_implementation=False, - **metric_params, + metric_params=None, ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples @@ -1159,7 +1199,7 @@ def __init__( self.allow_single_cluster = allow_single_cluster self.match_reference_implementation = match_reference_implementation self.prediction_data = prediction_data - self.metric_params = metric_params or {} + self.metric_params = metric_params def fit(self, X, y=None): """Perform HDBSCAN clustering from features or distance matrix. @@ -1179,6 +1219,7 @@ def fit(self, X, y=None): self : object Returns self. """ + metric_params = self.metric_params or {} if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. # Rows with these values @@ -1207,7 +1248,7 @@ def fit(self, X, y=None): # prediction data only applies to the persistent model, so remove # it from the keyword args we pass on the the function kwargs.pop("prediction_data", None) - kwargs.update(self.metric_params) + kwargs.update(metric_params) ( self.labels_, @@ -1280,13 +1321,14 @@ def generate_prediction_data(self): warn("Metric {} not supported for prediction data!".format(self.metric)) return + metric_params = self.metric_params or {} self._prediction_data = PredictionData( self._raw_data, self.condensed_tree_, min_samples, tree_type=tree_type, metric=self.metric, - **self.metric_params, + **metric_params, ) else: warn( @@ -1358,10 +1400,9 @@ def weighted_cluster_medoid(self, cluster_id): mask = self.labels_ == cluster_id cluster_data = self._raw_data[mask] cluster_membership_strengths = self.probabilities_[mask] + metric_params = self.metric_params or {} - dist_mat = pairwise_distances( - cluster_data, metric=self.metric, **self.metric_params - ) + dist_mat = pairwise_distances(cluster_data, metric=self.metric, **metric_params) dist_mat = dist_mat * cluster_membership_strengths medoid_index = np.argmin(dist_mat.sum(axis=1)) From 97f036fd1c5bb44fde6d894d2b4b4384bb50d08c Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 15:11:38 -0500 Subject: [PATCH 007/160] Propogated `metric_params` change to tests and other functions --- sklearn/cluster/_hdbscan/hdbscan_.py | 6 ++--- .../cluster/_hdbscan/tests/test_hdbscan.py | 19 ++++++++++++---- sklearn/cluster/_hdbscan/validity.py | 22 +++++++++---------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 9f1c2923a1821..a5145a480a861 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -1223,7 +1223,7 @@ def fit(self, X, y=None): if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. # Rows with these values - X = self._validate_data(X, force_all_finite=False) + X = self._validate_data(X, force_all_finite=False, accept_sparse="csr") self._raw_data = X self._all_finite = is_finite(X) @@ -1248,7 +1248,7 @@ def fit(self, X, y=None): # prediction data only applies to the persistent model, so remove # it from the keyword args we pass on the the function kwargs.pop("prediction_data", None) - kwargs.update(metric_params) + kwargs["metric_params"] = metric_params ( self.labels_, @@ -1453,7 +1453,7 @@ def prediction_data_(self): """ Cached data for predicting cluster labels of new or unseen points. """ - if getattr(self, "_prediction_data", None) is not None: + if getattr(self, "_prediction_data", None) is None: raise AttributeError("No prediction data was generated") else: return self._prediction_data diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index ce54c86ca8192..acc2b2b583eda 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -47,7 +47,10 @@ def test_missing_data(): - """Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster""" + """ + Tests if nan data are treated as infinite distance from all other points + and assigned to -1 cluster. + """ model = HDBSCAN().fit(X_missing_data) assert model.labels_[0] == -1 assert model.labels_[5] == -1 @@ -267,7 +270,11 @@ def test_hdbscan_high_dimensional(): assert n_clusters_1 == n_clusters labels = ( - HDBSCAN(algorithm="best", metric="seuclidean", V=np.ones(H.shape[1])) + HDBSCAN( + algorithm="best", + metric="seuclidean", + metric_params={"V": np.ones(H.shape[1])}, + ) .fit(H) .labels_ ) @@ -277,12 +284,16 @@ def test_hdbscan_high_dimensional(): def test_hdbscan_best_balltree_metric(): labels, p, persist, ctree, ltree, mtree = hdbscan( - X, metric="seuclidean", V=np.ones(X.shape[1]) + X, metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} ) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = HDBSCAN(metric="seuclidean", V=np.ones(X.shape[1])).fit(X).labels_ + labels = ( + HDBSCAN(metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}) + .fit(X) + .labels_ + ) n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters diff --git a/sklearn/cluster/_hdbscan/validity.py b/sklearn/cluster/_hdbscan/validity.py index f0d86cdb36bd4..e8a1092b2d545 100644 --- a/sklearn/cluster/_hdbscan/validity.py +++ b/sklearn/cluster/_hdbscan/validity.py @@ -39,7 +39,7 @@ def all_points_core_distance(distance_matrix, d=2.0): def all_points_mutual_reachability( - X, labels, cluster_id, metric="euclidean", d=None, **kwd_args + X, labels, cluster_id, metric="euclidean", d=None, metric_params=None ): """ Compute the all-points-mutual-reachability distances for all the points of @@ -76,9 +76,8 @@ def all_points_mutual_reachability( be set in the case of metric being set to `precomputed`, where the ambient dimension of the data is unknown to the function. - **kwd_args : - Extra arguments to pass to the distance computation for other - metrics, such as minkowski, Mahanalobis etc. + metric_params : dict, default=None + Arguments passed to the distance metric. Returns ------- @@ -102,7 +101,8 @@ def all_points_mutual_reachability( distance_matrix = X[labels == cluster_id, :][:, labels == cluster_id] else: subset_X = X[labels == cluster_id, :] - distance_matrix = pairwise_distances(subset_X, metric=metric, **kwd_args) + metric_params = metric_params or {} + distance_matrix = pairwise_distances(subset_X, metric=metric, **metric_params) d = X.shape[1] core_distances = all_points_core_distance(distance_matrix.copy(), d=d) @@ -276,7 +276,7 @@ def density_separation( def validity_index( - X, labels, metric="euclidean", d=None, per_cluster_scores=False, kwargs=None + X, labels, metric="euclidean", d=None, per_cluster_scores=False, metric_params=None ): """ Compute the density based cluster validity index. @@ -312,9 +312,8 @@ def validity_index( Defaults to False with the function returning a single float value for the whole clustering. - kwargs : dict, default=None - Extra arguments to pass to the distance computation for other - metrics, such as minkowski, Mahanalobis etc. + metric_params : dict, default=None + Arguments passed to the distance metric. Returns ------- @@ -337,6 +336,7 @@ def validity_index( density_sparseness = {} mst_nodes = {} mst_edges = {} + metric_params = metric_params or {} max_cluster_id = labels.max() + 1 density_sep = np.inf * np.ones((max_cluster_id, max_cluster_id), dtype=np.float64) @@ -348,7 +348,7 @@ def validity_index( continue mr_distances, core_distances[cluster_id] = all_points_mutual_reachability( - X, labels, cluster_id, metric, d, **kwargs + X, labels, cluster_id, metric, d, **metric_params ) mst_nodes[cluster_id], mst_edges[cluster_id] = internal_minimum_spanning_tree( @@ -378,7 +378,7 @@ def validity_index( core_distances[i], core_distances[j], metric=metric, - **kwargs, + **metric_params, ) density_sep[j, i] = density_sep[i, j] From 8aa297a9307946a3c2f27c5b99fe2faa828f10dd Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 16:21:57 -0500 Subject: [PATCH 008/160] Removed plotting, `to_pandas`, `to_networkx` infrastructure --- sklearn/cluster/_hdbscan/plots.py | 653 ------------------ .../cluster/_hdbscan/tests/test_hdbscan.py | 115 --- 2 files changed, 768 deletions(-) diff --git a/sklearn/cluster/_hdbscan/plots.py b/sklearn/cluster/_hdbscan/plots.py index e00a415af8aa5..ea2da11b1809b 100644 --- a/sklearn/cluster/_hdbscan/plots.py +++ b/sklearn/cluster/_hdbscan/plots.py @@ -5,10 +5,6 @@ import numpy as np -from scipy.cluster.hierarchy import dendrogram -from sklearn.manifold import TSNE -from sklearn.decomposition import PCA -from warnings import warn from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs CB_LEFT = 0 @@ -284,275 +280,10 @@ def _select_clusters(self): 'Should be one of: "eom", "leaf"\n' ) - def plot( - self, - leaf_separation=1, - cmap="viridis", - select_clusters=False, - label_clusters=False, - selection_palette=None, - axis=None, - colorbar=True, - log_size=False, - max_rectangles_per_icicle=20, - ): - """Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree. - - Effectively this is a dendrogram where the width of each cluster bar is - equal to the number of points (or log of the number of points) in the cluster - at the given lambda value. Thus bars narrow as points progressively drop - out of clusters. The make the effect more apparent the bars are also colored - according the the number of points (or log of the number of points). - - Parameters - ---------- - leaf_separation : float, optional (default 1) - How far apart to space the final leaves of the - dendrogram. - - cmap : string or matplotlib colormap, optional (default viridis) - The matplotlib colormap to use to color the cluster bars. - - - select_clusters : boolean, optional (default False) - Whether to draw ovals highlighting which cluster - bar represent the clusters that were selected by - HDBSCAN as the final clusters. - - label_clusters : boolean, optional (default False) - If select_clusters is True then this determines - whether to draw text labels on the clusters. - - selection_palette : list of colors, optional (default None) - If not None, and at least as long as - the number of clusters, draw ovals - in colors iterating through this palette. - This can aid in cluster identification - when plotting. - - axis : matplotlib axis or None, optional (default None) - The matplotlib axis to render to. If None then a new axis - will be generated. The rendered axis will be returned. - - - colorbar : boolean, optional (default True) - Whether to draw a matplotlib colorbar displaying the range - of cluster sizes as per the colormap. - - log_size : boolean, optional (default False) - Use log scale for the 'size' of clusters (i.e. number of - points in the cluster at a given lambda value). - - - max_rectangles_per_icicle : int, optional (default 20) - To simplify the plot this method will only emit - ``max_rectangles_per_icicle`` bars per branch of the dendrogram. - This ensures that we don't suffer from massive overplotting in - cases with a lot of data points. - - Returns - ------- - axis : matplotlib axis - The axis on which the 'icicle plot' has been rendered. - """ - try: - import matplotlib.pyplot as plt - except ImportError: - raise ImportError( - "You must install the matplotlib library to plot the condensed tree." - "Use get_plot_data to calculate the relevant data without plotting." - ) - - plot_data = self.get_plot_data( - leaf_separation=leaf_separation, - log_size=log_size, - max_rectangle_per_icicle=max_rectangles_per_icicle, - ) - - if cmap != "none": - sm = plt.cm.ScalarMappable( - cmap=cmap, norm=plt.Normalize(0, max(plot_data["bar_widths"])) - ) - sm.set_array(plot_data["bar_widths"]) - bar_colors = [sm.to_rgba(x) for x in plot_data["bar_widths"]] - else: - bar_colors = "black" - - if axis is None: - axis = plt.gca() - - axis.bar( - plot_data["bar_centers"], - plot_data["bar_tops"], - bottom=plot_data["bar_bottoms"], - width=plot_data["bar_widths"], - color=bar_colors, - align="center", - linewidth=0, - ) - - drawlines = [] - for xs, ys in zip(plot_data["line_xs"], plot_data["line_ys"]): - drawlines.append(xs) - drawlines.append(ys) - axis.plot(*drawlines, color="black", linewidth=1) - # for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']): - # axis.plot(xs, ys, color='black', linewidth=1) - - if select_clusters: - try: - from matplotlib.patches import Ellipse - except ImportError: - raise ImportError( - "You must have matplotlib.patches available to plot selected" - " clusters." - ) - - chosen_clusters = self._select_clusters() - - # Extract the chosen cluster bounds. If enough duplicate data - # points exist in the data the lambda value might be infinite. - # This breaks labeling and highlighting the chosen clusters. - cluster_bounds = np.array( - [plot_data["cluster_bounds"][c] for c in chosen_clusters] - ) - if not np.isfinite(cluster_bounds).all(): - warn( - "Infinite lambda values encountered in chosen clusters." - " This might be due to duplicates in the data." - ) - - # Extract the plot range of the y-axis and set default center and - # height values for ellipses. Extremly dense clusters might result - # in near infinite lambda values. Setting max_height based on the - # percentile should alleviate the impact on plotting. - plot_range = np.hstack([plot_data["bar_tops"], plot_data["bar_bottoms"]]) - plot_range = plot_range[np.isfinite(plot_range)] - mean_y_center = np.mean([np.max(plot_range), np.min(plot_range)]) - max_height = np.diff(np.percentile(plot_range, q=[10, 90])) - - for i, c in enumerate(chosen_clusters): - c_bounds = plot_data["cluster_bounds"][c] - width = c_bounds[CB_RIGHT] - c_bounds[CB_LEFT] - height = c_bounds[CB_TOP] - c_bounds[CB_BOTTOM] - center = ( - np.mean([c_bounds[CB_LEFT], c_bounds[CB_RIGHT]]), - np.mean([c_bounds[CB_TOP], c_bounds[CB_BOTTOM]]), - ) - - # Set center and height to default values if necessary - if not np.isfinite(center[1]): - center = (center[0], mean_y_center) - if not np.isfinite(height): - height = max_height - - # Ensure the ellipse is visible - min_height = 0.1 * max_height - if height < min_height: - height = min_height - - if selection_palette is not None and len(selection_palette) >= len( - chosen_clusters - ): - oval_color = selection_palette[i] - else: - oval_color = "r" - - box = Ellipse( - center, - 2.0 * width, - 1.2 * height, - facecolor="none", - edgecolor=oval_color, - linewidth=2, - ) - - if label_clusters: - axis.annotate( - str(i), - xy=center, - xytext=(center[0] - 4.0 * width, center[1] + 0.65 * height), - horizontalalignment="left", - verticalalignment="bottom", - ) - - axis.add_artist(box) - - if colorbar: - cb = plt.colorbar(sm, ax=axis) - if log_size: - cb.ax.set_ylabel("log(Number of points)") - else: - cb.ax.set_ylabel("Number of points") - - axis.set_xticks([]) - for side in ("right", "top", "bottom"): - axis.spines[side].set_visible(False) - axis.invert_yaxis() - axis.set_ylabel("$\lambda$ value") - - return axis - def to_numpy(self): """Return a numpy structured array representation of the condensed tree.""" return self._raw_tree.copy() - def to_pandas(self): - """Return a pandas dataframe representation of the condensed tree. - - Each row of the dataframe corresponds to an edge in the tree. - The columns of the dataframe are `parent`, `child`, `lambda_val` - and `child_size`. - - The `parent` and `child` are the ids of the - parent and child nodes in the tree. Node ids less than the number - of points in the original dataset represent individual points, while - ids greater than the number of points are clusters. - - The `lambda_val` value is the value (1/distance) at which the `child` - node leaves the cluster. - - The `child_size` is the number of points in the `child` node. - """ - try: - from pandas import DataFrame, Series - except ImportError: - raise ImportError( - "You must have pandas installed to export pandas DataFrames" - ) - - result = DataFrame(self._raw_tree) - - return result - - def to_networkx(self): - """Return a NetworkX DiGraph object representing the condensed tree. - - Edge weights in the graph are the lamba values at which child nodes - 'leave' the parent cluster. - - Nodes have a `size` attribute attached giving the number of points - that are in the cluster (or 1 if it is a singleton point) at the - point of cluster creation (fewer points may be in the cluster at - larger lambda values). - """ - try: - from networkx import DiGraph, set_node_attributes - except ImportError: - raise ImportError( - "You must have networkx installed to export networkx graphs" - ) - - result = DiGraph() - for row in self._raw_tree: - result.add_edge(row["parent"], row["child"], weight=row["lambda_val"]) - - set_node_attributes( - result, dict(self._raw_tree[["child", "child_size"]]), "size" - ) - - return result - def _get_dendrogram_ordering(parent, linkage, root): @@ -601,157 +332,6 @@ class SingleLinkageTree(object): def __init__(self, linkage): self._linkage = linkage - def plot( - self, - axis=None, - truncate_mode=None, - p=0, - vary_line_width=True, - cmap="viridis", - colorbar=True, - ): - """Plot a dendrogram of the single linkage tree. - - Parameters - ---------- - truncate_mode : str, optional - The dendrogram can be hard to read when the original - observation matrix from which the linkage is derived - is large. Truncation is used to condense the dendrogram. - There are several modes: - - ``None/'none'`` - No truncation is performed (Default). - - ``'lastp'`` - The last p non-singleton formed in the linkage are the only - non-leaf nodes in the linkage; they correspond to rows - Z[n-p-2:end] in Z. All other non-singleton clusters are - contracted into leaf nodes. - - ``'level'/'mtica'`` - No more than p levels of the dendrogram tree are displayed. - This corresponds to Mathematica(TM) behavior. - - p : int, optional - The ``p`` parameter for ``truncate_mode``. - - vary_line_width : boolean, optional - Draw downward branches of the dendrogram with line thickness that - varies depending on the size of the cluster. - - cmap : string or matplotlib colormap, optional - The matplotlib colormap to use to color the cluster bars. - A value of 'none' will result in black bars. - (default 'viridis') - - colorbar : boolean, optional - Whether to draw a matplotlib colorbar displaying the range - of cluster sizes as per the colormap. (default True) - - Returns - ------- - axis : matplotlib axis - The axis on which the dendrogram plot has been rendered. - - """ - dendrogram_data = dendrogram( - self._linkage, p=p, truncate_mode=truncate_mode, no_plot=True - ) - X = dendrogram_data["icoord"] - Y = dendrogram_data["dcoord"] - - try: - import matplotlib.pyplot as plt - except ImportError: - raise ImportError( - "You must install the matplotlib library to plot the single linkage" - " tree." - ) - - if axis is None: - axis = plt.gca() - - if vary_line_width: - dendrogram_ordering = _get_dendrogram_ordering( - 2 * len(self._linkage), self._linkage, len(self._linkage) + 1 - ) - linewidths = _calculate_linewidths( - dendrogram_ordering, self._linkage, len(self._linkage) + 1 - ) - else: - linewidths = [(1.0, 1.0)] * len(Y) - - if cmap != "none": - color_array = np.log2(np.array(linewidths).flatten()) - sm = plt.cm.ScalarMappable( - cmap=cmap, norm=plt.Normalize(0, color_array.max()) - ) - sm.set_array(color_array) - - for x, y, lw in zip(X, Y, linewidths): - left_x = x[:2] - right_x = x[2:] - left_y = y[:2] - right_y = y[2:] - horizontal_x = x[1:3] - horizontal_y = y[1:3] - - if cmap != "none": - axis.plot( - left_x, - left_y, - color=sm.to_rgba(np.log2(lw[0])), - linewidth=np.log2(1 + lw[0]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - axis.plot( - right_x, - right_y, - color=sm.to_rgba(np.log2(lw[1])), - linewidth=np.log2(1 + lw[1]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - else: - axis.plot( - left_x, - left_y, - color="k", - linewidth=np.log2(1 + lw[0]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - axis.plot( - right_x, - right_y, - color="k", - linewidth=np.log2(1 + lw[1]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - - axis.plot( - horizontal_x, - horizontal_y, - color="k", - linewidth=1.0, - solid_joinstyle="miter", - solid_capstyle="butt", - ) - - if colorbar: - cb = plt.colorbar(sm, ax=axis) - cb.ax.set_ylabel("log(Number of points)") - - axis.set_xticks([]) - for side in ("right", "top", "bottom"): - axis.spines[side].set_visible(False) - axis.set_ylabel("distance") - - return axis - def to_numpy(self): """Return a numpy array representation of the single linkage tree. @@ -762,78 +342,6 @@ def to_numpy(self): """ return self._linkage.copy() - def to_pandas(self): - """Return a pandas dataframe representation of the single linkage tree. - - Each row of the dataframe corresponds to an edge in the tree. - The columns of the dataframe are `parent`, `left_child`, - `right_child`, `distance` and `size`. - - The `parent`, `left_child` and `right_child` are the ids of the - parent and child nodes in the tree. Node ids less than the number - of points in the original dataset represent individual points, while - ids greater than the number of points are clusters. - - The `distance` value is the at which the child nodes merge to form - the parent node. - - The `size` is the number of points in the `parent` node. - """ - try: - from pandas import DataFrame, Series - except ImportError: - raise ImportError( - "You must have pandas installed to export pandas DataFrames" - ) - - max_node = 2 * self._linkage.shape[0] - num_points = max_node - (self._linkage.shape[0] - 1) - - parent_array = np.arange(num_points, max_node + 1) - - result = DataFrame( - { - "parent": parent_array, - "left_child": self._linkage.T[0], - "right_child": self._linkage.T[1], - "distance": self._linkage.T[2], - "size": self._linkage.T[3], - } - )[["parent", "left_child", "right_child", "distance", "size"]] - - return result - - def to_networkx(self): - """Return a NetworkX DiGraph object representing the single linkage tree. - - Edge weights in the graph are the distance values at which child nodes - merge to form the parent cluster. - - Nodes have a `size` attribute attached giving the number of points - that are in the cluster. - """ - try: - from networkx import DiGraph, set_node_attributes - except ImportError: - raise ImportError( - "You must have networkx installed to export networkx graphs" - ) - - max_node = 2 * self._linkage.shape[0] - num_points = max_node - (self._linkage.shape[0] - 1) - - result = DiGraph() - for parent, row in enumerate(self._linkage, num_points): - result.add_edge(parent, row[0], weight=row[2]) - result.add_edge(parent, row[1], weight=row[2]) - - size_dict = { - parent: row[3] for parent, row in enumerate(self._linkage, num_points) - } - set_node_attributes(result, size_dict, "size") - - return result - def get_clusters(self, cut_distance, min_cluster_size=5): """Return a flat clustering from the single linkage hierarchy. @@ -867,167 +375,6 @@ def __init__(self, mst, data): self._mst = mst self._data = data - def plot( - self, - axis=None, - node_size=40, - node_color="k", - node_alpha=0.8, - edge_alpha=0.5, - edge_cmap="viridis_r", - edge_linewidth=2, - vary_line_width=True, - colorbar=True, - ): - """Plot the minimum spanning tree (as projected into 2D by t-SNE if required). - - Parameters - ---------- - - axis : matplotlib axis, optional - The axis to render the plot to - - node_size : int, optional - The size of nodes in the plot (default 40). - - node_color : matplotlib color spec, optional - The color to render nodes (default black). - - node_alpha : float, optional - The alpha value (between 0 and 1) to render nodes with - (default 0.8). - - edge_cmap : matplotlib colormap, optional - The colormap to color edges by (varying color by edge - weight/distance). Can be a cmap object or a string - recognised by matplotlib. (default `viridis_r`) - - edge_alpha : float, optional - The alpha value (between 0 and 1) to render edges with - (default 0.5). - - edge_linewidth : float, optional - The linewidth to use for rendering edges (default 2). - - vary_line_width : bool, optional - Edge width is proportional to (log of) the inverse of the - mutual reachability distance. (default True) - - colorbar : bool, optional - Whether to draw a colorbar. (default True) - - Returns - ------- - - axis : matplotlib axis - The axis used the render the plot. - """ - try: - import matplotlib.pyplot as plt - from matplotlib.collections import LineCollection - except ImportError: - raise ImportError( - "You must install the matplotlib library to plot the minimum spanning" - " tree." - ) - - if self._data.shape[0] > 32767: - warn("Too many data points for safe rendering of an minimal spanning tree!") - return None - - if axis is None: - axis = plt.gca() - - if self._data.shape[1] > 2: - # Get a 2D projection; if we have a lot of dimensions use PCA first - if self._data.shape[1] > 32: - # Use PCA to get down to 32 dimension - data_for_projection = PCA(n_components=32).fit_transform(self._data) - else: - data_for_projection = self._data.copy() - - projection = TSNE().fit_transform(data_for_projection) - else: - projection = self._data.copy() - - if vary_line_width: - line_width = edge_linewidth * ( - np.log(self._mst.T[2].max() / self._mst.T[2]) + 1.0 - ) - else: - line_width = edge_linewidth - - line_coords = projection[self._mst[:, :2].astype(int)] - line_collection = LineCollection( - line_coords, linewidth=line_width, cmap=edge_cmap, alpha=edge_alpha - ) - line_collection.set_array(self._mst[:, 2].T) - - axis.add_artist(line_collection) - axis.scatter( - projection.T[0], - projection.T[1], - c=node_color, - alpha=node_alpha, - s=node_size, - ) - axis.set_xticks([]) - axis.set_yticks([]) - - if colorbar: - cb = plt.colorbar(line_collection, ax=axis) - cb.ax.set_ylabel("Mutual reachability distance") - - return axis - def to_numpy(self): """Return a numpy array of weighted edges in the minimum spanning tree""" return self._mst.copy() - - def to_pandas(self): - """Return a Pandas dataframe of the minimum spanning tree. - - Each row is an edge in the tree; the columns are `from`, - `to`, and `distance` giving the two vertices of the edge - which are indices into the dataset, and the distance - between those datapoints. - """ - try: - from pandas import DataFrame - except ImportError: - raise ImportError( - "You must have pandas installed to export pandas DataFrames" - ) - - result = DataFrame( - { - "from": self._mst.T[0].astype(int), - "to": self._mst.T[1].astype(int), - "distance": self._mst.T[2], - } - ) - return result - - def to_networkx(self): - """Return a NetworkX Graph object representing the minimum spanning tree. - - Edge weights in the graph are the distance between the nodes they connect. - - Nodes have a `data` attribute attached giving the data vector of the - associated point. - """ - try: - from networkx import Graph, set_node_attributes - except ImportError: - raise ImportError( - "You must have networkx installed to export networkx graphs" - ) - - result = Graph() - for row in self._mst: - result.add_edge(row[0], row[1], weight=row[2]) - - data_dict = {index: tuple(row) for index, row in enumerate(self._data)} - set_node_attributes(result, data_dict, "data") - - return result diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index acc2b2b583eda..a3859d7e3337a 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -28,7 +28,6 @@ from scipy.stats import mode from tempfile import mkdtemp -from functools import wraps import pytest from sklearn import datasets @@ -62,62 +61,6 @@ def test_missing_data(): assert np.allclose(clean_model.labels_, model.labels_[clean_indices]) -def if_matplotlib(func): - """Test decorator that skips test if matplotlib not installed. - - Parameters - ---------- - func - """ - - @wraps(func) - def run_test(*args, **kwargs): - try: - import matplotlib - - matplotlib.use("Agg") - # this fails if no $DISPLAY specified - import matplotlib.pyplot as plt - - plt.figure() - except ImportError: - pytest.skip("Matplotlib not available.") - else: - return func(*args, **kwargs) - - return run_test - - -def if_pandas(func): - """Test decorator that skips test if pandas not installed.""" - - @wraps(func) - def run_test(*args, **kwargs): - try: - import pandas - except ImportError: - pytest.skip("Pandas not available.") - else: - return func(*args, **kwargs) - - return run_test - - -def if_networkx(func): - """Test decorator that skips test if networkx not installed.""" - - @wraps(func) - def run_test(*args, **kwargs): - try: - import networkx - except ImportError: - pytest.skip("NetworkX not available.") - else: - return func(*args, **kwargs) - - return run_test - - def generate_noisy_data(): blobs, _ = datasets.make_blobs( n_samples=200, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 @@ -383,48 +326,6 @@ def test_hdbscan_boruvka_balltree_matches(): assert (num_mismatches / float(data.shape[0])) < 0.15 -def test_condensed_tree_plot(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_matplotlib(clusterer.condensed_tree_.plot)( - select_clusters=True, - label_clusters=True, - selection_palette=("r", "g", "b"), - cmap="Reds", - ) - if_matplotlib(clusterer.condensed_tree_.plot)( - log_size=True, colorbar=False, cmap="none" - ) - - -def test_single_linkage_tree_plot(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_matplotlib(clusterer.single_linkage_tree_.plot)(cmap="Reds") - if_matplotlib(clusterer.single_linkage_tree_.plot)( - vary_line_width=False, truncate_mode="lastp", p=10, cmap="none", colorbar=False - ) - - -def test_min_span_tree_plot(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap="Reds") - - H, y = make_blobs(n_samples=50, random_state=0, n_features=10) - H = StandardScaler().fit_transform(H) - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) - if_matplotlib(clusterer.minimum_spanning_tree_.plot)( - edge_cmap="Reds", vary_line_width=False, colorbar=False - ) - - H, y = make_blobs(n_samples=50, random_state=0, n_features=40) - H = StandardScaler().fit_transform(H) - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) - if_matplotlib(clusterer.minimum_spanning_tree_.plot)( - edge_cmap="Reds", vary_line_width=False, colorbar=False - ) - - def test_tree_numpy_output_formats(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) @@ -434,22 +335,6 @@ def test_tree_numpy_output_formats(): clusterer.minimum_spanning_tree_.to_numpy() -def test_tree_pandas_output_formats(): - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_pandas(clusterer.condensed_tree_.to_pandas)() - if_pandas(clusterer.single_linkage_tree_.to_pandas)() - if_pandas(clusterer.minimum_spanning_tree_.to_pandas)() - - -def test_tree_networkx_output_formats(): - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_networkx(clusterer.condensed_tree_.to_networkx)() - if_networkx(clusterer.single_linkage_tree_.to_networkx)() - if_networkx(clusterer.minimum_spanning_tree_.to_networkx)() - - def test_hdbscan_outliers(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) scores = clusterer.outlier_scores_ From fe362b59a94ea878c45e9b8730da465b961337bb Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 16:21:57 -0500 Subject: [PATCH 009/160] Removed plotting, `to_pandas`, `to_networkx` infrastructure --- sklearn/cluster/_hdbscan/plots.py | 844 +----------------- .../cluster/_hdbscan/tests/test_hdbscan.py | 115 --- 2 files changed, 2 insertions(+), 957 deletions(-) diff --git a/sklearn/cluster/_hdbscan/plots.py b/sklearn/cluster/_hdbscan/plots.py index e00a415af8aa5..3afdba90ffc2d 100644 --- a/sklearn/cluster/_hdbscan/plots.py +++ b/sklearn/cluster/_hdbscan/plots.py @@ -5,10 +5,6 @@ import numpy as np -from scipy.cluster.hierarchy import dendrogram -from sklearn.manifold import TSNE -from sklearn.decomposition import PCA -from warnings import warn from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs CB_LEFT = 0 @@ -80,173 +76,6 @@ def __init__( self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster - def get_plot_data( - self, leaf_separation=1, log_size=False, max_rectangle_per_icicle=20 - ): - """Generates data for use in plotting the 'icicle plot' or dendrogram - plot of the condensed tree generated by HDBSCAN. - - Parameters - ---------- - leaf_separation : float, optional - How far apart to space the final leaves of the - dendrogram. (default 1) - - log_size : boolean, optional - Use log scale for the 'size' of clusters (i.e. number of - points in the cluster at a given lambda value). - (default False) - - max_rectangles_per_icicle : int, optional - To simplify the plot this method will only emit - ``max_rectangles_per_icicle`` bars per branch of the dendrogram. - This ensures that we don't suffer from massive overplotting in - cases with a lot of data points. - - Returns - ------- - plot_data : dict - Data associated to bars in a bar plot: - `bar_centers` x coordinate centers for bars - `bar_tops` heights of bars in lambda scale - `bar_bottoms` y coordinate of bottoms of bars - `bar_widths` widths of the bars (in x coord scale) - `bar_bounds` a 4-tuple of [left, right, bottom, top] - giving the bounds on a full set of - cluster bars - Data associates with cluster splits: - `line_xs` x coordinates for horizontal dendrogram lines - `line_ys` y coordinates for horizontal dendrogram lines - """ - leaves = _get_leaves(self._raw_tree) - last_leaf = self._raw_tree["parent"].max() - root = self._raw_tree["parent"].min() - - # We want to get the x and y coordinates for the start of each cluster - # Initialize the leaves, since we know where they go, the iterate - # through everything from the leaves back, setting coords as we go - if isinstance(leaves, np.int64): - cluster_x_coords = {leaves: leaf_separation} - else: - cluster_x_coords = dict( - zip(leaves, [leaf_separation * x for x in range(len(leaves))]) - ) - cluster_y_coords = {root: 0.0} - - for cluster in range(last_leaf, root - 1, -1): - split = self._raw_tree[["child", "lambda_val"]] - split = split[ - (self._raw_tree["parent"] == cluster) - & (self._raw_tree["child_size"] > 1) - ] - if len(split["child"]) > 1: - left_child, right_child = split["child"] - cluster_x_coords[cluster] = np.mean( - [cluster_x_coords[left_child], cluster_x_coords[right_child]] - ) - cluster_y_coords[left_child] = split["lambda_val"][0] - cluster_y_coords[right_child] = split["lambda_val"][1] - - # We use bars to plot the 'icicles', so we need to generate centers, tops, - # bottoms and widths for each rectangle. We can go through each cluster - # and do this for each in turn. - bar_centers = [] - bar_tops = [] - bar_bottoms = [] - bar_widths = [] - - cluster_bounds = {} - - scaling = np.sum(self._raw_tree[self._raw_tree["parent"] == root]["child_size"]) - - if log_size: - scaling = np.log(scaling) - - for c in range(last_leaf, root - 1, -1): - - cluster_bounds[c] = [0, 0, 0, 0] - - c_children = self._raw_tree[self._raw_tree["parent"] == c] - current_size = np.sum(c_children["child_size"]) - current_lambda = cluster_y_coords[c] - cluster_max_size = current_size - cluster_max_lambda = c_children["lambda_val"].max() - cluster_min_size = np.sum( - c_children[c_children["lambda_val"] == cluster_max_lambda]["child_size"] - ) - - if log_size: - current_size = np.log(current_size) - cluster_max_size = np.log(cluster_max_size) - cluster_min_size = np.log(cluster_min_size) - - total_size_change = float(cluster_max_size - cluster_min_size) - step_size_change = total_size_change / max_rectangle_per_icicle - - cluster_bounds[c][CB_LEFT] = cluster_x_coords[c] * scaling - ( - current_size / 2.0 - ) - cluster_bounds[c][CB_RIGHT] = cluster_x_coords[c] * scaling + ( - current_size / 2.0 - ) - cluster_bounds[c][CB_BOTTOM] = cluster_y_coords[c] - cluster_bounds[c][CB_TOP] = np.max(c_children["lambda_val"]) - - last_step_size = current_size - last_step_lambda = current_lambda - - for i in np.argsort(c_children["lambda_val"]): - row = c_children[i] - if row["lambda_val"] != current_lambda and ( - last_step_size - current_size > step_size_change - or row["lambda_val"] == cluster_max_lambda - ): - bar_centers.append(cluster_x_coords[c] * scaling) - bar_tops.append(row["lambda_val"] - last_step_lambda) - bar_bottoms.append(last_step_lambda) - bar_widths.append(last_step_size) - last_step_size = current_size - last_step_lambda = current_lambda - if log_size: - exp_size = np.exp(current_size) - row["child_size"] - # Ensure we don't try to take log of zero - if exp_size > 0.01: - current_size = np.log(np.exp(current_size) - row["child_size"]) - else: - current_size = 0.0 - else: - current_size -= row["child_size"] - current_lambda = row["lambda_val"] - - # Finally we need the horizontal lines that occur at cluster splits. - line_xs = [] - line_ys = [] - - for row in self._raw_tree[self._raw_tree["child_size"] > 1]: - parent = row["parent"] - child = row["child"] - child_size = row["child_size"] - if log_size: - child_size = np.log(child_size) - sign = np.sign(cluster_x_coords[child] - cluster_x_coords[parent]) - line_xs.append( - [ - cluster_x_coords[parent] * scaling, - cluster_x_coords[child] * scaling + sign * (child_size / 2.0), - ] - ) - line_ys.append([cluster_y_coords[child], cluster_y_coords[child]]) - - return { - "bar_centers": bar_centers, - "bar_tops": bar_tops, - "bar_bottoms": bar_bottoms, - "bar_widths": bar_widths, - "line_xs": line_xs, - "line_ys": line_ys, - "cluster_bounds": cluster_bounds, - } - def _select_clusters(self): if self.cluster_selection_method == "eom": stability = compute_stability(self._raw_tree) @@ -284,275 +113,10 @@ def _select_clusters(self): 'Should be one of: "eom", "leaf"\n' ) - def plot( - self, - leaf_separation=1, - cmap="viridis", - select_clusters=False, - label_clusters=False, - selection_palette=None, - axis=None, - colorbar=True, - log_size=False, - max_rectangles_per_icicle=20, - ): - """Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree. - - Effectively this is a dendrogram where the width of each cluster bar is - equal to the number of points (or log of the number of points) in the cluster - at the given lambda value. Thus bars narrow as points progressively drop - out of clusters. The make the effect more apparent the bars are also colored - according the the number of points (or log of the number of points). - - Parameters - ---------- - leaf_separation : float, optional (default 1) - How far apart to space the final leaves of the - dendrogram. - - cmap : string or matplotlib colormap, optional (default viridis) - The matplotlib colormap to use to color the cluster bars. - - - select_clusters : boolean, optional (default False) - Whether to draw ovals highlighting which cluster - bar represent the clusters that were selected by - HDBSCAN as the final clusters. - - label_clusters : boolean, optional (default False) - If select_clusters is True then this determines - whether to draw text labels on the clusters. - - selection_palette : list of colors, optional (default None) - If not None, and at least as long as - the number of clusters, draw ovals - in colors iterating through this palette. - This can aid in cluster identification - when plotting. - - axis : matplotlib axis or None, optional (default None) - The matplotlib axis to render to. If None then a new axis - will be generated. The rendered axis will be returned. - - - colorbar : boolean, optional (default True) - Whether to draw a matplotlib colorbar displaying the range - of cluster sizes as per the colormap. - - log_size : boolean, optional (default False) - Use log scale for the 'size' of clusters (i.e. number of - points in the cluster at a given lambda value). - - - max_rectangles_per_icicle : int, optional (default 20) - To simplify the plot this method will only emit - ``max_rectangles_per_icicle`` bars per branch of the dendrogram. - This ensures that we don't suffer from massive overplotting in - cases with a lot of data points. - - Returns - ------- - axis : matplotlib axis - The axis on which the 'icicle plot' has been rendered. - """ - try: - import matplotlib.pyplot as plt - except ImportError: - raise ImportError( - "You must install the matplotlib library to plot the condensed tree." - "Use get_plot_data to calculate the relevant data without plotting." - ) - - plot_data = self.get_plot_data( - leaf_separation=leaf_separation, - log_size=log_size, - max_rectangle_per_icicle=max_rectangles_per_icicle, - ) - - if cmap != "none": - sm = plt.cm.ScalarMappable( - cmap=cmap, norm=plt.Normalize(0, max(plot_data["bar_widths"])) - ) - sm.set_array(plot_data["bar_widths"]) - bar_colors = [sm.to_rgba(x) for x in plot_data["bar_widths"]] - else: - bar_colors = "black" - - if axis is None: - axis = plt.gca() - - axis.bar( - plot_data["bar_centers"], - plot_data["bar_tops"], - bottom=plot_data["bar_bottoms"], - width=plot_data["bar_widths"], - color=bar_colors, - align="center", - linewidth=0, - ) - - drawlines = [] - for xs, ys in zip(plot_data["line_xs"], plot_data["line_ys"]): - drawlines.append(xs) - drawlines.append(ys) - axis.plot(*drawlines, color="black", linewidth=1) - # for xs, ys in zip(plot_data['line_xs'], plot_data['line_ys']): - # axis.plot(xs, ys, color='black', linewidth=1) - - if select_clusters: - try: - from matplotlib.patches import Ellipse - except ImportError: - raise ImportError( - "You must have matplotlib.patches available to plot selected" - " clusters." - ) - - chosen_clusters = self._select_clusters() - - # Extract the chosen cluster bounds. If enough duplicate data - # points exist in the data the lambda value might be infinite. - # This breaks labeling and highlighting the chosen clusters. - cluster_bounds = np.array( - [plot_data["cluster_bounds"][c] for c in chosen_clusters] - ) - if not np.isfinite(cluster_bounds).all(): - warn( - "Infinite lambda values encountered in chosen clusters." - " This might be due to duplicates in the data." - ) - - # Extract the plot range of the y-axis and set default center and - # height values for ellipses. Extremly dense clusters might result - # in near infinite lambda values. Setting max_height based on the - # percentile should alleviate the impact on plotting. - plot_range = np.hstack([plot_data["bar_tops"], plot_data["bar_bottoms"]]) - plot_range = plot_range[np.isfinite(plot_range)] - mean_y_center = np.mean([np.max(plot_range), np.min(plot_range)]) - max_height = np.diff(np.percentile(plot_range, q=[10, 90])) - - for i, c in enumerate(chosen_clusters): - c_bounds = plot_data["cluster_bounds"][c] - width = c_bounds[CB_RIGHT] - c_bounds[CB_LEFT] - height = c_bounds[CB_TOP] - c_bounds[CB_BOTTOM] - center = ( - np.mean([c_bounds[CB_LEFT], c_bounds[CB_RIGHT]]), - np.mean([c_bounds[CB_TOP], c_bounds[CB_BOTTOM]]), - ) - - # Set center and height to default values if necessary - if not np.isfinite(center[1]): - center = (center[0], mean_y_center) - if not np.isfinite(height): - height = max_height - - # Ensure the ellipse is visible - min_height = 0.1 * max_height - if height < min_height: - height = min_height - - if selection_palette is not None and len(selection_palette) >= len( - chosen_clusters - ): - oval_color = selection_palette[i] - else: - oval_color = "r" - - box = Ellipse( - center, - 2.0 * width, - 1.2 * height, - facecolor="none", - edgecolor=oval_color, - linewidth=2, - ) - - if label_clusters: - axis.annotate( - str(i), - xy=center, - xytext=(center[0] - 4.0 * width, center[1] + 0.65 * height), - horizontalalignment="left", - verticalalignment="bottom", - ) - - axis.add_artist(box) - - if colorbar: - cb = plt.colorbar(sm, ax=axis) - if log_size: - cb.ax.set_ylabel("log(Number of points)") - else: - cb.ax.set_ylabel("Number of points") - - axis.set_xticks([]) - for side in ("right", "top", "bottom"): - axis.spines[side].set_visible(False) - axis.invert_yaxis() - axis.set_ylabel("$\lambda$ value") - - return axis - def to_numpy(self): """Return a numpy structured array representation of the condensed tree.""" return self._raw_tree.copy() - def to_pandas(self): - """Return a pandas dataframe representation of the condensed tree. - - Each row of the dataframe corresponds to an edge in the tree. - The columns of the dataframe are `parent`, `child`, `lambda_val` - and `child_size`. - - The `parent` and `child` are the ids of the - parent and child nodes in the tree. Node ids less than the number - of points in the original dataset represent individual points, while - ids greater than the number of points are clusters. - - The `lambda_val` value is the value (1/distance) at which the `child` - node leaves the cluster. - - The `child_size` is the number of points in the `child` node. - """ - try: - from pandas import DataFrame, Series - except ImportError: - raise ImportError( - "You must have pandas installed to export pandas DataFrames" - ) - - result = DataFrame(self._raw_tree) - - return result - - def to_networkx(self): - """Return a NetworkX DiGraph object representing the condensed tree. - - Edge weights in the graph are the lamba values at which child nodes - 'leave' the parent cluster. - - Nodes have a `size` attribute attached giving the number of points - that are in the cluster (or 1 if it is a singleton point) at the - point of cluster creation (fewer points may be in the cluster at - larger lambda values). - """ - try: - from networkx import DiGraph, set_node_attributes - except ImportError: - raise ImportError( - "You must have networkx installed to export networkx graphs" - ) - - result = DiGraph() - for row in self._raw_tree: - result.add_edge(row["parent"], row["child"], weight=row["lambda_val"]) - - set_node_attributes( - result, dict(self._raw_tree[["child", "child_size"]]), "size" - ) - - return result - def _get_dendrogram_ordering(parent, linkage, root): @@ -566,26 +130,6 @@ def _get_dendrogram_ordering(parent, linkage, root): ) -def _calculate_linewidths(ordering, linkage, root): - - linewidths = [] - - for x in ordering: - if linkage[x - root][0] >= root: - left_width = linkage[int(linkage[x - root][0]) - root][3] - else: - left_width = 1 - - if linkage[x - root][1] >= root: - right_width = linkage[int(linkage[x - root][1]) - root][3] - else: - right_width = 1 - - linewidths.append((left_width, right_width)) - - return linewidths - - class SingleLinkageTree(object): """A single linkage format dendrogram tree, with plotting functionality and networkX support. @@ -601,157 +145,6 @@ class SingleLinkageTree(object): def __init__(self, linkage): self._linkage = linkage - def plot( - self, - axis=None, - truncate_mode=None, - p=0, - vary_line_width=True, - cmap="viridis", - colorbar=True, - ): - """Plot a dendrogram of the single linkage tree. - - Parameters - ---------- - truncate_mode : str, optional - The dendrogram can be hard to read when the original - observation matrix from which the linkage is derived - is large. Truncation is used to condense the dendrogram. - There are several modes: - - ``None/'none'`` - No truncation is performed (Default). - - ``'lastp'`` - The last p non-singleton formed in the linkage are the only - non-leaf nodes in the linkage; they correspond to rows - Z[n-p-2:end] in Z. All other non-singleton clusters are - contracted into leaf nodes. - - ``'level'/'mtica'`` - No more than p levels of the dendrogram tree are displayed. - This corresponds to Mathematica(TM) behavior. - - p : int, optional - The ``p`` parameter for ``truncate_mode``. - - vary_line_width : boolean, optional - Draw downward branches of the dendrogram with line thickness that - varies depending on the size of the cluster. - - cmap : string or matplotlib colormap, optional - The matplotlib colormap to use to color the cluster bars. - A value of 'none' will result in black bars. - (default 'viridis') - - colorbar : boolean, optional - Whether to draw a matplotlib colorbar displaying the range - of cluster sizes as per the colormap. (default True) - - Returns - ------- - axis : matplotlib axis - The axis on which the dendrogram plot has been rendered. - - """ - dendrogram_data = dendrogram( - self._linkage, p=p, truncate_mode=truncate_mode, no_plot=True - ) - X = dendrogram_data["icoord"] - Y = dendrogram_data["dcoord"] - - try: - import matplotlib.pyplot as plt - except ImportError: - raise ImportError( - "You must install the matplotlib library to plot the single linkage" - " tree." - ) - - if axis is None: - axis = plt.gca() - - if vary_line_width: - dendrogram_ordering = _get_dendrogram_ordering( - 2 * len(self._linkage), self._linkage, len(self._linkage) + 1 - ) - linewidths = _calculate_linewidths( - dendrogram_ordering, self._linkage, len(self._linkage) + 1 - ) - else: - linewidths = [(1.0, 1.0)] * len(Y) - - if cmap != "none": - color_array = np.log2(np.array(linewidths).flatten()) - sm = plt.cm.ScalarMappable( - cmap=cmap, norm=plt.Normalize(0, color_array.max()) - ) - sm.set_array(color_array) - - for x, y, lw in zip(X, Y, linewidths): - left_x = x[:2] - right_x = x[2:] - left_y = y[:2] - right_y = y[2:] - horizontal_x = x[1:3] - horizontal_y = y[1:3] - - if cmap != "none": - axis.plot( - left_x, - left_y, - color=sm.to_rgba(np.log2(lw[0])), - linewidth=np.log2(1 + lw[0]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - axis.plot( - right_x, - right_y, - color=sm.to_rgba(np.log2(lw[1])), - linewidth=np.log2(1 + lw[1]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - else: - axis.plot( - left_x, - left_y, - color="k", - linewidth=np.log2(1 + lw[0]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - axis.plot( - right_x, - right_y, - color="k", - linewidth=np.log2(1 + lw[1]), - solid_joinstyle="miter", - solid_capstyle="butt", - ) - - axis.plot( - horizontal_x, - horizontal_y, - color="k", - linewidth=1.0, - solid_joinstyle="miter", - solid_capstyle="butt", - ) - - if colorbar: - cb = plt.colorbar(sm, ax=axis) - cb.ax.set_ylabel("log(Number of points)") - - axis.set_xticks([]) - for side in ("right", "top", "bottom"): - axis.spines[side].set_visible(False) - axis.set_ylabel("distance") - - return axis - def to_numpy(self): """Return a numpy array representation of the single linkage tree. @@ -762,78 +155,6 @@ def to_numpy(self): """ return self._linkage.copy() - def to_pandas(self): - """Return a pandas dataframe representation of the single linkage tree. - - Each row of the dataframe corresponds to an edge in the tree. - The columns of the dataframe are `parent`, `left_child`, - `right_child`, `distance` and `size`. - - The `parent`, `left_child` and `right_child` are the ids of the - parent and child nodes in the tree. Node ids less than the number - of points in the original dataset represent individual points, while - ids greater than the number of points are clusters. - - The `distance` value is the at which the child nodes merge to form - the parent node. - - The `size` is the number of points in the `parent` node. - """ - try: - from pandas import DataFrame, Series - except ImportError: - raise ImportError( - "You must have pandas installed to export pandas DataFrames" - ) - - max_node = 2 * self._linkage.shape[0] - num_points = max_node - (self._linkage.shape[0] - 1) - - parent_array = np.arange(num_points, max_node + 1) - - result = DataFrame( - { - "parent": parent_array, - "left_child": self._linkage.T[0], - "right_child": self._linkage.T[1], - "distance": self._linkage.T[2], - "size": self._linkage.T[3], - } - )[["parent", "left_child", "right_child", "distance", "size"]] - - return result - - def to_networkx(self): - """Return a NetworkX DiGraph object representing the single linkage tree. - - Edge weights in the graph are the distance values at which child nodes - merge to form the parent cluster. - - Nodes have a `size` attribute attached giving the number of points - that are in the cluster. - """ - try: - from networkx import DiGraph, set_node_attributes - except ImportError: - raise ImportError( - "You must have networkx installed to export networkx graphs" - ) - - max_node = 2 * self._linkage.shape[0] - num_points = max_node - (self._linkage.shape[0] - 1) - - result = DiGraph() - for parent, row in enumerate(self._linkage, num_points): - result.add_edge(parent, row[0], weight=row[2]) - result.add_edge(parent, row[1], weight=row[2]) - - size_dict = { - parent: row[3] for parent, row in enumerate(self._linkage, num_points) - } - set_node_attributes(result, size_dict, "size") - - return result - def get_clusters(self, cut_distance, min_cluster_size=5): """Return a flat clustering from the single linkage hierarchy. @@ -848,14 +169,14 @@ def get_clusters(self, cut_distance, min_cluster_size=5): The mutual reachability distance cut value to use to generate a flat clustering. - min_cluster_size : int, optional + min_cluster_size : int, default=5 Clusters smaller than this value with be called 'noise' and remain unclustered in the resulting flat clustering. Returns ------- - labels : array [n_samples] + labels : array (n_samples,) An array of cluster labels, one per datapoint. Unclustered points are assigned the label -1. """ @@ -867,167 +188,6 @@ def __init__(self, mst, data): self._mst = mst self._data = data - def plot( - self, - axis=None, - node_size=40, - node_color="k", - node_alpha=0.8, - edge_alpha=0.5, - edge_cmap="viridis_r", - edge_linewidth=2, - vary_line_width=True, - colorbar=True, - ): - """Plot the minimum spanning tree (as projected into 2D by t-SNE if required). - - Parameters - ---------- - - axis : matplotlib axis, optional - The axis to render the plot to - - node_size : int, optional - The size of nodes in the plot (default 40). - - node_color : matplotlib color spec, optional - The color to render nodes (default black). - - node_alpha : float, optional - The alpha value (between 0 and 1) to render nodes with - (default 0.8). - - edge_cmap : matplotlib colormap, optional - The colormap to color edges by (varying color by edge - weight/distance). Can be a cmap object or a string - recognised by matplotlib. (default `viridis_r`) - - edge_alpha : float, optional - The alpha value (between 0 and 1) to render edges with - (default 0.5). - - edge_linewidth : float, optional - The linewidth to use for rendering edges (default 2). - - vary_line_width : bool, optional - Edge width is proportional to (log of) the inverse of the - mutual reachability distance. (default True) - - colorbar : bool, optional - Whether to draw a colorbar. (default True) - - Returns - ------- - - axis : matplotlib axis - The axis used the render the plot. - """ - try: - import matplotlib.pyplot as plt - from matplotlib.collections import LineCollection - except ImportError: - raise ImportError( - "You must install the matplotlib library to plot the minimum spanning" - " tree." - ) - - if self._data.shape[0] > 32767: - warn("Too many data points for safe rendering of an minimal spanning tree!") - return None - - if axis is None: - axis = plt.gca() - - if self._data.shape[1] > 2: - # Get a 2D projection; if we have a lot of dimensions use PCA first - if self._data.shape[1] > 32: - # Use PCA to get down to 32 dimension - data_for_projection = PCA(n_components=32).fit_transform(self._data) - else: - data_for_projection = self._data.copy() - - projection = TSNE().fit_transform(data_for_projection) - else: - projection = self._data.copy() - - if vary_line_width: - line_width = edge_linewidth * ( - np.log(self._mst.T[2].max() / self._mst.T[2]) + 1.0 - ) - else: - line_width = edge_linewidth - - line_coords = projection[self._mst[:, :2].astype(int)] - line_collection = LineCollection( - line_coords, linewidth=line_width, cmap=edge_cmap, alpha=edge_alpha - ) - line_collection.set_array(self._mst[:, 2].T) - - axis.add_artist(line_collection) - axis.scatter( - projection.T[0], - projection.T[1], - c=node_color, - alpha=node_alpha, - s=node_size, - ) - axis.set_xticks([]) - axis.set_yticks([]) - - if colorbar: - cb = plt.colorbar(line_collection, ax=axis) - cb.ax.set_ylabel("Mutual reachability distance") - - return axis - def to_numpy(self): """Return a numpy array of weighted edges in the minimum spanning tree""" return self._mst.copy() - - def to_pandas(self): - """Return a Pandas dataframe of the minimum spanning tree. - - Each row is an edge in the tree; the columns are `from`, - `to`, and `distance` giving the two vertices of the edge - which are indices into the dataset, and the distance - between those datapoints. - """ - try: - from pandas import DataFrame - except ImportError: - raise ImportError( - "You must have pandas installed to export pandas DataFrames" - ) - - result = DataFrame( - { - "from": self._mst.T[0].astype(int), - "to": self._mst.T[1].astype(int), - "distance": self._mst.T[2], - } - ) - return result - - def to_networkx(self): - """Return a NetworkX Graph object representing the minimum spanning tree. - - Edge weights in the graph are the distance between the nodes they connect. - - Nodes have a `data` attribute attached giving the data vector of the - associated point. - """ - try: - from networkx import Graph, set_node_attributes - except ImportError: - raise ImportError( - "You must have networkx installed to export networkx graphs" - ) - - result = Graph() - for row in self._mst: - result.add_edge(row[0], row[1], weight=row[2]) - - data_dict = {index: tuple(row) for index, row in enumerate(self._data)} - set_node_attributes(result, data_dict, "data") - - return result diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index acc2b2b583eda..a3859d7e3337a 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -28,7 +28,6 @@ from scipy.stats import mode from tempfile import mkdtemp -from functools import wraps import pytest from sklearn import datasets @@ -62,62 +61,6 @@ def test_missing_data(): assert np.allclose(clean_model.labels_, model.labels_[clean_indices]) -def if_matplotlib(func): - """Test decorator that skips test if matplotlib not installed. - - Parameters - ---------- - func - """ - - @wraps(func) - def run_test(*args, **kwargs): - try: - import matplotlib - - matplotlib.use("Agg") - # this fails if no $DISPLAY specified - import matplotlib.pyplot as plt - - plt.figure() - except ImportError: - pytest.skip("Matplotlib not available.") - else: - return func(*args, **kwargs) - - return run_test - - -def if_pandas(func): - """Test decorator that skips test if pandas not installed.""" - - @wraps(func) - def run_test(*args, **kwargs): - try: - import pandas - except ImportError: - pytest.skip("Pandas not available.") - else: - return func(*args, **kwargs) - - return run_test - - -def if_networkx(func): - """Test decorator that skips test if networkx not installed.""" - - @wraps(func) - def run_test(*args, **kwargs): - try: - import networkx - except ImportError: - pytest.skip("NetworkX not available.") - else: - return func(*args, **kwargs) - - return run_test - - def generate_noisy_data(): blobs, _ = datasets.make_blobs( n_samples=200, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 @@ -383,48 +326,6 @@ def test_hdbscan_boruvka_balltree_matches(): assert (num_mismatches / float(data.shape[0])) < 0.15 -def test_condensed_tree_plot(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_matplotlib(clusterer.condensed_tree_.plot)( - select_clusters=True, - label_clusters=True, - selection_palette=("r", "g", "b"), - cmap="Reds", - ) - if_matplotlib(clusterer.condensed_tree_.plot)( - log_size=True, colorbar=False, cmap="none" - ) - - -def test_single_linkage_tree_plot(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_matplotlib(clusterer.single_linkage_tree_.plot)(cmap="Reds") - if_matplotlib(clusterer.single_linkage_tree_.plot)( - vary_line_width=False, truncate_mode="lastp", p=10, cmap="none", colorbar=False - ) - - -def test_min_span_tree_plot(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap="Reds") - - H, y = make_blobs(n_samples=50, random_state=0, n_features=10) - H = StandardScaler().fit_transform(H) - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) - if_matplotlib(clusterer.minimum_spanning_tree_.plot)( - edge_cmap="Reds", vary_line_width=False, colorbar=False - ) - - H, y = make_blobs(n_samples=50, random_state=0, n_features=40) - H = StandardScaler().fit_transform(H) - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) - if_matplotlib(clusterer.minimum_spanning_tree_.plot)( - edge_cmap="Reds", vary_line_width=False, colorbar=False - ) - - def test_tree_numpy_output_formats(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) @@ -434,22 +335,6 @@ def test_tree_numpy_output_formats(): clusterer.minimum_spanning_tree_.to_numpy() -def test_tree_pandas_output_formats(): - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_pandas(clusterer.condensed_tree_.to_pandas)() - if_pandas(clusterer.single_linkage_tree_.to_pandas)() - if_pandas(clusterer.minimum_spanning_tree_.to_pandas)() - - -def test_tree_networkx_output_formats(): - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - if_networkx(clusterer.condensed_tree_.to_networkx)() - if_networkx(clusterer.single_linkage_tree_.to_networkx)() - if_networkx(clusterer.minimum_spanning_tree_.to_networkx)() - - def test_hdbscan_outliers(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) scores = clusterer.outlier_scores_ From fda93505f46ff3b372674435923f9c2fb8a04520 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 16:31:36 -0500 Subject: [PATCH 010/160] Renamed `plots.py`-->`_trees.py` --- sklearn/cluster/_hdbscan/{plots.py => _trees.py} | 0 sklearn/cluster/_hdbscan/flat.py | 2 +- sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- sklearn/cluster/_hdbscan/robust_single_linkage_.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename sklearn/cluster/_hdbscan/{plots.py => _trees.py} (100%) diff --git a/sklearn/cluster/_hdbscan/plots.py b/sklearn/cluster/_hdbscan/_trees.py similarity index 100% rename from sklearn/cluster/_hdbscan/plots.py rename to sklearn/cluster/_hdbscan/_trees.py diff --git a/sklearn/cluster/_hdbscan/flat.py b/sklearn/cluster/_hdbscan/flat.py index 9455ffb963364..eaff77d1645d6 100644 --- a/sklearn/cluster/_hdbscan/flat.py +++ b/sklearn/cluster/_hdbscan/flat.py @@ -32,7 +32,7 @@ import numpy as np from ._hdbscan_tree import compute_stability, get_cluster_tree_leaves from .hdbscan_ import HDBSCAN, _tree_to_labels -from .plots import _bfs_from_cluster_tree +from ._trees import _bfs_from_cluster_tree from .prediction import ( PredictionData, _find_cluster_and_probability, diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index a5145a480a861..c5da73de4e66f 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -34,7 +34,7 @@ from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm from .dist_metrics import DistanceMetric -from .plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree +from ._trees import CondensedTree, SingleLinkageTree, MinimumSpanningTree from .prediction import PredictionData FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] diff --git a/sklearn/cluster/_hdbscan/robust_single_linkage_.py b/sklearn/cluster/_hdbscan/robust_single_linkage_.py index 5ecc1f173549a..1d668ff1c00d7 100644 --- a/sklearn/cluster/_hdbscan/robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/robust_single_linkage_.py @@ -15,7 +15,7 @@ from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm from .dist_metrics import DistanceMetric from ._hdbscan_reachability import mutual_reachability -from .plots import SingleLinkageTree +from ._trees import SingleLinkageTree from sklearn.neighbors import KDTree, BallTree # Author: Leland McInnes From 7478586b2a8a81e1f05660e6a688674bb6c82cd9 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 16:38:40 -0500 Subject: [PATCH 011/160] Fixed package namespace in `cluster/__init__.py` --- sklearn/cluster/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 6033f589d5155..8d89bff955d8c 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -31,6 +31,12 @@ all_points_membership_vectors, approximate_predict_scores, ) +from ._hdbscan.flat import ( + HDBSCAN_flat, + approximate_predict_flat, + membership_vector_flat, + all_points_membership_vectors_flat, +) __all__ = [ "AffinityPropagation", @@ -71,5 +77,4 @@ "approximate_predict_flat", "membership_vector_flat", "all_points_membership_vectors_flat", - "safe_always_positive_division", ] From cd1edc45c3a663c1642344202c0cec504d36927f Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 16:48:00 -0500 Subject: [PATCH 012/160] Drop-in replaced private `dist_metrics` with `metrics.dist_metrics` --- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 2 +- sklearn/cluster/_hdbscan/dist_metrics.pxd | 94 -- sklearn/cluster/_hdbscan/dist_metrics.pyx | 1147 ----------------- sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- .../_hdbscan/robust_single_linkage_.py | 2 +- 5 files changed, 3 insertions(+), 1244 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pxd delete mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pyx diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index ddb1db48e8622..82c7bcebef6b3 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -11,7 +11,7 @@ import cython from libc.float cimport DBL_MAX from libc.stdio cimport printf -from .dist_metrics cimport DistanceMetric +from sklearn.metrics._dist_metrics cimport DistanceMetric cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pxd b/sklearn/cluster/_hdbscan/dist_metrics.pxd deleted file mode 100644 index df3c8af85b105..0000000000000 --- a/sklearn/cluster/_hdbscan/dist_metrics.pxd +++ /dev/null @@ -1,94 +0,0 @@ -#!python -#cython: boundscheck=False -#cython: wraparound=False -#cython: cdivision=True - -import cython -cimport cython - -import numpy as np -cimport numpy as np - -from libc.math cimport fabs, sqrt, exp, cos, pow - -ctypedef np.double_t DTYPE_t -ctypedef np.intp_t ITYPE_t - -cdef enum: - DTYPECODE = np.NPY_FLOAT64 - ITYPECODE = np.NPY_INTP - -# Fused type for certain operations -ctypedef fused DITYPE_t: - ITYPE_t - DTYPE_t - -ITYPE = np.intp - -DTYPE = np.double - -###################################################################### -# Inline distance functions -# -# We use these for the default (euclidean) case so that they can be -# inlined. This leads to faster computation for the most common case -cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t tmp, d=0 - cdef np.intp_t j - for j in range(size): - tmp = x1[j] - x2[j] - d += tmp * tmp - return sqrt(d) - - -cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t tmp, d=0 - cdef np.intp_t j - for j in range(size): - tmp = x1[j] - x2[j] - d += tmp * tmp - return d - - -cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1: - return dist * dist - - -cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1: - return sqrt(dist) - - -###################################################################### -# DistanceMetric base class -cdef class DistanceMetric: - # The following attributes are required for a few of the subclasses. - # we must define them here so that cython's limited polymorphism will work. - # Because we don't expect to instantiate a lot of these objects, the - # extra memory overhead of this setup should not be an issue. - cdef DTYPE_t p - #cdef DTYPE_t[::1] vec - #cdef DTYPE_t[:, ::1] mat - cdef np.ndarray vec - cdef np.ndarray mat - cdef DTYPE_t* vec_ptr - cdef DTYPE_t* mat_ptr - cdef ITYPE_t size - cdef object func - cdef object kwargs - - cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1 - - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1 - - cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 - - cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, - DTYPE_t[:, ::1] D) except -1 - - cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1 - - cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pyx b/sklearn/cluster/_hdbscan/dist_metrics.pyx deleted file mode 100644 index 7416a9ffa62ce..0000000000000 --- a/sklearn/cluster/_hdbscan/dist_metrics.pyx +++ /dev/null @@ -1,1147 +0,0 @@ -# !python -# cython: boundscheck=False -# cython: wraparound=False -# cython: cdivision=True - -# By Jake Vanderplas (2013) -# written for the scikit-learn project -# modified for HDBSCAN Dual Tree Boruvka algorithm -# License: BSD - -import numpy as np -cimport numpy as np -np.import_array() # required in order to use C-API - -from libc.math cimport fabs, sqrt, exp, cos, pow, log, acos, M_PI - -DTYPE = np.double -ITYPE = np.intp - - -###################################################################### -# Numpy 1.3-1.4 compatibility utilities -cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( - np.ndarray[DTYPE_t, ndim=2, mode='c'] X): - return ( X.data) - - -cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec): - return &vec[0] - - -cdef DTYPE_t* get_mat_ptr(np.ndarray[DTYPE_t, ndim=2, mode='c'] mat): - return &mat[0, 0] -###################################################################### - - -# First, define a function to get an ndarray from a memory bufffer -cdef extern from "numpy/arrayobject.h": - object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims, - int typenum, void* data) - - -cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): - # Wrap a memory buffer with an ndarray. Warning: this is not robust. - # In particular, if x is deallocated before the returned array goes - # out of scope, this could cause memory errors. Since there is not - # a possibility of this for our use-case, this should be safe. - - # Note: this Segfaults unless np.import_array() is called above - return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) - - -# some handy constants -from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin -cdef DTYPE_t INF = np.inf - - -###################################################################### -# newObj function -# this is a helper function for pickling -def newObj(obj): - return obj.__new__(obj) - - -###################################################################### -# metric mappings -# These map from metric id strings to class names -METRIC_MAPPING = {'euclidean': EuclideanDistance, - 'l2': EuclideanDistance, - 'minkowski': MinkowskiDistance, - 'p': MinkowskiDistance, - 'manhattan': ManhattanDistance, - 'cityblock': ManhattanDistance, - 'l1': ManhattanDistance, - 'chebyshev': ChebyshevDistance, - 'infinity': ChebyshevDistance, - 'seuclidean': SEuclideanDistance, - 'mahalanobis': MahalanobisDistance, - 'wminkowski': WMinkowskiDistance, - 'hamming': HammingDistance, - 'canberra': CanberraDistance, - 'braycurtis': BrayCurtisDistance, - 'matching': MatchingDistance, - 'jaccard': JaccardDistance, - 'dice': DiceDistance, - 'kulsinski': KulsinskiDistance, - 'rogerstanimoto': RogersTanimotoDistance, - 'russellrao': RussellRaoDistance, - 'sokalmichener': SokalMichenerDistance, - 'sokalsneath': SokalSneathDistance, - 'haversine': HaversineDistance, - 'cosine': ArccosDistance, - 'arccos': ArccosDistance, - 'pyfunc': PyFuncDistance} - - -def get_valid_metric_ids(L): - """Given an iterable of metric class names or class identifiers, - return a list of metric IDs which map to those classes. - - Examples - -------- - >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) - >>> sorted(L) - ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] - """ - return [key for (key, val) in METRIC_MAPPING.items() - if (val.__name__ in L) or (val in L)] - - -###################################################################### -# Distance Metric Classes -cdef class DistanceMetric: - """DistanceMetric class - - This class provides a uniform interface to fast distance metric - functions. The various metrics can be accessed via the `get_metric` - class method and the metric string identifier (see below). - - Examples - -------- - - For example, to use the Euclidean distance: - - >>> dist = DistanceMetric.get_metric('euclidean') - >>> X = [[0, 1, 2], - [3, 4, 5]]) - >>> dist.pairwise(X) - array([[ 0. , 5.19615242], - [ 5.19615242, 0. ]]) - - Available Metrics - The following lists the string metric identifiers and the associated - distance metric classes: - - **Metrics intended for real-valued vector spaces:** - - ============== ==================== ======== =============================== - identifier class name args distance function - -------------- -------------------- -------- ------------------------------- - "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` - "manhattan" ManhattanDistance - ``sum(|x - y|)`` - "chebyshev" ChebyshevDistance - ``sum(max(|x - y|))`` - "minkowski" MinkowskiDistance p ``sum(|x - y|^p)^(1/p)`` - "wminkowski" WMinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` - "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` - "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` - ============== ==================== ======== =============================== - - **Metrics intended for two-dimensional vector spaces:** Note that the haversine - distance metric requires data in the form of [latitude, longitude] and both - inputs and outputs are in units of radians. - - ============ ================== ======================================== - identifier class name distance function - ------------ ------------------ ---------------------------------------- - "haversine" HaversineDistance 2 arcsin(sqrt(sin^2(0.5*dx) - + cos(x1)cos(x2)sin^2(0.5*dy))) - ============ ================== ======================================== - - - **Metrics intended for integer-valued vector spaces:** Though intended - for integer-valued vectors, these are also valid metrics in the case of - real-valued vectors. - - ============= ==================== ======================================== - identifier class name distance function - ------------- -------------------- ---------------------------------------- - "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` - "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` - "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` - ============= ==================== ======================================== - - **Metrics intended for boolean-valued vector spaces:** Any nonzero entry - is evaluated to "True". In the listings below, the following - abbreviations are used: - - - N : number of dimensions - - NTT : number of dims in which both values are True - - NTF : number of dims in which the first value is True, second is False - - NFT : number of dims in which the first value is False, second is True - - NFF : number of dims in which both values are False - - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT - - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT - - ================= ======================= =============================== - identifier class name distance function - ----------------- ----------------------- ------------------------------- - "jaccard" JaccardDistance NNEQ / NNZ - "maching" MatchingDistance NNEQ / N - "dice" DiceDistance NNEQ / (NTT + NNZ) - "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) - "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) - "russellrao" RussellRaoDistance NNZ / N - "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) - "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) - ================= ======================= =============================== - - **User-defined distance:** - - =========== =============== ======= - identifier class name args - ----------- --------------- ------- - "pyfunc" PyFuncDistance func - =========== =============== ======= - - Here ``func`` is a function which takes two one-dimensional numpy - arrays, and returns a distance. Note that in order to be used within - the BallTree, the distance must be a true metric: - i.e. it must satisfy the following properties - - 1) Non-negativity: d(x, y) >= 0 - 2) Identity: d(x, y) = 0 if and only if x == y - 3) Symmetry: d(x, y) = d(y, x) - 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) - - Because of the Python object overhead involved in calling the python - function, this will be fairly slow, but it will have the same - scaling as other distances. - """ - def __cinit__(self): - self.p = 2 - self.vec = np.zeros(1, dtype=DTYPE, order='c') - self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') - self.vec_ptr = get_vec_ptr(self.vec) - self.mat_ptr = get_mat_ptr(self.mat) - self.size = 1 - - def __reduce__(self): - """ - reduce method used for pickling - """ - return (newObj, (self.__class__,), self.__getstate__()) - - def __getstate__(self): - """ - get state for pickling - """ - if self.__class__.__name__ == "PyFuncDistance": - return (float(self.p), self.vec, self.mat, self.func, self.kwargs) - return (float(self.p), self.vec, self.mat) - - def __setstate__(self, state): - """ - set state for pickling - """ - self.p = state[0] - self.vec = state[1] - self.mat = state[2] - if self.__class__.__name__ == "PyFuncDistance": - self.func = state[3] - self.kwargs = state[4] - self.vec_ptr = get_vec_ptr(self.vec) - self.mat_ptr = get_mat_ptr(self.mat) - self.size = 1 - - @classmethod - def get_metric(cls, metric, **kwargs): - """Get the given distance metric from the string identifier. - - See the docstring of DistanceMetric for a list of available metrics. - - Parameters - ---------- - metric : string or class name - The distance metric to use - **kwargs - additional arguments will be passed to the requested metric - """ - if isinstance(metric, DistanceMetric): - return metric - - if callable(metric): - return PyFuncDistance(metric, **kwargs) - - # Map the metric string ID to the metric class - if isinstance(metric, type) and issubclass(metric, DistanceMetric): - pass - else: - try: - metric = METRIC_MAPPING[metric] - except: - raise ValueError("Unrecognized metric '%s'" % metric) - - # In Minkowski special cases, return more efficient methods - if metric is MinkowskiDistance: - p = kwargs.pop('p', 2) - if p == 1: - return ManhattanDistance(**kwargs) - elif p == 2: - return EuclideanDistance(**kwargs) - elif np.isinf(p): - return ChebyshevDistance(**kwargs) - else: - return MinkowskiDistance(p, **kwargs) - else: - return metric(**kwargs) - - def __init__(self): - if self.__class__ is DistanceMetric: - raise NotImplementedError("DistanceMetric is an abstract class") - - cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - """Compute the distance between vectors x1 and x2 - - This should be overridden in a base class. - """ - return -999 - - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - """Compute the reduced distance between vectors x1 and x2. - - This can optionally be overridden in a base class. - - The reduced distance is any measure that yields the same rank as the - distance, but is more efficient to compute. For example, for the - Euclidean metric, the reduced distance is the squared-euclidean - distance. - """ - return self.dist(x1, x2, size) - - cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: - """compute the pairwise distances between points in X""" - cdef ITYPE_t i1, i2 - for i1 in range(X.shape[0]): - for i2 in range(i1, X.shape[0]): - D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) - D[i2, i1] = D[i1, i2] - return 0 - - cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, - DTYPE_t[:, ::1] D) except -1: - """compute the cross-pairwise distances between arrays X and Y""" - cdef ITYPE_t i1, i2 - if X.shape[1] != Y.shape[1]: - raise ValueError('X and Y must have the same second dimension') - for i1 in range(X.shape[0]): - for i2 in range(Y.shape[0]): - D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) - return 0 - - cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - """Convert the reduced distance to the distance""" - return rdist - - cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - """Convert the distance to the reduced distance""" - return dist - - def rdist_to_dist(self, rdist): - """Convert the Reduced distance to the true distance. - - The reduced distance, defined for some metrics, is a computationally - more efficent measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. - """ - return rdist - - def dist_to_rdist(self, dist): - """Convert the true distance to the reduced distance. - - The reduced distance, defined for some metrics, is a computationally - more efficent measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. - """ - return dist - - def pairwise(self, X, Y=None): - """Compute the pairwise distances between X and Y - - This is a convenience routine for the sake of testing. For many - metrics, the utilities in scipy.spatial.distance.cdist and - scipy.spatial.distance.pdist will be faster. - - Parameters - ---------- - X : array_like - Array of shape (Nx, D), representing Nx points in D dimensions. - Y : array_like (optional) - Array of shape (Ny, D), representing Ny points in D dimensions. - If not specified, then Y=X. - Returns - ------- - dist : ndarray - The shape (Nx, Ny) array of pairwise distances between points in - X and Y. - """ - cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr - cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr - cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr - - Xarr = np.asarray(X, dtype=DTYPE, order='C') - if Y is None: - Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), - dtype=DTYPE, order='C') - self.pdist(get_memview_DTYPE_2D(Xarr), - get_memview_DTYPE_2D(Darr)) - else: - Yarr = np.asarray(Y, dtype=DTYPE, order='C') - Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), - dtype=DTYPE, order='C') - self.cdist(get_memview_DTYPE_2D(Xarr), - get_memview_DTYPE_2D(Yarr), - get_memview_DTYPE_2D(Darr)) - return Darr - - -# ------------------------------------------------------------ -# Euclidean Distance -# d = sqrt(sum(x_i^2 - y_i^2)) -cdef class EuclideanDistance(DistanceMetric): - """Euclidean Distance metric - - .. math:: - D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } - """ - def __init__(self): - self.p = 2 - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return euclidean_dist(x1, x2, size) - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return euclidean_rdist(x1, x2, size) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return sqrt(rdist) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return dist * dist - - def rdist_to_dist(self, rdist): - return np.sqrt(rdist) - - def dist_to_rdist(self, dist): - return dist ** 2 - - -# ------------------------------------------------------------ -# SEuclidean Distance -# d = sqrt(sum((x_i - y_i2)^2 / v_i)) -cdef class SEuclideanDistance(DistanceMetric): - """Standardized Euclidean Distance metric - - .. math:: - D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } - """ - def __init__(self, V): - self.vec = np.asarray(V, dtype=DTYPE) - self.vec_ptr = get_vec_ptr(self.vec) - self.size = self.vec.shape[0] - self.p = 2 - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != self.size: - with gil: - raise ValueError('SEuclidean dist: size of V does not match') - cdef DTYPE_t tmp, d=0 - cdef np.intp_t j - for j in range(size): - tmp = x1[j] - x2[j] - d += tmp * tmp / self.vec_ptr[j] - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return sqrt(self.rdist(x1, x2, size)) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return sqrt(rdist) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return dist * dist - - def rdist_to_dist(self, rdist): - return np.sqrt(rdist) - - def dist_to_rdist(self, dist): - return dist ** 2 - - -# ------------------------------------------------------------ -# Manhattan Distance -# d = sum(abs(x_i - y_i)) -cdef class ManhattanDistance(DistanceMetric): - """Manhattan/City-block Distance metric - - .. math:: - D(x, y) = \sum_i |x_i - y_i| - """ - def __init__(self): - self.p = 1 - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d = 0 - cdef np.intp_t j - for j in range(size): - d += fabs(x1[j] - x2[j]) - return d - - -# ------------------------------------------------------------ -# Chebyshev Distance -# d = max_i(abs(x_i), abs(y_i)) -cdef class ChebyshevDistance(DistanceMetric): - """Chebyshev/Infinity Distance - - .. math:: - D(x, y) = max_i (|x_i - y_i|) - """ - def __init__(self): - self.p = INF - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d = 0 - cdef np.intp_t j - for j in range(size): - d = fmax(d, fabs(x1[j] - x2[j])) - return d - - -# ------------------------------------------------------------ -# Minkowski Distance -# d = sum(x_i^p - y_i^p) ^ (1/p) -cdef class MinkowskiDistance(DistanceMetric): - """Minkowski Distance - - .. math:: - D(x, y) = [\sum_i (x_i - y_i)^p] ^ (1/p) - - Minkowski Distance requires p >= 1 and finite. For p = infinity, - use ChebyshevDistance. - Note that for p=1, ManhattanDistance is more efficient, and for - p=2, EuclideanDistance is more efficient. - """ - def __init__(self, p): - if p < 1: - raise ValueError("p must be greater than 1") - elif np.isinf(p): - raise ValueError("MinkowskiDistance requires finite p. " - "For p=inf, use ChebyshevDistance.") - self.p = p - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d=0 - cdef np.intp_t j - for j in range(size): - d += pow(fabs(x1[j] - x2[j]), self.p) - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return pow(rdist, 1. / self.p) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return pow(dist, self.p) - - def rdist_to_dist(self, rdist): - return rdist ** (1. / self.p) - - def dist_to_rdist(self, dist): - return dist ** self.p - - -# ------------------------------------------------------------ -# W-Minkowski Distance -# d = sum(w_i * (x_i^p - y_i^p)) ^ (1/p) -cdef class WMinkowskiDistance(DistanceMetric): - """Weighted Minkowski Distance - - .. math:: - D(x, y) = [\sum_i w_i (x_i - y_i)^p] ^ (1/p) - - Weighted Minkowski Distance requires p >= 1 and finite. - - Parameters - ---------- - p : int - The order of the norm of the difference :math:`{||u-v||}_p`. - w : (N,) array_like - The weight vector. - - """ - def __init__(self, p, w): - if p < 1: - raise ValueError("p must be greater than 1") - elif np.isinf(p): - raise ValueError("WMinkowskiDistance requires finite p. " - "For p=inf, use ChebyshevDistance.") - self.p = p - self.vec = np.asarray(w, dtype=DTYPE) - self.vec_ptr = get_vec_ptr(self.vec) - self.size = self.vec.shape[0] - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != self.size: - with gil: - raise ValueError('WMinkowskiDistance dist: ' - 'size of w does not match') - cdef DTYPE_t d=0 - cdef np.intp_t j - for j in range(size): - d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p) - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return pow(rdist, 1. / self.p) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return pow(dist, self.p) - - def rdist_to_dist(self, rdist): - return rdist ** (1. / self.p) - - def dist_to_rdist(self, dist): - return dist ** self.p - - -# ------------------------------------------------------------ -# Mahalanobis Distance -# d = sqrt( (x - y)^T V^-1 (x - y) ) -cdef class MahalanobisDistance(DistanceMetric): - """Mahalanobis Distance - - .. math:: - D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } - - Parameters - ---------- - V : array_like - Symmetric positive-definite covariance matrix. - The inverse of this matrix will be explicitly computed. - VI : array_like - optionally specify the inverse directly. If VI is passed, - then V is not referenced. - """ - def __init__(self, V=None, VI=None): - if VI is None: - VI = np.linalg.inv(V) - if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: - raise ValueError("V/VI must be square") - - self.mat = np.asarray(VI, dtype=float, order='C') - self.mat_ptr = get_mat_ptr(self.mat) - - self.size = self.mat.shape[0] - - # we need vec as a work buffer - self.vec = np.zeros(self.size, dtype=DTYPE) - self.vec_ptr = get_vec_ptr(self.vec) - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != self.size: - with gil: - raise ValueError('Mahalanobis dist: size of V does not match') - - cdef DTYPE_t tmp, d = 0 - cdef np.intp_t i, j - - # compute (x1 - x2).T * VI * (x1 - x2) - for i in range(size): - self.vec_ptr[i] = x1[i] - x2[i] - - for i in range(size): - tmp = 0 - for j in range(size): - tmp += self.mat_ptr[i * size + j] * self.vec_ptr[j] - d += tmp * self.vec_ptr[i] - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return sqrt(self.rdist(x1, x2, size)) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return sqrt(rdist) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return dist * dist - - def rdist_to_dist(self, rdist): - return np.sqrt(rdist) - - def dist_to_rdist(self, dist): - return dist ** 2 - - -# ------------------------------------------------------------ -# Hamming Distance -# d = N_unequal(x, y) / N_tot -cdef class HammingDistance(DistanceMetric): - """Hamming Distance - - Hamming distance is meant for discrete-valued vectors, though it is - a valid metric for real-valued vectors. - - .. math:: - D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int n_unequal = 0 - cdef np.intp_t j - for j in range(size): - if x1[j] != x2[j]: - n_unequal += 1 - return float(n_unequal) / size - - -# ------------------------------------------------------------ -# Canberra Distance -# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] -cdef class CanberraDistance(DistanceMetric): - """Canberra Distance - - Canberra distance is meant for discrete-valued vectors, though it is - a valid metric for real-valued vectors. - - .. math:: - D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t denom, d = 0 - cdef np.intp_t j - for j in range(size): - denom = fabs(x1[j]) + fabs(x2[j]) - if denom > 0: - d += fabs(x1[j] - x2[j]) / denom - return d - - -# ------------------------------------------------------------ -# Bray-Curtis Distance -# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] -cdef class BrayCurtisDistance(DistanceMetric): - """Bray-Curtis Distance - - Bray-Curtis distance is meant for discrete-valued vectors, though it is - a valid metric for real-valued vectors. - - .. math:: - D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t num = 0, denom = 0 - cdef np.intp_t j - for j in range(size): - num += fabs(x1[j] - x2[j]) - denom += fabs(x1[j]) + fabs(x2[j]) - if denom > 0: - return num / denom - else: - return 0.0 - - -# ------------------------------------------------------------ -# Jaccard Distance (boolean) -# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) -cdef class JaccardDistance(DistanceMetric): - """Jaccard Distance - - Jaccard Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_eq = 0, nnz = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - nnz += (tf1 or tf2) - n_eq += (tf1 and tf2) - if nnz == 0: - return 0.0 - return (nnz - n_eq) * 1.0 / nnz - - -# ------------------------------------------------------------ -# Matching Distance (boolean) -# D(x, y) = n_neq / n -cdef class MatchingDistance(DistanceMetric): - """Matching Distance - - Matching Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{N} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - return n_neq * 1. / size - - -# ------------------------------------------------------------ -# Dice Distance (boolean) -# D(x, y) = n_neq / (2 * ntt + n_neq) -cdef class DiceDistance(DistanceMetric): - """Dice Distance - - Dice Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0, ntt = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - ntt += (tf1 and tf2) - n_neq += (tf1 != tf2) - return n_neq / (2.0 * ntt + n_neq) - - -# ------------------------------------------------------------ -# Kulsinski Distance (boolean) -# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) -cdef class KulsinskiDistance(DistanceMetric): - """Kulsinski Distance - - Kulsinski Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return (n_neq - ntt + size) * 1.0 / (n_neq + size) - - -# ------------------------------------------------------------ -# Rogers-Tanimoto Distance (boolean) -# D(x, y) = 2 * n_neq / (n + n_neq) -cdef class RogersTanimotoDistance(DistanceMetric): - """Rogers-Tanimoto Distance - - Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - return (2.0 * n_neq) / (size + n_neq) - - -# ------------------------------------------------------------ -# Russell-Rao Distance (boolean) -# D(x, y) = (n - ntt) / n -cdef class RussellRaoDistance(DistanceMetric): - """Russell-Rao Distance - - Russell-Rao Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N - N_{TT}}{N} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - ntt += (tf1 and tf2) - return (size - ntt) * 1. / size - - -# ------------------------------------------------------------ -# Sokal-Michener Distance (boolean) -# D(x, y) = 2 * n_neq / (n + n_neq) -cdef class SokalMichenerDistance(DistanceMetric): - """Sokal-Michener Distance - - Sokal-Michener Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - return (2.0 * n_neq) / (size + n_neq) - - -# ------------------------------------------------------------ -# Sokal-Sneath Distance (boolean) -# D(x, y) = n_neq / (0.5 * n_tt + n_neq) -cdef class SokalSneathDistance(DistanceMetric): - """Sokal-Sneath Distance - - Sokal-Sneath Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return n_neq / (0.5 * ntt + n_neq) - - -# ------------------------------------------------------------ -# Haversine Distance (2 dimensional) -# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) -# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} -cdef class HaversineDistance(DistanceMetric): - """Haversine (Spherical) Distance - - The Haversine distance is the angular distance between two points on - the surface of a sphere. The first distance of each point is assumed - to be the latitude, the second is the longitude, given in radians. - The dimension of the points must be 2: - - .. math:: - D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2) - + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] - """ - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != 2: - with gil: - raise ValueError("Haversine distance only valid " - "in 2 dimensions") - cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) - cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) - return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != 2: - with gil: - raise ValueError("Haversine distance only valid in" - " 2 dimensions") - cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) - cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) - return 2 * asin(sqrt(sin_0 * sin_0 + - cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return 2 * asin(sqrt(rdist)) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - cdef DTYPE_t tmp = sin(0.5 * dist) - return tmp * tmp - - def rdist_to_dist(self, rdist): - return 2 * np.arcsin(np.sqrt(rdist)) - - def dist_to_rdist(self, dist): - tmp = np.sin(0.5 * dist) - return tmp * tmp - - -# ------------------------------------------------------------ -# Yule Distance (boolean) -# D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft) -# [This is not a true metric, so we will leave it out.] -# -# cdef class YuleDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): -# cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0 -# cdef np.intp_t j -# for j in range(size): -# tf1 = x1[j] != 0 -# tf2 = x2[j] != 0 -# ntt += tf1 and tf2 -# ntf += tf1 and (tf2 == 0) -# nft += (tf1 == 0) and tf2 -# nff = size - ntt - ntf - nft -# return (2.0 * ntf * nft) / (ntt * nff + ntf * nft) - - -# ------------------------------------------------------------ -# Cosine Distance -# D(x, y) = dot(x, y) / (|x| * |y|) -# [This is not a true metric, so we will leave it out. Use the `arccos` -# distance instead] - -# cdef class CosineDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, -# ITYPE_t size) nogil except -1: -# cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 -# cdef np.intp_t j -# for j in range(size): -# d += x1[j] * x2[j] -# norm1 += x1[j] * x1[j] -# norm2 += x2[j] * x2[j] -# return 1.0 - d / sqrt(norm1 * norm2) - -# ------------------------------------------------------------ -# Arccos Distance -# D(x, y) = arccos(dot(x, y) / (|x| * |y|)) / PI - -cdef class ArccosDistance(DistanceMetric): - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 - cdef np.intp_t j - for j in range(size): - d += x1[j] * x2[j] - norm1 += x1[j] * x1[j] - norm2 += x2[j] * x2[j] - return acos(d / sqrt(norm1 * norm2)) / M_PI - - -# ------------------------------------------------------------ -# Correlation Distance -# D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|) -# [This is not a true metric, so we will leave it out.] -# -# cdef class CorrelationDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): -# cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0 -# cdef DTYPE_t tmp1, tmp2 -# -# cdef np.intp_t i -# for i in range(size): -# mu1 += x1[i] -# mu2 += x2[i] -# mu1 /= size -# mu2 /= size -# -# for i in range(size): -# tmp1 = x1[i] - mu1 -# tmp2 = x2[i] - mu2 -# x1nrm += tmp1 * tmp1 -# x2nrm += tmp2 * tmp2 -# x1Tx2 += tmp1 * tmp2 -# -# return (1. - x1Tx2) / sqrt(x1nrm * x2nrm) - - -# ------------------------------------------------------------ -# User-defined distance -# -cdef class PyFuncDistance(DistanceMetric): - """PyFunc Distance - A user-defined distance - Parameters - ---------- - func : function - func should take two numpy arrays as input, and return a distance. - """ - def __init__(self, func, **kwargs): - self.func = func - self.kwargs = kwargs - - # in cython < 0.26, GIL was required to be acquired during definition of - # the function and inside the body of the function. This behaviour is not - # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The - # only way to be back compatible is to inherit `dist` from the base class - # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return self._dist(x1, x2, size) - - cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) except -1 with gil: - cdef np.ndarray x1arr - cdef np.ndarray x2arr - x1arr = _buffer_to_ndarray(x1, size) - x2arr = _buffer_to_ndarray(x2, size) - d = self.func(x1arr, x2arr, **self.kwargs) - try: - # Cython generates code here that results in a TypeError - # if d is the wrong type. - return d - except TypeError: - raise TypeError("Custom distance function must accept two " - "vectors and return a float.") - - -cdef inline double fmax(double a, double b) nogil: - return max(a, b) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index c5da73de4e66f..336f63053872d 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -32,7 +32,7 @@ from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from .dist_metrics import DistanceMetric +from sklearn.metrics._dist_metrics import DistanceMetric from ._trees import CondensedTree, SingleLinkageTree, MinimumSpanningTree from .prediction import PredictionData diff --git a/sklearn/cluster/_hdbscan/robust_single_linkage_.py b/sklearn/cluster/_hdbscan/robust_single_linkage_.py index 1d668ff1c00d7..944bb254566a7 100644 --- a/sklearn/cluster/_hdbscan/robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/robust_single_linkage_.py @@ -13,7 +13,7 @@ from ._hdbscan_linkage import mst_linkage_core, mst_linkage_core_vector, label from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from .dist_metrics import DistanceMetric +from sklearn.metrics._dist_metrics import DistanceMetric from ._hdbscan_reachability import mutual_reachability from ._trees import SingleLinkageTree from sklearn.neighbors import KDTree, BallTree From 08025042bf64792d8f7b00db64712da5896bb01e Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 18:20:02 -0500 Subject: [PATCH 013/160] Revert "Drop-in replaced private `dist_metrics` with `metrics.dist_metrics`" This reverts commit cd1edc45c3a663c1642344202c0cec504d36927f. --- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 2 +- sklearn/cluster/_hdbscan/dist_metrics.pxd | 94 ++ sklearn/cluster/_hdbscan/dist_metrics.pyx | 1147 +++++++++++++++++ sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- .../_hdbscan/robust_single_linkage_.py | 2 +- 5 files changed, 1244 insertions(+), 3 deletions(-) create mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pxd create mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pyx diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index 82c7bcebef6b3..ddb1db48e8622 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -11,7 +11,7 @@ import cython from libc.float cimport DBL_MAX from libc.stdio cimport printf -from sklearn.metrics._dist_metrics cimport DistanceMetric +from .dist_metrics cimport DistanceMetric cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pxd b/sklearn/cluster/_hdbscan/dist_metrics.pxd new file mode 100644 index 0000000000000..df3c8af85b105 --- /dev/null +++ b/sklearn/cluster/_hdbscan/dist_metrics.pxd @@ -0,0 +1,94 @@ +#!python +#cython: boundscheck=False +#cython: wraparound=False +#cython: cdivision=True + +import cython +cimport cython + +import numpy as np +cimport numpy as np + +from libc.math cimport fabs, sqrt, exp, cos, pow + +ctypedef np.double_t DTYPE_t +ctypedef np.intp_t ITYPE_t + +cdef enum: + DTYPECODE = np.NPY_FLOAT64 + ITYPECODE = np.NPY_INTP + +# Fused type for certain operations +ctypedef fused DITYPE_t: + ITYPE_t + DTYPE_t + +ITYPE = np.intp + +DTYPE = np.double + +###################################################################### +# Inline distance functions +# +# We use these for the default (euclidean) case so that they can be +# inlined. This leads to faster computation for the most common case +cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t tmp, d=0 + cdef np.intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += tmp * tmp + return sqrt(d) + + +cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t tmp, d=0 + cdef np.intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += tmp * tmp + return d + + +cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1: + return dist * dist + + +cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1: + return sqrt(dist) + + +###################################################################### +# DistanceMetric base class +cdef class DistanceMetric: + # The following attributes are required for a few of the subclasses. + # we must define them here so that cython's limited polymorphism will work. + # Because we don't expect to instantiate a lot of these objects, the + # extra memory overhead of this setup should not be an issue. + cdef DTYPE_t p + #cdef DTYPE_t[::1] vec + #cdef DTYPE_t[:, ::1] mat + cdef np.ndarray vec + cdef np.ndarray mat + cdef DTYPE_t* vec_ptr + cdef DTYPE_t* mat_ptr + cdef ITYPE_t size + cdef object func + cdef object kwargs + + cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1 + + cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1 + + cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 + + cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, + DTYPE_t[:, ::1] D) except -1 + + cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1 + + cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pyx b/sklearn/cluster/_hdbscan/dist_metrics.pyx new file mode 100644 index 0000000000000..7416a9ffa62ce --- /dev/null +++ b/sklearn/cluster/_hdbscan/dist_metrics.pyx @@ -0,0 +1,1147 @@ +# !python +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True + +# By Jake Vanderplas (2013) +# written for the scikit-learn project +# modified for HDBSCAN Dual Tree Boruvka algorithm +# License: BSD + +import numpy as np +cimport numpy as np +np.import_array() # required in order to use C-API + +from libc.math cimport fabs, sqrt, exp, cos, pow, log, acos, M_PI + +DTYPE = np.double +ITYPE = np.intp + + +###################################################################### +# Numpy 1.3-1.4 compatibility utilities +cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( + np.ndarray[DTYPE_t, ndim=2, mode='c'] X): + return ( X.data) + + +cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec): + return &vec[0] + + +cdef DTYPE_t* get_mat_ptr(np.ndarray[DTYPE_t, ndim=2, mode='c'] mat): + return &mat[0, 0] +###################################################################### + + +# First, define a function to get an ndarray from a memory bufffer +cdef extern from "numpy/arrayobject.h": + object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims, + int typenum, void* data) + + +cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): + # Wrap a memory buffer with an ndarray. Warning: this is not robust. + # In particular, if x is deallocated before the returned array goes + # out of scope, this could cause memory errors. Since there is not + # a possibility of this for our use-case, this should be safe. + + # Note: this Segfaults unless np.import_array() is called above + return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) + + +# some handy constants +from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin +cdef DTYPE_t INF = np.inf + + +###################################################################### +# newObj function +# this is a helper function for pickling +def newObj(obj): + return obj.__new__(obj) + + +###################################################################### +# metric mappings +# These map from metric id strings to class names +METRIC_MAPPING = {'euclidean': EuclideanDistance, + 'l2': EuclideanDistance, + 'minkowski': MinkowskiDistance, + 'p': MinkowskiDistance, + 'manhattan': ManhattanDistance, + 'cityblock': ManhattanDistance, + 'l1': ManhattanDistance, + 'chebyshev': ChebyshevDistance, + 'infinity': ChebyshevDistance, + 'seuclidean': SEuclideanDistance, + 'mahalanobis': MahalanobisDistance, + 'wminkowski': WMinkowskiDistance, + 'hamming': HammingDistance, + 'canberra': CanberraDistance, + 'braycurtis': BrayCurtisDistance, + 'matching': MatchingDistance, + 'jaccard': JaccardDistance, + 'dice': DiceDistance, + 'kulsinski': KulsinskiDistance, + 'rogerstanimoto': RogersTanimotoDistance, + 'russellrao': RussellRaoDistance, + 'sokalmichener': SokalMichenerDistance, + 'sokalsneath': SokalSneathDistance, + 'haversine': HaversineDistance, + 'cosine': ArccosDistance, + 'arccos': ArccosDistance, + 'pyfunc': PyFuncDistance} + + +def get_valid_metric_ids(L): + """Given an iterable of metric class names or class identifiers, + return a list of metric IDs which map to those classes. + + Examples + -------- + >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) + >>> sorted(L) + ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] + """ + return [key for (key, val) in METRIC_MAPPING.items() + if (val.__name__ in L) or (val in L)] + + +###################################################################### +# Distance Metric Classes +cdef class DistanceMetric: + """DistanceMetric class + + This class provides a uniform interface to fast distance metric + functions. The various metrics can be accessed via the `get_metric` + class method and the metric string identifier (see below). + + Examples + -------- + + For example, to use the Euclidean distance: + + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[0, 1, 2], + [3, 4, 5]]) + >>> dist.pairwise(X) + array([[ 0. , 5.19615242], + [ 5.19615242, 0. ]]) + + Available Metrics + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``sum(max(|x - y|))`` + "minkowski" MinkowskiDistance p ``sum(|x - y|^p)^(1/p)`` + "wminkowski" WMinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== ======================================== + identifier class name distance function + ------------ ------------------ ---------------------------------------- + "haversine" HaversineDistance 2 arcsin(sqrt(sin^2(0.5*dx) + + cos(x1)cos(x2)sin^2(0.5*dy))) + ============ ================== ======================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N : number of dimensions + - NTT : number of dims in which both values are True + - NTF : number of dims in which the first value is True, second is False + - NFT : number of dims in which the first value is False, second is True + - NFF : number of dims in which both values are False + - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "maching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance NNZ / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ + def __cinit__(self): + self.p = 2 + self.vec = np.zeros(1, dtype=DTYPE, order='c') + self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') + self.vec_ptr = get_vec_ptr(self.vec) + self.mat_ptr = get_mat_ptr(self.mat) + self.size = 1 + + def __reduce__(self): + """ + reduce method used for pickling + """ + return (newObj, (self.__class__,), self.__getstate__()) + + def __getstate__(self): + """ + get state for pickling + """ + if self.__class__.__name__ == "PyFuncDistance": + return (float(self.p), self.vec, self.mat, self.func, self.kwargs) + return (float(self.p), self.vec, self.mat) + + def __setstate__(self, state): + """ + set state for pickling + """ + self.p = state[0] + self.vec = state[1] + self.mat = state[2] + if self.__class__.__name__ == "PyFuncDistance": + self.func = state[3] + self.kwargs = state[4] + self.vec_ptr = get_vec_ptr(self.vec) + self.mat_ptr = get_mat_ptr(self.mat) + self.size = 1 + + @classmethod + def get_metric(cls, metric, **kwargs): + """Get the given distance metric from the string identifier. + + See the docstring of DistanceMetric for a list of available metrics. + + Parameters + ---------- + metric : string or class name + The distance metric to use + **kwargs + additional arguments will be passed to the requested metric + """ + if isinstance(metric, DistanceMetric): + return metric + + if callable(metric): + return PyFuncDistance(metric, **kwargs) + + # Map the metric string ID to the metric class + if isinstance(metric, type) and issubclass(metric, DistanceMetric): + pass + else: + try: + metric = METRIC_MAPPING[metric] + except: + raise ValueError("Unrecognized metric '%s'" % metric) + + # In Minkowski special cases, return more efficient methods + if metric is MinkowskiDistance: + p = kwargs.pop('p', 2) + if p == 1: + return ManhattanDistance(**kwargs) + elif p == 2: + return EuclideanDistance(**kwargs) + elif np.isinf(p): + return ChebyshevDistance(**kwargs) + else: + return MinkowskiDistance(p, **kwargs) + else: + return metric(**kwargs) + + def __init__(self): + if self.__class__ is DistanceMetric: + raise NotImplementedError("DistanceMetric is an abstract class") + + cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + """Compute the distance between vectors x1 and x2 + + This should be overridden in a base class. + """ + return -999 + + cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + """Compute the reduced distance between vectors x1 and x2. + + This can optionally be overridden in a base class. + + The reduced distance is any measure that yields the same rank as the + distance, but is more efficient to compute. For example, for the + Euclidean metric, the reduced distance is the squared-euclidean + distance. + """ + return self.dist(x1, x2, size) + + cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: + """compute the pairwise distances between points in X""" + cdef ITYPE_t i1, i2 + for i1 in range(X.shape[0]): + for i2 in range(i1, X.shape[0]): + D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) + D[i2, i1] = D[i1, i2] + return 0 + + cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, + DTYPE_t[:, ::1] D) except -1: + """compute the cross-pairwise distances between arrays X and Y""" + cdef ITYPE_t i1, i2 + if X.shape[1] != Y.shape[1]: + raise ValueError('X and Y must have the same second dimension') + for i1 in range(X.shape[0]): + for i2 in range(Y.shape[0]): + D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) + return 0 + + cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + """Convert the reduced distance to the distance""" + return rdist + + cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + """Convert the distance to the reduced distance""" + return dist + + def rdist_to_dist(self, rdist): + """Convert the Reduced distance to the true distance. + + The reduced distance, defined for some metrics, is a computationally + more efficent measure which preserves the rank of the true distance. + For example, in the Euclidean distance metric, the reduced distance + is the squared-euclidean distance. + """ + return rdist + + def dist_to_rdist(self, dist): + """Convert the true distance to the reduced distance. + + The reduced distance, defined for some metrics, is a computationally + more efficent measure which preserves the rank of the true distance. + For example, in the Euclidean distance metric, the reduced distance + is the squared-euclidean distance. + """ + return dist + + def pairwise(self, X, Y=None): + """Compute the pairwise distances between X and Y + + This is a convenience routine for the sake of testing. For many + metrics, the utilities in scipy.spatial.distance.cdist and + scipy.spatial.distance.pdist will be faster. + + Parameters + ---------- + X : array_like + Array of shape (Nx, D), representing Nx points in D dimensions. + Y : array_like (optional) + Array of shape (Ny, D), representing Ny points in D dimensions. + If not specified, then Y=X. + Returns + ------- + dist : ndarray + The shape (Nx, Ny) array of pairwise distances between points in + X and Y. + """ + cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr + cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr + cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr + + Xarr = np.asarray(X, dtype=DTYPE, order='C') + if Y is None: + Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), + dtype=DTYPE, order='C') + self.pdist(get_memview_DTYPE_2D(Xarr), + get_memview_DTYPE_2D(Darr)) + else: + Yarr = np.asarray(Y, dtype=DTYPE, order='C') + Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), + dtype=DTYPE, order='C') + self.cdist(get_memview_DTYPE_2D(Xarr), + get_memview_DTYPE_2D(Yarr), + get_memview_DTYPE_2D(Darr)) + return Darr + + +# ------------------------------------------------------------ +# Euclidean Distance +# d = sqrt(sum(x_i^2 - y_i^2)) +cdef class EuclideanDistance(DistanceMetric): + """Euclidean Distance metric + + .. math:: + D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } + """ + def __init__(self): + self.p = 2 + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return euclidean_dist(x1, x2, size) + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return euclidean_rdist(x1, x2, size) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return sqrt(rdist) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + +# ------------------------------------------------------------ +# SEuclidean Distance +# d = sqrt(sum((x_i - y_i2)^2 / v_i)) +cdef class SEuclideanDistance(DistanceMetric): + """Standardized Euclidean Distance metric + + .. math:: + D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } + """ + def __init__(self, V): + self.vec = np.asarray(V, dtype=DTYPE) + self.vec_ptr = get_vec_ptr(self.vec) + self.size = self.vec.shape[0] + self.p = 2 + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != self.size: + with gil: + raise ValueError('SEuclidean dist: size of V does not match') + cdef DTYPE_t tmp, d=0 + cdef np.intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += tmp * tmp / self.vec_ptr[j] + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return sqrt(self.rdist(x1, x2, size)) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return sqrt(rdist) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + +# ------------------------------------------------------------ +# Manhattan Distance +# d = sum(abs(x_i - y_i)) +cdef class ManhattanDistance(DistanceMetric): + """Manhattan/City-block Distance metric + + .. math:: + D(x, y) = \sum_i |x_i - y_i| + """ + def __init__(self): + self.p = 1 + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d = 0 + cdef np.intp_t j + for j in range(size): + d += fabs(x1[j] - x2[j]) + return d + + +# ------------------------------------------------------------ +# Chebyshev Distance +# d = max_i(abs(x_i), abs(y_i)) +cdef class ChebyshevDistance(DistanceMetric): + """Chebyshev/Infinity Distance + + .. math:: + D(x, y) = max_i (|x_i - y_i|) + """ + def __init__(self): + self.p = INF + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d = 0 + cdef np.intp_t j + for j in range(size): + d = fmax(d, fabs(x1[j] - x2[j])) + return d + + +# ------------------------------------------------------------ +# Minkowski Distance +# d = sum(x_i^p - y_i^p) ^ (1/p) +cdef class MinkowskiDistance(DistanceMetric): + """Minkowski Distance + + .. math:: + D(x, y) = [\sum_i (x_i - y_i)^p] ^ (1/p) + + Minkowski Distance requires p >= 1 and finite. For p = infinity, + use ChebyshevDistance. + Note that for p=1, ManhattanDistance is more efficient, and for + p=2, EuclideanDistance is more efficient. + """ + def __init__(self, p): + if p < 1: + raise ValueError("p must be greater than 1") + elif np.isinf(p): + raise ValueError("MinkowskiDistance requires finite p. " + "For p=inf, use ChebyshevDistance.") + self.p = p + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d=0 + cdef np.intp_t j + for j in range(size): + d += pow(fabs(x1[j] - x2[j]), self.p) + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return pow(self.rdist(x1, x2, size), 1. / self.p) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return pow(rdist, 1. / self.p) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return pow(dist, self.p) + + def rdist_to_dist(self, rdist): + return rdist ** (1. / self.p) + + def dist_to_rdist(self, dist): + return dist ** self.p + + +# ------------------------------------------------------------ +# W-Minkowski Distance +# d = sum(w_i * (x_i^p - y_i^p)) ^ (1/p) +cdef class WMinkowskiDistance(DistanceMetric): + """Weighted Minkowski Distance + + .. math:: + D(x, y) = [\sum_i w_i (x_i - y_i)^p] ^ (1/p) + + Weighted Minkowski Distance requires p >= 1 and finite. + + Parameters + ---------- + p : int + The order of the norm of the difference :math:`{||u-v||}_p`. + w : (N,) array_like + The weight vector. + + """ + def __init__(self, p, w): + if p < 1: + raise ValueError("p must be greater than 1") + elif np.isinf(p): + raise ValueError("WMinkowskiDistance requires finite p. " + "For p=inf, use ChebyshevDistance.") + self.p = p + self.vec = np.asarray(w, dtype=DTYPE) + self.vec_ptr = get_vec_ptr(self.vec) + self.size = self.vec.shape[0] + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != self.size: + with gil: + raise ValueError('WMinkowskiDistance dist: ' + 'size of w does not match') + cdef DTYPE_t d=0 + cdef np.intp_t j + for j in range(size): + d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p) + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return pow(self.rdist(x1, x2, size), 1. / self.p) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return pow(rdist, 1. / self.p) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return pow(dist, self.p) + + def rdist_to_dist(self, rdist): + return rdist ** (1. / self.p) + + def dist_to_rdist(self, dist): + return dist ** self.p + + +# ------------------------------------------------------------ +# Mahalanobis Distance +# d = sqrt( (x - y)^T V^-1 (x - y) ) +cdef class MahalanobisDistance(DistanceMetric): + """Mahalanobis Distance + + .. math:: + D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } + + Parameters + ---------- + V : array_like + Symmetric positive-definite covariance matrix. + The inverse of this matrix will be explicitly computed. + VI : array_like + optionally specify the inverse directly. If VI is passed, + then V is not referenced. + """ + def __init__(self, V=None, VI=None): + if VI is None: + VI = np.linalg.inv(V) + if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: + raise ValueError("V/VI must be square") + + self.mat = np.asarray(VI, dtype=float, order='C') + self.mat_ptr = get_mat_ptr(self.mat) + + self.size = self.mat.shape[0] + + # we need vec as a work buffer + self.vec = np.zeros(self.size, dtype=DTYPE) + self.vec_ptr = get_vec_ptr(self.vec) + + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != self.size: + with gil: + raise ValueError('Mahalanobis dist: size of V does not match') + + cdef DTYPE_t tmp, d = 0 + cdef np.intp_t i, j + + # compute (x1 - x2).T * VI * (x1 - x2) + for i in range(size): + self.vec_ptr[i] = x1[i] - x2[i] + + for i in range(size): + tmp = 0 + for j in range(size): + tmp += self.mat_ptr[i * size + j] * self.vec_ptr[j] + d += tmp * self.vec_ptr[i] + return d + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return sqrt(self.rdist(x1, x2, size)) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return sqrt(rdist) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + +# ------------------------------------------------------------ +# Hamming Distance +# d = N_unequal(x, y) / N_tot +cdef class HammingDistance(DistanceMetric): + """Hamming Distance + + Hamming distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int n_unequal = 0 + cdef np.intp_t j + for j in range(size): + if x1[j] != x2[j]: + n_unequal += 1 + return float(n_unequal) / size + + +# ------------------------------------------------------------ +# Canberra Distance +# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] +cdef class CanberraDistance(DistanceMetric): + """Canberra Distance + + Canberra distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t denom, d = 0 + cdef np.intp_t j + for j in range(size): + denom = fabs(x1[j]) + fabs(x2[j]) + if denom > 0: + d += fabs(x1[j] - x2[j]) / denom + return d + + +# ------------------------------------------------------------ +# Bray-Curtis Distance +# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] +cdef class BrayCurtisDistance(DistanceMetric): + """Bray-Curtis Distance + + Bray-Curtis distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t num = 0, denom = 0 + cdef np.intp_t j + for j in range(size): + num += fabs(x1[j] - x2[j]) + denom += fabs(x1[j]) + fabs(x2[j]) + if denom > 0: + return num / denom + else: + return 0.0 + + +# ------------------------------------------------------------ +# Jaccard Distance (boolean) +# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) +cdef class JaccardDistance(DistanceMetric): + """Jaccard Distance + + Jaccard Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_eq = 0, nnz = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + nnz += (tf1 or tf2) + n_eq += (tf1 and tf2) + if nnz == 0: + return 0.0 + return (nnz - n_eq) * 1.0 / nnz + + +# ------------------------------------------------------------ +# Matching Distance (boolean) +# D(x, y) = n_neq / n +cdef class MatchingDistance(DistanceMetric): + """Matching Distance + + Matching Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{N} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return n_neq * 1. / size + + +# ------------------------------------------------------------ +# Dice Distance (boolean) +# D(x, y) = n_neq / (2 * ntt + n_neq) +cdef class DiceDistance(DistanceMetric): + """Dice Distance + + Dice Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0, ntt = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + ntt += (tf1 and tf2) + n_neq += (tf1 != tf2) + return n_neq / (2.0 * ntt + n_neq) + + +# ------------------------------------------------------------ +# Kulsinski Distance (boolean) +# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) +cdef class KulsinskiDistance(DistanceMetric): + """Kulsinski Distance + + Kulsinski Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + ntt += (tf1 and tf2) + return (n_neq - ntt + size) * 1.0 / (n_neq + size) + + +# ------------------------------------------------------------ +# Rogers-Tanimoto Distance (boolean) +# D(x, y) = 2 * n_neq / (n + n_neq) +cdef class RogersTanimotoDistance(DistanceMetric): + """Rogers-Tanimoto Distance + + Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return (2.0 * n_neq) / (size + n_neq) + + +# ------------------------------------------------------------ +# Russell-Rao Distance (boolean) +# D(x, y) = (n - ntt) / n +cdef class RussellRaoDistance(DistanceMetric): + """Russell-Rao Distance + + Russell-Rao Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N - N_{TT}}{N} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, ntt = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + ntt += (tf1 and tf2) + return (size - ntt) * 1. / size + + +# ------------------------------------------------------------ +# Sokal-Michener Distance (boolean) +# D(x, y) = 2 * n_neq / (n + n_neq) +cdef class SokalMichenerDistance(DistanceMetric): + """Sokal-Michener Distance + + Sokal-Michener Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return (2.0 * n_neq) / (size + n_neq) + + +# ------------------------------------------------------------ +# Sokal-Sneath Distance (boolean) +# D(x, y) = n_neq / (0.5 * n_tt + n_neq) +cdef class SokalSneathDistance(DistanceMetric): + """Sokal-Sneath Distance + + Sokal-Sneath Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + .. math:: + D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}} + """ + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef int tf1, tf2, ntt = 0, n_neq = 0 + cdef np.intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + ntt += (tf1 and tf2) + return n_neq / (0.5 * ntt + n_neq) + + +# ------------------------------------------------------------ +# Haversine Distance (2 dimensional) +# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) +# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} +cdef class HaversineDistance(DistanceMetric): + """Haversine (Spherical) Distance + + The Haversine distance is the angular distance between two points on + the surface of a sphere. The first distance of each point is assumed + to be the latitude, the second is the longitude, given in radians. + The dimension of the points must be 2: + + .. math:: + D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2) + + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] + """ + cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != 2: + with gil: + raise ValueError("Haversine distance only valid " + "in 2 dimensions") + cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) + cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != 2: + with gil: + raise ValueError("Haversine distance only valid in" + " 2 dimensions") + cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) + cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) + return 2 * asin(sqrt(sin_0 * sin_0 + + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)) + + cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: + return 2 * asin(sqrt(rdist)) + + cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: + cdef DTYPE_t tmp = sin(0.5 * dist) + return tmp * tmp + + def rdist_to_dist(self, rdist): + return 2 * np.arcsin(np.sqrt(rdist)) + + def dist_to_rdist(self, dist): + tmp = np.sin(0.5 * dist) + return tmp * tmp + + +# ------------------------------------------------------------ +# Yule Distance (boolean) +# D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft) +# [This is not a true metric, so we will leave it out.] +# +# cdef class YuleDistance(DistanceMetric): +# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0 +# cdef np.intp_t j +# for j in range(size): +# tf1 = x1[j] != 0 +# tf2 = x2[j] != 0 +# ntt += tf1 and tf2 +# ntf += tf1 and (tf2 == 0) +# nft += (tf1 == 0) and tf2 +# nff = size - ntt - ntf - nft +# return (2.0 * ntf * nft) / (ntt * nff + ntf * nft) + + +# ------------------------------------------------------------ +# Cosine Distance +# D(x, y) = dot(x, y) / (|x| * |y|) +# [This is not a true metric, so we will leave it out. Use the `arccos` +# distance instead] + +# cdef class CosineDistance(DistanceMetric): +# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, +# ITYPE_t size) nogil except -1: +# cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 +# cdef np.intp_t j +# for j in range(size): +# d += x1[j] * x2[j] +# norm1 += x1[j] * x1[j] +# norm2 += x2[j] * x2[j] +# return 1.0 - d / sqrt(norm1 * norm2) + +# ------------------------------------------------------------ +# Arccos Distance +# D(x, y) = arccos(dot(x, y) / (|x| * |y|)) / PI + +cdef class ArccosDistance(DistanceMetric): + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 + cdef np.intp_t j + for j in range(size): + d += x1[j] * x2[j] + norm1 += x1[j] * x1[j] + norm2 += x2[j] * x2[j] + return acos(d / sqrt(norm1 * norm2)) / M_PI + + +# ------------------------------------------------------------ +# Correlation Distance +# D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|) +# [This is not a true metric, so we will leave it out.] +# +# cdef class CorrelationDistance(DistanceMetric): +# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0 +# cdef DTYPE_t tmp1, tmp2 +# +# cdef np.intp_t i +# for i in range(size): +# mu1 += x1[i] +# mu2 += x2[i] +# mu1 /= size +# mu2 /= size +# +# for i in range(size): +# tmp1 = x1[i] - mu1 +# tmp2 = x2[i] - mu2 +# x1nrm += tmp1 * tmp1 +# x2nrm += tmp2 * tmp2 +# x1Tx2 += tmp1 * tmp2 +# +# return (1. - x1Tx2) / sqrt(x1nrm * x2nrm) + + +# ------------------------------------------------------------ +# User-defined distance +# +cdef class PyFuncDistance(DistanceMetric): + """PyFunc Distance + A user-defined distance + Parameters + ---------- + func : function + func should take two numpy arrays as input, and return a distance. + """ + def __init__(self, func, **kwargs): + self.func = func + self.kwargs = kwargs + + # in cython < 0.26, GIL was required to be acquired during definition of + # the function and inside the body of the function. This behaviour is not + # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The + # only way to be back compatible is to inherit `dist` from the base class + # without GIL and called an inline `_dist` which acquire GIL. + cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + return self._dist(x1, x2, size) + + cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) except -1 with gil: + cdef np.ndarray x1arr + cdef np.ndarray x2arr + x1arr = _buffer_to_ndarray(x1, size) + x2arr = _buffer_to_ndarray(x2, size) + d = self.func(x1arr, x2arr, **self.kwargs) + try: + # Cython generates code here that results in a TypeError + # if d is the wrong type. + return d + except TypeError: + raise TypeError("Custom distance function must accept two " + "vectors and return a float.") + + +cdef inline double fmax(double a, double b) nogil: + return max(a, b) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 336f63053872d..c5da73de4e66f 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -32,7 +32,7 @@ from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from sklearn.metrics._dist_metrics import DistanceMetric +from .dist_metrics import DistanceMetric from ._trees import CondensedTree, SingleLinkageTree, MinimumSpanningTree from .prediction import PredictionData diff --git a/sklearn/cluster/_hdbscan/robust_single_linkage_.py b/sklearn/cluster/_hdbscan/robust_single_linkage_.py index 944bb254566a7..1d668ff1c00d7 100644 --- a/sklearn/cluster/_hdbscan/robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/robust_single_linkage_.py @@ -13,7 +13,7 @@ from ._hdbscan_linkage import mst_linkage_core, mst_linkage_core_vector, label from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from sklearn.metrics._dist_metrics import DistanceMetric +from .dist_metrics import DistanceMetric from ._hdbscan_reachability import mutual_reachability from ._trees import SingleLinkageTree from sklearn.neighbors import KDTree, BallTree From 543c35cbc16f9b3aecf92a07d6640e45156bdcb8 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 19:34:46 -0500 Subject: [PATCH 014/160] Improved hdbscan metric handling and testing - Removed internal minkowski metric parameter validation in favor of `sklearn.metrics` built-in handling - Removed default argument and presence of `p` in hdbscan functions - Now users must pass `p` in through `metric_params`, consistent w/ other metrics such as `wminkowski` and `mahalanobis` - Removed vestigial estimator check -- now supported via common tests - Fixed bug where `boruvka_kdtree` algorithm's accepted metrics were based off of `BallTree` not `KDTree` - Cleaned up lines with unused returns by indexing output of `hdbscan` - Greatly expanded scope of algorithm/metric compatability tests - Streamlined some other tests - Delted commented out tests --- sklearn/cluster/_hdbscan/hdbscan_.py | 31 +-- .../cluster/_hdbscan/tests/test_hdbscan.py | 239 +++++------------- 2 files changed, 62 insertions(+), 208 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index c5da73de4e66f..4bfe44f4e95ba 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -221,7 +221,6 @@ def _hdbscan_prims_kdtree( min_samples=5, alpha=1.0, metric="minkowski", - p=2, leaf_size=40, gen_min_span_tree=False, **kwargs, @@ -263,7 +262,6 @@ def _hdbscan_prims_balltree( min_samples=5, alpha=1.0, metric="minkowski", - p=2, leaf_size=40, gen_min_span_tree=False, **kwargs, @@ -302,7 +300,6 @@ def _hdbscan_boruvka_kdtree( min_samples=5, alpha=1.0, metric="minkowski", - p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, @@ -354,7 +351,6 @@ def _hdbscan_boruvka_balltree( min_samples=5, alpha=1.0, metric="minkowski", - p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, @@ -514,7 +510,6 @@ def hdbscan( cluster_selection_epsilon=0.0, max_cluster_size=0, metric="minkowski", - p=2, leaf_size=40, algorithm="best", memory=None, @@ -576,9 +571,6 @@ def hdbscan( If metric is "precomputed", X is assumed to be a distance matrix and must be square. - p : int, default=2 - Value of `p` if using the minkowski metric. - leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries. @@ -708,12 +700,6 @@ def hdbscan( if leaf_size < 1: raise ValueError("Leaf size must be greater than 0!") - if metric == "minkowski": - if p is None: - raise TypeError("Minkowski metric given but no p value supplied!") - if p < 0: - raise ValueError("Minkowski metric with negative p value is not defined!") - if match_reference_implementation: min_samples = min_samples - 1 min_cluster_size = min_cluster_size + 1 @@ -753,7 +739,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, gen_min_span_tree, **metric_params, @@ -768,7 +753,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, gen_min_span_tree, **metric_params, @@ -783,13 +767,12 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, gen_min_span_tree, **metric_params, ) elif algorithm == "boruvka_kdtree": - if metric not in BallTree.valid_metrics: + if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Boruvka with KDTree for this metric!") (single_linkage_tree, result_min_span_tree) = memory.cache( _hdbscan_boruvka_kdtree @@ -798,7 +781,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, approx_min_span_tree, gen_min_span_tree, @@ -821,7 +803,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, approx_min_span_tree, gen_min_span_tree, @@ -841,7 +822,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, gen_min_span_tree, **metric_params, @@ -857,7 +837,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, gen_min_span_tree, **metric_params, @@ -870,7 +849,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, approx_min_span_tree, gen_min_span_tree, @@ -888,7 +866,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, gen_min_span_tree, **metric_params, @@ -901,7 +878,6 @@ def hdbscan( min_samples, alpha, metric, - p, leaf_size, approx_min_span_tree, gen_min_span_tree, @@ -970,9 +946,6 @@ class HDBSCAN(BaseEstimator, ClusterMixin): A distance scaling parameter as used in robust single linkage. See [3]_ for more information. - p : int, default=None - Value of `p` if using the minkowski metric. - algorithm : str, default='best' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set @@ -1169,7 +1142,6 @@ def __init__( max_cluster_size=0, metric="euclidean", alpha=1.0, - p=None, algorithm="best", leaf_size=40, memory=Memory(cachedir=None, verbose=0), @@ -1188,7 +1160,6 @@ def __init__( self.max_cluster_size = max_cluster_size self.cluster_selection_epsilon = cluster_selection_epsilon self.metric = metric - self.p = p self.algorithm = algorithm self.leaf_size = leaf_size self.memory = memory diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index a3859d7e3337a..4cfcba2b17ba3 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -6,7 +6,6 @@ from scipy.spatial import distance from scipy import sparse from scipy import stats -from sklearn.utils.estimator_checks import check_estimator from sklearn.utils._testing import ( assert_array_equal, assert_array_almost_equal, @@ -26,7 +25,8 @@ from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler from scipy.stats import mode - +from sklearn.metrics.pairwise import _VALID_METRICS +from sklearn.neighbors import KDTree, BallTree from tempfile import mkdtemp import pytest @@ -89,7 +89,7 @@ def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) - labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") + labels = hdbscan(D, metric="precomputed")[0] # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == n_clusters @@ -112,7 +112,7 @@ def test_hdbscan_sparse_distance_matrix(): D = sparse.csr_matrix(D) D.eliminate_zeros() - labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") + labels = hdbscan(D, metric="precomputed")[0] # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == n_clusters @@ -123,7 +123,7 @@ def test_hdbscan_sparse_distance_matrix(): def test_hdbscan_feature_vector(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X) + labels = hdbscan(X)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -135,66 +135,55 @@ def test_hdbscan_feature_vector(): assert validity >= 0.4 -def test_hdbscan_prims_kdtree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_kdtree") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(algorithm="prims_kdtree", gen_min_span_tree=True).fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - - assert_raises(ValueError, hdbscan, X, algorithm="prims_kdtree", metric="russelrao") - - -def test_hdbscan_prims_balltree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_balltree") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(algorithm="prims_balltree", gen_min_span_tree=True).fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - - assert_raises(ValueError, hdbscan, X, algorithm="prims_balltree", metric="cosine") - - -def test_hdbscan_boruvka_kdtree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_kdtree") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(algorithm="boruvka_kdtree", gen_min_span_tree=True).fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - - assert_raises( - ValueError, hdbscan, X, algorithm="boruvka_kdtree", metric="russelrao" - ) - - -def test_hdbscan_boruvka_balltree(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="boruvka_balltree") +@pytest.mark.parametrize( + "algo", + [ + "prims_kdtree", + "prims_balltree", + "boruvka_kdtree", + "boruvka_balltree", + "generic", + "best", + ], +) +@pytest.mark.parametrize("metric", _VALID_METRICS) +def test_hdbscan_algorithms(algo, metric): + labels = hdbscan(X, algorithm=algo)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = ( - HDBSCAN(algorithm="boruvka_balltree", gen_min_span_tree=True).fit(X).labels_ - ) + labels = HDBSCAN(algorithm=algo, gen_min_span_tree=True).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters - assert_raises(ValueError, hdbscan, X, algorithm="boruvka_balltree", metric="cosine") - - -def test_hdbscan_generic(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="generic") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(algorithm="generic", gen_min_span_tree=True).fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + ALGOS_TREES = { + "prims_kdtree": KDTree, + "prims_balltree": BallTree, + "boruvka_kdtree": KDTree, + "boruvka_balltree": BallTree, + } + METRIC_PARAMS = { + "mahalanobis": {"V": np.eye(X.shape[1])}, + "seuclidean": {"V": np.ones(X.shape[1])}, + "minkowski": {"p": 2}, + "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, + } + if algo not in ("best", "generic"): + if metric not in ALGOS_TREES[algo].valid_metrics: + with pytest.raises(ValueError): + hdbscan( + X, + algorithm=algo, + metric=metric, + metric_params=METRIC_PARAMS.get(metric, None), + ) + else: + hdbscan( + X, + algorithm=algo, + metric=metric, + metric_params=METRIC_PARAMS.get(metric, None), + ) def test_hdbscan_dbscan_clustering(): @@ -242,7 +231,7 @@ def test_hdbscan_best_balltree_metric(): def test_hdbscan_no_clusters(): - labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X) + 1) + labels = hdbscan(X, min_cluster_size=len(X) + 1)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == 0 @@ -253,9 +242,7 @@ def test_hdbscan_no_clusters(): def test_hdbscan_min_cluster_size(): for min_cluster_size in range(2, len(X) + 1, 1): - labels, p, persist, ctree, ltree, mtree = hdbscan( - X, min_cluster_size=min_cluster_size - ) + labels = hdbscan(X, min_cluster_size=min_cluster_size)[0] true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size @@ -267,10 +254,9 @@ def test_hdbscan_min_cluster_size(): def test_hdbscan_callable_metric(): - # metric is the function reference, not the string key. metric = distance.euclidean - labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric) + labels = hdbscan(X, metric=metric)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -281,45 +267,23 @@ def test_hdbscan_callable_metric(): def test_hdbscan_input_lists(): X = [[1.0, 2.0], [3.0, 4.0]] - HDBSCAN().fit(X) # must not raise exception - - -def test_hdbscan_boruvka_kdtree_matches(): - - data = generate_noisy_data() - - labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") - labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( - data, algorithm="boruvka_kdtree" - ) - - num_mismatches = homogeneity(labels_prims, labels_boruvka) - - assert (num_mismatches / float(data.shape[0])) < 0.15 - - labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) - labels_boruvka = HDBSCAN(algorithm="boruvka_kdtree").fit_predict(data) - - num_mismatches = homogeneity(labels_prims, labels_boruvka) - - assert (num_mismatches / float(data.shape[0])) < 0.15 + HDBSCAN().fit(X) -def test_hdbscan_boruvka_balltree_matches(): +@pytest.mark.parametrize("tree", ["kdtree", "balltree"]) +def test_hdbscan_boruvka_matches(tree): data = generate_noisy_data() - labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") - labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( - data, algorithm="boruvka_balltree" - ) + labels_prims = hdbscan(data, algorithm="generic")[0] + labels_boruvka = hdbscan(data, algorithm=f"boruvka_{tree}")[0] num_mismatches = homogeneity(labels_prims, labels_boruvka) assert (num_mismatches / float(data.shape[0])) < 0.15 labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) - labels_boruvka = HDBSCAN(algorithm="boruvka_balltree").fit_predict(data) + labels_boruvka = HDBSCAN(algorithm=f"boruvka_{tree}").fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) @@ -341,44 +305,13 @@ def test_hdbscan_outliers(): assert scores is not None -# def test_hdbscan_unavailable_attributes(): -# clusterer = HDBSCAN(gen_min_span_tree=False) -# with warnings.catch_warnings(record=True) as w: -# tree = clusterer.condensed_tree_ -# assert len(w) > 0 -# assert tree is None -# with warnings.catch_warnings(record=True) as w: -# tree = clusterer.single_linkage_tree_ -# assert len(w) > 0 -# assert tree is None -# with warnings.catch_warnings(record=True) as w: -# scores = clusterer.outlier_scores_ -# assert len(w) > 0 -# assert scores is None -# with warnings.catch_warnings(record=True) as w: -# tree = clusterer.minimum_spanning_tree_ -# assert len(w) > 0 -# assert tree is None - - -# def test_hdbscan_min_span_tree_availability(): -# clusterer = HDBSCAN().fit(X) -# tree = clusterer.minimum_spanning_tree_ -# assert tree is None -# D = distance.squareform(distance.pdist(X)) -# D /= np.max(D) -# HDBSCAN(metric='precomputed').fit(D) -# tree = clusterer.minimum_spanning_tree_ -# assert tree is None - - def test_hdbscan_approximate_predict(): clusterer = HDBSCAN(prediction_data=True).fit(X) - cluster, prob = approximate_predict(clusterer, np.array([[-1.5, -1.0]])) + cluster, _ = approximate_predict(clusterer, np.array([[-1.5, -1.0]])) assert cluster == 2 - cluster, prob = approximate_predict(clusterer, np.array([[1.5, -1.0]])) + cluster, _ = approximate_predict(clusterer, np.array([[1.5, -1.0]])) assert cluster == 1 - cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]])) + cluster, _ = approximate_predict(clusterer, np.array([[0.0, 0.0]])) assert cluster == -1 @@ -403,32 +336,6 @@ def test_hdbscan_approximate_predict_score(): assert scores.max() <= 1 -# def test_hdbscan_membership_vector(): -# clusterer = HDBSCAN(prediction_data=True).fit(X) -# vector = membership_vector(clusterer, np.array([[-1.5, -1.0]])) -# assert_array_almost_equal( -# vector, -# np.array([[ 0.05705305, 0.05974177, 0.12228153]])) -# vector = membership_vector(clusterer, np.array([[1.5, -1.0]])) -# assert_array_almost_equal( -# vector, -# np.array([[ 0.09462176, 0.32061556, 0.10112905]])) -# vector = membership_vector(clusterer, np.array([[0.0, 0.0]])) -# assert_array_almost_equal( -# vector, -# np.array([[ 0.03545607, 0.03363318, 0.04643177]])) -# -# def test_hdbscan_all_points_membership_vectors(): -# clusterer = HDBSCAN(prediction_data=True).fit(X) -# vects = all_points_membership_vectors(clusterer) -# assert_array_almost_equal(vects[0], np.array([7.86400992e-002, -# 2.52734246e-001, -# 8.38299608e-002])) -# assert_array_almost_equal(vects[-1], np.array([8.09055344e-001, -# 8.35882503e-002, -# 1.07356406e-001])) - - def test_hdbscan_all_points_membership_vectors(): clusterer = HDBSCAN(prediction_data=True, min_cluster_size=200).fit(X) vects = all_points_membership_vectors(clusterer) @@ -443,16 +350,6 @@ def test_hdbscan_badargs(): assert_raises(ValueError, hdbscan, X, min_samples=-1) assert_raises(ValueError, hdbscan, X, metric="imperial") assert_raises(ValueError, hdbscan, X, metric=None) - assert_raises(ValueError, hdbscan, X, metric="minkowski", p=-1) - assert_raises( - ValueError, hdbscan, X, metric="minkowski", p=-1, algorithm="prims_kdtree" - ) - assert_raises( - ValueError, hdbscan, X, metric="minkowski", p=-1, algorithm="prims_balltree" - ) - assert_raises( - ValueError, hdbscan, X, metric="minkowski", p=-1, algorithm="boruvka_balltree" - ) assert_raises( ValueError, hdbscan, X, metric="precomputed", algorithm="boruvka_kdtree" ) @@ -493,7 +390,7 @@ def test_hdbscan_caching(): def test_hdbscan_centroids_medoids(): centers = [(0.0, 0.0), (3.0, 3.0)] - H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) + H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) clusterer = HDBSCAN().fit(H) for idx, center in enumerate(centers): @@ -535,17 +432,3 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 assert counts[unique_labels == -1] == 2 - - -# Disable for now -- need to refactor to meet newer standards -@pytest.mark.skip(reason="need to refactor to meet newer standards") -def test_hdbscan_is_sklearn_estimator(): - check_estimator(HDBSCAN) - - -# Probably not applicable now # -# def test_dbscan_sparse(): -# def test_dbscan_balltree(): -# def test_pickle(): -# def test_dbscan_core_samples_toy(): -# def test_boundaries(): From ce945917bcd0b69c8e05873df67a5745fe61c45d Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 20:23:43 -0500 Subject: [PATCH 015/160] Docstring compliance for `flat.py` --- sklearn/cluster/_hdbscan/flat.py | 289 ++++++++++++++++++------------- 1 file changed, 165 insertions(+), 124 deletions(-) diff --git a/sklearn/cluster/_hdbscan/flat.py b/sklearn/cluster/_hdbscan/flat.py index eaff77d1645d6..12c428b36fbbb 100644 --- a/sklearn/cluster/_hdbscan/flat.py +++ b/sklearn/cluster/_hdbscan/flat.py @@ -58,58 +58,71 @@ def HDBSCAN_flat( **kwargs, ): """ - Train a HDBSCAN clusterer by specifying n_clusters. - Or, modify a trained clusterer to return specific n_clusters. + Train a HDBSCAN clusterer by specifying `n_clusters`. + + Or, modify a trained clusterer to return specific `n_clusters`. Parameters ---------- - X: array-like - Data to be passed to HDBSCAN for training. + X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ + array of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + `metric='precomputed'`. - n_clusters: int, default=None - Number of clusters to produce. - If None, revert to default HDBSCAN + n_clusters : int, default=None + Number of clusters to produce. If `None`, revert to default `HDBSCAN`. - cluster_selection_epsilon: float, default=0. - core-distance below which to stop splitting clusters. - This can indirectly impose n_clusters. - This argument is ignored if n_clusters is supplied. + cluster_selection_epsilon : float, default=0 + Core-distance below which to stop splitting clusters. This can + indirectly impose `n_clusters`. This argument is ignored if + `n_clusters` is supplied. - clusterer: HDBSCAN, default=None - If supplied, modify this clusterer to produce n_clusters clusters. + clusterer : HDBSCAN, default=None + If supplied, modify this clusterer to produce `n_clusters` clusters. - inplace: bool, default=False - If 'clusterer' parameter is supplied, and inplace is True, - modify the previous clusterer inplace. - If False, return a modified copy of the previous clusterer. + inplace : bool, default=False + If 'clusterer' parameter is supplied, and `inplace=True`, modify + `clusterer` inplace. If `inplace=False`, return a modified copy of + `clusterer`. - **kwargs: keyword arguments - All init arguments for HDBSCAN + **kwargs : keyword arguments + All keyword arguments to pass to `HDBSCAN`. Returns ------- - new_clusterer: HDBSCAN - New HDBSCAN instance; returned irrespective of inplace=True or False + new_clusterer : HDBSCAN + New `HDBSCAN` instance; returned irrespective of `inplace`. - Usage - ----- - # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters - clusterer = HDBSCAN_flat(X_train, n_clusters=7, - min_cluster_size=12, min_samples=8) - labels = clusterer.labels_ - proba = clusterer.probabilities_ - - # Use a previously initialized/trained HDBSCAN - old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8) - clusterer = HDBSCAN_flat(X_train, n_clusters=7, - clusterer=old_clusterer, inplace=True) - labels = clusterer.labels_ - proba = clusterer.probabilities_ + Examples + -------- + >>> from sklearn.cluster import HDBSCAN, HDBSCAN_flat + >>> from sklearn.datasets import make_blobs + >>> from sklearn.utils import shuffle + >>> from sklearn.preprocessing import StandardScaler + >>> + >>> X, y = make_blobs(n_samples=200, random_state=10) + >>> X, y = shuffle(X, y, random_state=7) + >>> X = StandardScaler().fit_transform(X) + >>> + >>> # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters + >>> clusterer = HDBSCAN_flat(X, n_clusters=7, + ... min_cluster_size=12, min_samples=8) + >>> labels = clusterer.labels_ + >>> proba = clusterer.probabilities_ + >>> + >>> # Use a previously initialized/trained HDBSCAN + >>> old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8) + >>> clusterer = HDBSCAN_flat(X, n_clusters=7, + ... clusterer=old_clusterer, inplace=True) + >>> labels = clusterer.labels_ + >>> proba = clusterer.probabilities_ See Also --------- - :py:func:`hdbscan.HDBSCAN` - :py:func:`re_init` + sklearn.cluster.hdbscan.HDBSCAN: Perform HDBSCAN clustering from vector + array or distance matrix. + sklearn.cluster.hdbscan.flat.re_init: Modify PredictionData of HDBSCAN to + account for epsilon. """ # Handle the trivial case first. if (n_clusters is None) and (cluster_selection_epsilon == 0.0): @@ -222,74 +235,88 @@ def approximate_predict_flat( return_prediction_data=False, ): """ - Predict the cluster label of new points at a particular flat clustering, - specified by n_clusters. This is a modified version of - hdbscan.approximate_predict to allow selection of n_clusters. + Predict the cluster label of new points at a particular flat clustering. + + The clustering produced is specified by `n_clusters`. This is a modified + version of `hdbscan.approximate_predict` to allow selection of + `n_clusters`. Parameters ---------- clusterer : HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. + A clustering object that has been fit to the data and either had + `prediction_data=True` set, or called the `generate_prediction_data` + method after the fact. points_to_predict : array, or array-like (n_samples, n_features) The new data points to predict cluster labels for. They should have the same dimensionality as the original dataset over which - clusterer was fit. + `clusterer` was fit. - n_clusters: int, default=None + n_clusters : int, default=None The number of clusters to have in the flat clustering (over the training data, not points_to_predict) Ignored when prediction_data is supplied. - cluster_selection_epsilon: float, default=None - core-distance below which to stop splitting clusters. - This can indirectly impose n_clusters. - This argument is ignored if n_clusters is supplied. + cluster_selection_epsilon : float, default=None + Core-distance below which to stop splitting clusters. This can + indirectly impose `n_clusters`. This argument is ignored if + `n_clusters` is supplied. - prediction_data: PredictionData, default=None + prediction_data : PredictionData, default=None If supplied, use this to predict clusters for points_to_predict. This allows predicting on multiple datasets without corrupting - prediction data associated with clusterer. + prediction data associated with `clusterer`. - If neither n_clusters, nor prediction_data are supplied, - then the prediction_data associated with clusterer is used. + If neither `n_clusters`, nor `prediction_data` are supplied, + then the `prediction_data` associated with `clusterer` is used. - return_prediction_data: bool, default=False - If True, return prediction_data along with labels and proba. + return_prediction_data : bool, default=False + If True, return `prediction_data` along with labels and proba. Returns ------- labels : array (n_samples,) - The predicted labels of the ``points_to_predict`` + The predicted labels of the ``points_to_predict``. probabilities : array (n_samples,) - The soft cluster scores for each of the ``points_to_predict`` + The soft cluster scores for each of the ``points_to_predict``. - prediction_data: PredictionData, optional - prediction_data used to predict. - Returned if return_prediciton_data is set to True. - - - Usage - ----- - # From a fitted HDBSCAN model, predict for n_clusters=5 - labels, proba = approximate_predict_flat( - clusterer, X_predict, n_clusters=5) - - # Store prediciton data for later use. - labels, proba, pred_data = approximate_predict_flat( - clusterer, X_predict, n_clusters=5, - return_prediction_data=True) - # and use this prediction data to predict on new points - labels1, proba1 = approximate_predict_flat( - clusterer, X_pred1, - prediction_data=pred_data) + prediction_data : PredictionData, optional + The `prediction_data` used to predict. Returned if + `return_prediciton_data=True`. + Examples + -------- + >>> from sklearn.cluster import HDBSCAN, approximate_predict_flat + >>> from sklearn.datasets import make_blobs + >>> from sklearn.utils import shuffle + >>> from sklearn.preprocessing import StandardScaler + >>> + >>> X, y = make_blobs(n_samples=200, random_state=10) + >>> X, y = shuffle(X, y, random_state=7) + >>> X = StandardScaler().fit_transform(X) + >>> + >>> hdb = HDBSCAN(prediction_data=True) + >>> hdb.fit(X) + HDBSCAN(prediction_data=True) + >>> # From a fitted HDBSCAN model, predict for n_clusters=5 + >>> labels, proba = approximate_predict_flat( + ... hdb, X, n_clusters=5) + >>> + >>> # Store prediciton data for later use. + >>> labels, proba, pred_data = approximate_predict_flat( + ... hdb, X, n_clusters=5, + ... return_prediction_data=True) + >>> + >>> # Use this prediction data to predict on new points + >>> labels1, proba1 = approximate_predict_flat( + ... hdb, X, + ... prediction_data=pred_data) See Also --------- - :py:func:`hdbscan.prediction.approximate_predict` + sklearn.cluster.hdbscan.prediction.approximate_predict : Predict the + cluster label of new points. """ # Get number of fitted clusters for later use. n_clusters_fit = np.sum(np.unique(clusterer.labels_) >= 0) @@ -399,39 +426,36 @@ def membership_vector_flat( cluster_selection_epsilon=0.0, ): """ - (Adaptation of hdbscan's membership_vector for n_clusters, epsilon) - Predict soft cluster membership probabilities; - a vector for each point in ``points_to_predict`` that gives - a probability that the given point is a member of a cluster - for each of the selected clusters of the ``clusterer``. + Predict soft cluster membership probabilities. + + Produces a vector for each point in ``points_to_predict`` that gives a + probability that the given point is a member of a cluster for each of the + selected clusters of the ``clusterer``. Parameters ---------- - clusterer: HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. + clusterer : HDBSCAN + A clustering object that has been fit to the data and either had + `prediction_data=True` set, or called the `generate_prediction_data` + method after the fact. - points_to_predict: array, or array-like (n_samples, n_features) + points_to_predict : array, or array-like (n_samples, n_features) The new data points to predict cluster labels for. They should have the same dimensionality as the original dataset over which clusterer was fit. - prediction_data: PredictionData, default=None - Prediction data associated with HDBSCAN for some flat clustering + prediction_data : PredictionData, default=None + Prediction data associated with HDBSCAN for some flat clustering. - n_clusters: int, default=None + n_clusters : int, default=None Number of clusters over which to compute membership probabilities. These clusters are obtained as a flat clustering at some - cluster_selection_epsilon. - - cluster_selection_epsilon: float, default=0. - core-distance below which to stop splitting clusters. - This can indirectly impose n_clusters. - This argument is ignored if n_clusters is supplied. + `cluster_selection_epsilon`. - Note: If neither n_clusters nor cluster_selection_epsilon are supplied, - the clusterer's original clustering is used. + cluster_selection_epsilon : float, default=0 + Core-distance below which to stop splitting clusters. This can + indirectly impose `n_clusters`. This argument is ignored if + `n_clusters` is supplied. Returns ------- @@ -441,8 +465,18 @@ def membership_vector_flat( See Also -------- - :py:func:`hdbscan.predict.membership_vector` - :py:func:`hdbscan.predict.all_points_membership_vectors` + sklearn.cluster.hdbscan.prediction.membership_vectors : Predict soft cluster + membership. + sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict + soft cluster membership vectors for all points in the original dataset + the clusterer was trained on. + + Notes + ----- + This function is an adaptation of hdbscan's membership_vector for + `n_clusters`, `epsilon`. If neither `n_clusters` nor + `cluster_selection_epsilon` are supplied, the `clusterer`'s original + clustering is used. """ points_to_predict = points_to_predict.astype(np.float64) # Extract condensed tree for later use @@ -555,47 +589,54 @@ def all_points_membership_vectors_flat( clusterer, prediction_data=None, n_clusters=None, cluster_selection_epsilon=None ): """ - (Adaptation of hdbscan's all_points_membership_vector - for n_clusters, epsilon) - Predict soft cluster membership vectors for all points in the - original dataset the clusterer was trained on. This function is more + Predict soft cluster membership vectors for all points in the dataset. + + This function predicts soft cluster membership vectors for all the points + in the dataset that the clusterer was trained on. This function is more efficient by making use of the fact that all points are already in the condensed tree, and processing in bulk. Parameters ---------- clusterer : HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. - This method does not work if the clusterer was trained - with ``metric='precomputed'``. + A clustering object that has been fit to the data and either had + `prediction_data=True` set, or called the `generate_prediction_data` + method after the fact. This method does not work if the clusterer was + trained with `metric='precomputed'`. - prediction_data: PredictionData, default=None - Prediction data associated with HDBSCAN for some flat clustering + prediction_data : PredictionData, default=None + Prediction data associated with HDBSCAN for some flat clustering. - n_clusters: int, optional, default=None + n_clusters : int, default=None Number of clusters over which to compute membership probabilities. These clusters are obtained as a flat clustering at some - cluster_selection_epsilon. - - cluster_selection_epsilon: float, optional, default=None - core-distance below which to stop splitting clusters. - This can indirectly impose n_clusters. - This argument is ignored if n_clusters is supplied. + `cluster_selection_epsilon`. - Note: If neither n_clusters nor cluster_selection_epsilon are supplied, - the clusterer's original clustering is used. + cluster_selection_epsilon : float, default=0 + Core-distance below which to stop splitting clusters. This can + indirectly impose `n_clusters`. This argument is ignored if + `n_clusters` is supplied. Returns ------- membership_vectors : array (n_samples, n_clusters) - The probability that point ``i`` of the original dataset is a member of - cluster ``j`` is in ``membership_vectors[i, j]``. + The probability that point `i` of the original dataset is a member of + cluster `j` is in `membership_vectors[i, j]`. + See Also -------- - :py:func:`hdbscan.prediction.all_points_membership_vectors` - :py:func:`hdbscan.prediction.membership_vector` + sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict + soft cluster membership vectors for all points in the original dataset + the clusterer was trained on. + sklearn.cluster.hdbscan.prediction.membership_vectors : Predict soft cluster + membership. + + Notes + ----- + This function is an adaptation of hdbscan's `all_points_membership_vector` + for `n_clusters`, `epsilon`. If neither `n_clusters` nor + `cluster_selection_epsilon` are supplied, the `clusterer`'s original + clustering is used. """ # Extract condensed tree for later use condensed_tree = clusterer.condensed_tree_ From e93bfe164fcd437e9bb4c19a9e24cf92c4fe956a Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 20:28:14 -0500 Subject: [PATCH 016/160] Renamed `flat.py` --> `_flat.py` --- sklearn/cluster/__init__.py | 2 +- sklearn/cluster/_hdbscan/{flat.py => _flat.py} | 0 sklearn/cluster/_hdbscan/tests/test_flat.py | 5 +++-- 3 files changed, 4 insertions(+), 3 deletions(-) rename sklearn/cluster/_hdbscan/{flat.py => _flat.py} (100%) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 8d89bff955d8c..278eedf6d5303 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -31,7 +31,7 @@ all_points_membership_vectors, approximate_predict_scores, ) -from ._hdbscan.flat import ( +from ._hdbscan._flat import ( HDBSCAN_flat, approximate_predict_flat, membership_vector_flat, diff --git a/sklearn/cluster/_hdbscan/flat.py b/sklearn/cluster/_hdbscan/_flat.py similarity index 100% rename from sklearn/cluster/_hdbscan/flat.py rename to sklearn/cluster/_hdbscan/_flat.py diff --git a/sklearn/cluster/_hdbscan/tests/test_flat.py b/sklearn/cluster/_hdbscan/tests/test_flat.py index 25073ce0d23ec..db8344e1b612d 100644 --- a/sklearn/cluster/_hdbscan/tests/test_flat.py +++ b/sklearn/cluster/_hdbscan/tests/test_flat.py @@ -4,8 +4,9 @@ import warnings import numpy as np -from sklearn.cluster import HDBSCAN, approximate_predict -from sklearn.cluster._hdbscan.flat import ( +from sklearn.cluster import ( + HDBSCAN, + approximate_predict, HDBSCAN_flat, approximate_predict_flat, membership_vector_flat, From 028e98f8e7c84cabc048652b3d1528e0b1187c35 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 20:36:50 -0500 Subject: [PATCH 017/160] Renamed `flat.py`-->`_flat.py` --- sklearn/cluster/__init__.py | 2 +- sklearn/cluster/_hdbscan/_flat.py | 2 +- sklearn/cluster/_hdbscan/{prediction.py => _prediction.py} | 0 sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- sklearn/cluster/_hdbscan/tests/test_prediction_utils.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename sklearn/cluster/_hdbscan/{prediction.py => _prediction.py} (100%) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 278eedf6d5303..5b90bfe6ba223 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -25,7 +25,7 @@ from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan from ._hdbscan.robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage from ._hdbscan.validity import validity_index -from ._hdbscan.prediction import ( +from ._hdbscan._prediction import ( approximate_predict, membership_vector, all_points_membership_vectors, diff --git a/sklearn/cluster/_hdbscan/_flat.py b/sklearn/cluster/_hdbscan/_flat.py index 12c428b36fbbb..328d411de9757 100644 --- a/sklearn/cluster/_hdbscan/_flat.py +++ b/sklearn/cluster/_hdbscan/_flat.py @@ -33,7 +33,7 @@ from ._hdbscan_tree import compute_stability, get_cluster_tree_leaves from .hdbscan_ import HDBSCAN, _tree_to_labels from ._trees import _bfs_from_cluster_tree -from .prediction import ( +from ._prediction import ( PredictionData, _find_cluster_and_probability, _find_neighbor_and_lambda, diff --git a/sklearn/cluster/_hdbscan/prediction.py b/sklearn/cluster/_hdbscan/_prediction.py similarity index 100% rename from sklearn/cluster/_hdbscan/prediction.py rename to sklearn/cluster/_hdbscan/_prediction.py diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 4bfe44f4e95ba..35aa0a3282cb0 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -35,7 +35,7 @@ from .dist_metrics import DistanceMetric from ._trees import CondensedTree, SingleLinkageTree, MinimumSpanningTree -from .prediction import PredictionData +from ._prediction import PredictionData FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] diff --git a/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py b/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py index a6eba19d99d11..c6241c63d3713 100644 --- a/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py +++ b/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py @@ -7,6 +7,6 @@ def test_safe_always_positive_division(denominator): numerator = 1 # Given negative, zero and positive denominator and positive numerator - value = safe_always_positive_division(numerator, 0) + value = safe_always_positive_division(numerator, denominator) # Make sure safe division is always positive and doesn't raise ZeroDivision error assert value >= 0 From a1ac99a46cb235b4da07f5e23a77845df2f97e94 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 20:42:19 -0500 Subject: [PATCH 018/160] Renamed `validity.py`-->`_validity.py` --- sklearn/cluster/__init__.py | 2 +- sklearn/cluster/_hdbscan/{validity.py => _validity.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename sklearn/cluster/_hdbscan/{validity.py => _validity.py} (100%) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 5b90bfe6ba223..3978efba7e66f 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -24,7 +24,7 @@ from ._birch import Birch from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan from ._hdbscan.robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage -from ._hdbscan.validity import validity_index +from ._hdbscan._validity import validity_index from ._hdbscan._prediction import ( approximate_predict, membership_vector, diff --git a/sklearn/cluster/_hdbscan/validity.py b/sklearn/cluster/_hdbscan/_validity.py similarity index 100% rename from sklearn/cluster/_hdbscan/validity.py rename to sklearn/cluster/_hdbscan/_validity.py From 788d4bc1ce290c97381a907c9c91b1db3e1fde99 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 6 Mar 2022 20:48:00 -0500 Subject: [PATCH 019/160] Renamed `robust_single_linkage_.py` --- sklearn/cluster/__init__.py | 2 +- .../{robust_single_linkage_.py => _robust_single_linkage_.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename sklearn/cluster/_hdbscan/{robust_single_linkage_.py => _robust_single_linkage_.py} (100%) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 3978efba7e66f..840505c720094 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -23,7 +23,7 @@ from ._bicluster import SpectralBiclustering, SpectralCoclustering from ._birch import Birch from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan -from ._hdbscan.robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage +from ._hdbscan._robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage from ._hdbscan._validity import validity_index from ._hdbscan._prediction import ( approximate_predict, diff --git a/sklearn/cluster/_hdbscan/robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py similarity index 100% rename from sklearn/cluster/_hdbscan/robust_single_linkage_.py rename to sklearn/cluster/_hdbscan/_robust_single_linkage_.py From cf4f239c0bd7429fb5014b57a88c2c6f817eb07f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 9 Mar 2022 11:21:51 -0500 Subject: [PATCH 020/160] Removed `_flat_.py` and associated tests --- sklearn/cluster/__init__.py | 10 - sklearn/cluster/_hdbscan/_flat.py | 1024 ------------------- sklearn/cluster/_hdbscan/tests/test_flat.py | 404 -------- 3 files changed, 1438 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_flat.py delete mode 100644 sklearn/cluster/_hdbscan/tests/test_flat.py diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 840505c720094..1e728d8b8d6ab 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -31,12 +31,6 @@ all_points_membership_vectors, approximate_predict_scores, ) -from ._hdbscan._flat import ( - HDBSCAN_flat, - approximate_predict_flat, - membership_vector_flat, - all_points_membership_vectors_flat, -) __all__ = [ "AffinityPropagation", @@ -73,8 +67,4 @@ "membership_vector", "all_points_membership_vectors", "approximate_predict_scores", - "HDBSCAN_flat", - "approximate_predict_flat", - "membership_vector_flat", - "all_points_membership_vectors_flat", ] diff --git a/sklearn/cluster/_hdbscan/_flat.py b/sklearn/cluster/_hdbscan/_flat.py deleted file mode 100644 index 328d411de9757..0000000000000 --- a/sklearn/cluster/_hdbscan/_flat.py +++ /dev/null @@ -1,1024 +0,0 @@ -"""flat.py - -Provides alternative functions to hdbscan.HDBSCAN and others to -1. Allow prediction on a flat clustering by specifying 'n_clusters'. - This is done by choosing the best cluster_selection_epsilon that produces - the required number of clusters without adding unnecessary outliers. -2. Makes approximate_predict, membership_vector, and - all_points_membership_vectors consistent with cluster_selection_epsilon - -Provides the following functions: -================================== -HDBSCAN_flat: trained HDBSCAN instance with 'n_clusters' clusters - The attributes (labels, probabilities, prediction_data) are tuned to - produce 'n_clusters' clusters. - -approximate_predict_flat: labels and probabilities for novel points - Allows selecting n_clusters for novel points, or using the - original clustering (potentially specified using cluster_selection_epsilon) - -membership_vector_flat: Soft-clustering probabilities for novel points - Similar to approximate_predict_flat, but for soft-clustering. - **Use with caution** - -all_points_membership_vectors_flat: Soft-clustering probabilities - Similar to membership_vector_flat, but for points in training set - **Use with caution** -""" - -import copy -from warnings import warn - -import numpy as np -from ._hdbscan_tree import compute_stability, get_cluster_tree_leaves -from .hdbscan_ import HDBSCAN, _tree_to_labels -from ._trees import _bfs_from_cluster_tree -from ._prediction import ( - PredictionData, - _find_cluster_and_probability, - _find_neighbor_and_lambda, -) -from ._prediction_utils import ( - get_tree_row_with_child, - dist_membership_vector, - outlier_membership_vector, - prob_in_some_cluster, - all_points_dist_membership_vector, - all_points_outlier_membership_vector, - all_points_prob_in_some_cluster, -) - - -def HDBSCAN_flat( - X, - n_clusters=None, - cluster_selection_epsilon=0.0, - clusterer=None, - inplace=False, - **kwargs, -): - """ - Train a HDBSCAN clusterer by specifying `n_clusters`. - - Or, modify a trained clusterer to return specific `n_clusters`. - - Parameters - ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) - A feature array, or array of distances between samples if - `metric='precomputed'`. - - n_clusters : int, default=None - Number of clusters to produce. If `None`, revert to default `HDBSCAN`. - - cluster_selection_epsilon : float, default=0 - Core-distance below which to stop splitting clusters. This can - indirectly impose `n_clusters`. This argument is ignored if - `n_clusters` is supplied. - - clusterer : HDBSCAN, default=None - If supplied, modify this clusterer to produce `n_clusters` clusters. - - inplace : bool, default=False - If 'clusterer' parameter is supplied, and `inplace=True`, modify - `clusterer` inplace. If `inplace=False`, return a modified copy of - `clusterer`. - - **kwargs : keyword arguments - All keyword arguments to pass to `HDBSCAN`. - - Returns - ------- - new_clusterer : HDBSCAN - New `HDBSCAN` instance; returned irrespective of `inplace`. - - Examples - -------- - >>> from sklearn.cluster import HDBSCAN, HDBSCAN_flat - >>> from sklearn.datasets import make_blobs - >>> from sklearn.utils import shuffle - >>> from sklearn.preprocessing import StandardScaler - >>> - >>> X, y = make_blobs(n_samples=200, random_state=10) - >>> X, y = shuffle(X, y, random_state=7) - >>> X = StandardScaler().fit_transform(X) - >>> - >>> # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters - >>> clusterer = HDBSCAN_flat(X, n_clusters=7, - ... min_cluster_size=12, min_samples=8) - >>> labels = clusterer.labels_ - >>> proba = clusterer.probabilities_ - >>> - >>> # Use a previously initialized/trained HDBSCAN - >>> old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8) - >>> clusterer = HDBSCAN_flat(X, n_clusters=7, - ... clusterer=old_clusterer, inplace=True) - >>> labels = clusterer.labels_ - >>> proba = clusterer.probabilities_ - - See Also - --------- - sklearn.cluster.hdbscan.HDBSCAN: Perform HDBSCAN clustering from vector - array or distance matrix. - sklearn.cluster.hdbscan.flat.re_init: Modify PredictionData of HDBSCAN to - account for epsilon. - """ - # Handle the trivial case first. - if (n_clusters is None) and (cluster_selection_epsilon == 0.0): - if (not isinstance(clusterer, HDBSCAN)) or (not inplace): - # Always generate prediction_data to avoid later woes - kwargs["prediction_data"] = True - new_clusterer = HDBSCAN(**kwargs) - else: - new_clusterer = clusterer - new_clusterer.prediction_data = True - - new_clusterer.fit(X) - return new_clusterer - - if (n_clusters is not None) and (cluster_selection_epsilon != 0.0): - warn( - f"'cluster_selection_epsilon' (={cluster_selection_epsilon})" - " is ignored when 'n_clusters' is supplied." - ) - cluster_selection_epsilon = 0.0 - # This will later be chosen according to n_clusters - - if not isinstance(clusterer, HDBSCAN): - # Initialize and train clusterer if one was not previously supplied. - # Always generate prediction data - kwargs["prediction_data"] = True - new_clusterer = HDBSCAN(**kwargs) - # We do not pass cluster_selection_epsilon here. - # While this adds unnecessary computation, it makes the code - # easier to read and debug. - new_clusterer.fit(X) - else: - if inplace: - new_clusterer = clusterer - else: - new_clusterer = copy.deepcopy(clusterer) - - new_clusterer.prediction_data = True - - # Train on 'X'. Do this even if the supplied clusterer was trained, - # because we want to make sure it fits 'X'. - new_clusterer.prediction_data = True - new_clusterer.fit(X) - - if new_clusterer.cluster_selection_method == "eom": - max_eom_clusters = len(new_clusterer.condensed_tree_._select_clusters()) - - # Pick an epsilon value right after a split produces n_clusters, - # and the don't split further for smaller epsilon (larger lambda) - if n_clusters is not None: - if (new_clusterer.cluster_selection_method == "eom") and ( - n_clusters > max_eom_clusters - ): - warn( - f"Cannot predict more than {max_eom_clusters} with cluster " - "selection method 'eom'. Changing to method 'leaf'..." - ) - new_clusterer.cluster_selection_method = "leaf" - epsilon = select_epsilon(new_clusterer.condensed_tree_, n_clusters) - else: - # Or use the specified cluster_selection_epsilon - epsilon = cluster_selection_epsilon - - new_clusterer.cluster_selection_epsilon = float(epsilon) - - # Extract tree related stuff, in order to re-assign labels - single_linkage_tree = new_clusterer.single_linkage_tree_ - single_linkage_tree = single_linkage_tree.to_numpy() - min_cluster_size = new_clusterer.min_cluster_size - cluster_selection_method = new_clusterer.cluster_selection_method - allow_single_cluster = new_clusterer.allow_single_cluster - match_reference_implementation = False - - # Get labels according to the required cluster_selection_epsilon - output = _tree_to_labels( - None, - single_linkage_tree, - min_cluster_size, - cluster_selection_method, - allow_single_cluster, - match_reference_implementation, - cluster_selection_epsilon=epsilon, - ) - - # Reflect the related changes in HDBSCAN. - ( - new_clusterer.labels_, - new_clusterer.probabilities_, - new_clusterer.cluster_persistence_, - new_clusterer._condensed_tree, - new_clusterer._single_linkage_tree, - ) = output - - # PredictionData attached to HDBSCAN should also change. - # A function re_init is defined in this module to handle this. - re_init( - new_clusterer.prediction_data_, - new_clusterer.condensed_tree_, - cluster_selection_epsilon=epsilon, - ) - return new_clusterer - - -def approximate_predict_flat( - clusterer, - points_to_predict, - n_clusters=None, - cluster_selection_epsilon=None, - prediction_data=None, - return_prediction_data=False, -): - """ - Predict the cluster label of new points at a particular flat clustering. - - The clustering produced is specified by `n_clusters`. This is a modified - version of `hdbscan.approximate_predict` to allow selection of - `n_clusters`. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and either had - `prediction_data=True` set, or called the `generate_prediction_data` - method after the fact. - - points_to_predict : array, or array-like (n_samples, n_features) - The new data points to predict cluster labels for. They should - have the same dimensionality as the original dataset over which - `clusterer` was fit. - - n_clusters : int, default=None - The number of clusters to have in the flat clustering - (over the training data, not points_to_predict) - Ignored when prediction_data is supplied. - - cluster_selection_epsilon : float, default=None - Core-distance below which to stop splitting clusters. This can - indirectly impose `n_clusters`. This argument is ignored if - `n_clusters` is supplied. - - prediction_data : PredictionData, default=None - If supplied, use this to predict clusters for points_to_predict. - This allows predicting on multiple datasets without corrupting - prediction data associated with `clusterer`. - - If neither `n_clusters`, nor `prediction_data` are supplied, - then the `prediction_data` associated with `clusterer` is used. - - return_prediction_data : bool, default=False - If True, return `prediction_data` along with labels and proba. - - Returns - ------- - labels : array (n_samples,) - The predicted labels of the ``points_to_predict``. - - probabilities : array (n_samples,) - The soft cluster scores for each of the ``points_to_predict``. - - prediction_data : PredictionData, optional - The `prediction_data` used to predict. Returned if - `return_prediciton_data=True`. - - Examples - -------- - >>> from sklearn.cluster import HDBSCAN, approximate_predict_flat - >>> from sklearn.datasets import make_blobs - >>> from sklearn.utils import shuffle - >>> from sklearn.preprocessing import StandardScaler - >>> - >>> X, y = make_blobs(n_samples=200, random_state=10) - >>> X, y = shuffle(X, y, random_state=7) - >>> X = StandardScaler().fit_transform(X) - >>> - >>> hdb = HDBSCAN(prediction_data=True) - >>> hdb.fit(X) - HDBSCAN(prediction_data=True) - >>> # From a fitted HDBSCAN model, predict for n_clusters=5 - >>> labels, proba = approximate_predict_flat( - ... hdb, X, n_clusters=5) - >>> - >>> # Store prediciton data for later use. - >>> labels, proba, pred_data = approximate_predict_flat( - ... hdb, X, n_clusters=5, - ... return_prediction_data=True) - >>> - >>> # Use this prediction data to predict on new points - >>> labels1, proba1 = approximate_predict_flat( - ... hdb, X, - ... prediction_data=pred_data) - See Also - --------- - sklearn.cluster.hdbscan.prediction.approximate_predict : Predict the - cluster label of new points. - """ - # Get number of fitted clusters for later use. - n_clusters_fit = np.sum(np.unique(clusterer.labels_) >= 0) - if n_clusters is not None: - n_clusters = int(n_clusters) # Ensure n_clusters is int - - # We'll need the condensed tree later... - condensed_tree = clusterer.condensed_tree_ - - # If none of the three arguments: prediction_data, n_clusters, - # and cluster_selection_epsilon are supplied, - # then use clusterer's prediciton data directly - if ( - (prediction_data is None) - and ((n_clusters is None) or (n_clusters == n_clusters_fit)) - and (cluster_selection_epsilon is None) - ): - prediction_data = clusterer.prediction_data_ - - # If either of n_clusters or cluster_selection_epsilon were supplied, - # then build prediction data from these by modifying clusterer's - if not isinstance(prediction_data, PredictionData): - if clusterer.prediction_data_ is None: - raise ValueError( - "Clusterer does not have prediction data!" - " Try fitting with prediction_data=True set," - " or run generate_prediction_data on the clusterer" - ) - # Get prediction data from clusterer - prediction_data = clusterer.prediction_data_ - # Modify prediction_data to reflect new n_clusters - # First, make a copy of prediction data to avoid modifying source - prediction_data = copy.deepcopy(prediction_data) - # Cluster selection method is hold by condensed_tree. - # Change from 'eom' to 'leaf' if n_clusters is too large. - if (condensed_tree.cluster_selection_method == "eom") and ( - (n_clusters is not None) and (n_clusters > n_clusters_fit) - ): - warn( - f"Cannot predict more than {n_clusters_fit} with cluster " - "selection method 'eom'. Changing to method 'leaf'..." - ) - condensed_tree.cluster_selection_method = "leaf" - # This change does not affect the tree associated with 'clusterer' - # Re-initialize prediction_data for the specified n_clusters or epsilon - re_init( - prediction_data, - condensed_tree, - n_clusters=n_clusters, - cluster_selection_epsilon=cluster_selection_epsilon, - ) - - # ============================================================ - # Now we're ready to use prediction_data - # The rest of the code is copied from HDBSCAN's approximate_predict, - # but modified to use prediction_data instead of clusterer's attribute - points_to_predict = np.asarray(points_to_predict) - - if points_to_predict.shape[1] != prediction_data.raw_data.shape[1]: - raise ValueError("New points dimension does not match fit data!") - - if prediction_data.cluster_tree.shape[0] == 0: - warn( - "Prediction data does not have any defined clusters, new data" - " will be automatically predicted as noise." - ) - labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) - probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) - if return_prediction_data: - return labels, probabilities, prediction_data - else: - return labels, probabilities - - labels = np.empty(points_to_predict.shape[0], dtype=np.int32) - probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) - - min_samples = clusterer.min_samples or clusterer.min_cluster_size - neighbor_distances, neighbor_indices = prediction_data.tree.query( - points_to_predict, k=2 * min_samples - ) - - for i in range(points_to_predict.shape[0]): - label, prob = _find_cluster_and_probability( - condensed_tree, - prediction_data.cluster_tree, - neighbor_indices[i], - neighbor_distances[i], - prediction_data.core_distances, - prediction_data.cluster_map, - prediction_data.max_lambdas, - min_samples, - ) - labels[i] = label - probabilities[i] = prob - - if return_prediction_data: - return labels, probabilities, prediction_data - else: - return labels, probabilities - - -def membership_vector_flat( - clusterer, - points_to_predict, - prediction_data=None, - n_clusters=None, - cluster_selection_epsilon=0.0, -): - """ - Predict soft cluster membership probabilities. - - Produces a vector for each point in ``points_to_predict`` that gives a - probability that the given point is a member of a cluster for each of the - selected clusters of the ``clusterer``. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and either had - `prediction_data=True` set, or called the `generate_prediction_data` - method after the fact. - - points_to_predict : array, or array-like (n_samples, n_features) - The new data points to predict cluster labels for. They should - have the same dimensionality as the original dataset over which - clusterer was fit. - - prediction_data : PredictionData, default=None - Prediction data associated with HDBSCAN for some flat clustering. - - n_clusters : int, default=None - Number of clusters over which to compute membership probabilities. - These clusters are obtained as a flat clustering at some - `cluster_selection_epsilon`. - - cluster_selection_epsilon : float, default=0 - Core-distance below which to stop splitting clusters. This can - indirectly impose `n_clusters`. This argument is ignored if - `n_clusters` is supplied. - - Returns - ------- - membership_vectors : array (n_samples, n_clusters) - The probability that point ``i`` is a member of cluster ``j`` is - in ``membership_vectors[i, j]``. - - See Also - -------- - sklearn.cluster.hdbscan.prediction.membership_vectors : Predict soft cluster - membership. - sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict - soft cluster membership vectors for all points in the original dataset - the clusterer was trained on. - - Notes - ----- - This function is an adaptation of hdbscan's membership_vector for - `n_clusters`, `epsilon`. If neither `n_clusters` nor - `cluster_selection_epsilon` are supplied, the `clusterer`'s original - clustering is used. - """ - points_to_predict = points_to_predict.astype(np.float64) - # Extract condensed tree for later use - condensed_tree = clusterer.condensed_tree_ - - # Choose flat clustering based on cluster_selection_epsilon or n_clusters. - # If neither is specified, use clusterer's cluster_selection_epsilon - if ( - (n_clusters is None) - and (cluster_selection_epsilon == 0.0) - and (prediction_data is None) - ): - epsilon = clusterer.cluster_selection_epsilon - # Use the same prediction_data as clusterer's - prediction_data = clusterer.prediction_data_ - elif prediction_data is None: - if n_clusters is not None: - # Compute cluster_selection_epsilon so that a flat clustering - # produces a specified number of n_clusters - # With method 'eom', we may fail to get 'n_clusters' clusters. So, - try: - epsilon = select_epsilon(condensed_tree, n_clusters) - except AssertionError: - warn( - f"Failed to predict {n_clusters} clusters with " - "cluster selection method 'eom'. Switching to 'leaf'..." - ) - condensed_tree.cluster_selection_method = "leaf" - epsilon = select_epsilon(condensed_tree, n_clusters) - else: - epsilon = cluster_selection_epsilon - # Create another instance of prediction_data that is consistent - # with the selected value of epsilon. - prediction_data = copy.deepcopy(clusterer.prediction_data_) - re_init(prediction_data, condensed_tree, cluster_selection_epsilon=epsilon) - - # Flat clustering from prediction data - clusters = clusters_from_prediction_data(prediction_data) - - # Initialize probabilities - result = np.empty((points_to_predict.shape[0], clusters.shape[0]), dtype=np.float64) - - # k-NN for prediciton points to training set - min_samples = clusterer.min_samples or clusterer.min_cluster_size - neighbor_distances, neighbor_indices = prediction_data.tree.query( - points_to_predict, k=2 * min_samples - ) - - # Loop over prediction points to compute probabilities - for i in range(points_to_predict.shape[0]): - # We need to find where in the tree the new point would go - # for the purposes of outlier membership approximation - nearest_neighbor, lambda_ = _find_neighbor_and_lambda( - neighbor_indices[i], - neighbor_distances[i], - prediction_data.core_distances, - min_samples, - ) - - # Find row in tree where nearest neighbor drops out, - # so we can get a lambda value for the nearest neighbor - neighbor_tree_row = get_tree_row_with_child( - condensed_tree._raw_tree, nearest_neighbor - ) - - # Assign lambda as min(lambda-to-neighbor, neighbor's-lambda-to-tree) - # Equivalently, this assigns core distance for prediction point as - # max(dist-to-neighbor, neighbor's-dist-to-tree) - if neighbor_tree_row["lambda_val"] <= lambda_: - lambda_ = neighbor_tree_row["lambda_val"] - - # Probabilities based on distance to closest exemplar in each cluster: - # Use new prediction_data that points to exemplars that are specific - # to the choice of n_clusters - distance_vec = dist_membership_vector( - points_to_predict[i], prediction_data.exemplars, prediction_data.dist_metric - ) - # Probabilities based on how long the nearest exemplar persists in - # each cluster (with respect to most persistent exemplar) - # Use new clusters that are defined by the choice of n_clusters. - outlier_vec = outlier_membership_vector( - nearest_neighbor, - lambda_, - clusters, - condensed_tree._raw_tree, - prediction_data.leaf_max_lambdas, - prediction_data.cluster_tree, - ) - - # Merge the two probabilities to produce a single set of probabilities - result[i] = distance_vec**0.5 * outlier_vec**2.0 - result[i] /= result[i].sum() - - # Include probability that the nearest neighbor belongs to a cluster - result[i] *= prob_in_some_cluster( - nearest_neighbor, - lambda_, - clusters, - condensed_tree._raw_tree, - prediction_data.leaf_max_lambdas, - prediction_data.cluster_tree, - ) - - # Rename variable so it's easy to understand what's being returned - membership_vectors = result - return membership_vectors - - -def all_points_membership_vectors_flat( - clusterer, prediction_data=None, n_clusters=None, cluster_selection_epsilon=None -): - """ - Predict soft cluster membership vectors for all points in the dataset. - - This function predicts soft cluster membership vectors for all the points - in the dataset that the clusterer was trained on. This function is more - efficient by making use of the fact that all points are already in the - condensed tree, and processing in bulk. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and either had - `prediction_data=True` set, or called the `generate_prediction_data` - method after the fact. This method does not work if the clusterer was - trained with `metric='precomputed'`. - - prediction_data : PredictionData, default=None - Prediction data associated with HDBSCAN for some flat clustering. - - n_clusters : int, default=None - Number of clusters over which to compute membership probabilities. - These clusters are obtained as a flat clustering at some - `cluster_selection_epsilon`. - - cluster_selection_epsilon : float, default=0 - Core-distance below which to stop splitting clusters. This can - indirectly impose `n_clusters`. This argument is ignored if - `n_clusters` is supplied. - - Returns - ------- - membership_vectors : array (n_samples, n_clusters) - The probability that point `i` of the original dataset is a member of - cluster `j` is in `membership_vectors[i, j]`. - - See Also - -------- - sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict - soft cluster membership vectors for all points in the original dataset - the clusterer was trained on. - sklearn.cluster.hdbscan.prediction.membership_vectors : Predict soft cluster - membership. - - Notes - ----- - This function is an adaptation of hdbscan's `all_points_membership_vector` - for `n_clusters`, `epsilon`. If neither `n_clusters` nor - `cluster_selection_epsilon` are supplied, the `clusterer`'s original - clustering is used. - """ - # Extract condensed tree for later use - condensed_tree = clusterer.condensed_tree_ - - # Choose flat clustering based on cluster_selection_epsilon or n_clusters. - # If neither is specified, use clusterer's cluster_selection_epsilon - if (n_clusters is None) and (cluster_selection_epsilon is None): - epsilon = clusterer.cluster_selection_epsilon - # Use the same prediction_data as clusterer's - prediction_data = clusterer.prediction_data_ - elif prediction_data is None: - if n_clusters is not None: - # Compute cluster_selection_epsilon so that a flat clustering - # produces a specified number of n_clusters - # With method 'eom', we may fail to get 'n_clusters' clusters. So, - try: - epsilon = select_epsilon(condensed_tree, n_clusters) - except AssertionError: - warn( - f"Failed to predict {n_clusters} clusters with " - "cluster selection method 'eom'. Switching to 'leaf'..." - ) - condensed_tree.cluster_selection_method = "leaf" - epsilon = select_epsilon(condensed_tree, n_clusters) - else: - epsilon = cluster_selection_epsilon - # Create another instance of prediction_data that is consistent - # with the selected value of epsilon. - prediction_data = copy.deepcopy(clusterer.prediction_data_) - re_init(prediction_data, condensed_tree, cluster_selection_epsilon=epsilon) - - # Flat clustering at the chosen epsilon from prediction_data - clusters = clusters_from_prediction_data(prediction_data) - - all_points = prediction_data.raw_data - - # When no clusters found, return array of 0's - if clusters.size == 0: - return np.zeros(all_points.shape[0]) - - # Probabilities based on distance to closest exemplar in each cluster: - # Use new prediction_data that points to exemplars that are specific - # to the choice of n_clusters - distance_vecs = all_points_dist_membership_vector( - all_points, prediction_data.exemplars, prediction_data.dist_metric - ) - - # Probabilities based on how long the point persists in - # each cluster (with respect to most persistent exemplar) - # Use new clusters that are defined by the choice of n_clusters. - outlier_vecs = all_points_outlier_membership_vector( - clusters, - condensed_tree._raw_tree, - prediction_data.leaf_max_lambdas, - prediction_data.cluster_tree, - ) - - # Include probability that the point belongs to a cluster - in_cluster_probs = all_points_prob_in_some_cluster( - clusters, - condensed_tree._raw_tree, - prediction_data.leaf_max_lambdas, - prediction_data.cluster_tree, - ) - - # Aggregate the three probabilities to produce membership vectors - result = distance_vecs * outlier_vecs - row_sums = result.sum(axis=1) - result = result / row_sums[:, np.newaxis] - result *= in_cluster_probs[:, np.newaxis] - - # Re-name variable to clarify what's being returned. - membership_vectors = result - return membership_vectors - - -def select_epsilon(condensed_tree, n_clusters): - """ - Pick optimal epsilon from condensed tree based on n_clusters, - calls functions specific to 'eom' or 'leaf' selection methods - """ - cluster_selection_method = condensed_tree.cluster_selection_method - if cluster_selection_method == "eom": - return select_epsilon_eom(condensed_tree, n_clusters) - if cluster_selection_method == "leaf": - return select_epsilon_leaf(condensed_tree, n_clusters) - raise ValueError( - 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' - ) - - -def select_epsilon_eom(condensed_tree, n_clusters): - """ - Select epsilon so that persistence-based clustering, - after truncating the tree at the above epsilon, - has exactly 'n_clusters' clusters - """ - # With method 'eom', max clusters are produced for epsilon=0, - # as computed by - eom_base_clusters = condensed_tree._select_clusters() - max_clusters = len(eom_base_clusters) - # Increasing epsilon can only reduce the number of ouput clusters. - - assert n_clusters <= max_clusters, ( - f"Cannot produce more than {max_clusters} with method 'eom'. " - + "Use method 'leaf' instead to extract flat clustering." - ) - - tree = condensed_tree._raw_tree - # To select epsilon, consider all values where clusters are split - cluster_lambdas = tree["lambda_val"][tree["child_size"] > 1] - candidate_epsilons = 1.0 / np.unique(cluster_lambdas) - 1.0e-12 - # Subtract the extra e-12 to avoid numerical errors in comparison - # Then, we avoid splitting for all epsilon below this. - candidate_epsilons = np.sort(candidate_epsilons)[::-1] - - for epsilon in candidate_epsilons: - sel_clusters = _new_select_clusters(condensed_tree, epsilon) - if len(sel_clusters) == n_clusters: - break - else: - raise RuntimeError("Could not find epsilon") - - return epsilon - - -def select_epsilon_leaf(condensed_tree, n_clusters): - """ - Select epsilon so that the leaves of condensed tree, - after truncating at the above epsilon, - has exactly 'n_clusters' clusters - """ - # Use an epsilon value that produces the right number of clusters. - # The condensed tree of HDBSCAN has this information. - # Extract the lambda levels (=1/distance) from the condensed tree - lambdas = condensed_tree._raw_tree["lambda_val"] - # We don't want values that produce a large cluster and - # just one or two individual points. - child_sizes = condensed_tree._raw_tree["child_size"] - child_sizes = child_sizes.astype(int) - # Keep only those lambda values corresponding to cluster separation; - # i.e., with child_sizes > 1 - lambdas = lambdas[child_sizes > 1] - # Get the unique values, because when two clusters fall out of one, - # the entry with lambda is repeated. - lambdas = np.unique(lambdas.astype(float)) - if n_clusters > len(lambdas) + 1: - warn( - f"HDBSCAN can only compute {len(lambdas)+1} clusters. " - f"Setting n_clusters to {len(lambdas)+1}..." - ) - n_clusters = len(lambdas) + 1 - - # lambda values are sorted by np.unique. - # Now, get epsilon (distance threshold) as 1/lambda - epsilon = 1.0 / lambdas[n_clusters - 2] - # At this epsilon, n_clusters have been split. - # Stop splits at epsilons smaller than this. - # To allow for numerical errors, - return epsilon - 1.0e-12 - - -def re_init(predData, condensed_tree, n_clusters=None, cluster_selection_epsilon=0.0): - """ - Modify PredictionData of HDBSCAN to account for epsilon. - epsilon is the cluster_selection_epsilon that controls granularity - of clusters; Large epsilon => More clusters - - Parameters - ---------- - predData: PredictionData - Contains data to use for predicting novel points. - Defined in the HDBSCAN module - - condensed_tree: CondensedTree - Tree structure that contains hierarchical clustering. - Defined in the HDBSCAN module - - n_clusters: int, optional, default=None - If specified, use this to obtain cluster_selection_epsilon - from CondensedTree; Overrides cluster_selection_epsilon parameter - - cluster_selection_epsilon: float, default=0. - In cluster tree, nodes are not split further beyond (>=) this value. - epsilon is the inverse of core distance. - - Returns - ------- - None - """ - # predData must be a pre-trained PredictionData instance from hdbscan - # If n_clusters is specified, compute cluster_selection_epsilon; - if n_clusters is not None: - cluster_selection_epsilon = select_epsilon(condensed_tree, n_clusters) - - # This is the key modification: - # Select clusters according to selection method and epsilon. - selected_clusters = _new_select_clusters(condensed_tree, cluster_selection_epsilon) - # _new_select_clusters is a modification of get_clusters - # from hdbscan._hdbscan_tree - - # raw tree, used later to get exemplars and lambda values - raw_condensed_tree = condensed_tree._raw_tree - - # Re-do the cluster map: Map cluster numbers in tree (N, N+1, ..) - # to the cluster labels produced as output - predData.cluster_map = { - int(c): n for n, c in enumerate(sorted(list(selected_clusters))) - } - predData.reverse_cluster_map = {n: c for c, n in predData.cluster_map.items()} - - # Re-compute lambdas and exemplars for selected clusters; - predData.max_lambdas = {} - predData.exemplars = [] - - for cluster in selected_clusters: - # max_lambda <=> smallest distance <=> most persistent point(s) - predData.max_lambdas[cluster] = raw_condensed_tree["lambda_val"][ - raw_condensed_tree["parent"] == cluster - ].max() - - # Map all sub-clusters of selected cluster to the selected cluster's - # label in output. - # Map lambdas too... - for sub_cluster in predData._clusters_below(cluster): - predData.cluster_map[sub_cluster] = predData.cluster_map[cluster] - predData.max_lambdas[sub_cluster] = predData.max_lambdas[cluster] - - # Create set of exemplar points for later use. - # Novel points are assigned based on cluster of closest exemplar. - cluster_exemplars = np.array([], dtype=np.int64) - # For each selected cluster, get all of its leaves, - # and leaves of leaves, and so on... - for leaf in predData._recurse_leaf_dfs(cluster): - # Largest lambda => Most persistent points - leaf_max_lambda = raw_condensed_tree["lambda_val"][ - raw_condensed_tree["parent"] == leaf - ].max() - # Get the most persistent points - points = raw_condensed_tree["child"][ - (raw_condensed_tree["parent"] == leaf) - & (raw_condensed_tree["lambda_val"] == leaf_max_lambda) - ] - # Add most persistent points as exemplars - cluster_exemplars = np.hstack([cluster_exemplars, points]) - - # Add exemplars for each leaf of each selected cluster. - predData.exemplars.append(predData.raw_data[cluster_exemplars]) - return - - -def _new_select_clusters( - condensed_tree, - cluster_selection_epsilon, - allow_single_cluster=False, - match_reference_implementation=False, -): - """ - Adaptation of get_clusters from hdbscan._hdbscan_tree. - Avoids the label and proba computation at the end, - and returns only the selected clusters instead. - """ - tree = condensed_tree._raw_tree - cluster_selection_method = condensed_tree.cluster_selection_method - stability = compute_stability(tree) - - if allow_single_cluster: - node_list = sorted(stability.keys(), reverse=True) - else: - node_list = sorted(stability.keys(), reverse=True)[:-1] - # (exclude root) - - cluster_tree = tree[tree["child_size"] > 1] - is_cluster = {cluster: True for cluster in node_list} - - if cluster_selection_method == "eom": - for node in node_list: - child_selection = cluster_tree["parent"] == node - subtree_stability = np.sum( - [stability[child] for child in cluster_tree["child"][child_selection]] - ) - if subtree_stability > stability[node]: - is_cluster[node] = False - stability[node] = subtree_stability - else: - for sub_node in _bfs_from_cluster_tree(cluster_tree, node): - if sub_node != node: - is_cluster[sub_node] = False - - if cluster_selection_epsilon != 0.0: - eom_clusters = set([c for c in is_cluster if is_cluster[c]]) - selected_clusters = epsilon_search( - eom_clusters, - cluster_tree, - cluster_selection_epsilon, - allow_single_cluster, - ) - for c in is_cluster: - if c in selected_clusters: - is_cluster[c] = True - else: - is_cluster[c] = False - - elif cluster_selection_method == "leaf": - leaves = set(get_cluster_tree_leaves(cluster_tree)) - if len(leaves) == 0: - for c in is_cluster: - is_cluster[c] = False - is_cluster[tree["parent"].min()] = True - - if cluster_selection_epsilon != 0.0: - selected_clusters = epsilon_search( - leaves, cluster_tree, cluster_selection_epsilon, allow_single_cluster - ) - else: - selected_clusters = leaves - - for c in is_cluster: - if c in selected_clusters: - is_cluster[c] = True - else: - is_cluster[c] = False - else: - raise ValueError( - 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' - ) - - clusters = set([int(c) for c in is_cluster if is_cluster[c]]) - return clusters - - -def epsilon_search( - leaves, cluster_tree, cluster_selection_epsilon, allow_single_cluster -): - selected_clusters = [] - processed = [] - - for leaf in leaves: - eps = 1 / cluster_tree["lambda_val"][cluster_tree["child"] == leaf][0] - if eps < cluster_selection_epsilon: - if leaf not in processed: - epsilon_child = traverse_upwards( - cluster_tree, cluster_selection_epsilon, leaf, allow_single_cluster - ) - if hasattr(epsilon_child, "__len__"): - epsilon_child = epsilon_child[0] - - selected_clusters.append(epsilon_child) - - for sub_node in _bfs_from_cluster_tree(cluster_tree, epsilon_child): - if sub_node != epsilon_child: - processed.append(sub_node) - else: - selected_clusters.append(leaf) - - return set(selected_clusters) - - -def traverse_upwards( - cluster_tree, cluster_selection_epsilon, leaf, allow_single_cluster -): - root = cluster_tree["parent"].min() - parent = cluster_tree[cluster_tree["child"] == leaf]["parent"] - if parent == root: - if allow_single_cluster: - return parent - else: - return leaf # return node closest to root - - parent_eps = 1 / cluster_tree[cluster_tree["child"] == parent]["lambda_val"] - if parent_eps > cluster_selection_epsilon: - return parent - else: - return traverse_upwards( - cluster_tree, cluster_selection_epsilon, parent, allow_single_cluster - ) - - -def clusters_from_prediction_data(prediction_data): - """ - Extract selected clusters from PredictionData instance. - """ - return np.array(sorted(list(prediction_data.reverse_cluster_map.values()))).astype( - np.intp - ) diff --git a/sklearn/cluster/_hdbscan/tests/test_flat.py b/sklearn/cluster/_hdbscan/tests/test_flat.py deleted file mode 100644 index db8344e1b612d..0000000000000 --- a/sklearn/cluster/_hdbscan/tests/test_flat.py +++ /dev/null @@ -1,404 +0,0 @@ -""" -Simple tests for flat clustering over HDBSCAN hierarchy -""" -import warnings -import numpy as np - -from sklearn.cluster import ( - HDBSCAN, - approximate_predict, - HDBSCAN_flat, - approximate_predict_flat, - membership_vector_flat, - all_points_membership_vectors_flat, -) - -from sklearn.datasets import make_blobs, make_moons -from sklearn.preprocessing import StandardScaler -from sklearn.model_selection import train_test_split -from sklearn.utils._testing import assert_array_equal, assert_array_less - -# Ignore future warnings thrown by sklearn -warnings.filterwarnings("ignore", category=FutureWarning) - -# Create a nice dataset with 6 circular clusters and 2 moons -centers = [(0, 2), (-0.2, 0), (0.2, 0), (1.5, 0), (2.0, 1.0), (2.5, 0.0)] -std = [0.5, 0.08, 0.06, 0.35, 0.35, 0.35] -X0, y0 = make_blobs( - n_samples=[70, 30, 80, 100, 40, 150], - centers=centers, - cluster_std=std, - random_state=1, -) -X1, y1 = make_moons(n_samples=300, noise=0.07, random_state=42) -X1 += 3.0 -y1 += len(centers) -X = np.vstack((X0, X1)) -y = np.concatenate((y0, y1)) - -X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42) -scaler = StandardScaler() -X = scaler.fit_transform(X) -X_test = scaler.transform(X_test) - - -def n_clusters_from_labels(labels_): - return np.amax(labels_) + 1 - - -def test_flat_base_default(): - """ - Verify that the default clustering of HDBSCAN is preserved. - """ - # Given, the base HDBSCAN with method 'eom' - clusterer = HDBSCAN(cluster_selection_method="eom").fit(X) - n_clusters = n_clusters_from_labels(clusterer.labels_) - - # When we ask for flat clustering with same n_clusters, - clusterer_flat = HDBSCAN_flat( - X, n_clusters=n_clusters, cluster_selection_method="eom" - ) - - # Then, the labels and probabilities should match - assert_array_equal(clusterer_flat.labels_, clusterer.labels_) - assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) - - # Given, the base HDBSCAN with method 'leaf' - clusterer = HDBSCAN(cluster_selection_method="leaf").fit(X) - n_clusters = n_clusters_from_labels(clusterer.labels_) - - # When we ask for flat clustering with same n_clusters, - clusterer_flat = HDBSCAN_flat( - X, n_clusters=n_clusters, cluster_selection_method="leaf" - ) - - # Then, the labels and probabilities should match - assert_array_equal(clusterer_flat.labels_, clusterer.labels_) - assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) - return - - -def test_flat_base_epsilon(): - """ - Verify that a clustering of HDBSCAN specified by - cluster_selection_epsilon is preserved. - """ - # Method 'eom'... - # Given, a flat clustering for required n_clusters, - n_clusters = 4 - clusterer_flat = HDBSCAN_flat( - X, n_clusters=n_clusters, cluster_selection_method="eom" - ) - - # When we run the base HDBSCAN using it's epsilon, - epsilon = clusterer_flat.cluster_selection_epsilon - clusterer = HDBSCAN( - cluster_selection_method="eom", cluster_selection_epsilon=epsilon - ).fit(X) - - # Then, the labels and probabilities should match - assert_array_equal(clusterer_flat.labels_, clusterer.labels_) - assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) - - # Method 'leaf'... - # Given, a flat clustering for required n_clusters, - n_clusters = 6 - clusterer_flat = HDBSCAN_flat( - X, n_clusters=n_clusters, cluster_selection_method="leaf" - ) - - # When we run the base HDBSCAN using it's epsilon, - epsilon = clusterer_flat.cluster_selection_epsilon - clusterer = HDBSCAN( - cluster_selection_method="leaf", cluster_selection_epsilon=epsilon - ).fit(X) - - # Then, the labels and probabilities should match - assert_array_equal(clusterer_flat.labels_, clusterer.labels_) - assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) - return - - -def test_switch_to_leaf(): - """ - Verify that when we request more clusters than 'eom' can handle, - method switches to 'leaf' and the results match 'leaf'. - """ - # Given the max number of clusters that can be produced by 'eom', - # (these are produced for epsilon=0) (??? Needs verification) - clusterer = HDBSCAN( - cluster_selection_method="eom", cluster_selection_epsilon=0 - ).fit(X) - max_clusters = n_clusters_from_labels(clusterer.labels_) - - with warnings.catch_warnings(record=True) as w: - # When we try flat clustering with 'eom' method for more n_clusters, - clusterer_flat = HDBSCAN_flat( - X, cluster_selection_method="eom", n_clusters=max_clusters + 2 - ) - # Then, a warning is raised saying 'eom' can't get this clustering, - assert len(w) > 0 - assert issubclass(w[-1].category, UserWarning) - assert "Cannot predict" in str(w[-1].message) - - # the resulting clusterer switches to using method 'leaf', - assert ( - clusterer_flat.cluster_selection_method == "leaf" - ), "cluster selection method has not switched to 'leaf'" - # and the resulting probabilities and labels must match - epsilon = clusterer_flat.cluster_selection_epsilon - clusterer_leaf = HDBSCAN( - cluster_selection_method="leaf", cluster_selection_epsilon=epsilon - ).fit(X) - assert_array_equal(clusterer_flat.labels_, clusterer_leaf.labels_) - assert_array_equal(clusterer_flat.probabilities_, clusterer_leaf.probabilities_) - return - - -def test_approx_predict_default(): - """ - Verify that approximate_predict_flat produces same results as default - """ - # Given the base HDBSCAN trained on some data, - clusterer = HDBSCAN( - cluster_selection_method="eom", - cluster_selection_epsilon=0, - prediction_data=True, - ).fit(X) - - # When using approximate_predict_flat without specifying n_clusters, - labels_flat, proba_flat = approximate_predict_flat( - clusterer, X_test, n_clusters=None - ) - - # Then, the clustering should match that due to approximate_predict, - labels_base, proba_base = approximate_predict(clusterer, X_test) - assert_array_equal(labels_flat, labels_base) - assert_array_equal(proba_flat, proba_base) - return - - -def test_approx_predict_same_clusters(): - """ - Verify that approximate_predict_flat produces as many clusters as clusterer - """ - # Given a flat clustering trained for some n_clusters, - n_clusters = 5 - clusterer = HDBSCAN_flat(X, cluster_selection_method="eom", n_clusters=n_clusters) - - # When using approximate_predict_flat without specifying n_clusters, - labels_flat, proba_flat = approximate_predict_flat( - clusterer, X_test, n_clusters=None - ) - - # Then, the number of clusters produced must match the original n_clusters - n_clusters_out = n_clusters_from_labels(labels_flat) - assert n_clusters_out == n_clusters - # and all probabilities are <= 1. - assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.0e-14) - return - - -def test_approx_predict_diff_clusters(): - """ - Verify that approximate_predict_flat produces as many clusters as asked - """ - # Given a flat clustering trained for some n_clusters, - n_clusters_fit = 5 - clusterer = HDBSCAN_flat( - X, - cluster_selection_method="eom", - n_clusters=n_clusters_fit, - prediction_data=True, - ) - - # When using approximate_predict_flat with specified n_clusters, - n_clusters_predict = 3 - labels_flat, proba_flat = approximate_predict_flat( - clusterer, X_test, n_clusters=n_clusters_predict - ) - - # Then, the requested number of clusters must be produced - n_clusters_out = n_clusters_from_labels(labels_flat) - assert n_clusters_out == n_clusters_predict - # and all probabilities are <= 1. - assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.0e-14) - - # When using approximate_predict_flat with more clusters - # than 'eom' can handle, - n_clusters_predict = 12 - with warnings.catch_warnings(record=True) as w: - labels_flat, proba_flat = approximate_predict_flat( - clusterer, X_test, n_clusters=n_clusters_predict - ) - # Then, a warning is raised saying 'eom' can't get this clustering, - assert len(w) > 0 - assert issubclass(w[-1].category, UserWarning) - assert "Cannot predict" in str(w[-1].message) - # But the requested number of clusters must still be produced using 'leaf' - n_clusters_out = n_clusters_from_labels(labels_flat) - assert n_clusters_out == n_clusters_predict - # and all probabilities are <= 1. - assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.0e-14) - return - - -def test_mem_vec_same_clusters(): - """ - Verify membership vector produces same n_clusters as clusterer - """ - # Given a flat clustering trained for n_clusters picked by HDBSCAN, - n_clusters_fit = None - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - - # When membership_vector_flat is called with new data, - memberships = membership_vector_flat(clusterer, X_test) - - # Then the number of clusters in memberships matches those of clusterer, - assert memberships.shape[1] == n_clusters_from_labels(clusterer.labels_) - # and the number of points should equal those in the test set - assert len(memberships) == len(X_test) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - - # ======================================== - # Given a flat clustering for a specified n_clusters, - n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2 - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - - # When membership_vector_flat is called with new data, - memberships = membership_vector_flat(clusterer, X_test) - - # Then the number of clusters in memberships matches those of clusterer, - assert memberships.shape[1] == n_clusters_fit - # and the number of points should equal those in the test set - assert len(memberships) == len(X_test) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - return - - -def test_mem_vec_diff_clusters(): - """ - Verify membership vector produces as many clusters as requested - """ - # Ignore user warnings in this function - warnings.filterwarnings("ignore", category=UserWarning) - - # Given a flat clustering trained for n_clusters picked by HDBSCAN, - n_clusters_fit = None - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - n_clusters_fitted = n_clusters_from_labels(clusterer.labels_) - - # When membership_vector_flat is called with new data for some n_clusters, - n_clusters_predict = n_clusters_fitted + 3 - memberships = membership_vector_flat( - clusterer, X_test, n_clusters=n_clusters_predict - ) - - # Then the number of clusters in memberships should be as requested, - assert memberships.shape[1] == n_clusters_predict - # and the number of points should equal those in the test set - assert len(memberships) == len(X_test) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - - # ======================================== - # Given a flat clustering for a specified n_clusters, - n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2 - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - - # When membership_vector_flat is called with new data for some n_clusters, - n_clusters_predict = n_clusters_fit + 3 - memberships = membership_vector_flat( - clusterer, X_test, n_clusters=n_clusters_predict - ) - - # Then the number of clusters in memberships should be as requested, - assert memberships.shape[1] == n_clusters_predict - # and the number of points should equal those in the test set - assert len(memberships) == len(X_test) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - return - - -def test_all_points_mem_vec_same_clusters(): - """ - Verify membership vector for training set produces same n_clusters - as clusterer - """ - # Given a flat clustering trained for n_clusters picked by HDBSCAN, - n_clusters_fit = None - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - - # When all_points_membership_vectors_flat is called, - memberships = all_points_membership_vectors_flat(clusterer) - - # Then the number of clusters in memberships matches those of clusterer, - assert memberships.shape[1] == n_clusters_from_labels(clusterer.labels_) - # and the number of points should equal those in the training set - assert len(memberships) == len(X) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - - # ======================================== - # Given a flat clustering for a specified n_clusters, - n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2 - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - - # When all_points_membership_vectors_flat is called, - memberships = all_points_membership_vectors_flat(clusterer) - - # Then the number of clusters in memberships matches those of clusterer, - assert memberships.shape[1] == n_clusters_from_labels(clusterer.labels_) - # and the number of points should equal those in the training set - assert len(memberships) == len(X) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - return - - -def test_all_points_mem_vec_diff_clusters(): - """ - Verify membership vector produces as many clusters as requested - """ - # Ignore user warnings in this function - warnings.filterwarnings("ignore", category=UserWarning) - - # Given a flat clustering trained for n_clusters picked by HDBSCAN, - n_clusters_fit = None - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - n_clusters_fitted = n_clusters_from_labels(clusterer.labels_) - - # When all_points_membership_vectors_flat is called for some n_clusters, - n_clusters_predict = n_clusters_fitted + 3 - memberships = all_points_membership_vectors_flat( - clusterer, n_clusters=n_clusters_predict - ) - - # Then the number of clusters in memberships should be as requested, - assert memberships.shape[1] == n_clusters_predict - # and the number of points should equal those in the training set - assert len(memberships) == len(X) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - - # ======================================== - # Given a flat clustering for a specified n_clusters, - n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2 - clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit) - - # When membership_vector_flat is called for some n_clusters, - n_clusters_predict = n_clusters_fitted + 3 - memberships = all_points_membership_vectors_flat( - clusterer, n_clusters=n_clusters_predict - ) - - # Then the number of clusters in memberships should be as requested, - assert memberships.shape[1] == n_clusters_predict - # and the number of points should equal those in the training set - assert len(memberships) == len(X) - # and all probabilities are <= 1. - assert_array_less(memberships, np.ones(memberships.shape) + 1.0e-14) - return From 1ceac4306ee157395d9132a7fb712cfb09bef65d Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 10 Mar 2022 15:43:44 -0500 Subject: [PATCH 021/160] Made memview readonly constant --- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx index 80b5ab0b50243..cab42934e5e73 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -288,7 +288,7 @@ cdef class KDTreeBoruvkaAlgorithm (object): cdef object core_dist_tree cdef DistanceMetric dist cdef np.ndarray _data - cdef np.double_t[:, ::1] _raw_data + cdef readonly const np.double_t[:, ::1] _raw_data cdef np.double_t[:, :, ::1] node_bounds cdef np.double_t alpha cdef np.int8_t approx_min_span_tree From 6f20a08ad24cba33d571627c41e965fedeac41e7 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 10 Mar 2022 17:01:33 -0500 Subject: [PATCH 022/160] Removed experimental/extra API -- may reenable in future PRs --- sklearn/cluster/__init__.py | 10 - sklearn/cluster/_hdbscan/_prediction.py | 713 ------------------ .../_hdbscan/_robust_single_linkage_.py | 50 +- sklearn/cluster/_hdbscan/_trees.py | 193 ----- sklearn/cluster/_hdbscan/hdbscan_.py | 145 +--- .../cluster/_hdbscan/tests/test_hdbscan.py | 52 -- sklearn/cluster/_hdbscan/tests/test_rsl.py | 27 +- 7 files changed, 28 insertions(+), 1162 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_prediction.py delete mode 100644 sklearn/cluster/_hdbscan/_trees.py diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 1e728d8b8d6ab..e7204028363d8 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -25,12 +25,6 @@ from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan from ._hdbscan._robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage from ._hdbscan._validity import validity_index -from ._hdbscan._prediction import ( - approximate_predict, - membership_vector, - all_points_membership_vectors, - approximate_predict_scores, -) __all__ = [ "AffinityPropagation", @@ -63,8 +57,4 @@ "RobustSingleLinkage", "robust_single_linkage", "validity_index", - "approximate_predict", - "membership_vector", - "all_points_membership_vectors", - "approximate_predict_scores", ] diff --git a/sklearn/cluster/_hdbscan/_prediction.py b/sklearn/cluster/_hdbscan/_prediction.py deleted file mode 100644 index 737de77e85948..0000000000000 --- a/sklearn/cluster/_hdbscan/_prediction.py +++ /dev/null @@ -1,713 +0,0 @@ -# Support various prediction methods for predicting cluster membership -# of new or unseen points. There are several ways to interpret how -# to do this correctly, so we provide several methods for -# the different use cases that may arise. - -import numpy as np - -from sklearn.neighbors import KDTree, BallTree -from .dist_metrics import DistanceMetric -from ._hdbscan_tree import recurse_leaf_dfs -from ._prediction_utils import ( - get_tree_row_with_child, - dist_membership_vector, - outlier_membership_vector, - prob_in_some_cluster, - all_points_dist_membership_vector, - all_points_outlier_membership_vector, - all_points_prob_in_some_cluster, -) -from warnings import warn - - -class PredictionData(object): - """ - Extra data that allows for faster prediction if cached. - - Parameters - ---------- - - data : array (n_samples, n_features) - The original data set that was clustered. - - condensed_tree : CondensedTree - The condensed tree object created by a clustering. - - min_samples : int - The min_samples value used in clustering. - - tree_type : str, default="kdtree" - Which type of space tree to use for core distance computation. - One of: - * ``kdtree`` - * ``balltree`` - - metric : str, default="euclidean" - The metric used to determine distance for the clustering. - This is the metric that will be used for the space tree to determine - core distances etc. - - **kwargs : - Any further arguments to the metric. - - Attributes - ---------- - - raw_data : array (n_samples, n_features) - The original data set that was clustered - - tree : KDTree or BallTree - A space partitioning tree that can be queried for nearest neighbors. - - core_distances : array (n_samples,) - The core distances for every point in the original data set. - - cluster_map : dict - A dictionary mapping cluster numbers in the condensed tree to labels - in the final selected clustering. - - cluster_tree : structured array - A version of the condensed tree that only contains clusters, not - individual points. - - max_lambdas : dict - A dictionary mapping cluster numbers in the condensed tree to the - maximum lambda value seen in that cluster. - """ - - _tree_type_map = {"kdtree": KDTree, "balltree": BallTree} - - def _clusters_below(self, cluster): - result = [] - to_process = [cluster] - - while to_process: - result.extend(to_process) - to_process = self.cluster_tree["child"][ - np.in1d(self.cluster_tree["parent"], to_process) - ] - to_process = to_process.tolist() - - return result - - def _recurse_leaf_dfs(self, current_node): - children = self.cluster_tree[self.cluster_tree["parent"] == current_node][ - "child" - ] - if len(children) == 0: - return [ - current_node, - ] - else: - return sum( - [recurse_leaf_dfs(self.cluster_tree, child) for child in children], [] - ) - - def __init__( - self, - data, - condensed_tree, - min_samples, - tree_type="kdtree", - metric="euclidean", - **kwargs, - ): - self.raw_data = data.astype(np.float64) - self.tree = self._tree_type_map[tree_type]( - self.raw_data, metric=metric, **kwargs - ) - self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1] - self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) - - selected_clusters = sorted(condensed_tree._select_clusters()) - # raw_condensed_tree = condensed_tree.to_numpy() - raw_condensed_tree = condensed_tree._raw_tree - - self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))} - self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()} - - self.cluster_tree = raw_condensed_tree[raw_condensed_tree["child_size"] > 1] - self.max_lambdas = {} - self.leaf_max_lambdas = {} - self.exemplars = [] - - all_clusters = set( - np.hstack([self.cluster_tree["parent"], self.cluster_tree["child"]]) - ) - - for cluster in all_clusters: - self.leaf_max_lambdas[cluster] = raw_condensed_tree["lambda_val"][ - raw_condensed_tree["parent"] == cluster - ].max() - - for cluster in selected_clusters: - self.max_lambdas[cluster] = raw_condensed_tree["lambda_val"][ - raw_condensed_tree["parent"] == cluster - ].max() - - for sub_cluster in self._clusters_below(cluster): - self.cluster_map[sub_cluster] = self.cluster_map[cluster] - self.max_lambdas[sub_cluster] = self.max_lambdas[cluster] - - cluster_exemplars = np.array([], dtype=np.int64) - for leaf in self._recurse_leaf_dfs(cluster): - leaf_max_lambda = raw_condensed_tree["lambda_val"][ - raw_condensed_tree["parent"] == leaf - ].max() - points = raw_condensed_tree["child"][ - (raw_condensed_tree["parent"] == leaf) - & (raw_condensed_tree["lambda_val"] == leaf_max_lambda) - ] - cluster_exemplars = np.hstack([cluster_exemplars, points]) - - self.exemplars.append(self.raw_data[cluster_exemplars]) - - -def _find_neighbor_and_lambda( - neighbor_indices, neighbor_distances, core_distances, min_samples -): - """ - Find the nearest mutual reachability neighbor of a point, and compute - the associated lambda value for the point, given the mutual reachability - distance to a nearest neighbor. - - Parameters - ---------- - neighbor_indices : array (2 * min_samples, ) - An array of raw distance based nearest neighbor indices. - - neighbor_distances : array (2 * min_samples, ) - An array of raw distances to the nearest neighbors. - - core_distances : array (n_samples, ) - An array of core distances for all points - - min_samples : int - The min_samples value used to generate core distances. - - Returns - ------- - neighbor : int - The index into the full raw data set of the nearest mutual reachability - distance neighbor of the point. - - lambda_ : float - The lambda value at which this point joins/merges with `neighbor`. - """ - neighbor_core_distances = core_distances[neighbor_indices] - point_core_distances = neighbor_distances[min_samples] * np.ones( - neighbor_indices.shape[0] - ) - mr_distances = np.vstack( - (neighbor_core_distances, point_core_distances, neighbor_distances) - ).max(axis=0) - - nn_index = mr_distances.argmin() - - nearest_neighbor = neighbor_indices[nn_index] - if mr_distances[nn_index] > 0.0: - lambda_ = 1.0 / mr_distances[nn_index] - else: - lambda_ = np.finfo(np.double).max - - return nearest_neighbor, lambda_ - - -def _extend_condensed_tree( - tree, neighbor_indices, neighbor_distances, core_distances, min_samples -): - """ - Create a new condensed tree with an additional point added, allowing for - computations as if this point had been part of the original tree. Note - that this makes as little change to the tree as possible, with no - re-optimizing/re-condensing so that the selected clusters remain - effectively unchanged. - - Parameters - ---------- - tree : structured array - The raw format condensed tree to update. - - neighbor_indices : array (2 * min_samples, ) - An array of raw distance based nearest neighbor indices. - - neighbor_distances : array (2 * min_samples, ) - An array of raw distances to the nearest neighbors. - - core_distances : array (n_samples, ) - An array of core distances for all points - - min_samples : int - The min_samples value used to generate core distances. - - Returns - ------- - new_tree : structured array - The original tree with an extra row providing the parent cluster - and lambda information for a new point given index -1. - """ - tree_root = tree["parent"].min() - - nearest_neighbor, lambda_ = _find_neighbor_and_lambda( - neighbor_indices, neighbor_distances, core_distances, min_samples - ) - - neighbor_tree_row = get_tree_row_with_child(tree, nearest_neighbor) - potential_cluster = neighbor_tree_row["parent"] - - if neighbor_tree_row["lambda_val"] <= lambda_: - # New point departs with the old - new_tree_row = (potential_cluster, -1, 1, neighbor_tree_row["lambda_val"]) - else: - # Find appropriate cluster based on lambda of new point - while ( - potential_cluster > tree_root - and tree[tree["child"] == potential_cluster]["lambda_val"] >= lambda_ - ): - potential_cluster = tree["parent"][tree["child"] == potential_cluster][0] - - new_tree_row = (potential_cluster, -1, 1, lambda_) - - return np.append(tree, new_tree_row) - - -def _find_cluster_and_probability( - tree, - cluster_tree, - neighbor_indices, - neighbor_distances, - core_distances, - cluster_map, - max_lambdas, - min_samples, -): - """ - Return the cluster label (of the original clustering) and membership - probability of a new data point. - - Parameters - ---------- - tree : CondensedTree - The condensed tree associated with the clustering. - - cluster_tree : structured_array - The raw form of the condensed tree with only cluster information (no - data on individual points). This is significantly more compact. - - neighbor_indices : array (2 * min_samples, ) - An array of raw distance based nearest neighbor indices. - - neighbor_distances : array (2 * min_samples, ) - An array of raw distances to the nearest neighbors. - - core_distances : array (n_samples, ) - An array of core distances for all points - - cluster_map : dict - A dictionary mapping cluster numbers in the condensed tree to labels - in the final selected clustering. - - max_lambdas : dict - A dictionary mapping cluster numbers in the condensed tree to the - maximum lambda value seen in that cluster. - - min_samples : int - The min_samples value used to generate core distances. - """ - raw_tree = tree._raw_tree - tree_root = cluster_tree["parent"].min() - - nearest_neighbor, lambda_ = _find_neighbor_and_lambda( - neighbor_indices, neighbor_distances, core_distances, min_samples - ) - - neighbor_tree_row = get_tree_row_with_child(raw_tree, nearest_neighbor) - potential_cluster = neighbor_tree_row["parent"] - - if neighbor_tree_row["lambda_val"] > lambda_: - # Find appropriate cluster based on lambda of new point - while ( - potential_cluster > tree_root - and cluster_tree["lambda_val"][cluster_tree["child"] == potential_cluster] - >= lambda_ - ): - potential_cluster = cluster_tree["parent"][ - cluster_tree["child"] == potential_cluster - ][0] - - if potential_cluster in cluster_map: - cluster_label = cluster_map[potential_cluster] - else: - cluster_label = -1 - - if cluster_label >= 0: - max_lambda = max_lambdas[potential_cluster] - - if max_lambda > 0.0: - lambda_ = min(max_lambda, lambda_) - prob = lambda_ / max_lambda - else: - prob = 1.0 - else: - prob = 0.0 - - return cluster_label, prob - - -def approximate_predict(clusterer, points_to_predict): - """ - Predict the cluster label of new points. - - The returned labels will be those of the original clustering found by - ``clusterer``, and therefore are not (necessarily) the cluster labels that - would be found by clustering the original data combined with - ``points_to_predict``, hence the 'approximate' label. - - If you simply wish to assign new points to an existing clustering - in the 'best' way possible, this is the function to use. If you - want to predict how ``points_to_predict`` would cluster with - the original data under HDBSCAN the most efficient existing approach - is to simply recluster with the new point(s) added to the original dataset. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. - - points_to_predict : array, or array-like (n_samples, n_features) - The new data points to predict cluster labels for. They should - have the same dimensionality as the original dataset over which - clusterer was fit. - - Returns - ------- - labels : array (n_samples,) - The predicted labels of the ``points_to_predict``. - - probabilities : array (n_samples,) - The soft cluster scores for each of the ``points_to_predict``. - - See Also - -------- - sklearn.cluster.hdbscan.prediction.membership_vector : Predict soft cluster - membership. - sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict - soft cluster membership vectors for all points in the original dataset - the clusterer was trained on. - """ - if clusterer.prediction_data_ is None: - raise ValueError( - "Clusterer does not have prediction data!" - " Try fitting with prediction_data=True set," - " or run generate_prediction_data on the clusterer" - ) - - points_to_predict = np.asarray(points_to_predict) - - if points_to_predict.shape[1] != clusterer.prediction_data_.raw_data.shape[1]: - raise ValueError("New points dimension does not match fit data!") - - if clusterer.prediction_data_.cluster_tree.shape[0] == 0: - warn( - "Clusterer does not have any defined clusters, new data" - " will be automatically predicted as noise." - ) - labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) - probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) - return labels, probabilities - - labels = np.empty(points_to_predict.shape[0], dtype=np.int32) - probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) - - min_samples = clusterer.min_samples or clusterer.min_cluster_size - neighbor_distances, neighbor_indices = clusterer.prediction_data_.tree.query( - points_to_predict, k=2 * min_samples - ) - - for i in range(points_to_predict.shape[0]): - label, prob = _find_cluster_and_probability( - clusterer.condensed_tree_, - clusterer.prediction_data_.cluster_tree, - neighbor_indices[i], - neighbor_distances[i], - clusterer.prediction_data_.core_distances, - clusterer.prediction_data_.cluster_map, - clusterer.prediction_data_.max_lambdas, - min_samples, - ) - labels[i] = label - probabilities[i] = prob - - return labels, probabilities - - -def approximate_predict_scores(clusterer, points_to_predict): - """ - Predict the outlier score of new points. - - The returned scores will be based on the original clustering found by - ``clusterer``, and therefore are not (necessarily) the outlier scores that - would be found by clustering the original data combined with - ``points_to_predict``, hence the 'approximate' label. - - If you simply wish to calculate the outlier scores for new points - in the 'best' way possible, this is the function to use. If you - want to predict the outlier score of ``points_to_predict`` with - the original data under HDBSCAN the most efficient existing approach - is to simply recluster with the new point(s) added to the original dataset. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. - - points_to_predict : array, or array-like (n_samples, n_features) - The new data points to predict cluster labels for. They should - have the same dimensionality as the original dataset over which - clusterer was fit. - - Returns - ------- - scores : array (n_samples,) - The predicted scores of the ``points_to_predict``. - - See Also - -------- - sklearn.cluster.hdbscan.prediction.membership_vector : Predict soft cluster - membership. - sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict - soft cluster membership vectors for all points in the original dataset - the clusterer was trained on. - """ - try: - clusterer.prediction_data_ - except AttributeError: - raise ValueError( - "Clusterer does not have prediction data!" - " Try fitting with prediction_data=True set," - " or run generate_prediction_data on the clusterer" - ) - - points_to_predict = np.asarray(points_to_predict) - - if points_to_predict.shape[1] != clusterer.prediction_data_.raw_data.shape[1]: - raise ValueError("New points dimension does not match fit data!") - - if clusterer.prediction_data_.cluster_tree.shape[0] == 0: - warn( - "Clusterer does not have any defined clusters, new data" - " will be automatically predicted as outliers." - ) - scores = np.ones(points_to_predict.shape[0], dtype=np.int32) - return scores - - scores = np.empty(points_to_predict.shape[0], dtype=np.float64) - - min_samples = clusterer.min_samples or clusterer.min_cluster_size - neighbor_distances, neighbor_indices = clusterer.prediction_data_.tree.query( - points_to_predict, k=2 * min_samples - ) - - tree = clusterer.condensed_tree_._raw_tree - - parent_array = tree["parent"] - - tree_root = parent_array.min() - max_lambdas = {} - for parent in np.unique(tree["parent"]): - max_lambdas[parent] = tree[tree["parent"] == parent]["lambda_val"].max() - - for n in np.argsort(parent_array): - cluster = tree["child"][n] - if cluster < tree_root: - break - - parent = parent_array[n] - if max_lambdas[cluster] > max_lambdas[parent]: - max_lambdas[parent] = max_lambdas[cluster] - - for i in range(points_to_predict.shape[0]): - neigh, lambda_ = _find_neighbor_and_lambda( - neighbor_indices[i], - neighbor_distances[i], - clusterer.prediction_data_.core_distances, - min_samples, - ) - - neighbor_tree_row = get_tree_row_with_child(tree, neigh) - potential_cluster = neighbor_tree_row["parent"] - - if neighbor_distances[i].min() == 0: - # the point is in the dataset, fix lambda for rounding errors - lambda_ = neighbor_tree_row["lambda_val"] - - max_lambda = max_lambdas[potential_cluster] - - if max_lambda > 0.0: - scores[i] = (max_lambda - lambda_) / max_lambda - else: - scores[i] = 0.0 - - return scores - - -def membership_vector(clusterer, points_to_predict): - """ - Predict soft cluster membership. - - Predicts sofr cluster membership, producing a vector for each point in - ``points_to_predict`` that gives a probability that the given point is a - member of a cluster for each of the selected clusters of the ``clusterer``. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. - - points_to_predict : array, or array-like (n_samples, n_features) - The new data points to predict cluster labels for. They should - have the same dimensionality as the original dataset over which - clusterer was fit. - - Returns - ------- - membership_vectors : array (n_samples, n_clusters) - The probability that point ``i`` is a member of cluster ``j`` is - in ``membership_vectors[i, j]``. - - See Also - -------- - sklearn.cluster.hdbscan.prediction.approximate_predict : Predict the - cluster label of new points. - sklearn.cluster.hdbscan.prediction.all_points_membership_vectors : Predict - soft cluster membership vectors for all points in the original dataset - the clusterer was trained on. - """ - - points_to_predict = points_to_predict.astype(np.float64) - clusters = np.array( - sorted(list(clusterer.condensed_tree_._select_clusters())) - ).astype(np.intp) - - result = np.empty((points_to_predict.shape[0], clusters.shape[0]), dtype=np.float64) - - min_samples = clusterer.min_samples or clusterer.min_cluster_size - neighbor_distances, neighbor_indices = clusterer.prediction_data_.tree.query( - points_to_predict, k=2 * min_samples - ) - - for i in range(points_to_predict.shape[0]): - - # We need to find where in the tree the new point would go - # for the purposes of outlier membership approximation - nearest_neighbor, lambda_ = _find_neighbor_and_lambda( - neighbor_indices[i], - neighbor_distances[i], - clusterer.prediction_data_.core_distances, - min_samples, - ) - - neighbor_tree_row = get_tree_row_with_child( - clusterer.condensed_tree_._raw_tree, nearest_neighbor - ) - - if neighbor_tree_row["lambda_val"] <= lambda_: - lambda_ = neighbor_tree_row["lambda_val"] - - distance_vec = dist_membership_vector( - points_to_predict[i], - clusterer.prediction_data_.exemplars, - clusterer.prediction_data_.dist_metric, - ) - outlier_vec = outlier_membership_vector( - nearest_neighbor, - lambda_, - clusters, - clusterer.condensed_tree_._raw_tree, - clusterer.prediction_data_.leaf_max_lambdas, - clusterer.prediction_data_.cluster_tree, - ) - - result[i] = distance_vec**0.5 * outlier_vec**2.0 - result[i] /= result[i].sum() - - result[i] *= prob_in_some_cluster( - nearest_neighbor, - lambda_, - clusters, - clusterer.condensed_tree_._raw_tree, - clusterer.prediction_data_.leaf_max_lambdas, - clusterer.prediction_data_.cluster_tree, - ) - - return result - - -def all_points_membership_vectors(clusterer): - """ - Predict soft cluster membership for all points in the original dataset. - - This function is more efficient by making use of the fact that all points - are already in the condensed tree, and processing in bulk. - - Parameters - ---------- - clusterer : HDBSCAN - A clustering object that has been fit to the data and - either had ``prediction_data=True`` set, or called the - ``generate_prediction_data`` method after the fact. - This method does not work if the clusterer was trained - with ``metric='precomputed'``. - - Returns - ------- - membership_vectors : array (n_samples, n_clusters) - The probability that point ``i`` of the original dataset is a member of - cluster ``j`` is in ``membership_vectors[i, j]``. - - See Also - -------- - sklearn.cluster.hdbscan.prediction.approximate_predict : Predict the - cluster label of new points. - sklearn.cluster.hdbscan.prediction.membership_vectors : Predict soft cluster - membership. - """ - clusters = np.array( - sorted(list(clusterer.condensed_tree_._select_clusters())) - ).astype(np.intp) - all_points = clusterer.prediction_data_.raw_data - - # When no clusters found, return array of 0's - if clusters.size == 0: - return np.zeros(all_points.shape[0]) - - distance_vecs = all_points_dist_membership_vector( - all_points, - clusterer.prediction_data_.exemplars, - clusterer.prediction_data_.dist_metric, - ) - outlier_vecs = all_points_outlier_membership_vector( - clusters, - clusterer.condensed_tree_._raw_tree, - clusterer.prediction_data_.leaf_max_lambdas, - clusterer.prediction_data_.cluster_tree, - ) - in_cluster_probs = all_points_prob_in_some_cluster( - clusters, - clusterer.condensed_tree_._raw_tree, - clusterer.prediction_data_.leaf_max_lambdas, - clusterer.prediction_data_.cluster_tree, - ) - - result = distance_vecs * outlier_vecs - row_sums = result.sum(axis=1) - result = result / row_sums[:, np.newaxis] - result *= in_cluster_probs[:, np.newaxis] - - return result diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py index 1d668ff1c00d7..bd41858dd85e4 100644 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py @@ -15,7 +15,7 @@ from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm from .dist_metrics import DistanceMetric from ._hdbscan_reachability import mutual_reachability -from ._trees import SingleLinkageTree +from ._hdbscan_tree import labelling_at_cut from sklearn.neighbors import KDTree, BallTree # Author: Leland McInnes @@ -33,10 +33,7 @@ def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs) min_spanning_tree = mst_linkage_core(mutual_reachability_) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] - single_linkage_tree = label(min_spanning_tree) - single_linkage_tree = SingleLinkageTree(single_linkage_tree) - - return single_linkage_tree + return label(min_spanning_tree) def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): @@ -55,10 +52,7 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kw core_distances = tree.query(X, k=k)[0][:, -1].copy(order="C") min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) - single_linkage_tree = label(min_spanning_tree) - single_linkage_tree = SingleLinkageTree(single_linkage_tree) - - return single_linkage_tree + return label(min_spanning_tree) def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): @@ -77,10 +71,7 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric="euclidean", ** core_distances = tree.query(X, k=k)[0][:, -1].copy(order="C") min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) - single_linkage_tree = label(min_spanning_tree) - single_linkage_tree = SingleLinkageTree(single_linkage_tree) - - return single_linkage_tree + return label(min_spanning_tree) def _rsl_boruvka_kdtree( @@ -99,10 +90,7 @@ def _rsl_boruvka_kdtree( ) min_spanning_tree = alg.spanning_tree() - single_linkage_tree = label(min_spanning_tree) - single_linkage_tree = SingleLinkageTree(single_linkage_tree) - - return single_linkage_tree + return label(min_spanning_tree) def _rsl_boruvka_balltree( @@ -121,10 +109,7 @@ def _rsl_boruvka_balltree( ) min_spanning_tree = alg.spanning_tree() - single_linkage_tree = label(min_spanning_tree) - single_linkage_tree = SingleLinkageTree(single_linkage_tree) - - return single_linkage_tree + return label(min_spanning_tree) def robust_single_linkage( @@ -298,9 +283,9 @@ def robust_single_linkage( X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs ) - labels = single_linkage_tree.get_clusters(cut, gamma) + labels = labelling_at_cut(single_linkage_tree, cut, gamma) - return labels, single_linkage_tree.to_numpy() + return labels class RobustSingleLinkage(BaseEstimator, ClusterMixin): @@ -372,14 +357,6 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin): labels_ : ndarray, shape (n_samples, ) Cluster labels for each point. Noisy samples are given the label -1. - cluster_hierarchy_ : SingleLinkageTree object - The single linkage tree produced during clustering. - This object provides several methods for: - * Plotting - * Generating a flat clustering - * Exporting to NetworkX - * Exporting to Pandas - References ---------- .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the @@ -431,7 +408,7 @@ def fit(self, X, y=None): del kwargs["metric_params"] kwargs.update(self.metric_params) - self.labels_, self._cluster_hierarchy = robust_single_linkage(X, **kwargs) + self.labels_ = robust_single_linkage(X, **kwargs) return self @@ -453,12 +430,3 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ - - @property - def cluster_hierarchy_(self): - if hasattr(self, "_cluster_hierarchy"): - return SingleLinkageTree(self._cluster_hierarchy) - else: - raise AttributeError( - "No single linkage tree was generated; try running fit first." - ) diff --git a/sklearn/cluster/_hdbscan/_trees.py b/sklearn/cluster/_hdbscan/_trees.py deleted file mode 100644 index 3afdba90ffc2d..0000000000000 --- a/sklearn/cluster/_hdbscan/_trees.py +++ /dev/null @@ -1,193 +0,0 @@ -# -*- coding: utf-8 -*- -# Author: Leland McInnes -# -# License: BSD 3 clause - -import numpy as np - -from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs - -CB_LEFT = 0 -CB_RIGHT = 1 -CB_BOTTOM = 2 -CB_TOP = 3 - - -def _bfs_from_cluster_tree(tree, bfs_root): - """ - Perform a breadth first search on a tree in condensed tree format - """ - - result = [] - to_process = [bfs_root] - - while to_process: - result.extend(to_process) - to_process = tree["child"][np.in1d(tree["parent"], to_process)].tolist() - - return result - - -def _recurse_leaf_dfs(cluster_tree, current_node): - children = cluster_tree[cluster_tree["parent"] == current_node]["child"] - if len(children) == 0: - return [ - current_node, - ] - else: - return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) - - -def _get_leaves(condensed_tree): - cluster_tree = condensed_tree[condensed_tree["child_size"] > 1] - if cluster_tree.shape[0] == 0: - # Return the only cluster, the root - return [condensed_tree["parent"].min()] - - root = cluster_tree["parent"].min() - return _recurse_leaf_dfs(cluster_tree, root) - - -class CondensedTree(object): - """The condensed tree structure, which provides a simplified or smoothed version - of the :class:`~hdbscan.plots.SingleLinkageTree`. - - Parameters - ---------- - condensed_tree_array : numpy recarray from :class:`~hdbscan.HDBSCAN` - The raw numpy rec array version of the condensed tree as produced - internally by hdbscan. - - cluster_selection_method : string, optional (default 'eom') - The method of selecting clusters. One of 'eom' or 'leaf' - - allow_single_cluster : Boolean, optional (default False) - Whether to allow the root cluster as the only selected cluster - - """ - - def __init__( - self, - condensed_tree_array, - cluster_selection_method="eom", - allow_single_cluster=False, - ): - self._raw_tree = condensed_tree_array - self.cluster_selection_method = cluster_selection_method - self.allow_single_cluster = allow_single_cluster - - def _select_clusters(self): - if self.cluster_selection_method == "eom": - stability = compute_stability(self._raw_tree) - if self.allow_single_cluster: - node_list = sorted(stability.keys(), reverse=True) - else: - node_list = sorted(stability.keys(), reverse=True)[:-1] - cluster_tree = self._raw_tree[self._raw_tree["child_size"] > 1] - is_cluster = {cluster: True for cluster in node_list} - - for node in node_list: - child_selection = cluster_tree["parent"] == node - subtree_stability = np.sum( - [ - stability[child] - for child in cluster_tree["child"][child_selection] - ] - ) - - if subtree_stability > stability[node]: - is_cluster[node] = False - stability[node] = subtree_stability - else: - for sub_node in _bfs_from_cluster_tree(cluster_tree, node): - if sub_node != node: - is_cluster[sub_node] = False - - return sorted([cluster for cluster in is_cluster if is_cluster[cluster]]) - - elif self.cluster_selection_method == "leaf": - return _get_leaves(self._raw_tree) - else: - raise ValueError( - "Invalid Cluster Selection Method: %s\n" - 'Should be one of: "eom", "leaf"\n' - ) - - def to_numpy(self): - """Return a numpy structured array representation of the condensed tree.""" - return self._raw_tree.copy() - - -def _get_dendrogram_ordering(parent, linkage, root): - - if parent < root: - return [] - - return ( - _get_dendrogram_ordering(int(linkage[parent - root][0]), linkage, root) - + _get_dendrogram_ordering(int(linkage[parent - root][1]), linkage, root) - + [parent] - ) - - -class SingleLinkageTree(object): - """A single linkage format dendrogram tree, with plotting functionality - and networkX support. - - Parameters - ---------- - linkage : ndarray (n_samples, 4) - The numpy array that holds the tree structure. As output by - scipy.cluster.hierarchy, hdbscan, of fastcluster. - - """ - - def __init__(self, linkage): - self._linkage = linkage - - def to_numpy(self): - """Return a numpy array representation of the single linkage tree. - - This representation conforms to the scipy.cluster.hierarchy notion - of a single linkage tree, and can be used with all the associated - scipy tools. Please see the scipy documentation for more details - on the format. - """ - return self._linkage.copy() - - def get_clusters(self, cut_distance, min_cluster_size=5): - """Return a flat clustering from the single linkage hierarchy. - - This represents the result of selecting a cut value for robust single linkage - clustering. The `min_cluster_size` allows the flat clustering to declare noise - points (and cluster smaller than `min_cluster_size`). - - Parameters - ---------- - - cut_distance : float - The mutual reachability distance cut value to use to generate a - flat clustering. - - min_cluster_size : int, default=5 - Clusters smaller than this value with be called 'noise' and remain - unclustered in the resulting flat clustering. - - Returns - ------- - - labels : array (n_samples,) - An array of cluster labels, one per datapoint. Unclustered points - are assigned the label -1. - """ - return labelling_at_cut(self._linkage, cut_distance, min_cluster_size) - - -class MinimumSpanningTree(object): - def __init__(self, mst, data): - self._mst = mst - self._data = data - - def to_numpy(self): - """Return a numpy array of weighted edges in the minimum spanning tree""" - return self._mst.copy() diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 35aa0a3282cb0..fb12cc6eb6ca3 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -28,15 +28,13 @@ compute_stability, get_clusters, outlier_scores, + labelling_at_cut, ) from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm from .dist_metrics import DistanceMetric -from ._trees import CondensedTree, SingleLinkageTree, MinimumSpanningTree -from ._prediction import PredictionData - FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] # Author: Leland McInnes @@ -1036,40 +1034,11 @@ class HDBSCAN(BaseEstimator, ClusterMixin): scores can be guage the relative coherence of the clusters output by the algorithm. - condensed_tree_ : CondensedTree object - The condensed tree produced by HDBSCAN. The object has methods - for converting to pandas, networkx, and plotting. - - single_linkage_tree_ : SingleLinkageTree object - The single linkage tree produced by HDBSCAN. The object has methods - for converting to pandas, networkx, and plotting. - - minimum_spanning_tree_ : MinimumSpanningTree object - The minimum spanning tree of the mutual reachability graph generated - by HDBSCAN. Note that this is not generated by default and will only - be available if `gen_min_span_tree` was set to True on object creation. - Even then in some optimized cases a tre may not be generated. - outlier_scores_ : ndarray, shape (n_samples, ) Outlier scores for clustered points; the larger the score the more outlier-like the point. Useful as an outlier detection technique. Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander. - prediction_data_ : PredictionData object - Cached data used for predicting the cluster labels of new or - unseen points. Necessary only if you are using functions from - `hdbscan.prediction` (see - :func:`~hdbscan.prediction.approximate_predict`, - :func:`~hdbscan.prediction.membership_vector`, - and :func:`~hdbscan.prediction.all_points_membership_vectors`). - - exemplars_ : list - A list of exemplar points for clusters. Since HDBSCAN supports - arbitrary shapes for clusters we cannot provide a single cluster - exemplar per cluster. Instead a list is returned with each element - of the list being a numpy array of exemplar points for a cluster -- - these points are the "most representative" points of the cluster. - relative_validity_ : float A fast approximation of the Density Based Cluster Validity (DBCV) score [4]. The only differece, and the speed, comes from the fact @@ -1225,19 +1194,19 @@ def fit(self, X, y=None): self.labels_, self.probabilities_, self.cluster_persistence_, - self._condensed_tree, - self._single_linkage_tree, + self._condensed_tree_, + self._single_linkage_tree_, self._min_spanning_tree, ) = hdbscan(X, **kwargs) if self.metric != "precomputed" and not self._all_finite: # remap indices to align with original data in the case of # non-finite entries. - self._condensed_tree = remap_condensed_tree( - self._condensed_tree, internal_to_raw, outliers + self._condensed_tree_ = remap_condensed_tree( + self._condensed_tree_, internal_to_raw, outliers ) - self._single_linkage_tree = remap_single_linkage_tree( - self._single_linkage_tree, internal_to_raw, outliers + self._single_linkage_tree_ = remap_single_linkage_tree( + self._single_linkage_tree_, internal_to_raw, outliers ) new_labels = np.full(self._raw_data.shape[0], -1) new_labels[finite_index] = self.labels_ @@ -1273,41 +1242,6 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ - def generate_prediction_data(self): - """ - Create data that caches intermediate results for label prediction. - - Create data that caches intermediate results used for predicting - the label of new/unseen points. This data is only useful if - you are intending to use functions from `hdbscan.prediction`. - """ - - if self.metric in FAST_METRICS: - min_samples = self.min_samples or self.min_cluster_size - if self.metric in KDTree.valid_metrics: - tree_type = "kdtree" - elif self.metric in BallTree.valid_metrics: - tree_type = "balltree" - else: - warn("Metric {} not supported for prediction data!".format(self.metric)) - return - - metric_params = self.metric_params or {} - self._prediction_data = PredictionData( - self._raw_data, - self.condensed_tree_, - min_samples, - tree_type=tree_type, - metric=self.metric, - **metric_params, - ) - else: - warn( - "Cannot generate prediction data for non-vector" - "space inputs -- access to the source data rather" - "than mere distances is required!" - ) - def weighted_cluster_centroid(self, cluster_id): """ Provide an approximate representative point for a given cluster. @@ -1414,21 +1348,10 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): An array of cluster labels, one per datapoint. Unclustered points are assigned the label -1. """ - return self.single_linkage_tree_.get_clusters( - cut_distance=cut_distance, - min_cluster_size=min_cluster_size, + return labelling_at_cut( + self._single_linkage_tree_, cut_distance, min_cluster_size ) - @property - def prediction_data_(self): - """ - Cached data for predicting cluster labels of new or unseen points. - """ - if getattr(self, "_prediction_data", None) is None: - raise AttributeError("No prediction data was generated") - else: - return self._prediction_data - @property def outlier_scores_(self): """ @@ -1437,38 +1360,14 @@ def outlier_scores_(self): if getattr(self, "_outlier_scores", None) is not None: return self._outlier_scores else: - if self._condensed_tree is not None: - self._outlier_scores = outlier_scores(self._condensed_tree) + if getattr(self, "_condensed_tree_", None) is not None: + self._outlier_scores = outlier_scores(self._condensed_tree_) return self._outlier_scores else: raise AttributeError( "No condensed tree was generated; try running fit first." ) - @property - def condensed_tree_(self): - """A simplified or smoothed version of `sinkle_linkage_tree_`.""" - if getattr(self, "_condensed_tree", None) is not None: - return CondensedTree( - self._condensed_tree, - self.cluster_selection_method, - self.allow_single_cluster, - ) - else: - raise AttributeError( - "No condensed tree was generated; try running fit first." - ) - - @property - def single_linkage_tree_(self): - """A single linkage format dendrogram tree.""" - if getattr(self, "_single_linkage_tree", None) is not None: - return SingleLinkageTree(self._single_linkage_tree) - else: - raise AttributeError( - "No single linkage tree was generated; try running fit first." - ) - @property def minimum_spanning_tree_(self): """ @@ -1476,7 +1375,7 @@ def minimum_spanning_tree_(self): """ if getattr(self, "_min_spanning_tree", None) is not None: if self._raw_data is not None: - return MinimumSpanningTree(self._min_spanning_tree, self._raw_data) + return self._min_spanning_tree, self._raw_data else: warn( "No raw data is available; this may be due to using" @@ -1491,26 +1390,6 @@ def minimum_spanning_tree_(self): " explicit generation of the spanning tree." ) - @property - def exemplars_(self): - """ - A list of exemplar points for clusters. - - These are the "most representative" points of the arbitrarily shaped - clusters. - """ - if getattr(self, "_prediction_data", None) is not None: - return self._prediction_data.exemplars - elif self.metric in FAST_METRICS: - self.generate_prediction_data() - return self._prediction_data.exemplars - else: - raise AttributeError( - "Currently exemplars require the use of vector input data" - "with a suitable metric. This will likely change in the " - "future, but for now no exemplars can be provided" - ) - @property def relative_validity_(self): """ diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 4cfcba2b17ba3..c92b96c58d5e3 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -7,7 +7,6 @@ from scipy import sparse from scipy import stats from sklearn.utils._testing import ( - assert_array_equal, assert_array_almost_equal, assert_raises, ) @@ -15,9 +14,6 @@ HDBSCAN, hdbscan, validity_index, - approximate_predict, - approximate_predict_scores, - all_points_membership_vectors, ) # from sklearn.cluster.tests.common import generate_clustered_data @@ -32,8 +28,6 @@ from sklearn import datasets -import warnings - n_clusters = 3 # X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) X, y = make_blobs(n_samples=200, random_state=10) @@ -290,58 +284,12 @@ def test_hdbscan_boruvka_matches(tree): assert (num_mismatches / float(data.shape[0])) < 0.15 -def test_tree_numpy_output_formats(): - - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - - clusterer.single_linkage_tree_.to_numpy() - clusterer.condensed_tree_.to_numpy() - clusterer.minimum_spanning_tree_.to_numpy() - - def test_hdbscan_outliers(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) scores = clusterer.outlier_scores_ assert scores is not None -def test_hdbscan_approximate_predict(): - clusterer = HDBSCAN(prediction_data=True).fit(X) - cluster, _ = approximate_predict(clusterer, np.array([[-1.5, -1.0]])) - assert cluster == 2 - cluster, _ = approximate_predict(clusterer, np.array([[1.5, -1.0]])) - assert cluster == 1 - cluster, _ = approximate_predict(clusterer, np.array([[0.0, 0.0]])) - assert cluster == -1 - - -def test_hdbscan_approximate_predict_score(): - clusterer = HDBSCAN(min_cluster_size=200).fit(X) - # no prediction data error - assert_raises(ValueError, approximate_predict_scores, clusterer, X) - clusterer.generate_prediction_data() - # wrong dimensions error - assert_raises( - ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]]) - ) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - approximate_predict_scores(clusterer, np.array([[1.5, -1.0]])) - # no clusters warning - assert "Clusterer does not have any defined clusters" in str(w[-1].message) - clusterer = HDBSCAN(prediction_data=True).fit(X) - scores = approximate_predict_scores(clusterer, X) - assert_array_almost_equal(scores, clusterer.outlier_scores_) - assert scores.min() >= 0 - assert scores.max() <= 1 - - -def test_hdbscan_all_points_membership_vectors(): - clusterer = HDBSCAN(prediction_data=True, min_cluster_size=200).fit(X) - vects = all_points_membership_vectors(clusterer) - assert_array_equal(vects, np.zeros(clusterer.prediction_data_.raw_data.shape[0])) - - def test_hdbscan_badargs(): assert_raises(ValueError, hdbscan, X="fail") assert_raises(ValueError, hdbscan, X=None) diff --git a/sklearn/cluster/_hdbscan/tests/test_rsl.py b/sklearn/cluster/_hdbscan/tests/test_rsl.py index a87ef8490afe9..d2a85e897ebee 100644 --- a/sklearn/cluster/_hdbscan/tests/test_rsl.py +++ b/sklearn/cluster/_hdbscan/tests/test_rsl.py @@ -25,7 +25,7 @@ def test_rsl_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) - labels, tree = robust_single_linkage(D, 0.4, metric="precomputed") + labels = robust_single_linkage(D, 0.4, metric="precomputed") # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == 2 @@ -36,7 +36,7 @@ def test_rsl_distance_matrix(): def test_rsl_feature_vector(): - labels, tree = robust_single_linkage(X, 0.4) + labels = robust_single_linkage(X, 0.4) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -49,7 +49,7 @@ def test_rsl_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean - labels, tree = robust_single_linkage(X, 0.4, metric=metric) + labels = robust_single_linkage(X, 0.4, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -64,7 +64,7 @@ def test_rsl_input_lists(): def test_rsl_boruvka_balltree(): - labels, tree = robust_single_linkage(X, 0.45, algorithm="boruvka_balltree") + labels = robust_single_linkage(X, 0.45, algorithm="boruvka_balltree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -74,7 +74,7 @@ def test_rsl_boruvka_balltree(): def test_rsl_prims_balltree(): - labels, tree = robust_single_linkage(X, 0.4, algorithm="prims_balltree") + labels = robust_single_linkage(X, 0.4, algorithm="prims_balltree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -84,7 +84,7 @@ def test_rsl_prims_balltree(): def test_rsl_prims_kdtree(): - labels, tree = robust_single_linkage(X, 0.4, algorithm="prims_kdtree") + labels = robust_single_linkage(X, 0.4, algorithm="prims_kdtree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -93,24 +93,11 @@ def test_rsl_prims_kdtree(): assert n_clusters_2 == n_clusters -# def test_rsl_unavailable_hierarchy(): -# clusterer = RobustSingleLinkage() -# with warnings.catch_warnings(record=True) as w: -# tree = clusterer.cluster_hierarchy_ -# assert len(w) > 0 -# assert tree is None - - -def test_rsl_hierarchy(): - clusterer = RobustSingleLinkage().fit(X) - assert clusterer.cluster_hierarchy_ is not None - - def test_rsl_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) # H, y = shuffle(X, y, random_state=7) H = StandardScaler().fit_transform(H) - labels, tree = robust_single_linkage(H, 5.5) + labels = robust_single_linkage(H, 5.5) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters From 9e9be819d5aaa933a94c45c8e408016da06d82f7 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 12 Mar 2022 20:32:44 -0500 Subject: [PATCH 023/160] WIP docstring improvements for RSL --- .../_hdbscan/_robust_single_linkage_.py | 111 ++++++++++-------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py index bd41858dd85e4..1c29dd285d19c 100644 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py @@ -123,7 +123,7 @@ def robust_single_linkage( memory=None, leaf_size=40, core_dist_n_jobs=4, - **kwargs, + metric_params=None, ): """ Perform robust single linkage clustering. @@ -190,7 +190,7 @@ def robust_single_linkage( supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - **kwargs : optional + metric_params : dict, default=None Arguments passed to the distance metric. Returns @@ -222,35 +222,29 @@ def robust_single_linkage( if not isinstance(leaf_size, int) or leaf_size < 1: raise ValueError("Leaf size must be at least one!") - if metric == "minkowski": - if "p" not in kwargs or kwargs["p"] is None: - raise TypeError("Minkowski metric given but no p value supplied!") - if kwargs["p"] < 0: - raise ValueError("Minkowski metric with negative p value is not defined!") - X = check_array(X, accept_sparse="csr") memory = Memory(cachedir=memory, verbose=0) if algorithm != "best": if algorithm == "generic": single_linkage_tree = memory.cache(_rsl_generic)( - X, k, alpha, metric, **kwargs + X, k, alpha, metric, **metric_params ) elif algorithm == "prims_kdtree": single_linkage_tree = memory.cache(_rsl_prims_kdtree)( - X, k, alpha, metric, **kwargs + X, k, alpha, metric, **metric_params ) elif algorithm == "prims_balltree": single_linkage_tree = memory.cache(_rsl_prims_balltree)( - X, k, alpha, metric, **kwargs + X, k, alpha, metric, **metric_params ) elif algorithm == "boruvka_kdtree": single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params ) elif algorithm == "boruvka_balltree": single_linkage_tree = memory.cache(_rsl_boruvka_balltree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params ) else: raise TypeError("Unknown algorithm type %s specified" % algorithm) @@ -258,29 +252,29 @@ def robust_single_linkage( if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... single_linkage_tree = memory.cache(_rsl_generic)( - X, k, alpha, metric, **kwargs + X, k, alpha, metric, **metric_params ) elif metric in KDTree.valid_metrics: # Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 128: single_linkage_tree = memory.cache(_rsl_prims_kdtree)( - X, k, alpha, metric, **kwargs + X, k, alpha, metric, **metric_params ) else: single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params ) else: # Metric is a valid BallTree metric # Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 128: single_linkage_tree = memory.cache(_rsl_prims_kdtree)( - X, k, alpha, metric, **kwargs + X, k, alpha, metric, **metric_params ) else: single_linkage_tree = memory.cache(_rsl_boruvka_balltree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **kwargs + X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params ) labels = labelling_at_cut(single_linkage_tree, cut, gamma) @@ -289,8 +283,8 @@ def robust_single_linkage( class RobustSingleLinkage(BaseEstimator, ClusterMixin): - r"""Perform robust single linkage clustering from a vector array - or distance matrix. + r""" + Perform robust single linkage clustering. Robust single linkage is a modified version of single linkage that attempts to be more robust to noise. Specifically the goal is to @@ -300,41 +294,38 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin): Parameters ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) - A feature array, or array of distances between samples if - ``metric='precomputed'``. - - cut : float + cut : float, default=0.4 The reachability distance value to cut the cluster heirarchy at to derive a flat cluster labelling. - k : int, optional (default=5) + k : int, default=5 Reachability distances will be computed with regard to the `k` nearest neighbors. - alpha : float, optional (default=np.sqrt(2)) + alpha : float, default=`np.sqrt(2)` Distance scaling for reachability distance computation. Reachability distance is computed as - $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$. - gamma : int, optional (default=5) + .. math:: + + max \\{ core_k(a), core_k(b), 1/\\alpha d(a,b) \\}. + + gamma : int, default=5 Ignore any clusters in the flat clustering with size less than gamma, and declare points in such clusters as noise points. - metric : string, or callable, optional (default='euclidean') + metric : str, or callable, default='euclidean' The metric to use when calculating distance between instances in a - feature array. If metric is a string or callable, it must be one of - the options allowed by metrics.pairwise.pairwise_distances for its + feature array. + + If metric is a string or callable, it must be one of + the options allowed by `metrics.pairwise.pairwise_distances` for its metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and - must be square. - metric_params : dict, option (default={}) - Keyword parameter arguments for calling the metric (for example - the p values if using the minkowski metric). + If `metric="precomputed"`, `X` is assumed to be a distance matrix and + must be square. - algorithm : string, optional (default='best') + algorithm : str, default='best' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of @@ -345,15 +336,16 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin): * ``large_kdtree`` * ``large_kdtree_fastcluster`` - - core_dist_n_jobs : int, optional + core_dist_n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - (default 4) + + metric_params : dict, default=None + Arguments passed to the distance metric. Attributes - ------- + ---------- labels_ : ndarray, shape (n_samples, ) Cluster labels for each point. Noisy samples are given the label -1. @@ -363,6 +355,13 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin): cluster tree. In Advances in Neural Information Processing Systems (pp. 343-351). + See Also + -------- + + + Examples + -------- + >>> import numpy as np """ def __init__( @@ -374,7 +373,7 @@ def __init__( metric="euclidean", algorithm="best", core_dist_n_jobs=4, - metric_params={}, + metric_params=None, ): self.cut = cut @@ -387,8 +386,10 @@ def __init__( self.metric_params = metric_params def fit(self, X, y=None): - """Perform robust single linkage clustering from features or - distance matrix. + """ + Perform robust single linkage clustering on `X`. + + Assumes `X` is either a series of feature vectors or a distance matrix. Parameters ---------- @@ -397,23 +398,28 @@ def fit(self, X, y=None): A feature array, or array of distances between samples if ``metric='precomputed'``. + y : Ignored + Ignored. + Returns ------- self : object - Returns self + Returns self. """ X = check_array(X, accept_sparse="csr") kwargs = self.get_params() - del kwargs["metric_params"] - kwargs.update(self.metric_params) + kwargs["metric_params"] = self.metric_params or {} self.labels_ = robust_single_linkage(X, **kwargs) return self def fit_predict(self, X, y=None): - """Performs clustering on X and returns cluster labels. + """ + Perform clustering on X and return cluster labels. + + Assumes `X` is either a series of feature vectors or a distance matrix. Parameters ---------- @@ -422,10 +428,13 @@ def fit_predict(self, X, y=None): A feature array, or array of distances between samples if ``metric='precomputed'``. + y : Ignored + Ignored. + Returns ------- y : ndarray, shape (n_samples, ) - cluster labels + Cluster labels. """ self.fit(X) From 0cd08f339cdb9f64ba6aff559be65b77ba42c207 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 12 Mar 2022 20:43:02 -0500 Subject: [PATCH 024/160] Trimmed and removed unnecessary RSL estimator --- sklearn/cluster/__init__.py | 3 +- .../_hdbscan/_robust_single_linkage_.py | 161 +----------------- sklearn/cluster/_hdbscan/tests/test_rsl.py | 80 +-------- 3 files changed, 3 insertions(+), 241 deletions(-) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index e7204028363d8..b7d25cab68720 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -23,7 +23,7 @@ from ._bicluster import SpectralBiclustering, SpectralCoclustering from ._birch import Birch from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan -from ._hdbscan._robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage +from ._hdbscan._robust_single_linkage_ import robust_single_linkage from ._hdbscan._validity import validity_index __all__ = [ @@ -54,7 +54,6 @@ "SpectralCoclustering", "HDBSCAN", "hdbscan", - "RobustSingleLinkage", "robust_single_linkage", "validity_index", ] diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py index 1c29dd285d19c..d1d815cfc4049 100644 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py @@ -4,7 +4,6 @@ """ import numpy as np -from sklearn.base import BaseEstimator, ClusterMixin from sklearn.metrics import pairwise_distances from scipy.sparse import issparse @@ -224,6 +223,7 @@ def robust_single_linkage( X = check_array(X, accept_sparse="csr") memory = Memory(cachedir=memory, verbose=0) + metric_params = metric_params or {} if algorithm != "best": if algorithm == "generic": @@ -280,162 +280,3 @@ def robust_single_linkage( labels = labelling_at_cut(single_linkage_tree, cut, gamma) return labels - - -class RobustSingleLinkage(BaseEstimator, ClusterMixin): - r""" - Perform robust single linkage clustering. - - Robust single linkage is a modified version of single linkage that - attempts to be more robust to noise. Specifically the goal is to - more accurately approximate the level set tree of the unknown - probability density function from which the sample data has - been drawn. - - Parameters - ---------- - cut : float, default=0.4 - The reachability distance value to cut the cluster heirarchy at - to derive a flat cluster labelling. - - k : int, default=5 - Reachability distances will be computed with regard to the `k` - nearest neighbors. - - alpha : float, default=`np.sqrt(2)` - Distance scaling for reachability distance computation. Reachability - distance is computed as - - .. math:: - - max \\{ core_k(a), core_k(b), 1/\\alpha d(a,b) \\}. - - gamma : int, default=5 - Ignore any clusters in the flat clustering with size less than gamma, - and declare points in such clusters as noise points. - - metric : str, or callable, default='euclidean' - The metric to use when calculating distance between instances in a - feature array. - - If metric is a string or callable, it must be one of - the options allowed by `metrics.pairwise.pairwise_distances` for its - metric parameter. - - If `metric="precomputed"`, `X` is assumed to be a distance matrix and - must be square. - - algorithm : str, default='best' - Exactly which algorithm to use; hdbscan has variants specialised - for different characteristics of the data. By default this is set - to ``best`` which chooses the "best" algorithm given the nature of - the data. You can force other options if you believe you know - better. Options are: - * ``small`` - * ``small_kdtree`` - * ``large_kdtree`` - * ``large_kdtree_fastcluster`` - - core_dist_n_jobs : int, default=4 - Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For ``core_dist_n_jobs`` - below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - - metric_params : dict, default=None - Arguments passed to the distance metric. - - Attributes - ---------- - labels_ : ndarray, shape (n_samples, ) - Cluster labels for each point. Noisy samples are given the label -1. - - References - ---------- - .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the - cluster tree. In Advances in Neural Information Processing Systems - (pp. 343-351). - - See Also - -------- - - - Examples - -------- - >>> import numpy as np - """ - - def __init__( - self, - cut=0.4, - k=5, - alpha=1.4142135623730951, - gamma=5, - metric="euclidean", - algorithm="best", - core_dist_n_jobs=4, - metric_params=None, - ): - - self.cut = cut - self.k = k - self.alpha = alpha - self.gamma = gamma - self.metric = metric - self.algorithm = algorithm - self.core_dist_n_jobs = core_dist_n_jobs - self.metric_params = metric_params - - def fit(self, X, y=None): - """ - Perform robust single linkage clustering on `X`. - - Assumes `X` is either a series of feature vectors or a distance matrix. - - Parameters - ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) - A feature array, or array of distances between samples if - ``metric='precomputed'``. - - y : Ignored - Ignored. - - Returns - ------- - self : object - Returns self. - """ - X = check_array(X, accept_sparse="csr") - - kwargs = self.get_params() - kwargs["metric_params"] = self.metric_params or {} - - self.labels_ = robust_single_linkage(X, **kwargs) - - return self - - def fit_predict(self, X, y=None): - """ - Perform clustering on X and return cluster labels. - - Assumes `X` is either a series of feature vectors or a distance matrix. - - Parameters - ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) - A feature array, or array of distances between samples if - ``metric='precomputed'``. - - y : Ignored - Ignored. - - Returns - ------- - y : ndarray, shape (n_samples, ) - Cluster labels. - """ - - self.fit(X) - return self.labels_ diff --git a/sklearn/cluster/_hdbscan/tests/test_rsl.py b/sklearn/cluster/_hdbscan/tests/test_rsl.py index d2a85e897ebee..173ab10c6cef9 100644 --- a/sklearn/cluster/_hdbscan/tests/test_rsl.py +++ b/sklearn/cluster/_hdbscan/tests/test_rsl.py @@ -4,15 +4,13 @@ # import pickle import numpy as np from scipy.spatial import distance -from sklearn.utils.estimator_checks import check_estimator from sklearn.utils._testing import assert_raises -from sklearn.cluster import RobustSingleLinkage, robust_single_linkage +from sklearn.cluster import robust_single_linkage from sklearn.datasets import make_blobs from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler -import pytest n_clusters = 3 X, y = make_blobs(n_samples=50, random_state=1) @@ -30,20 +28,12 @@ def test_rsl_distance_matrix(): n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == 2 - labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == 2 - def test_rsl_feature_vector(): labels = robust_single_linkage(X, 0.4) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = RobustSingleLinkage().fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - def test_rsl_callable_metric(): # metric is the function reference, not the string key. @@ -53,45 +43,24 @@ def test_rsl_callable_metric(): n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = RobustSingleLinkage(metric=metric).fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - - -def test_rsl_input_lists(): - X = [[1.0, 2.0], [3.0, 4.0]] - RobustSingleLinkage().fit(X) # must not raise exception - def test_rsl_boruvka_balltree(): labels = robust_single_linkage(X, 0.45, algorithm="boruvka_balltree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = RobustSingleLinkage(cut=0.45, algorithm="boruvka_balltree").fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - def test_rsl_prims_balltree(): labels = robust_single_linkage(X, 0.4, algorithm="prims_balltree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = RobustSingleLinkage(algorithm="prims_balltree").fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - def test_rsl_prims_kdtree(): labels = robust_single_linkage(X, 0.4, algorithm="prims_kdtree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = RobustSingleLinkage(algorithm="prims_kdtree").fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - def test_rsl_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) @@ -101,19 +70,6 @@ def test_rsl_high_dimensional(): n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = ( - RobustSingleLinkage( - cut=5.5, - algorithm="best", - metric="seuclidean", - metric_params={"V": np.ones(H.shape[1])}, - ) - .fit(H) - .labels_ - ) - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters - def test_rsl_badargs(): assert_raises(ValueError, robust_single_linkage, "fail", 0.4) @@ -122,34 +78,6 @@ def test_rsl_badargs(): assert_raises(ValueError, robust_single_linkage, X, 0.4, k=-1) assert_raises(ValueError, robust_single_linkage, X, 0.4, metric="imperial") assert_raises(ValueError, robust_single_linkage, X, 0.4, metric=None) - assert_raises(ValueError, robust_single_linkage, X, 0.4, metric="minkowski", p=-1) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="minkowski", - p=-1, - algorithm="prims_kdtree", - ) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="minkowski", - p=-1, - algorithm="prims_balltree", - ) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="minkowski", - p=-1, - algorithm="boruvka_balltree", - ) assert_raises( ValueError, robust_single_linkage, @@ -188,9 +116,3 @@ def test_rsl_badargs(): assert_raises(TypeError, robust_single_linkage, X, 0.4, metric="minkowski", p=None) assert_raises(ValueError, robust_single_linkage, X, 0.4, leaf_size=0) assert_raises(ValueError, robust_single_linkage, X, 0.4, gamma=0) - - -# Disable for now -- need to refactor to meet newer standards -@pytest.mark.skip(reason="need to refactor to meet newer standards") -def test_rsl_is_sklearn_estimator(): - check_estimator(RobustSingleLinkage) From 7b73dd80b835fb49a4cda2ef233860cf532b5707 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 12 Mar 2022 21:09:47 -0500 Subject: [PATCH 025/160] Updated sqrt2 default in robust_single_linkage --- sklearn/cluster/_hdbscan/_robust_single_linkage_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py index d1d815cfc4049..d69be62397fab 100644 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py @@ -115,7 +115,7 @@ def robust_single_linkage( X, cut, k=5, - alpha=1.4142135623730951, + alpha=np.sqrt(2), gamma=5, metric="euclidean", algorithm="best", From 62cf09e4a0c1d3c5202a283af3d7cc1f23de7805 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 12 Mar 2022 21:35:16 -0500 Subject: [PATCH 026/160] Updated `alpha` arg for rsl functions --- sklearn/cluster/_hdbscan/_robust_single_linkage_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py index d69be62397fab..77309f55c95d8 100644 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py @@ -24,7 +24,7 @@ FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics -def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): +def _rsl_generic(X, k=5, metric="euclidean", **kwargs): distance_matrix = pairwise_distances(X, metric=metric, **kwargs) mutual_reachability_ = mutual_reachability(distance_matrix, k) @@ -35,7 +35,7 @@ def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs) return label(min_spanning_tree) -def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): +def _rsl_prims_kdtree(X, k=5, alpha=np.sqrt(2), metric="euclidean", **kwargs): # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: @@ -54,7 +54,7 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kw return label(min_spanning_tree) -def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric="euclidean", **kwargs): +def _rsl_prims_balltree(X, k=5, alpha=np.sqrt(2), metric="euclidean", **kwargs): # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: @@ -228,7 +228,7 @@ def robust_single_linkage( if algorithm != "best": if algorithm == "generic": single_linkage_tree = memory.cache(_rsl_generic)( - X, k, alpha, metric, **metric_params + X, k, metric, **metric_params ) elif algorithm == "prims_kdtree": single_linkage_tree = memory.cache(_rsl_prims_kdtree)( @@ -252,7 +252,7 @@ def robust_single_linkage( if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... single_linkage_tree = memory.cache(_rsl_generic)( - X, k, alpha, metric, **metric_params + X, k, metric, **metric_params ) elif metric in KDTree.valid_metrics: # Need heuristic to decide when to go to boruvka; From f48e1487898cb6156871d52c477285673fd854df Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 21:14:32 -0400 Subject: [PATCH 027/160] Added WIP section for HDBSCAN in User Guide --- doc/modules/clustering.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 1775ec5386ab5..3f1c059e3a7f6 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -880,6 +880,16 @@ by black points below. Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). In ACM Transactions on Database Systems (TODS), 42(3), 19. +.. _hdbscan: + +HDBSCAN [WIP] +============= + +The :class:`HDBSCAN` algorithm views clusters as areas of high density +separated by areas of low density, similarly to :class:`DBSCAN`. However + + + .. _optics: OPTICS From 87071a4a8060cd0210cb6a95e420f66d9e59841a Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 21:26:23 -0400 Subject: [PATCH 028/160] Replaced custom `dist_metrics` w/ `metric._dist_metrics` --- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 2 +- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 2 +- .../cluster/_hdbscan/_prediction_utils.pyx | 2 +- .../_hdbscan/_robust_single_linkage_.py | 2 +- sklearn/cluster/_hdbscan/dist_metrics.pxd | 94 -- sklearn/cluster/_hdbscan/dist_metrics.pyx | 1147 ----------------- sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- sklearn/cluster/setup.py | 7 - 8 files changed, 5 insertions(+), 1253 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pxd delete mode 100644 sklearn/cluster/_hdbscan/dist_metrics.pyx diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx index cab42934e5e73..c82f5f4eccad7 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -65,7 +65,7 @@ from libc.math cimport fabs, pow from sklearn.neighbors import KDTree, BallTree -from .dist_metrics cimport DistanceMetric +from sklearn.metrics._dist_metrics cimport DistanceMetric from joblib import Parallel, delayed diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index ddb1db48e8622..82c7bcebef6b3 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -11,7 +11,7 @@ import cython from libc.float cimport DBL_MAX from libc.stdio cimport printf -from .dist_metrics cimport DistanceMetric +from sklearn.metrics._dist_metrics cimport DistanceMetric cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( diff --git a/sklearn/cluster/_hdbscan/_prediction_utils.pyx b/sklearn/cluster/_hdbscan/_prediction_utils.pyx index a6a7c13489666..c1bfb48633595 100644 --- a/sklearn/cluster/_hdbscan/_prediction_utils.pyx +++ b/sklearn/cluster/_hdbscan/_prediction_utils.pyx @@ -6,7 +6,7 @@ import numpy as np cimport numpy as np -from .dist_metrics cimport DistanceMetric +from sklearn.metrics._dist_metrics cimport DistanceMetric from libc.float cimport DBL_MAX from libc.math cimport exp diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py index 77309f55c95d8..8539f867057ce 100644 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py @@ -12,7 +12,7 @@ from ._hdbscan_linkage import mst_linkage_core, mst_linkage_core_vector, label from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from .dist_metrics import DistanceMetric +from sklearn.metrics._dist_metrics import DistanceMetric from ._hdbscan_reachability import mutual_reachability from ._hdbscan_tree import labelling_at_cut from sklearn.neighbors import KDTree, BallTree diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pxd b/sklearn/cluster/_hdbscan/dist_metrics.pxd deleted file mode 100644 index df3c8af85b105..0000000000000 --- a/sklearn/cluster/_hdbscan/dist_metrics.pxd +++ /dev/null @@ -1,94 +0,0 @@ -#!python -#cython: boundscheck=False -#cython: wraparound=False -#cython: cdivision=True - -import cython -cimport cython - -import numpy as np -cimport numpy as np - -from libc.math cimport fabs, sqrt, exp, cos, pow - -ctypedef np.double_t DTYPE_t -ctypedef np.intp_t ITYPE_t - -cdef enum: - DTYPECODE = np.NPY_FLOAT64 - ITYPECODE = np.NPY_INTP - -# Fused type for certain operations -ctypedef fused DITYPE_t: - ITYPE_t - DTYPE_t - -ITYPE = np.intp - -DTYPE = np.double - -###################################################################### -# Inline distance functions -# -# We use these for the default (euclidean) case so that they can be -# inlined. This leads to faster computation for the most common case -cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t tmp, d=0 - cdef np.intp_t j - for j in range(size): - tmp = x1[j] - x2[j] - d += tmp * tmp - return sqrt(d) - - -cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t tmp, d=0 - cdef np.intp_t j - for j in range(size): - tmp = x1[j] - x2[j] - d += tmp * tmp - return d - - -cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1: - return dist * dist - - -cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) except -1: - return sqrt(dist) - - -###################################################################### -# DistanceMetric base class -cdef class DistanceMetric: - # The following attributes are required for a few of the subclasses. - # we must define them here so that cython's limited polymorphism will work. - # Because we don't expect to instantiate a lot of these objects, the - # extra memory overhead of this setup should not be an issue. - cdef DTYPE_t p - #cdef DTYPE_t[::1] vec - #cdef DTYPE_t[:, ::1] mat - cdef np.ndarray vec - cdef np.ndarray mat - cdef DTYPE_t* vec_ptr - cdef DTYPE_t* mat_ptr - cdef ITYPE_t size - cdef object func - cdef object kwargs - - cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1 - - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1 - - cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 - - cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, - DTYPE_t[:, ::1] D) except -1 - - cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1 - - cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 diff --git a/sklearn/cluster/_hdbscan/dist_metrics.pyx b/sklearn/cluster/_hdbscan/dist_metrics.pyx deleted file mode 100644 index 7416a9ffa62ce..0000000000000 --- a/sklearn/cluster/_hdbscan/dist_metrics.pyx +++ /dev/null @@ -1,1147 +0,0 @@ -# !python -# cython: boundscheck=False -# cython: wraparound=False -# cython: cdivision=True - -# By Jake Vanderplas (2013) -# written for the scikit-learn project -# modified for HDBSCAN Dual Tree Boruvka algorithm -# License: BSD - -import numpy as np -cimport numpy as np -np.import_array() # required in order to use C-API - -from libc.math cimport fabs, sqrt, exp, cos, pow, log, acos, M_PI - -DTYPE = np.double -ITYPE = np.intp - - -###################################################################### -# Numpy 1.3-1.4 compatibility utilities -cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( - np.ndarray[DTYPE_t, ndim=2, mode='c'] X): - return ( X.data) - - -cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec): - return &vec[0] - - -cdef DTYPE_t* get_mat_ptr(np.ndarray[DTYPE_t, ndim=2, mode='c'] mat): - return &mat[0, 0] -###################################################################### - - -# First, define a function to get an ndarray from a memory bufffer -cdef extern from "numpy/arrayobject.h": - object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims, - int typenum, void* data) - - -cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): - # Wrap a memory buffer with an ndarray. Warning: this is not robust. - # In particular, if x is deallocated before the returned array goes - # out of scope, this could cause memory errors. Since there is not - # a possibility of this for our use-case, this should be safe. - - # Note: this Segfaults unless np.import_array() is called above - return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) - - -# some handy constants -from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin -cdef DTYPE_t INF = np.inf - - -###################################################################### -# newObj function -# this is a helper function for pickling -def newObj(obj): - return obj.__new__(obj) - - -###################################################################### -# metric mappings -# These map from metric id strings to class names -METRIC_MAPPING = {'euclidean': EuclideanDistance, - 'l2': EuclideanDistance, - 'minkowski': MinkowskiDistance, - 'p': MinkowskiDistance, - 'manhattan': ManhattanDistance, - 'cityblock': ManhattanDistance, - 'l1': ManhattanDistance, - 'chebyshev': ChebyshevDistance, - 'infinity': ChebyshevDistance, - 'seuclidean': SEuclideanDistance, - 'mahalanobis': MahalanobisDistance, - 'wminkowski': WMinkowskiDistance, - 'hamming': HammingDistance, - 'canberra': CanberraDistance, - 'braycurtis': BrayCurtisDistance, - 'matching': MatchingDistance, - 'jaccard': JaccardDistance, - 'dice': DiceDistance, - 'kulsinski': KulsinskiDistance, - 'rogerstanimoto': RogersTanimotoDistance, - 'russellrao': RussellRaoDistance, - 'sokalmichener': SokalMichenerDistance, - 'sokalsneath': SokalSneathDistance, - 'haversine': HaversineDistance, - 'cosine': ArccosDistance, - 'arccos': ArccosDistance, - 'pyfunc': PyFuncDistance} - - -def get_valid_metric_ids(L): - """Given an iterable of metric class names or class identifiers, - return a list of metric IDs which map to those classes. - - Examples - -------- - >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) - >>> sorted(L) - ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] - """ - return [key for (key, val) in METRIC_MAPPING.items() - if (val.__name__ in L) or (val in L)] - - -###################################################################### -# Distance Metric Classes -cdef class DistanceMetric: - """DistanceMetric class - - This class provides a uniform interface to fast distance metric - functions. The various metrics can be accessed via the `get_metric` - class method and the metric string identifier (see below). - - Examples - -------- - - For example, to use the Euclidean distance: - - >>> dist = DistanceMetric.get_metric('euclidean') - >>> X = [[0, 1, 2], - [3, 4, 5]]) - >>> dist.pairwise(X) - array([[ 0. , 5.19615242], - [ 5.19615242, 0. ]]) - - Available Metrics - The following lists the string metric identifiers and the associated - distance metric classes: - - **Metrics intended for real-valued vector spaces:** - - ============== ==================== ======== =============================== - identifier class name args distance function - -------------- -------------------- -------- ------------------------------- - "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` - "manhattan" ManhattanDistance - ``sum(|x - y|)`` - "chebyshev" ChebyshevDistance - ``sum(max(|x - y|))`` - "minkowski" MinkowskiDistance p ``sum(|x - y|^p)^(1/p)`` - "wminkowski" WMinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` - "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` - "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` - ============== ==================== ======== =============================== - - **Metrics intended for two-dimensional vector spaces:** Note that the haversine - distance metric requires data in the form of [latitude, longitude] and both - inputs and outputs are in units of radians. - - ============ ================== ======================================== - identifier class name distance function - ------------ ------------------ ---------------------------------------- - "haversine" HaversineDistance 2 arcsin(sqrt(sin^2(0.5*dx) - + cos(x1)cos(x2)sin^2(0.5*dy))) - ============ ================== ======================================== - - - **Metrics intended for integer-valued vector spaces:** Though intended - for integer-valued vectors, these are also valid metrics in the case of - real-valued vectors. - - ============= ==================== ======================================== - identifier class name distance function - ------------- -------------------- ---------------------------------------- - "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` - "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` - "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` - ============= ==================== ======================================== - - **Metrics intended for boolean-valued vector spaces:** Any nonzero entry - is evaluated to "True". In the listings below, the following - abbreviations are used: - - - N : number of dimensions - - NTT : number of dims in which both values are True - - NTF : number of dims in which the first value is True, second is False - - NFT : number of dims in which the first value is False, second is True - - NFF : number of dims in which both values are False - - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT - - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT - - ================= ======================= =============================== - identifier class name distance function - ----------------- ----------------------- ------------------------------- - "jaccard" JaccardDistance NNEQ / NNZ - "maching" MatchingDistance NNEQ / N - "dice" DiceDistance NNEQ / (NTT + NNZ) - "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) - "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) - "russellrao" RussellRaoDistance NNZ / N - "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) - "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) - ================= ======================= =============================== - - **User-defined distance:** - - =========== =============== ======= - identifier class name args - ----------- --------------- ------- - "pyfunc" PyFuncDistance func - =========== =============== ======= - - Here ``func`` is a function which takes two one-dimensional numpy - arrays, and returns a distance. Note that in order to be used within - the BallTree, the distance must be a true metric: - i.e. it must satisfy the following properties - - 1) Non-negativity: d(x, y) >= 0 - 2) Identity: d(x, y) = 0 if and only if x == y - 3) Symmetry: d(x, y) = d(y, x) - 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) - - Because of the Python object overhead involved in calling the python - function, this will be fairly slow, but it will have the same - scaling as other distances. - """ - def __cinit__(self): - self.p = 2 - self.vec = np.zeros(1, dtype=DTYPE, order='c') - self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') - self.vec_ptr = get_vec_ptr(self.vec) - self.mat_ptr = get_mat_ptr(self.mat) - self.size = 1 - - def __reduce__(self): - """ - reduce method used for pickling - """ - return (newObj, (self.__class__,), self.__getstate__()) - - def __getstate__(self): - """ - get state for pickling - """ - if self.__class__.__name__ == "PyFuncDistance": - return (float(self.p), self.vec, self.mat, self.func, self.kwargs) - return (float(self.p), self.vec, self.mat) - - def __setstate__(self, state): - """ - set state for pickling - """ - self.p = state[0] - self.vec = state[1] - self.mat = state[2] - if self.__class__.__name__ == "PyFuncDistance": - self.func = state[3] - self.kwargs = state[4] - self.vec_ptr = get_vec_ptr(self.vec) - self.mat_ptr = get_mat_ptr(self.mat) - self.size = 1 - - @classmethod - def get_metric(cls, metric, **kwargs): - """Get the given distance metric from the string identifier. - - See the docstring of DistanceMetric for a list of available metrics. - - Parameters - ---------- - metric : string or class name - The distance metric to use - **kwargs - additional arguments will be passed to the requested metric - """ - if isinstance(metric, DistanceMetric): - return metric - - if callable(metric): - return PyFuncDistance(metric, **kwargs) - - # Map the metric string ID to the metric class - if isinstance(metric, type) and issubclass(metric, DistanceMetric): - pass - else: - try: - metric = METRIC_MAPPING[metric] - except: - raise ValueError("Unrecognized metric '%s'" % metric) - - # In Minkowski special cases, return more efficient methods - if metric is MinkowskiDistance: - p = kwargs.pop('p', 2) - if p == 1: - return ManhattanDistance(**kwargs) - elif p == 2: - return EuclideanDistance(**kwargs) - elif np.isinf(p): - return ChebyshevDistance(**kwargs) - else: - return MinkowskiDistance(p, **kwargs) - else: - return metric(**kwargs) - - def __init__(self): - if self.__class__ is DistanceMetric: - raise NotImplementedError("DistanceMetric is an abstract class") - - cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - """Compute the distance between vectors x1 and x2 - - This should be overridden in a base class. - """ - return -999 - - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - """Compute the reduced distance between vectors x1 and x2. - - This can optionally be overridden in a base class. - - The reduced distance is any measure that yields the same rank as the - distance, but is more efficient to compute. For example, for the - Euclidean metric, the reduced distance is the squared-euclidean - distance. - """ - return self.dist(x1, x2, size) - - cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: - """compute the pairwise distances between points in X""" - cdef ITYPE_t i1, i2 - for i1 in range(X.shape[0]): - for i2 in range(i1, X.shape[0]): - D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) - D[i2, i1] = D[i1, i2] - return 0 - - cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, - DTYPE_t[:, ::1] D) except -1: - """compute the cross-pairwise distances between arrays X and Y""" - cdef ITYPE_t i1, i2 - if X.shape[1] != Y.shape[1]: - raise ValueError('X and Y must have the same second dimension') - for i1 in range(X.shape[0]): - for i2 in range(Y.shape[0]): - D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) - return 0 - - cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - """Convert the reduced distance to the distance""" - return rdist - - cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - """Convert the distance to the reduced distance""" - return dist - - def rdist_to_dist(self, rdist): - """Convert the Reduced distance to the true distance. - - The reduced distance, defined for some metrics, is a computationally - more efficent measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. - """ - return rdist - - def dist_to_rdist(self, dist): - """Convert the true distance to the reduced distance. - - The reduced distance, defined for some metrics, is a computationally - more efficent measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. - """ - return dist - - def pairwise(self, X, Y=None): - """Compute the pairwise distances between X and Y - - This is a convenience routine for the sake of testing. For many - metrics, the utilities in scipy.spatial.distance.cdist and - scipy.spatial.distance.pdist will be faster. - - Parameters - ---------- - X : array_like - Array of shape (Nx, D), representing Nx points in D dimensions. - Y : array_like (optional) - Array of shape (Ny, D), representing Ny points in D dimensions. - If not specified, then Y=X. - Returns - ------- - dist : ndarray - The shape (Nx, Ny) array of pairwise distances between points in - X and Y. - """ - cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr - cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr - cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr - - Xarr = np.asarray(X, dtype=DTYPE, order='C') - if Y is None: - Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), - dtype=DTYPE, order='C') - self.pdist(get_memview_DTYPE_2D(Xarr), - get_memview_DTYPE_2D(Darr)) - else: - Yarr = np.asarray(Y, dtype=DTYPE, order='C') - Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), - dtype=DTYPE, order='C') - self.cdist(get_memview_DTYPE_2D(Xarr), - get_memview_DTYPE_2D(Yarr), - get_memview_DTYPE_2D(Darr)) - return Darr - - -# ------------------------------------------------------------ -# Euclidean Distance -# d = sqrt(sum(x_i^2 - y_i^2)) -cdef class EuclideanDistance(DistanceMetric): - """Euclidean Distance metric - - .. math:: - D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } - """ - def __init__(self): - self.p = 2 - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return euclidean_dist(x1, x2, size) - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return euclidean_rdist(x1, x2, size) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return sqrt(rdist) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return dist * dist - - def rdist_to_dist(self, rdist): - return np.sqrt(rdist) - - def dist_to_rdist(self, dist): - return dist ** 2 - - -# ------------------------------------------------------------ -# SEuclidean Distance -# d = sqrt(sum((x_i - y_i2)^2 / v_i)) -cdef class SEuclideanDistance(DistanceMetric): - """Standardized Euclidean Distance metric - - .. math:: - D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } - """ - def __init__(self, V): - self.vec = np.asarray(V, dtype=DTYPE) - self.vec_ptr = get_vec_ptr(self.vec) - self.size = self.vec.shape[0] - self.p = 2 - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != self.size: - with gil: - raise ValueError('SEuclidean dist: size of V does not match') - cdef DTYPE_t tmp, d=0 - cdef np.intp_t j - for j in range(size): - tmp = x1[j] - x2[j] - d += tmp * tmp / self.vec_ptr[j] - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return sqrt(self.rdist(x1, x2, size)) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return sqrt(rdist) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return dist * dist - - def rdist_to_dist(self, rdist): - return np.sqrt(rdist) - - def dist_to_rdist(self, dist): - return dist ** 2 - - -# ------------------------------------------------------------ -# Manhattan Distance -# d = sum(abs(x_i - y_i)) -cdef class ManhattanDistance(DistanceMetric): - """Manhattan/City-block Distance metric - - .. math:: - D(x, y) = \sum_i |x_i - y_i| - """ - def __init__(self): - self.p = 1 - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d = 0 - cdef np.intp_t j - for j in range(size): - d += fabs(x1[j] - x2[j]) - return d - - -# ------------------------------------------------------------ -# Chebyshev Distance -# d = max_i(abs(x_i), abs(y_i)) -cdef class ChebyshevDistance(DistanceMetric): - """Chebyshev/Infinity Distance - - .. math:: - D(x, y) = max_i (|x_i - y_i|) - """ - def __init__(self): - self.p = INF - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d = 0 - cdef np.intp_t j - for j in range(size): - d = fmax(d, fabs(x1[j] - x2[j])) - return d - - -# ------------------------------------------------------------ -# Minkowski Distance -# d = sum(x_i^p - y_i^p) ^ (1/p) -cdef class MinkowskiDistance(DistanceMetric): - """Minkowski Distance - - .. math:: - D(x, y) = [\sum_i (x_i - y_i)^p] ^ (1/p) - - Minkowski Distance requires p >= 1 and finite. For p = infinity, - use ChebyshevDistance. - Note that for p=1, ManhattanDistance is more efficient, and for - p=2, EuclideanDistance is more efficient. - """ - def __init__(self, p): - if p < 1: - raise ValueError("p must be greater than 1") - elif np.isinf(p): - raise ValueError("MinkowskiDistance requires finite p. " - "For p=inf, use ChebyshevDistance.") - self.p = p - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d=0 - cdef np.intp_t j - for j in range(size): - d += pow(fabs(x1[j] - x2[j]), self.p) - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return pow(rdist, 1. / self.p) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return pow(dist, self.p) - - def rdist_to_dist(self, rdist): - return rdist ** (1. / self.p) - - def dist_to_rdist(self, dist): - return dist ** self.p - - -# ------------------------------------------------------------ -# W-Minkowski Distance -# d = sum(w_i * (x_i^p - y_i^p)) ^ (1/p) -cdef class WMinkowskiDistance(DistanceMetric): - """Weighted Minkowski Distance - - .. math:: - D(x, y) = [\sum_i w_i (x_i - y_i)^p] ^ (1/p) - - Weighted Minkowski Distance requires p >= 1 and finite. - - Parameters - ---------- - p : int - The order of the norm of the difference :math:`{||u-v||}_p`. - w : (N,) array_like - The weight vector. - - """ - def __init__(self, p, w): - if p < 1: - raise ValueError("p must be greater than 1") - elif np.isinf(p): - raise ValueError("WMinkowskiDistance requires finite p. " - "For p=inf, use ChebyshevDistance.") - self.p = p - self.vec = np.asarray(w, dtype=DTYPE) - self.vec_ptr = get_vec_ptr(self.vec) - self.size = self.vec.shape[0] - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != self.size: - with gil: - raise ValueError('WMinkowskiDistance dist: ' - 'size of w does not match') - cdef DTYPE_t d=0 - cdef np.intp_t j - for j in range(size): - d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p) - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return pow(self.rdist(x1, x2, size), 1. / self.p) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return pow(rdist, 1. / self.p) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return pow(dist, self.p) - - def rdist_to_dist(self, rdist): - return rdist ** (1. / self.p) - - def dist_to_rdist(self, dist): - return dist ** self.p - - -# ------------------------------------------------------------ -# Mahalanobis Distance -# d = sqrt( (x - y)^T V^-1 (x - y) ) -cdef class MahalanobisDistance(DistanceMetric): - """Mahalanobis Distance - - .. math:: - D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } - - Parameters - ---------- - V : array_like - Symmetric positive-definite covariance matrix. - The inverse of this matrix will be explicitly computed. - VI : array_like - optionally specify the inverse directly. If VI is passed, - then V is not referenced. - """ - def __init__(self, V=None, VI=None): - if VI is None: - VI = np.linalg.inv(V) - if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: - raise ValueError("V/VI must be square") - - self.mat = np.asarray(VI, dtype=float, order='C') - self.mat_ptr = get_mat_ptr(self.mat) - - self.size = self.mat.shape[0] - - # we need vec as a work buffer - self.vec = np.zeros(self.size, dtype=DTYPE) - self.vec_ptr = get_vec_ptr(self.vec) - - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != self.size: - with gil: - raise ValueError('Mahalanobis dist: size of V does not match') - - cdef DTYPE_t tmp, d = 0 - cdef np.intp_t i, j - - # compute (x1 - x2).T * VI * (x1 - x2) - for i in range(size): - self.vec_ptr[i] = x1[i] - x2[i] - - for i in range(size): - tmp = 0 - for j in range(size): - tmp += self.mat_ptr[i * size + j] * self.vec_ptr[j] - d += tmp * self.vec_ptr[i] - return d - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return sqrt(self.rdist(x1, x2, size)) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return sqrt(rdist) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - return dist * dist - - def rdist_to_dist(self, rdist): - return np.sqrt(rdist) - - def dist_to_rdist(self, dist): - return dist ** 2 - - -# ------------------------------------------------------------ -# Hamming Distance -# d = N_unequal(x, y) / N_tot -cdef class HammingDistance(DistanceMetric): - """Hamming Distance - - Hamming distance is meant for discrete-valued vectors, though it is - a valid metric for real-valued vectors. - - .. math:: - D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int n_unequal = 0 - cdef np.intp_t j - for j in range(size): - if x1[j] != x2[j]: - n_unequal += 1 - return float(n_unequal) / size - - -# ------------------------------------------------------------ -# Canberra Distance -# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] -cdef class CanberraDistance(DistanceMetric): - """Canberra Distance - - Canberra distance is meant for discrete-valued vectors, though it is - a valid metric for real-valued vectors. - - .. math:: - D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t denom, d = 0 - cdef np.intp_t j - for j in range(size): - denom = fabs(x1[j]) + fabs(x2[j]) - if denom > 0: - d += fabs(x1[j] - x2[j]) / denom - return d - - -# ------------------------------------------------------------ -# Bray-Curtis Distance -# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] -cdef class BrayCurtisDistance(DistanceMetric): - """Bray-Curtis Distance - - Bray-Curtis distance is meant for discrete-valued vectors, though it is - a valid metric for real-valued vectors. - - .. math:: - D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t num = 0, denom = 0 - cdef np.intp_t j - for j in range(size): - num += fabs(x1[j] - x2[j]) - denom += fabs(x1[j]) + fabs(x2[j]) - if denom > 0: - return num / denom - else: - return 0.0 - - -# ------------------------------------------------------------ -# Jaccard Distance (boolean) -# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) -cdef class JaccardDistance(DistanceMetric): - """Jaccard Distance - - Jaccard Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_eq = 0, nnz = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - nnz += (tf1 or tf2) - n_eq += (tf1 and tf2) - if nnz == 0: - return 0.0 - return (nnz - n_eq) * 1.0 / nnz - - -# ------------------------------------------------------------ -# Matching Distance (boolean) -# D(x, y) = n_neq / n -cdef class MatchingDistance(DistanceMetric): - """Matching Distance - - Matching Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{N} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - return n_neq * 1. / size - - -# ------------------------------------------------------------ -# Dice Distance (boolean) -# D(x, y) = n_neq / (2 * ntt + n_neq) -cdef class DiceDistance(DistanceMetric): - """Dice Distance - - Dice Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0, ntt = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - ntt += (tf1 and tf2) - n_neq += (tf1 != tf2) - return n_neq / (2.0 * ntt + n_neq) - - -# ------------------------------------------------------------ -# Kulsinski Distance (boolean) -# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) -cdef class KulsinskiDistance(DistanceMetric): - """Kulsinski Distance - - Kulsinski Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return (n_neq - ntt + size) * 1.0 / (n_neq + size) - - -# ------------------------------------------------------------ -# Rogers-Tanimoto Distance (boolean) -# D(x, y) = 2 * n_neq / (n + n_neq) -cdef class RogersTanimotoDistance(DistanceMetric): - """Rogers-Tanimoto Distance - - Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - return (2.0 * n_neq) / (size + n_neq) - - -# ------------------------------------------------------------ -# Russell-Rao Distance (boolean) -# D(x, y) = (n - ntt) / n -cdef class RussellRaoDistance(DistanceMetric): - """Russell-Rao Distance - - Russell-Rao Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N - N_{TT}}{N} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - ntt += (tf1 and tf2) - return (size - ntt) * 1. / size - - -# ------------------------------------------------------------ -# Sokal-Michener Distance (boolean) -# D(x, y) = 2 * n_neq / (n + n_neq) -cdef class SokalMichenerDistance(DistanceMetric): - """Sokal-Michener Distance - - Sokal-Michener Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - return (2.0 * n_neq) / (size + n_neq) - - -# ------------------------------------------------------------ -# Sokal-Sneath Distance (boolean) -# D(x, y) = n_neq / (0.5 * n_tt + n_neq) -cdef class SokalSneathDistance(DistanceMetric): - """Sokal-Sneath Distance - - Sokal-Sneath Distance is a dissimilarity measure for boolean-valued - vectors. All nonzero entries will be treated as True, zero entries will - be treated as False. - - .. math:: - D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}} - """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef int tf1, tf2, ntt = 0, n_neq = 0 - cdef np.intp_t j - for j in range(size): - tf1 = x1[j] != 0 - tf2 = x2[j] != 0 - n_neq += (tf1 != tf2) - ntt += (tf1 and tf2) - return n_neq / (0.5 * ntt + n_neq) - - -# ------------------------------------------------------------ -# Haversine Distance (2 dimensional) -# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) -# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} -cdef class HaversineDistance(DistanceMetric): - """Haversine (Spherical) Distance - - The Haversine distance is the angular distance between two points on - the surface of a sphere. The first distance of each point is assumed - to be the latitude, the second is the longitude, given in radians. - The dimension of the points must be 2: - - .. math:: - D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2) - + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] - """ - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != 2: - with gil: - raise ValueError("Haversine distance only valid " - "in 2 dimensions") - cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) - cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) - return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - if size != 2: - with gil: - raise ValueError("Haversine distance only valid in" - " 2 dimensions") - cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) - cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) - return 2 * asin(sqrt(sin_0 * sin_0 + - cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)) - - cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: - return 2 * asin(sqrt(rdist)) - - cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - cdef DTYPE_t tmp = sin(0.5 * dist) - return tmp * tmp - - def rdist_to_dist(self, rdist): - return 2 * np.arcsin(np.sqrt(rdist)) - - def dist_to_rdist(self, dist): - tmp = np.sin(0.5 * dist) - return tmp * tmp - - -# ------------------------------------------------------------ -# Yule Distance (boolean) -# D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft) -# [This is not a true metric, so we will leave it out.] -# -# cdef class YuleDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): -# cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0 -# cdef np.intp_t j -# for j in range(size): -# tf1 = x1[j] != 0 -# tf2 = x2[j] != 0 -# ntt += tf1 and tf2 -# ntf += tf1 and (tf2 == 0) -# nft += (tf1 == 0) and tf2 -# nff = size - ntt - ntf - nft -# return (2.0 * ntf * nft) / (ntt * nff + ntf * nft) - - -# ------------------------------------------------------------ -# Cosine Distance -# D(x, y) = dot(x, y) / (|x| * |y|) -# [This is not a true metric, so we will leave it out. Use the `arccos` -# distance instead] - -# cdef class CosineDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, -# ITYPE_t size) nogil except -1: -# cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 -# cdef np.intp_t j -# for j in range(size): -# d += x1[j] * x2[j] -# norm1 += x1[j] * x1[j] -# norm2 += x2[j] * x2[j] -# return 1.0 - d / sqrt(norm1 * norm2) - -# ------------------------------------------------------------ -# Arccos Distance -# D(x, y) = arccos(dot(x, y) / (|x| * |y|)) / PI - -cdef class ArccosDistance(DistanceMetric): - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 - cdef np.intp_t j - for j in range(size): - d += x1[j] * x2[j] - norm1 += x1[j] * x1[j] - norm2 += x2[j] * x2[j] - return acos(d / sqrt(norm1 * norm2)) / M_PI - - -# ------------------------------------------------------------ -# Correlation Distance -# D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|) -# [This is not a true metric, so we will leave it out.] -# -# cdef class CorrelationDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): -# cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0 -# cdef DTYPE_t tmp1, tmp2 -# -# cdef np.intp_t i -# for i in range(size): -# mu1 += x1[i] -# mu2 += x2[i] -# mu1 /= size -# mu2 /= size -# -# for i in range(size): -# tmp1 = x1[i] - mu1 -# tmp2 = x2[i] - mu2 -# x1nrm += tmp1 * tmp1 -# x2nrm += tmp2 * tmp2 -# x1Tx2 += tmp1 * tmp2 -# -# return (1. - x1Tx2) / sqrt(x1nrm * x2nrm) - - -# ------------------------------------------------------------ -# User-defined distance -# -cdef class PyFuncDistance(DistanceMetric): - """PyFunc Distance - A user-defined distance - Parameters - ---------- - func : function - func should take two numpy arrays as input, and return a distance. - """ - def __init__(self, func, **kwargs): - self.func = func - self.kwargs = kwargs - - # in cython < 0.26, GIL was required to be acquired during definition of - # the function and inside the body of the function. This behaviour is not - # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The - # only way to be back compatible is to inherit `dist` from the base class - # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) nogil except -1: - return self._dist(x1, x2, size) - - cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2, - ITYPE_t size) except -1 with gil: - cdef np.ndarray x1arr - cdef np.ndarray x2arr - x1arr = _buffer_to_ndarray(x1, size) - x2arr = _buffer_to_ndarray(x2, size) - d = self.func(x1arr, x2arr, **self.kwargs) - try: - # Cython generates code here that results in a TypeError - # if d is the wrong type. - return d - except TypeError: - raise TypeError("Custom distance function must accept two " - "vectors and return a float.") - - -cdef inline double fmax(double a, double b) nogil: - return max(a, b) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index fb12cc6eb6ca3..e6a7268f65b8c 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -33,7 +33,7 @@ from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from .dist_metrics import DistanceMetric +from sklearn.metrics._dist_metrics import DistanceMetric FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index d8658c0458532..a073948a1bc70 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -91,13 +91,6 @@ def configuration(parent_package="", top_path=None): include_dirs=[numpy.get_include()], libraries=libraries, ) - config.add_extension( - "_hdbscan.dist_metrics", - sources=["_hdbscan/dist_metrics.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - return config From 30b652ac1b0dbbef07cdec130df59852003d9621 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 21:36:51 -0400 Subject: [PATCH 029/160] Removed unnecessary arg --- sklearn/cluster/_hdbscan/hdbscan_.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index e6a7268f65b8c..808c443f379cc 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -45,7 +45,6 @@ def _tree_to_labels( - X, single_linkage_tree, min_cluster_size=10, cluster_selection_method="eom", @@ -884,7 +883,6 @@ def hdbscan( ) return _tree_to_labels( - X, single_linkage_tree, min_cluster_size, cluster_selection_method, From 42b26e1346b12543a182fd3a8b06f71dafc263f9 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 16 Mar 2022 11:45:19 -0400 Subject: [PATCH 030/160] Removed vestigial `robust_single_linkage` functionality --- sklearn/cluster/__init__.py | 4 - .../_hdbscan/_robust_single_linkage_.py | 282 ------------------ .../cluster/_hdbscan/tests/test_hdbscan.py | 7 +- sklearn/cluster/_hdbscan/tests/test_rsl.py | 118 -------- 4 files changed, 2 insertions(+), 409 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_robust_single_linkage_.py delete mode 100644 sklearn/cluster/_hdbscan/tests/test_rsl.py diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index b7d25cab68720..0805f6e312602 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -23,8 +23,6 @@ from ._bicluster import SpectralBiclustering, SpectralCoclustering from ._birch import Birch from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan -from ._hdbscan._robust_single_linkage_ import robust_single_linkage -from ._hdbscan._validity import validity_index __all__ = [ "AffinityPropagation", @@ -54,6 +52,4 @@ "SpectralCoclustering", "HDBSCAN", "hdbscan", - "robust_single_linkage", - "validity_index", ] diff --git a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py b/sklearn/cluster/_hdbscan/_robust_single_linkage_.py deleted file mode 100644 index 8539f867057ce..0000000000000 --- a/sklearn/cluster/_hdbscan/_robust_single_linkage_.py +++ /dev/null @@ -1,282 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Robust Single Linkage: Density based single linkage clustering. -""" -import numpy as np - -from sklearn.metrics import pairwise_distances -from scipy.sparse import issparse - -from joblib import Memory, cpu_count -from sklearn.utils import check_array - -from ._hdbscan_linkage import mst_linkage_core, mst_linkage_core_vector, label -from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm -from sklearn.metrics._dist_metrics import DistanceMetric -from ._hdbscan_reachability import mutual_reachability -from ._hdbscan_tree import labelling_at_cut -from sklearn.neighbors import KDTree, BallTree - -# Author: Leland McInnes -# -# License: BSD 3 clause - -FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics - - -def _rsl_generic(X, k=5, metric="euclidean", **kwargs): - distance_matrix = pairwise_distances(X, metric=metric, **kwargs) - - mutual_reachability_ = mutual_reachability(distance_matrix, k) - - min_spanning_tree = mst_linkage_core(mutual_reachability_) - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] - - return label(min_spanning_tree) - - -def _rsl_prims_kdtree(X, k=5, alpha=np.sqrt(2), metric="euclidean", **kwargs): - - # The Cython routines used require contiguous arrays - if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, dtype=np.double, order="C") - - dim = X.shape[0] - k = min(dim - 1, k) - - tree = KDTree(X, metric=metric, **kwargs) - - dist_metric = DistanceMetric.get_metric(metric, **kwargs) - - core_distances = tree.query(X, k=k)[0][:, -1].copy(order="C") - min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) - - return label(min_spanning_tree) - - -def _rsl_prims_balltree(X, k=5, alpha=np.sqrt(2), metric="euclidean", **kwargs): - - # The Cython routines used require contiguous arrays - if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, dtype=np.double, order="C") - - dim = X.shape[0] - k = min(dim - 1, k) - - tree = BallTree(X, metric=metric, **kwargs) - - dist_metric = DistanceMetric.get_metric(metric, **kwargs) - - core_distances = tree.query(X, k=k)[0][:, -1].copy(order="C") - min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) - - return label(min_spanning_tree) - - -def _rsl_boruvka_kdtree( - X, k=5, alpha=1.0, metric="euclidean", leaf_size=40, core_dist_n_jobs=4, **kwargs -): - - if core_dist_n_jobs < 1: - core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) - - dim = X.shape[0] - min_samples = min(dim - 1, k) - - tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) - alg = KDTreeBoruvkaAlgorithm( - tree, min_samples, metric=metric, alpha=alpha, leaf_size=leaf_size, **kwargs - ) - min_spanning_tree = alg.spanning_tree() - - return label(min_spanning_tree) - - -def _rsl_boruvka_balltree( - X, k=5, alpha=1.0, metric="euclidean", leaf_size=40, core_dist_n_jobs=4, **kwargs -): - - if core_dist_n_jobs < 1: - core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) - - dim = X.shape[0] - min_samples = min(dim - 1, k) - - tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) - alg = BallTreeBoruvkaAlgorithm( - tree, min_samples, metric=metric, alpha=alpha, leaf_size=leaf_size, **kwargs - ) - min_spanning_tree = alg.spanning_tree() - - return label(min_spanning_tree) - - -def robust_single_linkage( - X, - cut, - k=5, - alpha=np.sqrt(2), - gamma=5, - metric="euclidean", - algorithm="best", - memory=None, - leaf_size=40, - core_dist_n_jobs=4, - metric_params=None, -): - """ - Perform robust single linkage clustering. - - Parameters - ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) - A feature array, or array of distances between samples if - ``metric='precomputed'``. - - cut : float - The reachability distance value to cut the cluster heirarchy at - to derive a flat cluster labelling. - - k : int, default=5 - Reachability distances will be computed with regard to the `k` - nearest neighbors. - - alpha : float, default=np.sqrt(2) - Distance scaling for reachability distance computation. Reachability - distance is computed as - - .. math:: - - \\max (core_k(a), core_k(b), 1/\\alpha d(a,b)). - - gamma : int, default=5 - Ignore any clusters in the flat clustering with size less than gamma, - and declare points in such clusters as noise points. - - metric : str or callable, default='euclidean' - The metric to use when calculating distance between instances in a - feature array. If metric is a string or callable, it must be one of - the options allowed by `metrics.pairwise.pairwise_distances` for its - metric parameter. - If `metric="precomputed"`, X is assumed to be a distance matrix and - must be square. - - algorithm : str, default='best' - Exactly which algorithm to use; hdbscan has variants specialised - for different characteristics of the data. By default this is set - to ``best`` which chooses the "best" algorithm given the nature of - the data. You can force other options if you believe you know - better. Options are: - * ``generic`` - * ``best`` - * ``prims_kdtree`` - * ``prims_balltree`` - * ``boruvka_kdtree`` - * ``boruvka_balltree`` - - memory : str, default=None - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - - leaf_size : int, default=40 - Leaf size for trees responsible for fast nearest - neighbour queries. - - core_dist_n_jobs : int, default=4 - Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For ``core_dist_n_jobs`` - below -1, (n_cpus + 1 + core_dist_n_jobs) are used. - - metric_params : dict, default=None - Arguments passed to the distance metric. - - Returns - ------- - labels : ndarray, shape (n_samples, ) - Cluster labels for each point. Noisy samples are given the label -1. - - single_linkage_tree : ndarray, shape (n_samples - 1, 4) - The single linkage tree produced during clustering in scipy - hierarchical clustering format - (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). - - References - ---------- - .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the - cluster tree. In Advances in Neural Information Processing Systems - (pp. 343-351). - """ - - if not isinstance(k, int) or k < 1: - raise ValueError("k must be an integer greater than zero!") - - if not isinstance(alpha, float) or alpha < 1.0: - raise ValueError("alpha must be a float greater than or equal to 1.0!") - - if not isinstance(gamma, int) or gamma < 1: - raise ValueError("gamma must be an integer greater than zero!") - - if not isinstance(leaf_size, int) or leaf_size < 1: - raise ValueError("Leaf size must be at least one!") - - X = check_array(X, accept_sparse="csr") - memory = Memory(cachedir=memory, verbose=0) - metric_params = metric_params or {} - - if algorithm != "best": - if algorithm == "generic": - single_linkage_tree = memory.cache(_rsl_generic)( - X, k, metric, **metric_params - ) - elif algorithm == "prims_kdtree": - single_linkage_tree = memory.cache(_rsl_prims_kdtree)( - X, k, alpha, metric, **metric_params - ) - elif algorithm == "prims_balltree": - single_linkage_tree = memory.cache(_rsl_prims_balltree)( - X, k, alpha, metric, **metric_params - ) - elif algorithm == "boruvka_kdtree": - single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params - ) - elif algorithm == "boruvka_balltree": - single_linkage_tree = memory.cache(_rsl_boruvka_balltree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params - ) - else: - raise TypeError("Unknown algorithm type %s specified" % algorithm) - else: - if issparse(X) or metric not in FAST_METRICS: - # We can't do much with sparse matrices ... - single_linkage_tree = memory.cache(_rsl_generic)( - X, k, metric, **metric_params - ) - elif metric in KDTree.valid_metrics: - # Need heuristic to decide when to go to boruvka; - # still debugging for now - if X.shape[1] > 128: - single_linkage_tree = memory.cache(_rsl_prims_kdtree)( - X, k, alpha, metric, **metric_params - ) - else: - single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params - ) - else: # Metric is a valid BallTree metric - # Need heuristic to decide when to go to boruvka; - # still debugging for now - if X.shape[1] > 128: - single_linkage_tree = memory.cache(_rsl_prims_kdtree)( - X, k, alpha, metric, **metric_params - ) - else: - single_linkage_tree = memory.cache(_rsl_boruvka_balltree)( - X, k, alpha, metric, leaf_size, core_dist_n_jobs, **metric_params - ) - - labels = labelling_at_cut(single_linkage_tree, cut, gamma) - - return labels diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index c92b96c58d5e3..0d2853037e068 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -10,11 +10,8 @@ assert_array_almost_equal, assert_raises, ) -from sklearn.cluster import ( - HDBSCAN, - hdbscan, - validity_index, -) +from sklearn.cluster import HDBSCAN, hdbscan +from sklearn.cluster._hdbscan._validity import validity_index # from sklearn.cluster.tests.common import generate_clustered_data from sklearn.datasets import make_blobs diff --git a/sklearn/cluster/_hdbscan/tests/test_rsl.py b/sklearn/cluster/_hdbscan/tests/test_rsl.py deleted file mode 100644 index 173ab10c6cef9..0000000000000 --- a/sklearn/cluster/_hdbscan/tests/test_rsl.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Tests for Robust Single Linkage clustering algorithm -""" -# import pickle -import numpy as np -from scipy.spatial import distance -from sklearn.utils._testing import assert_raises -from sklearn.cluster import robust_single_linkage - -from sklearn.datasets import make_blobs -from sklearn.utils import shuffle -from sklearn.preprocessing import StandardScaler - - -n_clusters = 3 -X, y = make_blobs(n_samples=50, random_state=1) -X, y = shuffle(X, y, random_state=7) -X = StandardScaler().fit_transform(X) -# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) - - -def test_rsl_distance_matrix(): - D = distance.squareform(distance.pdist(X)) - D /= np.max(D) - - labels = robust_single_linkage(D, 0.4, metric="precomputed") - # number of clusters, ignoring noise if present - n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise - assert n_clusters_1 == 2 - - -def test_rsl_feature_vector(): - labels = robust_single_linkage(X, 0.4) - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - -def test_rsl_callable_metric(): - # metric is the function reference, not the string key. - metric = distance.euclidean - - labels = robust_single_linkage(X, 0.4, metric=metric) - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - -def test_rsl_boruvka_balltree(): - labels = robust_single_linkage(X, 0.45, algorithm="boruvka_balltree") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - -def test_rsl_prims_balltree(): - labels = robust_single_linkage(X, 0.4, algorithm="prims_balltree") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - -def test_rsl_prims_kdtree(): - labels = robust_single_linkage(X, 0.4, algorithm="prims_kdtree") - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - -def test_rsl_high_dimensional(): - H, y = make_blobs(n_samples=50, random_state=0, n_features=64) - # H, y = shuffle(X, y, random_state=7) - H = StandardScaler().fit_transform(H) - labels = robust_single_linkage(H, 5.5) - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - -def test_rsl_badargs(): - assert_raises(ValueError, robust_single_linkage, "fail", 0.4) - assert_raises(ValueError, robust_single_linkage, None, 0.4) - assert_raises(ValueError, robust_single_linkage, X, 0.4, k="fail") - assert_raises(ValueError, robust_single_linkage, X, 0.4, k=-1) - assert_raises(ValueError, robust_single_linkage, X, 0.4, metric="imperial") - assert_raises(ValueError, robust_single_linkage, X, 0.4, metric=None) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="precomputed", - algorithm="boruvka_kdtree", - ) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="precomputed", - algorithm="prims_kdtree", - ) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="precomputed", - algorithm="prims_balltree", - ) - assert_raises( - ValueError, - robust_single_linkage, - X, - 0.4, - metric="precomputed", - algorithm="boruvka_balltree", - ) - assert_raises(ValueError, robust_single_linkage, X, 0.4, alpha=-1) - assert_raises(ValueError, robust_single_linkage, X, 0.4, alpha="fail") - assert_raises(Exception, robust_single_linkage, X, 0.4, algorithm="something_else") - assert_raises(TypeError, robust_single_linkage, X, 0.4, metric="minkowski", p=None) - assert_raises(ValueError, robust_single_linkage, X, 0.4, leaf_size=0) - assert_raises(ValueError, robust_single_linkage, X, 0.4, gamma=0) From e46a41813574d3e4e00d55b35d852d48d6b2408c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 16 Mar 2022 11:51:23 -0400 Subject: [PATCH 031/160] Removed cython flags --- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 4 ---- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 2 -- sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx | 3 --- sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 3 --- sklearn/cluster/_hdbscan/_prediction_utils.pyx | 1 - 5 files changed, 13 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx index c82f5f4eccad7..1aa29db3509d4 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -1,7 +1,3 @@ -# cython: boundscheck=False -# cython: nonecheck=False -# cython: wraparound=False -# cython: initializedcheck=False # Minimum spanning tree single linkage implementation for hdbscan # Authors: Leland McInnes # License: 3-clause BSD diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index 82c7bcebef6b3..5fec8727f4b69 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -1,5 +1,3 @@ -# cython: boundscheck=False -# cython: nonecheck=False # Minimum spanning tree single linkage implementation for hdbscan # Authors: Leland McInnes, Steve Astels # License: 3-clause BSD diff --git a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx index 2863dc8af4dca..e988a4155e9f6 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx @@ -1,6 +1,3 @@ -# cython: boundscheck=False -# cython: nonecheck=False -# cython: initializedcheck=False # mutual reachability distance compiutations # Authors: Leland McInnes # License: 3-clause BSD diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx index bf2e7014d6026..ef51db0635473 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -1,6 +1,3 @@ -# cython: boundscheck=False -# cython: nonecheck=False -# cython: initializedcheck=False # Tree handling (condensing, finding stable clusters) for hdbscan # Authors: Leland McInnes # License: 3-clause BSD diff --git a/sklearn/cluster/_hdbscan/_prediction_utils.pyx b/sklearn/cluster/_hdbscan/_prediction_utils.pyx index c1bfb48633595..bced8713ca128 100644 --- a/sklearn/cluster/_hdbscan/_prediction_utils.pyx +++ b/sklearn/cluster/_hdbscan/_prediction_utils.pyx @@ -1,4 +1,3 @@ -#cython: boundscheck=False, nonecheck=False, initializedcheck=False # Utility routines in cython for prediction in hdbscan # Authors: Leland McInnes # License: 3-clause BSD From 5ae1d038fa616899bf993a8379c7967acb7f56fe Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 19 Mar 2022 18:24:16 -0400 Subject: [PATCH 032/160] Initial addition of HDBSCAN User Guide [doc quick] --- doc/modules/clustering.rst | 70 +++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 3f1c059e3a7f6..f165899bfed8a 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -882,13 +882,73 @@ by black points below. .. _hdbscan: -HDBSCAN [WIP] -============= +HDBSCAN +======= + +The :class:`HDBSCAN` algorithm can be seen as an extension of :class:`DBSCAN` +and :class:`OPTICS`. Specifically, DBSCAN asserts that the clustering criterion +(i.e. density requirement) is *globally homogeneous*. That is to say that if +there are clusters of differing density then DBSCAN may struggle to succesfully +capture them. HDBSCAN alleviates this assumption and explores all possible +density scales by building an alternative representation of the clustering +problem. + +HDBSCAN first defines the *core distance* of a sample :math:`x_p` as the +distance to its `min_samples`-nearest neighbor, counting itself. For example, +if `min_samples=5` and :math:`x_*` is the 5th-nearest neighbor of :math:`x_p` +then the core distance is: + +.. math:: d_c(x_p)=d(x_p, x_*). -The :class:`HDBSCAN` algorithm views clusters as areas of high density -separated by areas of low density, similarly to :class:`DBSCAN`. However +Next it defines the *mutual reachability distance* of two points :math:`x_p, x_q` +as +.. math:: d_m(x_p, x_q) = \max\{d_c(x_p), d_c(x_q), d(x_p, x_q)\} + +These two notions allow us to construct the *mutual reachability graph* +:math:`G_{ms}` defined for a fixed choice of `min_samples` by associating each +sample :math:`x_p` with a vertex of the graph, and thus edges between points +:math:`x_p, x_q` is the mutual reachability distance :math:`d_m(x_p, x_q)` +between them. We may build subsets of this graph, labeled +:math:`G_{ms,\epsilon}` defined as the original graph after removing any edges +with value greater than `eps`. Any points whose core distance is less than `eps` +are at this staged marked as noise. The remaining points are then clustered by +finding the connected components of this trimmed graph. + + +.. note:: + The clustering generated by taking the connected components of a trimmed + graph :math:`G_{ms,\epsilon}` equivalent to running DBSCAN* with `min_samples` + and `eps`. DBSCAN* is a slightly modified version of DBSCAN mentioned in + + +An outline of the HDBSCAN algorithm using the mutual reachability graph is as +follows: + + 1. Extract the minimum spanning tree (MST) of :math:`G_{ms}` + 2. Extend the MST by adding a "self edge" for each vertex, with weight equal + to the core distance of the underlying sample. + 3. Initialize a single cluster and label for the MST. + 4. Remove the edge with the greatest weight is from the MST (ties are + removed simultaneously). + 5. Assign cluster labels to the connected components which contain the + end points of the now-removed edge. If the component does not have at least + one edge it is instead assigned a "null" label marking it as noise. + 6. Repeat 4-6 until there are no more connected components. + +HDBSCAN is therefore able to obtain all possible partitions obtainable by +DBSCAN* for a fixed choice of `min_samples` in a hierarchical fashion. +Indeed, this allows HDBSCAN to perform clustering across multiple densities +and as such it no longer needs `eps` to be given as a hyperparameter. Instead +it relies solely on the choice of `min_samples`, which tends to be a more robust +hyperparameter. + +HDBSCAN can be smoothed with an additional hyperparameter `minimum_cluster_size` +which specifies that during the hierarchical clustering, components with fewer +than `minimum_cluster_size` many samples are considered noise. In practice, one +can set `minimum_cluster_size = min_samples` to couple the parameters and +simplify the hyperparameter space. .. _optics: @@ -962,7 +1022,7 @@ represented as children of a larger parent cluster. Different distance metrics can be supplied via the ``metric`` keyword. For large datasets, similar (but not identical) results can be obtained via - `HDBSCAN `_. The HDBSCAN implementation is + :class:`HDBSCAN`. The HDBSCAN implementation is multithreaded, and has better algorithmic runtime complexity than OPTICS, at the cost of worse memory scaling. For extremely large datasets that exhaust system memory using HDBSCAN, OPTICS will maintain :math:`n` (as opposed From 30f38eadb99041319a26c5744c313bdba09cdcae Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 19 Mar 2022 19:39:32 -0400 Subject: [PATCH 033/160] Add reference for HDBSCAN User Guide entry --- doc/modules/clustering.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index f165899bfed8a..7a3e66b07c907 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -950,6 +950,14 @@ than `minimum_cluster_size` many samples are considered noise. In practice, one can set `minimum_cluster_size = min_samples` to couple the parameters and simplify the hyperparameter space. +.. topic:: References: + + * Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based Clustering + Based on Hierarchical Density Estimates. In: Pei, J., Tseng, V.S., Cao, L., + Motoda, H., Xu, G. (eds) Advances in Knowledge Discovery and Data Mining. + PAKDD 2013. Lecture Notes in Computer Science(), vol 7819. Springer, Berlin, + Heidelberg. https://doi.org/10.1007/978-3-642-37456-2_14 + .. _optics: OPTICS From 38f701973d37335d264af018cba967f3d37bc11d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 19 Mar 2022 19:42:58 -0400 Subject: [PATCH 034/160] Added authorship/license info --- sklearn/cluster/_hdbscan/_validity.py | 6 ++++++ sklearn/cluster/_hdbscan/hdbscan_.py | 12 +++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_validity.py b/sklearn/cluster/_hdbscan/_validity.py index e8a1092b2d545..c609f4db84ee8 100644 --- a/sklearn/cluster/_hdbscan/_validity.py +++ b/sklearn/cluster/_hdbscan/_validity.py @@ -1,3 +1,9 @@ +# Author: Leland McInnes +# Steve Astels +# John Healy +# +# License: BSD 3 clause + import numpy as np from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cdist diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 808c443f379cc..86aff2c107b14 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -1,8 +1,12 @@ -# -*- coding: utf-8 -*- """ HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise """ +# Author: Leland McInnes +# Steve Astels +# John Healy +# +# License: BSD 3 clause import numpy as np from numpy import isclose @@ -37,12 +41,6 @@ FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] -# Author: Leland McInnes -# Steve Astels -# John Healy -# -# License: BSD 3 clause - def _tree_to_labels( single_linkage_tree, From 236c2197f9e122c72509ecba49639529afe1abf1 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 20 Mar 2022 22:35:02 -0400 Subject: [PATCH 035/160] Fixed lists in `hdbscan` and improved user guide documentation --- doc/modules/classes.rst | 2 ++ doc/modules/clustering.rst | 32 +++++++++++++++++----------- sklearn/cluster/_hdbscan/hdbscan_.py | 26 +++++++++++----------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 4b6d39368287c..716778ab65efc 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -102,6 +102,7 @@ Classes cluster.AgglomerativeClustering cluster.Birch cluster.DBSCAN + cluster.HDBSCAN cluster.FeatureAgglomeration cluster.KMeans cluster.MiniBatchKMeans @@ -122,6 +123,7 @@ Functions cluster.cluster_optics_xi cluster.compute_optics_graph cluster.dbscan + cluster.hdbscan cluster.estimate_bandwidth cluster.k_means cluster.kmeans_plusplus diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 7a3e66b07c907..48933669f75cd 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -893,6 +893,9 @@ capture them. HDBSCAN alleviates this assumption and explores all possible density scales by building an alternative representation of the clustering problem. +Mutual Reachability Graph +------------------------- + HDBSCAN first defines the *core distance* of a sample :math:`x_p` as the distance to its `min_samples`-nearest neighbor, counting itself. For example, if `min_samples=5` and :math:`x_*` is the 5th-nearest neighbor of :math:`x_p` @@ -915,18 +918,16 @@ with value greater than `eps`. Any points whose core distance is less than `eps` are at this staged marked as noise. The remaining points are then clustered by finding the connected components of this trimmed graph. - -.. note:: - - The clustering generated by taking the connected components of a trimmed - graph :math:`G_{ms,\epsilon}` equivalent to running DBSCAN* with `min_samples` - and `eps`. DBSCAN* is a slightly modified version of DBSCAN mentioned in - - -An outline of the HDBSCAN algorithm using the mutual reachability graph is as -follows: - - 1. Extract the minimum spanning tree (MST) of :math:`G_{ms}` +Hierarchical Clustering +----------------------- +HDBSCAN can be seen as an algorithm which performs DBSCAN* clustering across all +values of `eps`. As mentioned prior, this is equivalent to finding the connected +components of the mutual reachability graphs for all values of `eps`. To do this +efficiently, HDBSCAN first extracts a minimum spanning tree (MST) from the fully +-connected mutual reachability graph, then greedily cuts the edges with heighest +weight. An outline of the HDBSCAN algorithm is as follows: + + 1. Extract the MST of :math:`G_{ms}` 2. Extend the MST by adding a "self edge" for each vertex, with weight equal to the core distance of the underlying sample. 3. Initialize a single cluster and label for the MST. @@ -937,6 +938,13 @@ follows: one edge it is instead assigned a "null" label marking it as noise. 6. Repeat 4-6 until there are no more connected components. +.. note:: + + The clustering generated by taking the connected components of a trimmed + graph :math:`G_{ms,\epsilon}` equivalent to running DBSCAN* with `min_samples` + and `eps`. DBSCAN* is a slightly modified version of DBSCAN mentioned in + <`https://doi.org/10.1007/978-3-642-37456-2_14`_> + HDBSCAN is therefore able to obtain all possible partitions obtainable by DBSCAN* for a fixed choice of `min_samples` in a hierarchical fashion. Indeed, this allows HDBSCAN to perform clustering across multiple densities diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 86aff2c107b14..cc9e8a46c430b 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -559,12 +559,12 @@ def hdbscan( The metric to use when calculating distance between instances in a feature array. - If metric is a string or callable, it must be one of - the options allowed by `metrics.pairwise.pairwise_distances` for its - metric parameter. + * If metric is a string or callable, it must be one of + the options allowed by `metrics.pairwise.pairwise_distances` for its + metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and - must be square. + * If metric is "precomputed", X is assumed to be a distance matrix and + must be square. leaf_size : int, default=40 Leaf size for trees responsible for fast nearest @@ -576,12 +576,12 @@ def hdbscan( to `best` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: - * `best` - * `generic` - * `prims_kdtree` - * `prims_balltree` - * `boruvka_kdtree` - * `boruvka_balltree` + * `best` + * `generic` + * `prims_kdtree` + * `prims_balltree` + * `boruvka_kdtree` + * `boruvka_balltree` memory : str, default=None Used to cache the output of the computation of the tree. @@ -609,8 +609,8 @@ def hdbscan( to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: - * `eom` - * `leaf` + * `eom` + * `leaf` allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this From 7ba96eaea3b132f50abb66838373ae51329eb105 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 21 Mar 2022 13:19:25 -0400 Subject: [PATCH 036/160] Added name mapping for hdbscan function autosummary --- doc/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/conf.py b/doc/conf.py index 304acbdfd8afd..e7b36c665f228 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -549,6 +549,7 @@ def setup(app): "sklearn.cluster.dbscan": "dbscan-function", "sklearn.covariance.oas": "oas-function", "sklearn.decomposition.fastica": "fastica-function", + "sklearn.cluster.hdbscan": "hdbscan-function", } From b5dcdca2de68272ad44f7fefefa07edda9118334 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 18:25:47 -0400 Subject: [PATCH 037/160] Added hdbscan to `plot_cluster_comparison` --- examples/cluster/plot_cluster_comparison.py | 3 ++- sklearn/cluster/_hdbscan/hdbscan_.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 8b52759c79018..952912c51dc08 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -161,6 +161,7 @@ affinity="nearest_neighbors", ) dbscan = cluster.DBSCAN(eps=params["eps"]) + hdbscan = cluster.HDBSCAN() optics = cluster.OPTICS( min_samples=params["min_samples"], xi=params["xi"], @@ -188,7 +189,7 @@ ("Ward", ward), ("Agglomerative\nClustering", average_linkage), ("DBSCAN", dbscan), - ("OPTICS", optics), + ("HDBSCAN", hdbscan)("OPTICS", optics), ("BIRCH", birch), ("Gaussian\nMixture", gmm), ) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index cc9e8a46c430b..e8801c988483f 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -959,7 +959,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): resulting clustering, but may have an effect on the runtime of the algorithm. - memory : Instance of joblib.Memory or str, default=Memory(verbose=1) + memory : str, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. @@ -1109,7 +1109,7 @@ def __init__( alpha=1.0, algorithm="best", leaf_size=40, - memory=Memory(cachedir=None, verbose=0), + memory=None, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, From d7734d45d0b58d28558f59af726711c5a12e49b6 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 18:29:23 -0400 Subject: [PATCH 038/160] Fixed sphinx lists --- sklearn/cluster/_hdbscan/hdbscan_.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index e8801c988483f..6c3b96d8e449b 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -929,12 +929,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin): The metric to use when calculating distance between instances in a feature array. - If metric is a string or callable, it must be one of - the options allowed by `metrics.pairwise.pairwise_distances` for its - metric parameter. + - If metric is a string or callable, it must be one of + the options allowed by `metrics.pairwise.pairwise_distances` for its + metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and - must be square. + - If metric is "precomputed", X is assumed to be a distance matrix and + must be square. alpha : float, default=1.0 A distance scaling parameter as used in robust single linkage. @@ -946,12 +946,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin): to `best` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: - * `best` - * `generic` - * `prims_kdtree` - * `prims_balltree` - * `boruvka_kdtree` - * `boruvka_balltree` + - `best` + - `generic` + - `prims_kdtree` + - `prims_balltree` + - `boruvka_kdtree` + - `boruvka_balltree` leaf_size : int, default=40 If using a space tree algorithm (kdtree, or balltree) the number @@ -986,8 +986,8 @@ class HDBSCAN(BaseEstimator, ClusterMixin): to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: - * `eom` - * `leaf` + - `eom` + - `leaf` allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this From 9f83f6e112efc83f4af5937da130c750e54aed6b Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 18:31:55 -0400 Subject: [PATCH 039/160] Added initial hdbscan plot file --- examples/cluster/plot_hdbscan.py | 87 ++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 examples/cluster/plot_hdbscan.py diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py new file mode 100644 index 0000000000000..6b43f6917f6c6 --- /dev/null +++ b/examples/cluster/plot_hdbscan.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +""" +=================================== +Demo of HDBSCAN clustering algorithm +=================================== + +""" + +import numpy as np + +from sklearn.cluster import HDBSCAN +from sklearn import metrics +from sklearn.datasets import make_blobs +from sklearn.preprocessing import StandardScaler + + +# %% +# Generate sample data +# -------------------- +centers = [[1, 1], [-1, -1], [1, -1]] +X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=0.4, random_state=0 +) + +X = StandardScaler().fit_transform(X) + +# %% +# Compute DBSCAN +# -------------- +hdb = HDBSCAN().fit(X) +core_samples_mask = np.zeros_like(hdb.labels_, dtype=bool) +core_samples_mask[hdb.core_sample_indices_] = True +labels = hdb.labels_ + +# Number of clusters in labels, ignoring noise if present. +n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) +n_noise_ = list(labels).count(-1) + +print("Estimated number of clusters: %d" % n_clusters_) +print("Estimated number of noise points: %d" % n_noise_) +print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) +print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) +print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) +print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) +print( + "Adjusted Mutual Information: %0.3f" + % metrics.adjusted_mutual_info_score(labels_true, labels) +) +print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) + +# %% +# Plot result +# ----------- +import matplotlib.pyplot as plt + +# Black removed and is used for noise instead. +unique_labels = set(labels) +colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] +for k, col in zip(unique_labels, colors): + if k == -1: + # Black used for noise. + col = [0, 0, 0, 1] + + class_member_mask = labels == k + + xy = X[class_member_mask & core_samples_mask] + plt.plot( + xy[:, 0], + xy[:, 1], + "o", + markerfacecolor=tuple(col), + markeredgecolor="k", + markersize=14, + ) + + xy = X[class_member_mask & ~core_samples_mask] + plt.plot( + xy[:, 0], + xy[:, 1], + "o", + markerfacecolor=tuple(col), + markeredgecolor="k", + markersize=6, + ) + +plt.title("Estimated number of clusters: %d" % n_clusters_) +plt.show() From f98b6bfcd7dfa743b603fc1a67d9ba3efa1debc4 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 18:32:53 -0400 Subject: [PATCH 040/160] Modified clustering rst for image inclusion --- doc/modules/clustering.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 48933669f75cd..4125ebaaf4f48 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -958,6 +958,10 @@ than `minimum_cluster_size` many samples are considered noise. In practice, one can set `minimum_cluster_size = min_samples` to couple the parameters and simplify the hyperparameter space. +.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_001.png + :target: ../auto_examples/cluster/plot_hdbscan.html + :scale: 50 + .. topic:: References: * Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based Clustering From 5365c3ae1460252f70f6def091b193442ffbedb9 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 19:26:08 -0400 Subject: [PATCH 041/160] Corrected plotting for HDBSCAN --- examples/cluster/plot_cluster_comparison.py | 12 ++++++++++-- examples/cluster/plot_hdbscan.py | 21 +++------------------ sklearn/cluster/_hdbscan/hdbscan_.py | 16 ++++++++-------- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 952912c51dc08..afcf166772c50 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -79,6 +79,9 @@ "min_samples": 7, "xi": 0.05, "min_cluster_size": 0.1, + "allow_single_cluster": True, + "hdbscan_min_cluster_size": 15, + "hdbscan_min_samples": 3, } datasets = [ @@ -161,7 +164,11 @@ affinity="nearest_neighbors", ) dbscan = cluster.DBSCAN(eps=params["eps"]) - hdbscan = cluster.HDBSCAN() + hdbscan = cluster.HDBSCAN( + min_samples=params["hdbscan_min_samples"], + min_cluster_size=params["hdbscan_min_cluster_size"], + allow_single_cluster=params["allow_single_cluster"], + ) optics = cluster.OPTICS( min_samples=params["min_samples"], xi=params["xi"], @@ -189,7 +196,8 @@ ("Ward", ward), ("Agglomerative\nClustering", average_linkage), ("DBSCAN", dbscan), - ("HDBSCAN", hdbscan)("OPTICS", optics), + ("HDBSCAN", hdbscan), + ("OPTICS", optics), ("BIRCH", birch), ("Gaussian\nMixture", gmm), ) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 6b43f6917f6c6..a2d153fda1036 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -25,11 +25,9 @@ X = StandardScaler().fit_transform(X) # %% -# Compute DBSCAN +# Compute HDBSCAN # -------------- hdb = HDBSCAN().fit(X) -core_samples_mask = np.zeros_like(hdb.labels_, dtype=bool) -core_samples_mask[hdb.core_sample_indices_] = True labels = hdb.labels_ # Number of clusters in labels, ignoring noise if present. @@ -61,27 +59,14 @@ # Black used for noise. col = [0, 0, 0, 1] - class_member_mask = labels == k - - xy = X[class_member_mask & core_samples_mask] - plt.plot( - xy[:, 0], - xy[:, 1], - "o", - markerfacecolor=tuple(col), - markeredgecolor="k", - markersize=14, - ) - - xy = X[class_member_mask & ~core_samples_mask] + xy = X[labels == k] plt.plot( xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", - markersize=6, + markersize=8, ) - plt.title("Estimated number of clusters: %d" % n_clusters_) plt.show() diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 6c3b96d8e449b..baf8a0e3be36b 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -946,12 +946,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin): to `best` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: - - `best` - - `generic` - - `prims_kdtree` - - `prims_balltree` - - `boruvka_kdtree` - - `boruvka_balltree` + - `best` + - `generic` + - `prims_kdtree` + - `prims_balltree` + - `boruvka_kdtree` + - `boruvka_balltree` leaf_size : int, default=40 If using a space tree algorithm (kdtree, or balltree) the number @@ -986,8 +986,8 @@ class HDBSCAN(BaseEstimator, ClusterMixin): to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: - - `eom` - - `leaf` + - `eom` + - `leaf` allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this From b25d2ada8c4d717d40dcd08ba48ea4f1260e4f89 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 20:13:39 -0400 Subject: [PATCH 042/160] Fixed image display in user guide entry and fixed hdbscan doc --- doc/modules/clustering.rst | 12 +++++++----- sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 4125ebaaf4f48..14ec685f02e6f 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -943,7 +943,7 @@ weight. An outline of the HDBSCAN algorithm is as follows: The clustering generated by taking the connected components of a trimmed graph :math:`G_{ms,\epsilon}` equivalent to running DBSCAN* with `min_samples` and `eps`. DBSCAN* is a slightly modified version of DBSCAN mentioned in - <`https://doi.org/10.1007/978-3-642-37456-2_14`_> + https://doi.org/10.1007/978-3-642-37456-2_14 HDBSCAN is therefore able to obtain all possible partitions obtainable by DBSCAN* for a fixed choice of `min_samples` in a hierarchical fashion. @@ -952,16 +952,18 @@ and as such it no longer needs `eps` to be given as a hyperparameter. Instead it relies solely on the choice of `min_samples`, which tends to be a more robust hyperparameter. +.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_001.png + :target: ../auto_examples/cluster/plot_hdbscan.html + :scale: 50 + +.. centered:: |hdbscan_results| + HDBSCAN can be smoothed with an additional hyperparameter `minimum_cluster_size` which specifies that during the hierarchical clustering, components with fewer than `minimum_cluster_size` many samples are considered noise. In practice, one can set `minimum_cluster_size = min_samples` to couple the parameters and simplify the hyperparameter space. -.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_001.png - :target: ../auto_examples/cluster/plot_hdbscan.html - :scale: 50 - .. topic:: References: * Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based Clustering diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index baf8a0e3be36b..d203fc84261e1 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -1093,7 +1093,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): >>> hdb.fit(X) HDBSCAN(min_cluster_size=20) >>> hdb.labels_ - array([ 2, 6, -1, ..., -1, -1, -1], dtype=int64) + array([ 2, 6, -1, ..., -1, -1, -1]) """ def _more_tags(self): From 103642d9f47ffe441b65e6a73101d477efbefdda Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 20:39:08 -0400 Subject: [PATCH 043/160] Added entry to algorithm comparison table --- doc/modules/clustering.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 14ec685f02e6f..77beba0599b8e 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -93,6 +93,13 @@ Overview of clustering methods transductive - Distances between nearest points + * - :ref:`HDBSCAN ` + - minimum cluster membership, minimum point neighbors + - large ``n_samples``, medium ``n_clusters`` + - Non-flat geometry, uneven cluster sizes, outlier removal, + transductive, hierarchical, variable cluster density + - Distances between nearest points + * - :ref:`OPTICS ` - minimum cluster membership - Very large ``n_samples``, large ``n_clusters`` From ba3302d71eade52c9a76fe9d79483efb22d41243 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 20:49:32 -0400 Subject: [PATCH 044/160] Added link to original hdbscan repository --- doc/modules/clustering.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 77beba0599b8e..4b607116cb4d9 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -900,6 +900,11 @@ capture them. HDBSCAN alleviates this assumption and explores all possible density scales by building an alternative representation of the clustering problem. +.. note:: + + This implementation is adapted from the work done in + https://github.com/scikit-learn-contrib/hdbscan + Mutual Reachability Graph ------------------------- From 0f46e6cb6e665421442d005bf964634dc55f4fb2 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 21:14:08 -0400 Subject: [PATCH 045/160] Updated tests and improved caching code - Updated test to account for future warning in mwinkowski - Replaced `cachedir` with `location` due to deprecation --- sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- sklearn/cluster/_hdbscan/tests/test_hdbscan.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index d203fc84261e1..6d28467384b6c 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -714,7 +714,7 @@ def hdbscan( check_precomputed_distance_matrix(X) # Python 2 and 3 compliant string_type checking - memory = Memory(cachedir=memory, verbose=0) + memory = Memory(location=memory, verbose=0) size = X.shape[0] min_samples = min(size - 1, min_samples) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 0d2853037e068..fde399fa6481e 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -168,6 +168,14 @@ def test_hdbscan_algorithms(algo, metric): metric=metric, metric_params=METRIC_PARAMS.get(metric, None), ) + elif metric == "wminkowski": + with pytest.warns(FutureWarning): + hdbscan( + X, + algorithm=algo, + metric=metric, + metric_params=METRIC_PARAMS.get(metric, None), + ) else: hdbscan( X, From e7165ae43cc3b961886a1250ce170401c7d492c0 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 27 Mar 2022 18:17:37 -0400 Subject: [PATCH 046/160] Removed extra properties/attributes - Removed `outlier_scores` - Removed `min_spanning_tree` - Removed `gen_min_spanning_tree` --- examples/cluster/plot_hdbscan.py | 6 +- sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 63 ---- sklearn/cluster/_hdbscan/_validity.py | 2 +- sklearn/cluster/_hdbscan/hdbscan_.py | 287 ++---------------- .../cluster/_hdbscan/tests/test_hdbscan.py | 18 +- 5 files changed, 36 insertions(+), 340 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index a2d153fda1036..fb26302cd3808 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -=================================== +==================================== Demo of HDBSCAN clustering algorithm -=================================== +==================================== """ @@ -26,7 +26,7 @@ # %% # Compute HDBSCAN -# -------------- +# --------------- hdb = HDBSCAN().fit(X) labels = hdb.labels_ diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx index ef51db0635473..d40cb15070057 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -525,69 +525,6 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels): return result -cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree): - """Generate GLOSH outlier scores from a condensed tree. - - Parameters - ---------- - tree : numpy recarray - The condensed tree to generate GLOSH outlier scores from - - Returns - ------- - outlier_scores : ndarray (n_samples,) - Outlier scores for each sample point. The larger the score - the more outlying the point. - """ - - cdef np.ndarray[np.double_t, ndim=1] result - cdef np.ndarray[np.double_t, ndim=1] deaths - cdef np.ndarray[np.double_t, ndim=1] lambda_array - cdef np.ndarray[np.intp_t, ndim=1] child_array - cdef np.ndarray[np.intp_t, ndim=1] parent_array - cdef np.intp_t root_cluster - cdef np.intp_t point - cdef np.intp_t parent - cdef np.intp_t cluster - cdef np.double_t lambda_max - - child_array = tree['child'] - parent_array = tree['parent'] - lambda_array = tree['lambda_val'] - - deaths = max_lambdas(tree) - root_cluster = parent_array.min() - result = np.zeros(root_cluster, dtype=np.double) - - topological_sort_order = np.argsort(parent_array) - # topologically_sorted_tree = tree[topological_sort_order] - - for n in topological_sort_order: - cluster = child_array[n] - if cluster < root_cluster: - break - - parent = parent_array[n] - if deaths[cluster] > deaths[parent]: - deaths[parent] = deaths[cluster] - - for n in range(tree.shape[0]): - point = child_array[n] - if point >= root_cluster: - continue - - cluster = parent_array[n] - lambda_max = deaths[cluster] - - - if lambda_max == 0.0 or not np.isfinite(lambda_array[n]): - result[point] = 0.0 - else: - result[point] = (lambda_max - lambda_array[n]) / lambda_max - - return result - - cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters, dict stability, np.double_t max_lambda): diff --git a/sklearn/cluster/_hdbscan/_validity.py b/sklearn/cluster/_hdbscan/_validity.py index c609f4db84ee8..2c1874542097a 100644 --- a/sklearn/cluster/_hdbscan/_validity.py +++ b/sklearn/cluster/_hdbscan/_validity.py @@ -8,7 +8,7 @@ from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cdist from ._hdbscan_linkage import mst_linkage_core -from .hdbscan_ import isclose +from numpy import isclose def all_points_core_distance(distance_matrix, d=2.0): diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 6d28467384b6c..7595251960cd4 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -9,7 +9,6 @@ # License: BSD 3 clause import numpy as np -from numpy import isclose from sklearn.base import BaseEstimator, ClusterMixin from sklearn.metrics import pairwise_distances @@ -31,7 +30,6 @@ condense_tree, compute_stability, get_clusters, - outlier_scores, labelling_at_cut, ) from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability @@ -66,7 +64,7 @@ def _tree_to_labels( max_cluster_size, ) - return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree) + return (labels, probabilities, stabilities, single_linkage_tree) def _hdbscan_generic( @@ -76,7 +74,6 @@ def _hdbscan_generic( metric="minkowski", p=2, leaf_size=None, - gen_min_span_tree=False, **kwargs, ): if metric == "minkowski": @@ -102,7 +99,6 @@ def _hdbscan_generic( metric, p, leaf_size, - gen_min_span_tree, **kwargs, ) @@ -120,29 +116,13 @@ def _hdbscan_generic( UserWarning, ) - # mst_linkage_core does not generate a full minimal spanning tree - # If a tree is required then we must build the edges from the information - # returned by mst_linkage_core (i.e. just the order of points to be merged) - if gen_min_span_tree: - result_min_span_tree = min_spanning_tree.copy() - for index, row in enumerate(result_min_span_tree[1:], 1): - candidates = np.where(isclose(mutual_reachability_[int(row[1])], row[2]))[0] - candidates = np.intersect1d( - candidates, min_spanning_tree[:index, :2].astype(int) - ) - candidates = candidates[candidates != row[1]] - assert len(candidates) > 0 - row[0] = candidates[0] - else: - result_min_span_tree = None - # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) - return single_linkage_tree, result_min_span_tree + return single_linkage_tree def _hdbscan_sparse_distance_matrix( @@ -152,7 +132,6 @@ def _hdbscan_sparse_distance_matrix( metric="minkowski", p=2, leaf_size=40, - gen_min_span_tree=False, **kwargs, ): assert issparse(X) @@ -205,10 +184,7 @@ def _hdbscan_sparse_distance_matrix( # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) - if gen_min_span_tree: - return single_linkage_tree, min_spanning_tree - else: - return single_linkage_tree, None + return single_linkage_tree def _hdbscan_prims_kdtree( @@ -217,7 +193,6 @@ def _hdbscan_prims_kdtree( alpha=1.0, metric="minkowski", leaf_size=40, - gen_min_span_tree=False, **kwargs, ): if X.dtype != np.float64: @@ -246,10 +221,7 @@ def _hdbscan_prims_kdtree( # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) - if gen_min_span_tree: - return single_linkage_tree, min_spanning_tree - else: - return single_linkage_tree, None + return single_linkage_tree def _hdbscan_prims_balltree( @@ -258,7 +230,6 @@ def _hdbscan_prims_balltree( alpha=1.0, metric="minkowski", leaf_size=40, - gen_min_span_tree=False, **kwargs, ): if X.dtype != np.float64: @@ -284,10 +255,7 @@ def _hdbscan_prims_balltree( # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) - if gen_min_span_tree: - return single_linkage_tree, min_spanning_tree - else: - return single_linkage_tree, None + return single_linkage_tree def _hdbscan_boruvka_kdtree( @@ -297,7 +265,6 @@ def _hdbscan_boruvka_kdtree( metric="minkowski", leaf_size=40, approx_min_span_tree=True, - gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs, ): @@ -335,10 +302,7 @@ def _hdbscan_boruvka_kdtree( # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) - if gen_min_span_tree: - return single_linkage_tree, min_spanning_tree - else: - return single_linkage_tree, None + return single_linkage_tree def _hdbscan_boruvka_balltree( @@ -348,7 +312,6 @@ def _hdbscan_boruvka_balltree( metric="minkowski", leaf_size=40, approx_min_span_tree=True, - gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs, ): @@ -377,10 +340,7 @@ def _hdbscan_boruvka_balltree( # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) - if gen_min_span_tree: - return single_linkage_tree, min_spanning_tree - else: - return single_linkage_tree, None + return single_linkage_tree def check_precomputed_distance_matrix(X): @@ -509,7 +469,6 @@ def hdbscan( algorithm="best", memory=None, approx_min_span_tree=True, - gen_min_span_tree=False, core_dist_n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, @@ -576,12 +535,12 @@ def hdbscan( to `best` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: - * `best` - * `generic` - * `prims_kdtree` - * `prims_balltree` - * `boruvka_kdtree` - * `boruvka_balltree` + - `best` + - `generic` + - `prims_kdtree` + - `prims_balltree` + - `boruvka_kdtree` + - `boruvka_balltree` memory : str, default=None Used to cache the output of the computation of the tree. @@ -595,9 +554,6 @@ def hdbscan( If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. - gen_min_span_tree : bool, default=False - Whether to generate the minimum spanning tree for later analysis. - core_dist_n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For `core_dist_n_jobs` @@ -609,8 +565,8 @@ def hdbscan( to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: - * `eom` - * `leaf` + - `eom` + - `leaf` allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this @@ -644,18 +600,11 @@ def hdbscan( scores can be guage the relative coherence of the clusters output by the algorithm. - condensed_tree : record array - The condensed cluster hierarchy used to generate clusters. - single_linkage_tree : ndarray, shape (n_samples - 1, 4) The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). - min_spanning_tree : ndarray, shape (n_samples - 1, 3) - The minimum spanning as an edgelist. If gen_min_span_tree was False - this will be None. - References ---------- @@ -727,58 +676,46 @@ def hdbscan( raise ValueError("Sparse data matrices only support algorithm 'generic'.") if algorithm == "generic": - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_generic - )( + single_linkage_tree = memory.cache(_hdbscan_generic)( X, min_samples, alpha, metric, leaf_size, - gen_min_span_tree, **metric_params, ) elif algorithm == "prims_kdtree": if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Prim's with KDTree for this metric!") - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_prims_kdtree - )( + single_linkage_tree = memory.cache(_hdbscan_prims_kdtree)( X, min_samples, alpha, metric, leaf_size, - gen_min_span_tree, **metric_params, ) elif algorithm == "prims_balltree": if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Prim's with BallTree for this metric!") - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_prims_balltree - )( + single_linkage_tree = memory.cache(_hdbscan_prims_balltree)( X, min_samples, alpha, metric, leaf_size, - gen_min_span_tree, **metric_params, ) elif algorithm == "boruvka_kdtree": if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Boruvka with KDTree for this metric!") - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_boruvka_kdtree - )( + single_linkage_tree = memory.cache(_hdbscan_boruvka_kdtree)( X, min_samples, alpha, metric, leaf_size, approx_min_span_tree, - gen_min_span_tree, core_dist_n_jobs, **metric_params, ) @@ -791,16 +728,13 @@ def hdbscan( "memory usage. If you are running out of memory consider " "increasing the `leaf_size` parameter." ) - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_boruvka_balltree - )( + single_linkage_tree = memory.cache(_hdbscan_boruvka_balltree)( X, min_samples, alpha, metric, leaf_size, approx_min_span_tree, - gen_min_span_tree, core_dist_n_jobs, **metric_params, ) @@ -810,43 +744,34 @@ def hdbscan( if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_generic - )( + single_linkage_tree = memory.cache(_hdbscan_generic)( X, min_samples, alpha, metric, leaf_size, - gen_min_span_tree, **metric_params, ) elif metric in KDTree.valid_metrics: # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 60: - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_prims_kdtree - )( + single_linkage_tree = memory.cache(_hdbscan_prims_kdtree)( X, min_samples, alpha, metric, leaf_size, - gen_min_span_tree, **metric_params, ) else: - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_boruvka_kdtree - )( + single_linkage_tree = memory.cache(_hdbscan_boruvka_kdtree)( X, min_samples, alpha, metric, leaf_size, approx_min_span_tree, - gen_min_span_tree, core_dist_n_jobs, **metric_params, ) @@ -854,28 +779,22 @@ def hdbscan( # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 60: - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_prims_balltree - )( + single_linkage_tree = memory.cache(_hdbscan_prims_balltree)( X, min_samples, alpha, metric, leaf_size, - gen_min_span_tree, **metric_params, ) else: - (single_linkage_tree, result_min_span_tree) = memory.cache( - _hdbscan_boruvka_balltree - )( + single_linkage_tree = memory.cache(_hdbscan_boruvka_balltree)( X, min_samples, alpha, metric, leaf_size, approx_min_span_tree, - gen_min_span_tree, core_dist_n_jobs, **metric_params, ) @@ -888,7 +807,7 @@ def hdbscan( match_reference_implementation, cluster_selection_epsilon, max_cluster_size, - ) + (result_min_span_tree,) + ) # Inherits from sklearn @@ -971,10 +890,6 @@ class HDBSCAN(BaseEstimator, ClusterMixin): If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. - gen_min_span_tree : bool, default=False - Whether to generate the minimum spanning tree with regard - to mutual reachability distance for later analysis. - core_dist_n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For `core_dist_n_jobs` @@ -1030,22 +945,6 @@ class HDBSCAN(BaseEstimator, ClusterMixin): scores can be guage the relative coherence of the clusters output by the algorithm. - outlier_scores_ : ndarray, shape (n_samples, ) - Outlier scores for clustered points; the larger the score the more - outlier-like the point. Useful as an outlier detection technique. - Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander. - - relative_validity_ : float - A fast approximation of the Density Based Cluster Validity (DBCV) - score [4]. The only differece, and the speed, comes from the fact - that this relative_validity_ is computed using the mutual- - reachability minimum spanning tree, i.e. minimum_spanning_tree_, - instead of the all-points minimum spanning tree used in the - reference. This score might not be an objective measure of the - goodness of clusterering. It may only be used to compare results - across different choices of hyper-parameters, therefore is only a - relative score. - n_features_in_ : int Number of features seen during :term:`fit`. @@ -1111,7 +1010,6 @@ def __init__( leaf_size=40, memory=None, approx_min_span_tree=True, - gen_min_span_tree=False, core_dist_n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, @@ -1129,7 +1027,6 @@ def __init__( self.leaf_size = leaf_size self.memory = memory self.approx_min_span_tree = approx_min_span_tree - self.gen_min_span_tree = gen_min_span_tree self.core_dist_n_jobs = core_dist_n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster @@ -1190,17 +1087,12 @@ def fit(self, X, y=None): self.labels_, self.probabilities_, self.cluster_persistence_, - self._condensed_tree_, self._single_linkage_tree_, - self._min_spanning_tree, ) = hdbscan(X, **kwargs) if self.metric != "precomputed" and not self._all_finite: # remap indices to align with original data in the case of # non-finite entries. - self._condensed_tree_ = remap_condensed_tree( - self._condensed_tree_, internal_to_raw, outliers - ) self._single_linkage_tree_ = remap_single_linkage_tree( self._single_linkage_tree_, internal_to_raw, outliers ) @@ -1347,130 +1239,3 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): return labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size ) - - @property - def outlier_scores_(self): - """ - Points with larger scores are more outlier-like points. - """ - if getattr(self, "_outlier_scores", None) is not None: - return self._outlier_scores - else: - if getattr(self, "_condensed_tree_", None) is not None: - self._outlier_scores = outlier_scores(self._condensed_tree_) - return self._outlier_scores - else: - raise AttributeError( - "No condensed tree was generated; try running fit first." - ) - - @property - def minimum_spanning_tree_(self): - """ - The minimum spanning tree of the mutual reachability graph. - """ - if getattr(self, "_min_spanning_tree", None) is not None: - if self._raw_data is not None: - return self._min_spanning_tree, self._raw_data - else: - warn( - "No raw data is available; this may be due to using" - " a precomputed metric matrix. No minimum spanning" - " tree will be provided without raw data." - ) - return None - else: - raise AttributeError( - "No minimum spanning tree was generated." - "This may be due to optimized algorithm variations that skip" - " explicit generation of the spanning tree." - ) - - @property - def relative_validity_(self): - """ - A fast approximation of the Density Based Cluster Validity (DBCV) score. - """ - if getattr(self, "_relative_validity", None) is not None: - return self._relative_validity - - if not self.gen_min_span_tree: - raise AttributeError( - "Minimum spanning tree not present. " - + "Either HDBSCAN object was created with " - + "gen_min_span_tree=False or the tree was " - + "not generated in spite of it owing to " - + "internal optimization criteria." - ) - return - - labels = self.labels_ - sizes = np.bincount(labels + 1) - noise_size = sizes[0] - cluster_size = sizes[1:] - total = noise_size + np.sum(cluster_size) - num_clusters = len(cluster_size) - DSC = np.zeros(num_clusters) - min_outlier_sep = np.inf # only required if num_clusters = 1 - correction_const = 2 # only required if num_clusters = 1 - - # Unltimately, for each Ci, we only require the - # minimum of DSPC(Ci, Cj) over all Cj != Ci. - # So let's call this value DSPC_wrt(Ci), i.e. - # density separation 'with respect to' Ci. - DSPC_wrt = np.ones(num_clusters) * np.inf - max_distance = 0 - - mst_df = self.minimum_spanning_tree_.to_pandas() - - for edge in mst_df.iterrows(): - label1 = labels[int(edge[1]["from"])] - label2 = labels[int(edge[1]["to"])] - length = edge[1]["distance"] - - max_distance = max(max_distance, length) - - if label1 == -1 and label2 == -1: - continue - elif label1 == -1 or label2 == -1: - # If exactly one of the points is noise - min_outlier_sep = min(min_outlier_sep, length) - continue - - if label1 == label2: - # Set the density sparseness of the cluster - # to the sparsest value seen so far. - DSC[label1] = max(length, DSC[label1]) - else: - # Check whether density separations with - # respect to each of these clusters can - # be reduced. - DSPC_wrt[label1] = min(length, DSPC_wrt[label1]) - DSPC_wrt[label2] = min(length, DSPC_wrt[label2]) - - # In case min_outlier_sep is still np.inf, we assign a new value to it. - # This only makes sense if num_clusters = 1 since it has turned out - # that the MR-MST has no edges between a noise point and a core point. - min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep - - # DSPC_wrt[Ci] might be infinite if the connected component for Ci is - # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the - # MR-MST might contain an edge with one point in Cj and ther other one - # in Ck. Here, we replace the infinite density separation of Ci by - # another large enough value. - # - # TODO: Think of a better yet efficient way to handle this. - correction = correction_const * ( - max_distance if num_clusters > 1 else min_outlier_sep - ) - DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction - - V_index = [ - (DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) - for i in range(num_clusters) - ] - score = np.sum( - [(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)] - ) - self._relative_validity = score - return self._relative_validity diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index fde399fa6481e..328421ca4b6da 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -108,7 +108,7 @@ def test_hdbscan_sparse_distance_matrix(): n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == n_clusters - labels = HDBSCAN(metric="precomputed", gen_min_span_tree=True).fit(D).labels_ + labels = HDBSCAN(metric="precomputed").fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters @@ -143,7 +143,7 @@ def test_hdbscan_algorithms(algo, metric): n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = HDBSCAN(algorithm=algo, gen_min_span_tree=True).fit(X).labels_ + labels = HDBSCAN(algorithm=algo).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters @@ -196,7 +196,7 @@ def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) # H, y = shuffle(X, y, random_state=7) H = StandardScaler().fit_transform(H) - labels, p, persist, ctree, ltree, mtree = hdbscan(H) + labels = hdbscan(H)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -214,9 +214,9 @@ def test_hdbscan_high_dimensional(): def test_hdbscan_best_balltree_metric(): - labels, p, persist, ctree, ltree, mtree = hdbscan( - X, metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} - ) + labels = hdbscan(X, metric="seuclidean", metric_params={"V": np.ones(X.shape[1])})[ + 0 + ] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters @@ -289,12 +289,6 @@ def test_hdbscan_boruvka_matches(tree): assert (num_mismatches / float(data.shape[0])) < 0.15 -def test_hdbscan_outliers(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) - scores = clusterer.outlier_scores_ - assert scores is not None - - def test_hdbscan_badargs(): assert_raises(ValueError, hdbscan, X="fail") assert_raises(ValueError, hdbscan, X=None) From 7834bf1b95db2e9d72eb57d1cea324f178109170 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 27 Mar 2022 18:20:55 -0400 Subject: [PATCH 047/160] Cleaned up function signatures --- sklearn/cluster/_hdbscan/hdbscan_.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 7595251960cd4..079d4e3b052a0 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -96,9 +96,6 @@ def _hdbscan_generic( distance_matrix, min_samples, alpha, - metric, - p, - leaf_size, **kwargs, ) @@ -129,9 +126,6 @@ def _hdbscan_sparse_distance_matrix( X, min_samples=5, alpha=1.0, - metric="minkowski", - p=2, - leaf_size=40, **kwargs, ): assert issparse(X) @@ -261,7 +255,6 @@ def _hdbscan_prims_balltree( def _hdbscan_boruvka_kdtree( X, min_samples=5, - alpha=1.0, metric="minkowski", leaf_size=40, approx_min_span_tree=True, @@ -308,7 +301,6 @@ def _hdbscan_boruvka_kdtree( def _hdbscan_boruvka_balltree( X, min_samples=5, - alpha=1.0, metric="minkowski", leaf_size=40, approx_min_span_tree=True, @@ -712,7 +704,6 @@ def hdbscan( single_linkage_tree = memory.cache(_hdbscan_boruvka_kdtree)( X, min_samples, - alpha, metric, leaf_size, approx_min_span_tree, @@ -731,7 +722,6 @@ def hdbscan( single_linkage_tree = memory.cache(_hdbscan_boruvka_balltree)( X, min_samples, - alpha, metric, leaf_size, approx_min_span_tree, @@ -768,7 +758,6 @@ def hdbscan( single_linkage_tree = memory.cache(_hdbscan_boruvka_kdtree)( X, min_samples, - alpha, metric, leaf_size, approx_min_span_tree, @@ -791,7 +780,6 @@ def hdbscan( single_linkage_tree = memory.cache(_hdbscan_boruvka_balltree)( X, min_samples, - alpha, metric, leaf_size, approx_min_span_tree, From 4a4e3eb011ba96f6a758c8f00ada0054136e8593 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 27 Mar 2022 18:54:33 -0400 Subject: [PATCH 048/160] Trimmed docstring, renamed param, removed extra parameters/attrs - Renamed `core_dist_n_jobs` to `n_jobs` - Removed mention of prediciton algorithms in docstring - Removed cluster persistance and stability scores - Removed `prediction_data` since prediction API no longer included --- sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 19 +---- sklearn/cluster/_hdbscan/hdbscan_.py | 98 +++++++--------------- 2 files changed, 32 insertions(+), 85 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx index d40cb15070057..31726cc900bea 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -525,22 +525,6 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels): return result -cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters, - dict stability, np.double_t max_lambda): - - cdef np.intp_t cluster_size - cdef np.intp_t n - - result = np.empty(len(clusters), dtype=np.double) - for n, c in enumerate(sorted(list(clusters))): - cluster_size = np.sum(labels == n) - if np.isinf(max_lambda) or max_lambda == 0.0 or cluster_size == 0: - result[n] = 1.0 - else: - result[n] = stability[c] / (cluster_size * max_lambda) - - return result - cpdef list recurse_leaf_dfs(np.ndarray cluster_tree, np.intp_t current_node): children = cluster_tree[cluster_tree['parent'] == current_node]['child'] if len(children) == 0: @@ -737,6 +721,5 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster, cluster_selection_epsilon, match_reference_implementation) probs = get_probabilities(tree, reverse_cluster_map, labels) - stabilities = get_stability_scores(labels, clusters, stability, max_lambda) - return (labels, probs, stabilities) + return (labels, probs) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 079d4e3b052a0..ec86c35cacfc1 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -54,7 +54,7 @@ def _tree_to_labels( """ condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) - labels, probabilities, stabilities = get_clusters( + labels, probabilities = get_clusters( condensed_tree, stability_dict, cluster_selection_method, @@ -64,7 +64,7 @@ def _tree_to_labels( max_cluster_size, ) - return (labels, probabilities, stabilities, single_linkage_tree) + return (labels, probabilities, single_linkage_tree) def _hdbscan_generic( @@ -258,14 +258,14 @@ def _hdbscan_boruvka_kdtree( metric="minkowski", leaf_size=40, approx_min_span_tree=True, - core_dist_n_jobs=4, + n_jobs=4, **kwargs, ): if leaf_size < 3: leaf_size = 3 - if core_dist_n_jobs < 1: - core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) + if n_jobs < 1: + n_jobs = max(cpu_count() + 1 + n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) @@ -285,7 +285,7 @@ def _hdbscan_boruvka_kdtree( metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, - n_jobs=core_dist_n_jobs, + n_jobs=n_jobs, **kwargs, ) min_spanning_tree = alg.spanning_tree() @@ -304,14 +304,14 @@ def _hdbscan_boruvka_balltree( metric="minkowski", leaf_size=40, approx_min_span_tree=True, - core_dist_n_jobs=4, + n_jobs=4, **kwargs, ): if leaf_size < 3: leaf_size = 3 - if core_dist_n_jobs < 1: - core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) + if n_jobs < 1: + n_jobs = max(cpu_count() + 1 + n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) @@ -323,7 +323,7 @@ def _hdbscan_boruvka_balltree( metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, - n_jobs=core_dist_n_jobs, + n_jobs=n_jobs, **kwargs, ) min_spanning_tree = alg.spanning_tree() @@ -461,7 +461,7 @@ def hdbscan( algorithm="best", memory=None, approx_min_span_tree=True, - core_dist_n_jobs=4, + n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, match_reference_implementation=False, @@ -492,19 +492,13 @@ def hdbscan( cluster_selection_epsilon : float, default=0.0 A distance threshold. Clusters below this value will be merged. - See [3]_ for more information. Note that this should not be used - if we want to predict the cluster labels for new points in future - (e.g. using approximate_predict), as the approximate_predict function - is not aware of this argument. + See [3]_ for more information. max_cluster_size : int, default=0 A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare - cases by a high value for cluster_selection_epsilon. Note that - this should not be used if we want to predict the cluster labels - for new points in future (e.g. using approximate_predict), as - the approximate_predict function is not aware of this argument. + cases by a high value for cluster_selection_epsilon. metric : str or callable, default='minkowski' The metric to use when calculating distance between instances in a @@ -546,10 +540,10 @@ def hdbscan( If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. - core_dist_n_jobs : int, default=4 + n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `core_dist_n_jobs` - below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + supported by the specific algorithm). For `n_jobs` + below -1, (n_cpus + 1 + n_jobs) are used. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The @@ -585,13 +579,6 @@ def hdbscan( Cluster membership strengths for each point. Noisy samples are assigned 0. - cluster_persistence : array, shape (n_clusters, ) - A score of how persistent each cluster is. A score of 1.0 represents - a perfectly stable cluster that persists over all distance scales, - while a score of 0.0 represents a perfectly ephemeral cluster. These - scores can be guage the relative coherence of the clusters output - by the algorithm. - single_linkage_tree : ndarray, shape (n_samples - 1, 4) The single linkage tree produced during clustering in scipy hierarchical clustering format @@ -707,7 +694,7 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - core_dist_n_jobs, + n_jobs, **metric_params, ) elif algorithm == "boruvka_balltree": @@ -725,7 +712,7 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - core_dist_n_jobs, + n_jobs, **metric_params, ) else: @@ -761,7 +748,7 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - core_dist_n_jobs, + n_jobs, **metric_params, ) else: # Metric is a valid BallTree metric @@ -783,7 +770,7 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - core_dist_n_jobs, + n_jobs, **metric_params, ) @@ -824,13 +811,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin): See [5]_ for more information. max_cluster_size : int, default=0 - A limit to the size of clusters returned by the eom algorithm. - Has no effect when using leaf clustering (where clusters are - usually small regardless) and can also be overridden in rare - cases by a high value for cluster_selection_epsilon. Note that - this should not be used if we want to predict the cluster labels - for new points in future (e.g. using approximate_predict), as - the approximate_predict function is not aware of this argument. + A limit to the size of clusters returned by the `eom` cluster selection + algorithm. Has no effect if `cluster_selection_method=leaf`. Can be + overridden in rare cases by a high value for + `cluster_selection_epsilon`. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a @@ -861,7 +845,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin): - `boruvka_balltree` leaf_size : int, default=40 - If using a space tree algorithm (kdtree, or balltree) the number + If using a space tree algorithm (`KDTree`, or `BallTree`) the number of points ina leaf node of the tree. This does not alter the resulting clustering, but may have an effect on the runtime of the algorithm. @@ -876,12 +860,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin): For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want - to explore this; in general this should be left at the default True. + to explore this; in general this should be left at the default `True`. - core_dist_n_jobs : int, default=4 + n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `core_dist_n_jobs` - below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + supported by the specific algorithm). For `n_jobs` + below -1, (n_cpus + 1 + n_jobs) are used. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The @@ -897,12 +881,6 @@ class HDBSCAN(BaseEstimator, ClusterMixin): to True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. - prediction_data : bool, default=False - Whether to generate extra cached data for predicting labels or - membership vectors few new unseen points later. If you wish to - persist the clustering object for later re-use you probably want - to set this to True. - match_reference_implementation : bool, default=False There exist some interpretational differences between this HDBSCAN* implementation and the original authors reference @@ -926,13 +904,6 @@ class HDBSCAN(BaseEstimator, ClusterMixin): have values assigned proportional to the degree that they persist as part of the cluster. - cluster_persistence_ : ndarray, shape (n_clusters, ) - A score of how persistent each cluster is. A score of 1.0 represents - a perfectly stable cluster that persists over all distance scales, - while a score of 0.0 represents a perfectly ephemeral cluster. These - scores can be guage the relative coherence of the clusters output - by the algorithm. - n_features_in_ : int Number of features seen during :term:`fit`. @@ -998,10 +969,9 @@ def __init__( leaf_size=40, memory=None, approx_min_span_tree=True, - core_dist_n_jobs=4, + n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, - prediction_data=False, match_reference_implementation=False, metric_params=None, ): @@ -1015,11 +985,10 @@ def __init__( self.leaf_size = leaf_size self.memory = memory self.approx_min_span_tree = approx_min_span_tree - self.core_dist_n_jobs = core_dist_n_jobs + self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster self.match_reference_implementation = match_reference_implementation - self.prediction_data = prediction_data self.metric_params = metric_params def fit(self, X, y=None): @@ -1068,13 +1037,11 @@ def fit(self, X, y=None): kwargs = self.get_params() # prediction data only applies to the persistent model, so remove # it from the keyword args we pass on the the function - kwargs.pop("prediction_data", None) kwargs["metric_params"] = metric_params ( self.labels_, self.probabilities_, - self.cluster_persistence_, self._single_linkage_tree_, ) = hdbscan(X, **kwargs) @@ -1092,9 +1059,6 @@ def fit(self, X, y=None): new_probabilities[finite_index] = self.probabilities_ self.probabilities_ = new_probabilities - if self.prediction_data: - self.generate_prediction_data() - return self def fit_predict(self, X, y=None): From df65fb957fd3de9adafa2d9f6c489be2c0961bcd Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 27 Mar 2022 19:20:58 -0400 Subject: [PATCH 049/160] Moved single-use functions in-line --- sklearn/cluster/_hdbscan/hdbscan_.py | 34 +++++++++++----------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index ec86c35cacfc1..5a8d4c6b06a48 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -335,16 +335,6 @@ def _hdbscan_boruvka_balltree( return single_linkage_tree -def check_precomputed_distance_matrix(X): - """ - Perform check_array(X) after removing infinite values (numpy.inf) - from the given distance matrix. - """ - tmp = X.copy() - tmp[np.isinf(tmp)] = 1 - check_array(tmp) - - def remap_condensed_tree(tree, internal_to_raw, outliers): """ Takes an internal condensed_tree structure and adds back in a set of points @@ -427,14 +417,6 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers): return tree -def is_finite(matrix): - """Returns true only if all the values of a ndarray or sparse matrix are finite""" - if issparse(matrix): - return np.alltrue(np.isfinite(matrix.tocoo().data)) - else: - return np.alltrue(np.isfinite(matrix)) - - def get_finite_row_indices(matrix): """ Returns the indices of the purely finite rows of a @@ -638,8 +620,13 @@ def hdbscan( X = check_array(X, accept_sparse="csr", force_all_finite=False) else: # Only non-sparse, precomputed distance matrices are handled here - # and thereby allowed to contain numpy.inf for missing distances - check_precomputed_distance_matrix(X) + # and thereby allowed to contain numpy.inf for missing distances + + # Perform check_array(X) after removing infinite values (numpy.inf) + # from the given distance matrix. + tmp = X.copy() + tmp[np.isinf(tmp)] = 1 + check_array(tmp) # Python 2 and 3 compliant string_type checking memory = Memory(location=memory, verbose=0) @@ -1016,7 +1003,12 @@ def fit(self, X, y=None): X = self._validate_data(X, force_all_finite=False, accept_sparse="csr") self._raw_data = X - self._all_finite = is_finite(X) + self._all_finite = ( + np.alltrue(np.isfinite(X.tocoo().data)) + if issparse(X) + else np.alltrue(np.isfinite(X)) + ) + if not self._all_finite: # Pass only the purely finite indices into hdbscan # We will later assign all non-finite points to the From 4bd72e5411e9fd75750a2e9d57d45add3c3bfc6f Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 27 Mar 2022 19:33:42 -0400 Subject: [PATCH 050/160] Trim cython file by removing functionality for old `prediction` --- .../cluster/_hdbscan/_prediction_utils.pyx | 199 ------------------ 1 file changed, 199 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_prediction_utils.pyx b/sklearn/cluster/_hdbscan/_prediction_utils.pyx index bced8713ca128..59b549ffe4a6e 100644 --- a/sklearn/cluster/_hdbscan/_prediction_utils.pyx +++ b/sklearn/cluster/_hdbscan/_prediction_utils.pyx @@ -21,98 +21,6 @@ cpdef get_tree_row_with_child(np.ndarray tree, np.intp_t child): return tree[0] -cdef np.float64_t min_dist_to_exemplar( - np.ndarray[np.float64_t, ndim=1] point, - np.ndarray[np.float64_t, ndim=2] cluster_exemplars, - DistanceMetric dist_metric): - - cdef np.intp_t i - cdef np.float64_t result = DBL_MAX - cdef np.float64_t distance - cdef np.float64_t *point_ptr = ( point.data) - cdef np.float64_t[:, ::1] exemplars_view = \ - ( - ( cluster_exemplars.data)) - cdef np.float64_t *exemplars_ptr = \ - ( &exemplars_view[0, 0]) - cdef np.intp_t num_features = point.shape[0] - - for i in range(cluster_exemplars.shape[0]): - distance = dist_metric.dist(point_ptr, - &exemplars_ptr[num_features * i], - num_features) - if distance < result: - result = distance - - return result - -cdef np.ndarray[np.float64_t, ndim=1] dist_vector( - np.ndarray[np.float64_t, ndim=1] point, - list exemplars_list, - DistanceMetric dist_metric): - - cdef np.intp_t i - cdef np.ndarray[np.float64_t, ndim=2] exemplars - cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list)) - - - for i in range(len(exemplars_list)): - exemplars = exemplars_list[i] - result[i] = min_dist_to_exemplar(point, exemplars, dist_metric) - - return result - -cpdef np.ndarray[np.float64_t, ndim=1] dist_membership_vector( - np.ndarray[np.float64_t, ndim=1] point, - list exemplars_list, - DistanceMetric dist_metric, - softmax=False): - - cdef np.intp_t i - cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list)) - cdef np.ndarray[np.float64_t, ndim=1] vector - cdef np.float64_t sum = 0.0 - - vector = dist_vector(point, exemplars_list, dist_metric) - - if softmax: - for i in range(vector.shape[0]): - result[i] = 1.0 / vector[i] - result = np.exp(result - np.nanmax(result)) - sum = np.sum(result) - - else: - for i in range(vector.shape[0]): - if vector[i] != 0: - result[i] = 1.0 / vector[i] - else: - result[i] = DBL_MAX / vector.shape[0] - sum += result[i] - - for i in range(result.shape[0]): - result[i] = result[i] / sum - - return result - -cpdef np.ndarray[np.float64_t, ndim=2] all_points_dist_membership_vector( - np.ndarray[np.float64_t, ndim=2] all_points, - list exemplars_list, - DistanceMetric dist_metric, - softmax=False): - - cdef np.ndarray[np.float64_t, ndim=2] result - cdef np.intp_t i - - result = np.empty((all_points.shape[0], len(exemplars_list)), - dtype=np.float64) - - for i in range(all_points.shape[0]): - result[i] = dist_membership_vector(all_points[i], - exemplars_list, - dist_metric, - softmax) - - return result cdef np.ndarray[np.float64_t, ndim=1] merge_height( np.intp_t point_cluster, @@ -238,113 +146,6 @@ cpdef np.ndarray[np.float64_t, ndim=1] per_cluster_scores( return result -cpdef np.ndarray[np.float64_t, ndim=1] outlier_membership_vector(neighbor, - lambda_, clusters, tree, max_lambda_dict, cluster_tree, - softmax=True): - - cdef np.ndarray[np.float64_t, ndim=1] result - - if softmax: - result = per_cluster_scores(neighbor, lambda_, clusters, tree, - max_lambda_dict, cluster_tree) - # Scale for numerical stability, mathematically equivalent with old - # version due to the scaling with the sum in below. - result = np.exp(result - np.nanmax(result)) - #result[~np.isfinite(result)] = np.finfo(np.double).max - else: - result = per_cluster_scores(neighbor, lambda_, clusters, tree, - max_lambda_dict, cluster_tree) - - result /= result.sum() - return result - -cpdef np.float64_t prob_in_some_cluster(neighbor, lambda_, clusters, tree, - max_lambda_dict, cluster_tree): - - cdef np.ndarray[np.float64_t, ndim=1] cluster_merge_heights - - cdef np.intp_t point_cluster - cdef np.float64_t point_lambda - cdef np.float64_t max_lambda - - point_row = get_tree_row_with_child(tree, neighbor) - point_cluster = point_row['parent'] - point_lambda = lambda_ - - cluster_merge_heights = \ - merge_height(point_cluster, point_lambda, clusters, cluster_tree) - point_height = cluster_merge_heights.max() - nearest_cluster = clusters[cluster_merge_heights.argmax()] - - max_lambda = max(lambda_, max_lambda_dict[nearest_cluster]) + 1e-8 # avoid z - - return (point_height / max_lambda) - -cpdef np.ndarray[np.float64_t, ndim=2] all_points_per_cluster_scores( - np.ndarray[np.intp_t, ndim=1] clusters, - np.ndarray tree, - dict max_lambda_dict, - np.ndarray cluster_tree): - - cdef np.intp_t num_points = tree['parent'].min() - cdef np.ndarray[np.float64_t, ndim=2] result_arr - cdef np.float64_t[:, ::1] result - cdef np.intp_t point - cdef np.intp_t point_cluster - cdef np.float64_t point_lambda - cdef np.float64_t max_lambda - - cdef np.intp_t i, j - - result_arr = np.empty((num_points, clusters.shape[0]), dtype=np.float64) - result = ( - ( result_arr.data)) - - point_tree = tree[tree['child_size'] == 1] - - for i in range(point_tree.shape[0]): - point_row = point_tree[i] - point = point_row['child'] - point_cluster = point_row['parent'] - point_lambda = point_row['lambda_val'] - max_lambda = max_lambda_dict[point_cluster] + 1e-8 # avoid zero lambda - - # Can we not do a faster merge height operation here? - result_arr[point] = merge_height(point_cluster, point_lambda, - clusters, cluster_tree) - - # Cythonize: result = np.exp(-(max_lambda / height)) - for j in range(result_arr.shape[1]): - result[point][j] = exp(-(max_lambda / result[point][j])) - - return result_arr - -cpdef np.ndarray[np.float64_t, ndim=2] all_points_outlier_membership_vector( - np.ndarray[np.intp_t, ndim=1] clusters, - np.ndarray tree, - dict max_lambda_dict, - np.ndarray cluster_tree, - np.intp_t softmax=True): - - cdef np.ndarray[np.float64_t, ndim=2] per_cluster_scores - - per_cluster_scores = all_points_per_cluster_scores( - clusters, - tree, - max_lambda_dict, - cluster_tree) - if softmax: - # Scale for numerical stability, mathematically equivalent with old - # version due to the scaling with the sum in below. - result = np.exp(per_cluster_scores - np.nanmax(per_cluster_scores)) - #result[~np.isfinite(result)] = np.finfo(np.double).max - else: - result = per_cluster_scores - - row_sums = result.sum(axis=1) - result = result / row_sums[:, np.newaxis] - - return result cpdef all_points_prob_in_some_cluster( np.ndarray[np.intp_t, ndim=1] clusters, From a25224fa30cf09f217530c23633935d337c1f34a Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Fri, 1 Apr 2022 15:12:34 -0400 Subject: [PATCH 051/160] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- doc/modules/clustering.rst | 2 +- sklearn/cluster/_hdbscan/tests/test_hdbscan.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 4b607116cb4d9..9a0f389d886a5 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -970,7 +970,7 @@ hyperparameter. .. centered:: |hdbscan_results| -HDBSCAN can be smoothed with an additional hyperparameter `minimum_cluster_size` +HDBSCAN can be smoothed with an additional hyperparameter `min_cluster_size` which specifies that during the hierarchical clustering, components with fewer than `minimum_cluster_size` many samples are considered noise. In practice, one can set `minimum_cluster_size = min_samples` to couple the parameters and diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 328421ca4b6da..df0f880b0a31c 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -13,7 +13,6 @@ from sklearn.cluster import HDBSCAN, hdbscan from sklearn.cluster._hdbscan._validity import validity_index -# from sklearn.cluster.tests.common import generate_clustered_data from sklearn.datasets import make_blobs from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler From 95d95a1933676f93634926e13907965b9ff89da6 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 15:29:57 -0400 Subject: [PATCH 052/160] Removed unnecessary `_prediction_utils` files --- .../cluster/_hdbscan/_prediction_utils.pyx | 183 ------------------ sklearn/cluster/_hdbscan/hdbscan_.py | 6 +- .../_hdbscan/tests/test_prediction_utils.py | 12 -- sklearn/cluster/setup.py | 7 +- 4 files changed, 4 insertions(+), 204 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_prediction_utils.pyx delete mode 100644 sklearn/cluster/_hdbscan/tests/test_prediction_utils.py diff --git a/sklearn/cluster/_hdbscan/_prediction_utils.pyx b/sklearn/cluster/_hdbscan/_prediction_utils.pyx deleted file mode 100644 index 59b549ffe4a6e..0000000000000 --- a/sklearn/cluster/_hdbscan/_prediction_utils.pyx +++ /dev/null @@ -1,183 +0,0 @@ -# Utility routines in cython for prediction in hdbscan -# Authors: Leland McInnes -# License: 3-clause BSD - -import numpy as np -cimport numpy as np - -from sklearn.metrics._dist_metrics cimport DistanceMetric - -from libc.float cimport DBL_MAX -from libc.math cimport exp - -cpdef get_tree_row_with_child(np.ndarray tree, np.intp_t child): - - cdef np.intp_t i - cdef np.ndarray[np.intp_t, ndim = 1] child_array = tree['child'] - - for i in range(tree.shape[0]): - if child_array[i] == child: - return tree[i] - - return tree[0] - - -cdef np.ndarray[np.float64_t, ndim=1] merge_height( - np.intp_t point_cluster, - np.float64_t point_lambda, - np.ndarray[np.intp_t, ndim=1] clusters, - np.ndarray cluster_tree): - - cdef np.intp_t i - cdef np.intp_t j - - cdef np.intp_t left_cluster - cdef np.intp_t right_cluster - cdef int took_right_parent - cdef int took_left_parent - cdef np.intp_t cluster - - cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(clusters.shape[0], - dtype=np.float64) - cdef np.ndarray[np.intp_t, ndim=1] parents - cdef np.ndarray[np.intp_t, ndim=1] children - cdef np.ndarray[np.float64_t, ndim=1] lambdas - - # convert the cluster tree for fast direct access - parents = cluster_tree['parent'].astype(np.intp) - children = cluster_tree['child'].astype(np.intp) - lambdas = cluster_tree['lambda_val'].astype(np.float64) - - - for i in range(clusters.shape[0]): - - took_right_parent = False - took_left_parent = False - - right_cluster = clusters[i] - left_cluster = point_cluster - - while left_cluster != right_cluster: - if left_cluster > right_cluster: - took_left_parent = True - last_cluster = left_cluster - - # Set left_cluster to be its parent - for j in range(children.shape[0]): - if children[j] == left_cluster: - left_cluster = parents[j] - break - else: - took_right_parent = True - last_cluster = right_cluster - - # Set right_cluster to be its parent - for j in range(children.shape[0]): - if children[j] == right_cluster: - right_cluster = parents[j] - break - - if took_left_parent and took_right_parent: - # Take the lambda value of last_cluster merging in - for j in range(children.shape[0]): - if children[j] == last_cluster: - result[i] = lambdas[j] - break - else: - result[i] = point_lambda - - return result - - -cpdef np.float64_t safe_always_positive_division( - np.float64_t numerator, - np.float64_t denominator): - """ This is a helper function to divide numbers safely without getting a ZeroDivision error, the - function handles zero division by assuming the denominator is always positive - - Parameters - ---------- - numerator: floating - any floating point type - denominator: floating - any floating point type - - Returns - ------- - floating - """ - if denominator <= 0: - # prevent zero division or negative result - denominator = 1e-8 - return numerator / denominator - - -cpdef np.ndarray[np.float64_t, ndim=1] per_cluster_scores( - np.intp_t neighbor, - np.float32_t lambda_, - np.ndarray[np.intp_t, ndim=1] clusters, - np.ndarray tree, - dict max_lambda_dict, - np.ndarray cluster_tree): - - cdef np.intp_t point_cluster - cdef np.float64_t point_lambda - cdef np.float64_t max_lambda - - cdef np.intp_t i - - cdef np.ndarray[np.float64_t, ndim=1] result - - point_row = get_tree_row_with_child(tree, neighbor) - point_cluster = point_row['parent'] - point_lambda = lambda_ - max_lambda = max_lambda_dict[point_cluster] - - # Save an allocation by assigning and reusing result ... - # height = merge_height(point_cluster, point_lambda, - # clusters, cluster_tree) - result = merge_height(point_cluster, point_lambda, - clusters, cluster_tree) - - # Cythonize: result = np.exp(-(max_lambda / height)) - for i in range(result.shape[0]): - # result[i] = exp(-(max_lambda / result[i])) - result[i] = safe_always_positive_division(max_lambda, (max_lambda - result[i])) - - return result - - -cpdef all_points_prob_in_some_cluster( - np.ndarray[np.intp_t, ndim=1] clusters, - np.ndarray tree, - dict max_lambda_dict, - np.ndarray cluster_tree): - - cdef np.ndarray[np.float64_t, ndim=1] heights - cdef np.intp_t num_points = tree['parent'].min() - cdef np.ndarray[np.float64_t, ndim=1] result - cdef np.intp_t point - cdef np.intp_t point_cluster - cdef np.float64_t point_lambda - cdef np.float64_t max_lambda - - cdef np.intp_t i - - result = np.empty(num_points, dtype=np.float64) - - point_tree = tree[tree['child_size'] == 1] - - for i in range(point_tree.shape[0]): - point_row = point_tree[i] - point = point_row['child'] - point_cluster = point_row['parent'] - point_lambda = point_row['lambda_val'] - - # Can we not do a faster merge height operation here? - heights = merge_height(point_cluster, point_lambda, - clusters, cluster_tree) - max_lambda = max(max_lambda_dict[clusters[heights.argmax()]], - point_lambda) - result[point] = (heights.max() / max_lambda) - - return result diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 5a8d4c6b06a48..ebc4fc87369ca 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -537,9 +537,9 @@ def hdbscan( - `leaf` allow_single_cluster : bool, default=False - By default HDBSCAN* will not produce a single cluster, setting this - to t=True will override this and allow single cluster results in - the case that you feel this is a valid result for your dataset. + By default HDBSCAN* will not produce a single cluster. Setting this to + `True` will allow single cluster results in the case that you feel this + is a valid result for your dataset. match_reference_implementation : bool, default=False There exist some interpretational differences between this diff --git a/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py b/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py deleted file mode 100644 index c6241c63d3713..0000000000000 --- a/sklearn/cluster/_hdbscan/tests/test_prediction_utils.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest - -from sklearn.cluster._hdbscan._prediction_utils import safe_always_positive_division - - -@pytest.mark.parametrize("denominator", [-1, 0, 1]) -def test_safe_always_positive_division(denominator): - numerator = 1 - # Given negative, zero and positive denominator and positive numerator - value = safe_always_positive_division(numerator, denominator) - # Make sure safe division is always positive and doesn't raise ZeroDivision error - assert value >= 0 diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index a073948a1bc70..eb5f622f65ad8 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -58,6 +58,7 @@ def configuration(parent_package="", top_path=None): ) config.add_subpackage("tests") + config.add_subpackage("_hdbscan") # HDBSCAN subpackage config.add_subpackage("_hdbscan.tests") @@ -85,12 +86,6 @@ def configuration(parent_package="", top_path=None): include_dirs=[numpy.get_include()], libraries=libraries, ) - config.add_extension( - "_hdbscan._prediction_utils", - sources=["_hdbscan/_prediction_utils.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) return config From cd83805e9dcbd125ba3a79df59744a2e629d7778 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 15:37:37 -0400 Subject: [PATCH 053/160] Renamed most `kwargs`-->`metric_params` for consistency --- sklearn/cluster/_hdbscan/hdbscan_.py | 46 +++++++++++++--------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index ebc4fc87369ca..a158c5ace041d 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -73,13 +73,12 @@ def _hdbscan_generic( alpha=1.0, metric="minkowski", p=2, - leaf_size=None, - **kwargs, + **metric_params, ): if metric == "minkowski": distance_matrix = pairwise_distances(X, metric=metric, p=p) elif metric == "arccos": - distance_matrix = pairwise_distances(X, metric="cosine", **kwargs) + distance_matrix = pairwise_distances(X, metric="cosine", **metric_params) elif metric == "precomputed": # Treating this case explicitly, instead of letting # sklearn.metrics.pairwise_distances handle it, @@ -88,7 +87,7 @@ def _hdbscan_generic( # TODO: Check if copying is necessary distance_matrix = X.copy() else: - distance_matrix = pairwise_distances(X, metric=metric, **kwargs) + distance_matrix = pairwise_distances(X, metric=metric, **metric_params) if issparse(distance_matrix): # raise TypeError('Sparse distance matrices not yet supported') @@ -96,7 +95,7 @@ def _hdbscan_generic( distance_matrix, min_samples, alpha, - **kwargs, + **metric_params, ) mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) @@ -126,7 +125,7 @@ def _hdbscan_sparse_distance_matrix( X, min_samples=5, alpha=1.0, - **kwargs, + **metric_params, ): assert issparse(X) # Check for connected component on X @@ -143,7 +142,7 @@ def _hdbscan_sparse_distance_matrix( # Compute sparse mutual reachability graph # if max_dist > 0, max distance to use when the reachability is infinite - max_dist = kwargs.get("max_dist", 0.0) + max_dist = metric_params.get("max_dist", 0.0) mutual_reachability_ = sparse_mutual_reachability( lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha ) @@ -187,7 +186,7 @@ def _hdbscan_prims_kdtree( alpha=1.0, metric="minkowski", leaf_size=40, - **kwargs, + **metric_params, ): if X.dtype != np.float64: X = X.astype(np.float64) @@ -196,10 +195,10 @@ def _hdbscan_prims_kdtree( if not X.flags["C_CONTIGUOUS"]: X = np.array(X, dtype=np.double, order="C") - tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + tree = KDTree(X, metric=metric, leaf_size=leaf_size, **metric_params) # TO DO: Deal with p for minkowski appropriately - dist_metric = DistanceMetric.get_metric(metric, **kwargs) + dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Get distance to kth nearest neighbour core_distances = tree.query( @@ -224,7 +223,7 @@ def _hdbscan_prims_balltree( alpha=1.0, metric="minkowski", leaf_size=40, - **kwargs, + **metric_params, ): if X.dtype != np.float64: X = X.astype(np.float64) @@ -233,9 +232,9 @@ def _hdbscan_prims_balltree( if not X.flags["C_CONTIGUOUS"]: X = np.array(X, dtype=np.double, order="C") - tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + tree = BallTree(X, metric=metric, leaf_size=leaf_size, **metric_params) - dist_metric = DistanceMetric.get_metric(metric, **kwargs) + dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Get distance to kth nearest neighbour core_distances = tree.query( @@ -259,7 +258,7 @@ def _hdbscan_boruvka_kdtree( leaf_size=40, approx_min_span_tree=True, n_jobs=4, - **kwargs, + **metric_params, ): if leaf_size < 3: leaf_size = 3 @@ -270,7 +269,7 @@ def _hdbscan_boruvka_kdtree( if X.dtype != np.float64: X = X.astype(np.float64) - tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + tree = KDTree(X, metric=metric, leaf_size=leaf_size, **metric_params) n_samples = X.shape[0] if min_samples + 1 > n_samples: @@ -286,7 +285,7 @@ def _hdbscan_boruvka_kdtree( leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=n_jobs, - **kwargs, + **metric_params, ) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight @@ -305,7 +304,7 @@ def _hdbscan_boruvka_balltree( leaf_size=40, approx_min_span_tree=True, n_jobs=4, - **kwargs, + **metric_params, ): if leaf_size < 3: leaf_size = 3 @@ -316,7 +315,7 @@ def _hdbscan_boruvka_balltree( if X.dtype != np.float64: X = X.astype(np.float64) - tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) + tree = BallTree(X, metric=metric, leaf_size=leaf_size, **metric_params) alg = BallTreeBoruvkaAlgorithm( tree, min_samples, @@ -324,7 +323,7 @@ def _hdbscan_boruvka_balltree( leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=n_jobs, - **kwargs, + **metric_params, ) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight @@ -647,7 +646,6 @@ def hdbscan( min_samples, alpha, metric, - leaf_size, **metric_params, ) elif algorithm == "prims_kdtree": @@ -658,7 +656,6 @@ def hdbscan( min_samples, alpha, metric, - leaf_size, **metric_params, ) elif algorithm == "prims_balltree": @@ -713,7 +710,6 @@ def hdbscan( min_samples, alpha, metric, - leaf_size, **metric_params, ) elif metric in KDTree.valid_metrics: @@ -773,7 +769,7 @@ def hdbscan( # Inherits from sklearn -class HDBSCAN(BaseEstimator, ClusterMixin): +class HDBSCAN(ClusterMixin, BaseEstimator): """Perform HDBSCAN clustering from vector array or distance matrix. HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications @@ -1004,9 +1000,9 @@ def fit(self, X, y=None): self._raw_data = X self._all_finite = ( - np.alltrue(np.isfinite(X.tocoo().data)) + np.all(np.isfinite(X.tocoo().data)) if issparse(X) - else np.alltrue(np.isfinite(X)) + else np.all(np.isfinite(X)) ) if not self._all_finite: From dee1c46df4479adce6851fb3cb253a946f6ab708 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 15:40:13 -0400 Subject: [PATCH 054/160] Added clarifying comment in `_validity.py` --- sklearn/cluster/_hdbscan/_validity.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/_hdbscan/_validity.py b/sklearn/cluster/_hdbscan/_validity.py index 2c1874542097a..7131d3dc6b905 100644 --- a/sklearn/cluster/_hdbscan/_validity.py +++ b/sklearn/cluster/_hdbscan/_validity.py @@ -3,6 +3,7 @@ # John Healy # # License: BSD 3 clause +# Currently only used in test_hdbscan.py for testing the correctness of HDBSCAN import numpy as np from sklearn.metrics import pairwise_distances From 2a8182419df14dd5e683698c1451c7f56cfc819d Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 15:48:20 -0400 Subject: [PATCH 055/160] Added random state objects, and used `tmp_path` fixture --- sklearn/cluster/_hdbscan/tests/test_hdbscan.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index df0f880b0a31c..e4a1da3b05696 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -19,7 +19,6 @@ from scipy.stats import mode from sklearn.metrics.pairwise import _VALID_METRICS from sklearn.neighbors import KDTree, BallTree -from tempfile import mkdtemp import pytest from sklearn import datasets @@ -52,11 +51,12 @@ def test_missing_data(): def generate_noisy_data(): + rng = np.random.RandomState(0) blobs, _ = datasets.make_blobs( n_samples=200, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 ) moons, _ = datasets.make_moons(n_samples=200, noise=0.05) - noise = np.random.uniform(-1.0, 3.0, (50, 2)) + noise = rng.uniform(-1.0, 3.0, (50, 2)) return np.vstack([blobs, moons, noise]) @@ -324,11 +324,10 @@ def test_hdbscan_sparse(): assert n_clusters == 3 -def test_hdbscan_caching(): +def test_hdbscan_caching(tmp_path): - cachedir = mkdtemp() - labels1 = HDBSCAN(memory=cachedir, min_samples=5).fit(X).labels_ - labels2 = HDBSCAN(memory=cachedir, min_samples=5, min_cluster_size=6).fit(X).labels_ + labels1 = HDBSCAN(memory=tmp_path, min_samples=5).fit(X).labels_ + labels2 = HDBSCAN(memory=tmp_path, min_samples=5, min_cluster_size=6).fit(X).labels_ n_clusters1 = len(set(labels1)) - int(-1 in labels1) n_clusters2 = len(set(labels2)) - int(-1 in labels2) assert n_clusters1 == n_clusters2 @@ -354,8 +353,8 @@ def test_hdbscan_no_centroid_medoid_for_noise(): def test_hdbscan_allow_single_cluster_with_epsilon(): - np.random.seed(0) - no_structure = np.random.rand(150, 2) + rng = np.random.RandomState(0) + no_structure = rng.rand(150, 2) # without epsilon we should see many noise points as children of root. labels = HDBSCAN( min_cluster_size=5, From add3617805021d56cdea25059b7f4d9b8a9ce612 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 16:10:50 -0400 Subject: [PATCH 056/160] Improved `badargs` test --- .../cluster/_hdbscan/tests/test_hdbscan.py | 61 +++++++++---------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index e4a1da3b05696..32cca4e2ad51c 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -6,10 +6,7 @@ from scipy.spatial import distance from scipy import sparse from scipy import stats -from sklearn.utils._testing import ( - assert_array_almost_equal, - assert_raises, -) +from sklearn.utils._testing import assert_array_almost_equal from sklearn.cluster import HDBSCAN, hdbscan from sklearn.cluster._hdbscan._validity import validity_index @@ -288,31 +285,31 @@ def test_hdbscan_boruvka_matches(tree): assert (num_mismatches / float(data.shape[0])) < 0.15 -def test_hdbscan_badargs(): - assert_raises(ValueError, hdbscan, X="fail") - assert_raises(ValueError, hdbscan, X=None) - assert_raises(ValueError, hdbscan, X, min_cluster_size="fail") - assert_raises(ValueError, hdbscan, X, min_samples="fail") - assert_raises(ValueError, hdbscan, X, min_samples=-1) - assert_raises(ValueError, hdbscan, X, metric="imperial") - assert_raises(ValueError, hdbscan, X, metric=None) - assert_raises( - ValueError, hdbscan, X, metric="precomputed", algorithm="boruvka_kdtree" - ) - assert_raises( - ValueError, hdbscan, X, metric="precomputed", algorithm="prims_kdtree" - ) - assert_raises( - ValueError, hdbscan, X, metric="precomputed", algorithm="prims_balltree" - ) - assert_raises( - ValueError, hdbscan, X, metric="precomputed", algorithm="boruvka_balltree" - ) - assert_raises(ValueError, hdbscan, X, alpha=-1) - assert_raises(ValueError, hdbscan, X, alpha="fail") - assert_raises(Exception, hdbscan, X, algorithm="something_else") - assert_raises(TypeError, hdbscan, X, metric="minkowski", p=None) - assert_raises(ValueError, hdbscan, X, leaf_size=0) +@pytest.mark.parametrize( + "kwargs, error", + [ + [{"X": "fail"}, ValueError], + [{"X": None}, ValueError], + [{"min_cluster_size": "fail"}, ValueError], + [{"min_samples": "fail"}, ValueError], + [{"min_samples": -1}, ValueError], + [{"metric": "imperial"}, ValueError], + [{"metric": None}, ValueError], + [{"metric": "precomputed", "algorithm": "boruvka_kdtree"}, ValueError], + [{"metric": "precomputed", "algorithm": "prims_kdtree"}, ValueError], + [{"metric": "precomputed", "algorithm": "boruvka_balltree"}, ValueError], + [{"metric": "precomputed", "algorithm": "prims_balltree"}, ValueError], + [{"alpha": -1}, ValueError], + [{"alpha": "fail"}, ValueError], + [{"leaf_size": 0}, ValueError], + [{"algorithm": "something_else"}, TypeError], + [{"metric": "minkowski", "metric_params": {"p": None}}, TypeError], + ], +) +def test_hdbscan_badargs(kwargs, error): + _X = kwargs.pop("X", X) + with pytest.raises(error): + hdbscan(_X, **kwargs) def test_hdbscan_sparse(): @@ -348,8 +345,10 @@ def test_hdbscan_centroids_medoids(): def test_hdbscan_no_centroid_medoid_for_noise(): clusterer = HDBSCAN().fit(X) - assert_raises(ValueError, clusterer.weighted_cluster_centroid, -1) - assert_raises(ValueError, clusterer.weighted_cluster_medoid, -1) + with pytest.raises(ValueError): + clusterer.weighted_cluster_centroid(-1) + with pytest.raises(ValueError): + clusterer.weighted_cluster_medoid(-1) def test_hdbscan_allow_single_cluster_with_epsilon(): From 0bf14914bee1c434f29b7128da6ac61d880f3216 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 16:21:55 -0400 Subject: [PATCH 057/160] Minor wording change --- sklearn/cluster/_hdbscan/tests/test_hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 32cca4e2ad51c..3480af801962f 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -1,6 +1,6 @@ """ Tests for HDBSCAN clustering algorithm -Shamelessly based on (i.e. ripped off from) the DBSCAN test code +Based on the DBSCAN test code """ import numpy as np from scipy.spatial import distance From 1f319601ba331cdd35b0c6a87f82a9d0ee8b3c13 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 17:41:52 -0400 Subject: [PATCH 058/160] Made docstrings more uniform and set default metric to `euclidean` --- sklearn/cluster/_hdbscan/hdbscan_.py | 43 +++++++++++++--------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index a158c5ace041d..43e360ade0c63 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -71,13 +71,10 @@ def _hdbscan_generic( X, min_samples=5, alpha=1.0, - metric="minkowski", - p=2, + metric="euclidean", **metric_params, ): - if metric == "minkowski": - distance_matrix = pairwise_distances(X, metric=metric, p=p) - elif metric == "arccos": + if metric == "arccos": distance_matrix = pairwise_distances(X, metric="cosine", **metric_params) elif metric == "precomputed": # Treating this case explicitly, instead of letting @@ -184,7 +181,7 @@ def _hdbscan_prims_kdtree( X, min_samples=5, alpha=1.0, - metric="minkowski", + metric="euclidean", leaf_size=40, **metric_params, ): @@ -197,7 +194,6 @@ def _hdbscan_prims_kdtree( tree = KDTree(X, metric=metric, leaf_size=leaf_size, **metric_params) - # TO DO: Deal with p for minkowski appropriately dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Get distance to kth nearest neighbour @@ -221,7 +217,7 @@ def _hdbscan_prims_balltree( X, min_samples=5, alpha=1.0, - metric="minkowski", + metric="euclidean", leaf_size=40, **metric_params, ): @@ -254,7 +250,7 @@ def _hdbscan_prims_balltree( def _hdbscan_boruvka_kdtree( X, min_samples=5, - metric="minkowski", + metric="euclidean", leaf_size=40, approx_min_span_tree=True, n_jobs=4, @@ -300,7 +296,7 @@ def _hdbscan_boruvka_kdtree( def _hdbscan_boruvka_balltree( X, min_samples=5, - metric="minkowski", + metric="euclidean", leaf_size=40, approx_min_span_tree=True, n_jobs=4, @@ -437,7 +433,7 @@ def hdbscan( alpha=1.0, cluster_selection_epsilon=0.0, max_cluster_size=0, - metric="minkowski", + metric="euclidean", leaf_size=40, algorithm="best", memory=None, @@ -465,7 +461,7 @@ def hdbscan( min_samples : int, default=None The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. - defaults to the min_cluster_size. + defaults to the `min_cluster_size`. alpha : float, default=1.0 A distance scaling parameter as used in robust single linkage. @@ -476,20 +472,20 @@ def hdbscan( See [3]_ for more information. max_cluster_size : int, default=0 - A limit to the size of clusters returned by the eom algorithm. - Has no effect when using leaf clustering (where clusters are - usually small regardless) and can also be overridden in rare - cases by a high value for cluster_selection_epsilon. + A limit to the size of clusters returned by the `eom` cluster selection + algorithm. Has no effect if `cluster_selection_method=leaf`. Can be + overridden in rare cases by a high value for + `cluster_selection_epsilon`. metric : str or callable, default='minkowski' The metric to use when calculating distance between instances in a feature array. - * If metric is a string or callable, it must be one of + - If metric is a string or callable, it must be one of the options allowed by `metrics.pairwise.pairwise_distances` for its metric parameter. - * If metric is "precomputed", X is assumed to be a distance matrix and + - If metric is "precomputed", X is assumed to be a distance matrix and must be square. leaf_size : int, default=40 @@ -781,13 +777,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Parameters ---------- min_cluster_size : int, default=5 - The minimum size of clusters; single linkage splits that contain - fewer points than this will be considered points "falling out" of a - cluster rather than a cluster splitting into two new clusters. + The minimum number of samples in a group for that group to be + considered a cluster; groupings smaller than this size will be left + as noise. min_samples : int, default=None - The number of samples in a neighbourhood for a point to be - considered a core point. + The number of samples in a neighborhood for a point + to be considered as a core point. This includes the point itself. + defaults to the `min_cluster_size`. cluster_selection_epsilon : float, default=0.0 A distance threshold. Clusters below this value will be merged. From e7291a8384d6481b9cda06a8bd31eb9424ae7ce0 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 17:53:51 -0400 Subject: [PATCH 059/160] Improved plotting w/ perturbation examples --- examples/cluster/plot_hdbscan.py | 80 ++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index fb26302cd3808..13e0072a41740 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -27,46 +27,58 @@ # %% # Compute HDBSCAN # --------------- -hdb = HDBSCAN().fit(X) -labels = hdb.labels_ +KWARGS = ({}, {"min_samples": 2}, {"min_cluster_size": 25}) +models = [] -# Number of clusters in labels, ignoring noise if present. -n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) -n_noise_ = list(labels).count(-1) +for kwargs in KWARGS: + hdb = HDBSCAN(**kwargs).fit(X) + models.append((hdb.labels_, hdb.probabilities_, kwargs)) + labels = hdb.labels_ -print("Estimated number of clusters: %d" % n_clusters_) -print("Estimated number of noise points: %d" % n_noise_) -print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) -print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) -print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) -print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) -print( - "Adjusted Mutual Information: %0.3f" - % metrics.adjusted_mutual_info_score(labels_true, labels) -) -print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) + # Number of clusters in labels, ignoring noise + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + n_noise_ = list(labels).count(-1) + print(f"\nFor {kwargs=}") + print("Estimated number of clusters: %d" % n_clusters_) + print("Estimated number of noise points: %d" % n_noise_) + print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) + print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) + print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) + print( + "Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels) + ) + print( + "Adjusted Mutual Information: %0.3f" + % metrics.adjusted_mutual_info_score(labels_true, labels) + ) + print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # %% # Plot result # ----------- import matplotlib.pyplot as plt -# Black removed and is used for noise instead. -unique_labels = set(labels) -colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] -for k, col in zip(unique_labels, colors): - if k == -1: - # Black used for noise. - col = [0, 0, 0, 1] +for labels, probabilities, kwargs in models: + # Black removed and is used for noise instead. + unique_labels = set(labels) + colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] + # The probability of a point belonging to its labeled cluster determines + # the size of its marker + proba_map = {idx: probabilities[idx] for idx in range(len(labels))} + for k, col in zip(unique_labels, colors): + if k == -1: + # Black used for noise. + col = [0, 0, 0, 1] - xy = X[labels == k] - plt.plot( - xy[:, 0], - xy[:, 1], - "o", - markerfacecolor=tuple(col), - markeredgecolor="k", - markersize=8, - ) -plt.title("Estimated number of clusters: %d" % n_clusters_) -plt.show() + class_index = np.where(labels == k)[0] + for ci in class_index: + plt.plot( + X[ci, 0], + X[ci, 1], + "x" if k == -1 else "o", + markerfacecolor=tuple(col), + markeredgecolor="k", + markersize=4 if k == -1 else 1 + 5 * proba_map[ci], + ) + plt.title(f"Estimated number of clusters: {n_clusters_} | {kwargs=}") + plt.show() From b7aca9e7d8bb77c1619000524413332e049e7d65 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 2 Apr 2022 13:58:16 -0400 Subject: [PATCH 060/160] Updated clustering plots for gallery page rendering --- examples/cluster/plot_hdbscan.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 13e0072a41740..9a417355bb930 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -59,6 +59,7 @@ import matplotlib.pyplot as plt for labels, probabilities, kwargs in models: + _, ax = plt.subplots() # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] @@ -72,7 +73,7 @@ class_index = np.where(labels == k)[0] for ci in class_index: - plt.plot( + ax.plot( X[ci, 0], X[ci, 1], "x" if k == -1 else "o", @@ -80,5 +81,5 @@ markeredgecolor="k", markersize=4 if k == -1 else 1 + 5 * proba_map[ci], ) - plt.title(f"Estimated number of clusters: {n_clusters_} | {kwargs=}") + ax.set_title(f"Estimated number of clusters: {n_clusters_} | {kwargs=}") plt.show() From 3d719d9e48e638750ea393fcbf0d6bcdb1545420 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 17 Apr 2022 14:48:31 -0400 Subject: [PATCH 061/160] Improved plotting example --- examples/cluster/plot_hdbscan.py | 236 ++++++++++++++++++++++++------- 1 file changed, 188 insertions(+), 48 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 9a417355bb930..ed815c6ae12cd 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -4,61 +4,24 @@ Demo of HDBSCAN clustering algorithm ==================================== +In this demo we will take a look at :class:`sklearn.cluster.HDBSCAN` from the +perspective of generalizing the :class:`sklearn.cluster.DBSCAN` algorithm. +We'll compare both algorithms on specific datasets. Finally we'll evaluate +HDBSCAN's sensitivity to certain hyperparameters. We first define a couple +utility functions for convenience. """ import numpy as np -from sklearn.cluster import HDBSCAN +from sklearn.cluster import HDBSCAN, DBSCAN from sklearn import metrics from sklearn.datasets import make_blobs -from sklearn.preprocessing import StandardScaler - - -# %% -# Generate sample data -# -------------------- -centers = [[1, 1], [-1, -1], [1, -1]] -X, labels_true = make_blobs( - n_samples=750, centers=centers, cluster_std=0.4, random_state=0 -) - -X = StandardScaler().fit_transform(X) - -# %% -# Compute HDBSCAN -# --------------- -KWARGS = ({}, {"min_samples": 2}, {"min_cluster_size": 25}) -models = [] - -for kwargs in KWARGS: - hdb = HDBSCAN(**kwargs).fit(X) - models.append((hdb.labels_, hdb.probabilities_, kwargs)) - labels = hdb.labels_ - - # Number of clusters in labels, ignoring noise - n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - n_noise_ = list(labels).count(-1) - - print(f"\nFor {kwargs=}") - print("Estimated number of clusters: %d" % n_clusters_) - print("Estimated number of noise points: %d" % n_noise_) - print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) - print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) - print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) - print( - "Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels) - ) - print( - "Adjusted Mutual Information: %0.3f" - % metrics.adjusted_mutual_info_score(labels_true, labels) - ) - print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) -# %% -# Plot result -# ----------- import matplotlib.pyplot as plt -for labels, probabilities, kwargs in models: + +def plot(X, labels=None, probabilities=None, kwargs=None, ground_truth=False): + labels = labels if labels is not None else np.ones(X.shape[0]) + probabilities = probabilities if probabilities is not None else np.ones(X.shape[0]) _, ax = plt.subplots() # Black removed and is used for noise instead. unique_labels = set(labels) @@ -81,5 +44,182 @@ markeredgecolor="k", markersize=4 if k == -1 else 1 + 5 * proba_map[ci], ) - ax.set_title(f"Estimated number of clusters: {n_clusters_} | {kwargs=}") + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + preamble = "True" if ground_truth else "Estimated" + title = f"{preamble} number of clusters: {n_clusters_}" + if kwargs is not None: + title += f" | {kwargs=}" + ax.set_title(title) plt.show() + + +def print_scores(labels, labels_true, kwargs): + # Number of clusters in labels, ignoring noise + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + n_noise_ = list(labels).count(-1) + print(f"\nFor {kwargs=}") + print("Estimated number of clusters: %d" % n_clusters_) + print("Estimated number of noise points: %d" % n_noise_) + print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) + print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) + print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) + print( + "Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels) + ) + print( + "Adjusted Mutual Information: %0.3f" + % metrics.adjusted_mutual_info_score(labels_true, labels) + ) + print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) + + +# %% +# Generate sample data +# -------------------- +# One of the greatest advantages of HDBSCAN over DBSCAN is its out-of-the-box +# robustness. It's especially remarkable on heterogenous mixtures of data. +# Like DBSCAN, it can model arbitrary shapes and distributions, however unlike +# DBSCAN it does not require specification of an arbitray (and indeed tricky) +# `eps` hyperparameter. For example, below we generate a dataset composed of +# a mixture of three diagonal Gaussians. +centers = [[1, 1], [-1, -1], [1.5, -1.5]] +X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=[0.4, 0.1, 0.75], random_state=0 +) +plot(X, labels=labels_true, ground_truth=True) +# %% +# Scale Invariance +# ----------------- +# It's worth remembering that, while DBSCAN provides a default value for `eps` +# parameter, it is entirely meaningless and must be tuned for your specific +# dataset. As a simple demonstration, consider what happens when we find an +# epsilon value that works for one dataset, and try to apply it to a +# similar but rescaled versions of the dataset. Below are plots of the original +# dataset, and version rescaled by 0.5 and 3 respectively. +dbs = DBSCAN(eps=0.3).fit(X) +plot(X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) +dbs.fit(0.5 * X) +plot(0.5 * X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) +dbs.fit(3 * X) +plot(3 * X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) + +# %% +# Indeed, in order to maintain the same results we would have to scale `eps` by +# the same factor. +dbs = DBSCAN(eps=0.9).fit(3 * X) +plot(3 * X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.9}) + +# %% +# While standardizing data (e.g. using +# :class:`sklearn.preprocessing.StandardScaler`) helps mitigate this problem, +# great care must be taken to select the appropriate value for `eps`. HDBSCAN +# is much more robust in this sense. HDBSCAN can be seen as clustering over +# all possible values of `eps` and extracting the best clusters from all +# possible clusters (see :ref:`HDBSCAN`). One immediate advantage is that +# HDBSCAN is scale-invariant. +hdb = HDBSCAN().fit(X) +plot(X, hdb.labels_, hdb.probabilities_) +hdb.fit(0.5 * X) +plot(0.5 * X, hdb.labels_, hdb.probabilities_) +hdb.fit(3 * X) +plot(3 * X, hdb.labels_, hdb.probabilities_) + +# %% +# Multi-Scale Clustering +# ---------------------- +# HDBSCAN is much more than scale invariant though -- it is capable of +# multi-scale clustering, which accounts for clusters with varying density. +# Traditional DBSCAN assumes that any potential clusters are homogenous in +# density. HDBSCAN is free from such constraints. To demonstrate this we +# consider the following dataset +centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] +X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0 +) +plot(X, labels=labels_true, ground_truth=True) + +# %% +# This dataset is more difficult for DBSCAN due to the varying densities and +# spatial separation. If `eps` is too large then we risk falsely clustering the +# two dense clusters as one since their mutual reachability will extend across +# clusters. If `eps` is too small, then we risk fragmenting the sparser +# clusters into many false clusters. Not to mention this requires manually +# tuning choices of `eps` until we find a tradeoff that we are comfortable +# with. Let's see how DBSCAN tackles this. +kwargs = {"eps": 0.7} +dbs = DBSCAN(**kwargs).fit(X) +plot(X, dbs.labels_, kwargs=kwargs) +kwargs = {"eps": 0.3} +dbs = DBSCAN(**kwargs).fit(X) +plot(X, dbs.labels_, kwargs=kwargs) + +# %% +# To properly cluster the two dense clusters, we would need a smaller value of +# epsilon, however at `eps=0.3` we are already fragmenting the sparse clusters, +# which would only become more severe as we decrease epsilon. Indeed it seems +# that DBSCAN is incapable of simultaneously separating the two dense clusters +# while preventing the sparse clusters from fragmenting. Let's compare with +# HDBSCAN. +hdb = HDBSCAN().fit(X) +plot(X, hdb.labels_, hdb.probabilities_) + +# %% +# HDBSCAN is able to pick up and preserve the multi-scale structure of the +# dataset, all the while requiring no parameter tuning. Of course in practice +# on any sufficiently interesting dataset, there will be some tuning required, +# but this demonstrates the fact that HDBSCAN can yield an entire class of +# solutions that are inaccessible to DBSCAN without nearly as much manual +# intervention and tuning. + +# %% +# Hyperparameter Robustness +# ------------------------- +# Ultimately tuning will be an important step in any real world application, so +# let's take a look at some of the most important hyperparameters for HDBSCAN. +# While HDBSCAN is free from the `eps` parameter of DBSCAN, it does still have +# some hyperparemeters like `min_cluster_size` and `min_samples` which tune its +# sense of density. We will however see that HDBSCAN is relatively robust to +# these parameters, and these parameters hold clear semantic meaning which help +# in tuning them. +# +# `min_cluster_size` +# ^^^^^^^^^^^^^^^^^^ +# This hyperparameter is the minimum number of samples in a group for that +# group to be considered a cluster; groupings smaller than this size will be +# left as noise. The default value is 5. This parameter is generally tuned to +# larger values as needed. Smaller values will likely to lead to results with +# fewer points labeled as noise, however values too small will lead to false +# sub-clusters being picked up and preferred. Larger values tend to be more +# robust w.r.t noisy datasets, e.g. high-variance clusters with significant +# overlap. + + +KWARGS = ({"min_cluster_size": 5}, {"min_cluster_size": 3}, {"min_cluster_size": 25}) +for kwargs in KWARGS: + hdb = HDBSCAN(**kwargs).fit(X) + labels = hdb.labels_ + + plot(X, labels, hdb.probabilities_, kwargs) + print_scores(labels, labels_true, kwargs) + +# %% +# `min_samples` +# ^^^^^^^^^^^^^ +# This hyperparameter is the number of samples in a neighborhood for a point to +# be considered as a core point. This includes the point itself. defaults to +# the `min_cluster_size`. Similarly to `min_cluster_size`, larger values +# increase the model's robustness to noise, but risks ignoring or discarding +# potentially valid but small clusters. Best tuned after finding a good value +# for `min_cluster_size`. + +KWARGS = ( + {"min_cluster_size": 20, "min_samples": 5}, + {"min_cluster_size": 20, "min_samples": 3}, + {"min_cluster_size": 20, "min_samples": 25}, +) +for kwargs in KWARGS: + hdb = HDBSCAN(**kwargs).fit(X) + labels = hdb.labels_ + + plot(X, labels, hdb.probabilities_, kwargs) + print_scores(labels, labels_true, kwargs) From daf1b2fcda57a5396ad35f30e47aa1d4b51bec1c Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 17 Apr 2022 15:02:59 -0400 Subject: [PATCH 062/160] Updated User-Guide entry for new plotting example --- doc/modules/clustering.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 01237e409f45e..b45eff3b45fdc 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -964,10 +964,14 @@ and as such it no longer needs `eps` to be given as a hyperparameter. Instead it relies solely on the choice of `min_samples`, which tends to be a more robust hyperparameter. -.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_001.png +.. |hdbscan_ground_truth| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_001.png :target: ../auto_examples/cluster/plot_hdbscan.html - :scale: 50 + :scale: 75 +.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_006.png + :target: ../auto_examples/cluster/plot_hdbscan.html + :scale: 75 +.. centered:: |hdbscan_ground_truth| .. centered:: |hdbscan_results| HDBSCAN can be smoothed with an additional hyperparameter `min_cluster_size` From 4ddaddf6db27c05ee799f9e06fbcb52f0eb8325f Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 17 Apr 2022 15:53:57 -0400 Subject: [PATCH 063/160] Typo fix --- examples/cluster/plot_hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index ed815c6ae12cd..69be52f9b10ac 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -95,7 +95,7 @@ def print_scores(labels, labels_true, kwargs): # dataset. As a simple demonstration, consider what happens when we find an # epsilon value that works for one dataset, and try to apply it to a # similar but rescaled versions of the dataset. Below are plots of the original -# dataset, and version rescaled by 0.5 and 3 respectively. +# dataset, and versions rescaled by 0.5 and 3 respectively. dbs = DBSCAN(eps=0.3).fit(X) plot(X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) dbs.fit(0.5 * X) From fa1d30f3a219231a7a45d72734f5dab870714dd3 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Sun, 8 May 2022 18:30:56 -0400 Subject: [PATCH 064/160] Applied plotting demo review feedback Co-authored-by: Thomas J. Fan --- examples/cluster/plot_hdbscan.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 69be52f9b10ac..1e4d74ea98112 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -10,7 +10,7 @@ HDBSCAN's sensitivity to certain hyperparameters. We first define a couple utility functions for convenience. """ - +# %% import numpy as np from sklearn.cluster import HDBSCAN, DBSCAN @@ -19,10 +19,11 @@ import matplotlib.pyplot as plt -def plot(X, labels=None, probabilities=None, kwargs=None, ground_truth=False): +def plot(X, labels=None, probabilities=None, kwargs=None, ground_truth=False, ax=None): + if ax is None: + _, ax = plt.subplots() labels = labels if labels is not None else np.ones(X.shape[0]) probabilities = probabilities if probabilities is not None else np.ones(X.shape[0]) - _, ax = plt.subplots() # Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] @@ -48,9 +49,9 @@ def plot(X, labels=None, probabilities=None, kwargs=None, ground_truth=False): preamble = "True" if ground_truth else "Estimated" title = f"{preamble} number of clusters: {n_clusters_}" if kwargs is not None: - title += f" | {kwargs=}" + kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) + title += f" | {kwargs_str}" ax.set_title(title) - plt.show() def print_scores(labels, labels_true, kwargs): @@ -96,18 +97,20 @@ def print_scores(labels, labels_true, kwargs): # epsilon value that works for one dataset, and try to apply it to a # similar but rescaled versions of the dataset. Below are plots of the original # dataset, and versions rescaled by 0.5 and 3 respectively. -dbs = DBSCAN(eps=0.3).fit(X) -plot(X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) +fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +parameters = {"eps": 0.3} +dbs = DBSCAN(**parameters).fit(X) +plot(X, dbs.labels_, kwargs=parameters, ax=axes[0]) dbs.fit(0.5 * X) -plot(0.5 * X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) +plot(0.5 * X, dbs.labels_, kwargs=parameters, ax=axes[1]) dbs.fit(3 * X) -plot(3 * X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.3}) +plot(3 * X, dbs.labels_, kwargs=parameters, ax=axes[2]) # %% # Indeed, in order to maintain the same results we would have to scale `eps` by # the same factor. dbs = DBSCAN(eps=0.9).fit(3 * X) -plot(3 * X, dbs.labels_, np.ones_like(dbs.labels_), {"eps": 0.9}) +plot(3 * X, dbs.labels_, kwargs={"eps": 0.9}) # %% # While standardizing data (e.g. using From 8f7f60bc1a584cba9c0f8b48cc1b8f210b885aef Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 8 May 2022 19:08:35 -0400 Subject: [PATCH 065/160] Streamlined and improved plotting demo per review feedback --- examples/cluster/plot_hdbscan.py | 90 ++++++++++++++------------------ 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 1e4d74ea98112..16a6b8c5c3ac2 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -14,12 +14,13 @@ import numpy as np from sklearn.cluster import HDBSCAN, DBSCAN -from sklearn import metrics from sklearn.datasets import make_blobs import matplotlib.pyplot as plt -def plot(X, labels=None, probabilities=None, kwargs=None, ground_truth=False, ax=None): +def plot( + X, labels=None, probabilities=None, parameters=None, ground_truth=False, ax=None +): if ax is None: _, ax = plt.subplots() labels = labels if labels is not None else np.ones(X.shape[0]) @@ -48,32 +49,12 @@ def plot(X, labels=None, probabilities=None, kwargs=None, ground_truth=False, ax n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) preamble = "True" if ground_truth else "Estimated" title = f"{preamble} number of clusters: {n_clusters_}" - if kwargs is not None: - kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) - title += f" | {kwargs_str}" + if parameters is not None: + parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items()) + title += f" | {parameters_str}" ax.set_title(title) -def print_scores(labels, labels_true, kwargs): - # Number of clusters in labels, ignoring noise - n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - n_noise_ = list(labels).count(-1) - print(f"\nFor {kwargs=}") - print("Estimated number of clusters: %d" % n_clusters_) - print("Estimated number of noise points: %d" % n_noise_) - print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) - print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) - print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) - print( - "Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels) - ) - print( - "Adjusted Mutual Information: %0.3f" - % metrics.adjusted_mutual_info_score(labels_true, labels) - ) - print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) - - # %% # Generate sample data # -------------------- @@ -83,11 +64,12 @@ def print_scores(labels, labels_true, kwargs): # DBSCAN it does not require specification of an arbitray (and indeed tricky) # `eps` hyperparameter. For example, below we generate a dataset composed of # a mixture of three diagonal Gaussians. +fig, axis = plt.subplots(1, 1, figsize=(12, 5)) centers = [[1, 1], [-1, -1], [1.5, -1.5]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=[0.4, 0.1, 0.75], random_state=0 ) -plot(X, labels=labels_true, ground_truth=True) +plot(X, labels=labels_true, ground_truth=True, ax=axis) # %% # Scale Invariance # ----------------- @@ -100,17 +82,18 @@ def print_scores(labels, labels_true, kwargs): fig, axes = plt.subplots(3, 1, figsize=(12, 16)) parameters = {"eps": 0.3} dbs = DBSCAN(**parameters).fit(X) -plot(X, dbs.labels_, kwargs=parameters, ax=axes[0]) +plot(X, dbs.labels_, parameters=parameters, ax=axes[0]) dbs.fit(0.5 * X) -plot(0.5 * X, dbs.labels_, kwargs=parameters, ax=axes[1]) +plot(0.5 * X, dbs.labels_, parameters=parameters, ax=axes[1]) dbs.fit(3 * X) -plot(3 * X, dbs.labels_, kwargs=parameters, ax=axes[2]) +plot(3 * X, dbs.labels_, parameters=parameters, ax=axes[2]) # %% # Indeed, in order to maintain the same results we would have to scale `eps` by # the same factor. +fig, axis = plt.subplots(1, 1, figsize=(12, 5)) dbs = DBSCAN(eps=0.9).fit(3 * X) -plot(3 * X, dbs.labels_, kwargs={"eps": 0.9}) +plot(3 * X, dbs.labels_, parameters={"eps": 0.9}, ax=axis) # %% # While standardizing data (e.g. using @@ -120,12 +103,13 @@ def print_scores(labels, labels_true, kwargs): # all possible values of `eps` and extracting the best clusters from all # possible clusters (see :ref:`HDBSCAN`). One immediate advantage is that # HDBSCAN is scale-invariant. +fig, axes = plt.subplots(3, 1, figsize=(12, 16)) hdb = HDBSCAN().fit(X) -plot(X, hdb.labels_, hdb.probabilities_) +plot(X, hdb.labels_, hdb.probabilities_, ax=axes[0]) hdb.fit(0.5 * X) -plot(0.5 * X, hdb.labels_, hdb.probabilities_) +plot(0.5 * X, hdb.labels_, hdb.probabilities_, ax=axes[1]) hdb.fit(3 * X) -plot(3 * X, hdb.labels_, hdb.probabilities_) +plot(3 * X, hdb.labels_, hdb.probabilities_, ax=axes[2]) # %% # Multi-Scale Clustering @@ -135,11 +119,12 @@ def print_scores(labels, labels_true, kwargs): # Traditional DBSCAN assumes that any potential clusters are homogenous in # density. HDBSCAN is free from such constraints. To demonstrate this we # consider the following dataset +fig, axis = plt.subplots(1, 1, figsize=(12, 5)) centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0 ) -plot(X, labels=labels_true, ground_truth=True) +plot(X, labels=labels_true, ground_truth=True, ax=axis) # %% # This dataset is more difficult for DBSCAN due to the varying densities and @@ -149,12 +134,13 @@ def print_scores(labels, labels_true, kwargs): # clusters into many false clusters. Not to mention this requires manually # tuning choices of `eps` until we find a tradeoff that we are comfortable # with. Let's see how DBSCAN tackles this. -kwargs = {"eps": 0.7} -dbs = DBSCAN(**kwargs).fit(X) -plot(X, dbs.labels_, kwargs=kwargs) -kwargs = {"eps": 0.3} -dbs = DBSCAN(**kwargs).fit(X) -plot(X, dbs.labels_, kwargs=kwargs) +fig, axes = plt.subplots(2, 1, figsize=(12, 10)) +params = {"eps": 0.7} +dbs = DBSCAN(**params).fit(X) +plot(X, dbs.labels_, parameters=params, ax=axes[0]) +params = {"eps": 0.3} +dbs = DBSCAN(**params).fit(X) +plot(X, dbs.labels_, parameters=params, ax=axes[1]) # %% # To properly cluster the two dense clusters, we would need a smaller value of @@ -163,8 +149,9 @@ def print_scores(labels, labels_true, kwargs): # that DBSCAN is incapable of simultaneously separating the two dense clusters # while preventing the sparse clusters from fragmenting. Let's compare with # HDBSCAN. +fig, axis = plt.subplots(1, 1, figsize=(12, 5)) hdb = HDBSCAN().fit(X) -plot(X, hdb.labels_, hdb.probabilities_) +plot(X, hdb.labels_, hdb.probabilities_, ax=axis) # %% # HDBSCAN is able to pick up and preserve the multi-scale structure of the @@ -196,14 +183,13 @@ def print_scores(labels, labels_true, kwargs): # robust w.r.t noisy datasets, e.g. high-variance clusters with significant # overlap. - -KWARGS = ({"min_cluster_size": 5}, {"min_cluster_size": 3}, {"min_cluster_size": 25}) -for kwargs in KWARGS: - hdb = HDBSCAN(**kwargs).fit(X) +PARAM = ({"min_cluster_size": 5}, {"min_cluster_size": 3}, {"min_cluster_size": 25}) +fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +for i, param in enumerate(PARAM): + hdb = HDBSCAN(**param).fit(X) labels = hdb.labels_ - plot(X, labels, hdb.probabilities_, kwargs) - print_scores(labels, labels_true, kwargs) + plot(X, labels, hdb.probabilities_, param, ax=axes[i]) # %% # `min_samples` @@ -215,14 +201,14 @@ def print_scores(labels, labels_true, kwargs): # potentially valid but small clusters. Best tuned after finding a good value # for `min_cluster_size`. -KWARGS = ( +PARAM = ( {"min_cluster_size": 20, "min_samples": 5}, {"min_cluster_size": 20, "min_samples": 3}, {"min_cluster_size": 20, "min_samples": 25}, ) -for kwargs in KWARGS: - hdb = HDBSCAN(**kwargs).fit(X) +fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +for i, param in enumerate(PARAM): + hdb = HDBSCAN(**param).fit(X) labels = hdb.labels_ - plot(X, labels, hdb.probabilities_, kwargs) - print_scores(labels, labels_true, kwargs) + plot(X, labels, hdb.probabilities_, param, ax=axes[i]) From 6c5f936aee3d9c438ffe65d9be06a6aaef7dc6b0 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 8 May 2022 19:09:38 -0400 Subject: [PATCH 066/160] Removed default arg for labels --- examples/cluster/plot_hdbscan.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 16a6b8c5c3ac2..c25dd0cef67d5 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -18,9 +18,7 @@ import matplotlib.pyplot as plt -def plot( - X, labels=None, probabilities=None, parameters=None, ground_truth=False, ax=None -): +def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None): if ax is None: _, ax = plt.subplots() labels = labels if labels is not None else np.ones(X.shape[0]) From e0daeb7349643b61a577e1b5ee0a4ba39cbf401b Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 8 May 2022 19:15:28 -0400 Subject: [PATCH 067/160] Removed `match_reference_implementation` arg --- sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 21 +------ sklearn/cluster/_hdbscan/hdbscan_.py | 70 ---------------------- 2 files changed, 3 insertions(+), 88 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx index 31726cc900bea..2c1bc3991c785 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -418,8 +418,7 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling( set clusters, dict cluster_label_map, np.intp_t allow_single_cluster, - np.double_t cluster_selection_epsilon, - np.intp_t match_reference_implementation): + np.double_t cluster_selection_epsilon): cdef np.intp_t root_cluster cdef np.ndarray[np.intp_t, ndim=1] result_arr @@ -468,15 +467,7 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling( else: result[n] = -1 else: - if match_reference_implementation: - point_lambda = lambda_array[child_array == n][0] - cluster_lambda = lambda_array[child_array == cluster][0] - if point_lambda > cluster_lambda: - result[n] = cluster_label_map[cluster] - else: - result[n] = -1 - else: - result[n] = cluster_label_map[cluster] + result[n] = cluster_label_map[cluster] return result_arr @@ -579,7 +570,6 @@ cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluste cpdef tuple get_clusters(np.ndarray tree, dict stability, cluster_selection_method='eom', allow_single_cluster=False, - match_reference_implementation=False, cluster_selection_epsilon=0.0, max_cluster_size=0): """Given a tree and stability dict, produce the cluster labels @@ -603,10 +593,6 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, Whether to allow a single cluster to be selected by the Excess of Mass algorithm. - match_reference_implementation : boolean, optional (default False) - Whether to match the reference implementation in how to handle - certain edge cases. - cluster_selection_epsilon: float, optional (default 0.0) A distance threshold for cluster splits. @@ -718,8 +704,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, reverse_cluster_map = {n: c for c, n in cluster_map.items()} labels = do_labelling(tree, clusters, cluster_map, - allow_single_cluster, cluster_selection_epsilon, - match_reference_implementation) + allow_single_cluster, cluster_selection_epsilon) probs = get_probabilities(tree, reverse_cluster_map, labels) return (labels, probs) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 43e360ade0c63..e2c643eaae56a 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -45,7 +45,6 @@ def _tree_to_labels( min_cluster_size=10, cluster_selection_method="eom", allow_single_cluster=False, - match_reference_implementation=False, cluster_selection_epsilon=0.0, max_cluster_size=0, ): @@ -59,7 +58,6 @@ def _tree_to_labels( stability_dict, cluster_selection_method, allow_single_cluster, - match_reference_implementation, cluster_selection_epsilon, max_cluster_size, ) @@ -330,49 +328,6 @@ def _hdbscan_boruvka_balltree( return single_linkage_tree -def remap_condensed_tree(tree, internal_to_raw, outliers): - """ - Takes an internal condensed_tree structure and adds back in a set of points - that were initially detected as non-finite and returns that new tree. - These points will all be split off from the maximal node at lambda zero and - considered noise points. - - Parameters - ---------- - tree: condensed_tree - internal_to_raw: dict - a mapping from internal integer index to the raw integer index - finite_index: ndarray - Boolean array of which entries in the raw data were finite - """ - finite_count = len(internal_to_raw) - - outlier_count = len(outliers) - for i, (parent, child, lambda_val, child_size) in enumerate(tree): - if child < finite_count: - child = internal_to_raw[child] - else: - child = child + outlier_count - tree[i] = (parent + outlier_count, child, lambda_val, child_size) - - outlier_list = [] - root = tree[0][0] # Should I check to be sure this is the minimal lambda? - for outlier in outliers: - outlier_list.append((root, outlier, 0, 1)) - - outlier_tree = np.array( - outlier_list, - dtype=[ - ("parent", np.intp), - ("child", np.intp), - ("lambda_val", float), - ("child_size", np.intp), - ], - ) - tree = np.append(outlier_tree, tree) - return tree - - def remap_single_linkage_tree(tree, internal_to_raw, outliers): """ Takes an internal single_linkage_tree structure and adds back in a set of points @@ -441,7 +396,6 @@ def hdbscan( n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, - match_reference_implementation=False, metric_params=None, ): """Perform HDBSCAN clustering from a vector array or distance matrix. @@ -536,14 +490,6 @@ def hdbscan( `True` will allow single cluster results in the case that you feel this is a valid result for your dataset. - match_reference_implementation : bool, default=False - There exist some interpretational differences between this - HDBSCAN* implementation and the original authors reference - implementation in Java. This can result in very minor differences - in clustering results. Setting this flag to True will, at a some - performance cost, ensure that the clustering results match the - reference implementation. - metric_params : dict, default=None Arguments passed to the distance metric. @@ -600,11 +546,6 @@ def hdbscan( if leaf_size < 1: raise ValueError("Leaf size must be greater than 0!") - if match_reference_implementation: - min_samples = min_samples - 1 - min_cluster_size = min_cluster_size + 1 - approx_min_span_tree = False - if cluster_selection_method not in ("eom", "leaf"): raise ValueError( 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' @@ -758,7 +699,6 @@ def hdbscan( min_cluster_size, cluster_selection_method, allow_single_cluster, - match_reference_implementation, cluster_selection_epsilon, max_cluster_size, ) @@ -861,14 +801,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): to True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. - match_reference_implementation : bool, default=False - There exist some interpretational differences between this - HDBSCAN* implementation and the original authors reference - implementation in Java. This can result in very minor differences - in clustering results. Setting this flag to True will, at a some - performance cost, ensure that the clustering results match the - reference implementation. - metric_params : dict, default=None Arguments passed to the distance metric. @@ -952,7 +884,6 @@ def __init__( n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, - match_reference_implementation=False, metric_params=None, ): self.min_cluster_size = min_cluster_size @@ -968,7 +899,6 @@ def __init__( self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster - self.match_reference_implementation = match_reference_implementation self.metric_params = metric_params def fit(self, X, y=None): From a095bb934d522ebdacb2cc20e0d938dc39b2b59d Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 8 May 2022 19:28:57 -0400 Subject: [PATCH 068/160] Improved doc for `algorithm` and changed option `"best"`-->`"auto"` --- sklearn/cluster/_hdbscan/hdbscan_.py | 60 ++++++++++++------- .../cluster/_hdbscan/tests/test_hdbscan.py | 6 +- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index e2c643eaae56a..1621a0e8a0d2f 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -390,7 +390,7 @@ def hdbscan( max_cluster_size=0, metric="euclidean", leaf_size=40, - algorithm="best", + algorithm="auto", memory=None, approx_min_span_tree=True, n_jobs=4, @@ -446,18 +446,25 @@ def hdbscan( Leaf size for trees responsible for fast nearest neighbour queries. - algorithm : str, default='best' + algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `best` which chooses the "best" algorithm given the nature of - the data. You can force other options if you believe you know - better. Options are: - - `best` - - `generic` - - `prims_kdtree` - - `prims_balltree` - - `boruvka_kdtree` - - `boruvka_balltree` + to `'auto'` which attempts to use a `KDTree` method if possible, + otherwise it uses a `BallTree` method. If the `X` passed during `fit` + has `n_features>60` then a `boruvka` approach is used, otherwise a + `prims` approach is used. + + If the `X` passed during `fit` is sparse or `metric` is not a valid + metric for neither `KDTree` nor `BallTree` and is something other than + "cosine" and "arccos", then it resolves to use the `generic` algorithm. + + Available algorithms: + - `'best'` + - `'generic'` + - `'prims_kdtree'` + - `'prims_balltree'` + - `'boruvka_kdtree'` + - `'boruvka_balltree'` memory : str, default=None Used to cache the output of the computation of the tree. @@ -573,7 +580,7 @@ def hdbscan( min_samples = 1 metric_params = metric_params or {} - if algorithm != "best": + if algorithm != "auto": if metric != "precomputed" and issparse(X) and algorithm != "generic": raise ValueError("Sparse data matrices only support algorithm 'generic'.") @@ -751,18 +758,25 @@ class HDBSCAN(ClusterMixin, BaseEstimator): A distance scaling parameter as used in robust single linkage. See [3]_ for more information. - algorithm : str, default='best' + algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `best` which chooses the "best" algorithm given the nature of - the data. You can force other options if you believe you know - better. Options are: - - `best` - - `generic` - - `prims_kdtree` - - `prims_balltree` - - `boruvka_kdtree` - - `boruvka_balltree` + to `'auto'` which attempts to use a `KDTree` method if possible, + otherwise it uses a `BallTree` method. If the `X` passed during `fit` + has `n_features>60` then a `boruvka` approach is used, otherwise a + `prims` approach is used. + + If the `X` passed during `fit` is sparse or `metric` is not a valid + metric for neither `KDTree` nor `BallTree` and is something other than + "cosine" and "arccos", then it resolves to use the `generic` algorithm. + + Available algorithms: + - `'best'` + - `'generic'` + - `'prims_kdtree'` + - `'prims_balltree'` + - `'boruvka_kdtree'` + - `'boruvka_balltree'` leaf_size : int, default=40 If using a space tree algorithm (`KDTree`, or `BallTree`) the number @@ -877,7 +891,7 @@ def __init__( max_cluster_size=0, metric="euclidean", alpha=1.0, - algorithm="best", + algorithm="auto", leaf_size=40, memory=None, approx_min_span_tree=True, diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 3480af801962f..0edb47d718e7e 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -130,7 +130,7 @@ def test_hdbscan_feature_vector(): "boruvka_kdtree", "boruvka_balltree", "generic", - "best", + "auto", ], ) @pytest.mark.parametrize("metric", _VALID_METRICS) @@ -155,7 +155,7 @@ def test_hdbscan_algorithms(algo, metric): "minkowski": {"p": 2}, "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, } - if algo not in ("best", "generic"): + if algo not in ("auto", "generic"): if metric not in ALGOS_TREES[algo].valid_metrics: with pytest.raises(ValueError): hdbscan( @@ -198,7 +198,7 @@ def test_hdbscan_high_dimensional(): labels = ( HDBSCAN( - algorithm="best", + algorithm="auto", metric="seuclidean", metric_params={"V": np.ones(H.shape[1])}, ) From ca7e87f2c43ae43b791fd2c78f33fc68a82a1c01 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 8 May 2022 20:25:10 -0400 Subject: [PATCH 069/160] Updated DOI reference and user guide images --- doc/modules/clustering.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index a1ef1da83d5b7..5725f6d8974fa 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1021,7 +1021,7 @@ weight. An outline of the HDBSCAN algorithm is as follows: The clustering generated by taking the connected components of a trimmed graph :math:`G_{ms,\epsilon}` equivalent to running DBSCAN* with `min_samples` and `eps`. DBSCAN* is a slightly modified version of DBSCAN mentioned in - https://doi.org/10.1007/978-3-642-37456-2_14 + [CM2013]_. HDBSCAN is therefore able to obtain all possible partitions obtainable by DBSCAN* for a fixed choice of `min_samples` in a hierarchical fashion. @@ -1030,10 +1030,10 @@ and as such it no longer needs `eps` to be given as a hyperparameter. Instead it relies solely on the choice of `min_samples`, which tends to be a more robust hyperparameter. -.. |hdbscan_ground_truth| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_001.png +.. |hdbscan_ground_truth| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_005.png :target: ../auto_examples/cluster/plot_hdbscan.html :scale: 75 -.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_006.png +.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_007.png :target: ../auto_examples/cluster/plot_hdbscan.html :scale: 75 @@ -1048,7 +1048,7 @@ simplify the hyperparameter space. .. topic:: References: - * Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based Clustering + .. [CM2013] Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based Clustering Based on Hierarchical Density Estimates. In: Pei, J., Tseng, V.S., Cao, L., Motoda, H., Xu, G. (eds) Advances in Knowledge Discovery and Data Mining. PAKDD 2013. Lecture Notes in Computer Science(), vol 7819. Springer, Berlin, From 57ec6801471c82b144a2d2347e7b8de8acba7ecd Mon Sep 17 00:00:00 2001 From: Micky774 Date: Mon, 30 May 2022 14:27:03 -0400 Subject: [PATCH 070/160] Refactored parameter validation to use new API --- sklearn/cluster/_hdbscan/hdbscan_.py | 104 ++++++++++++------ .../cluster/_hdbscan/tests/test_hdbscan.py | 2 +- 2 files changed, 74 insertions(+), 32 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 1621a0e8a0d2f..af1eadb9c53b7 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -8,7 +8,9 @@ # # License: BSD 3 clause +from numbers import Real, Integral import numpy as np +from pathlib import Path from sklearn.base import BaseEstimator, ClusterMixin from sklearn.metrics import pairwise_distances @@ -18,6 +20,7 @@ from warnings import warn from sklearn.utils import check_array from joblib.parallel import cpu_count +from sklearn.utils._param_validation import Interval, StrOptions, validate_params from scipy.sparse import csgraph @@ -258,7 +261,7 @@ def _hdbscan_boruvka_kdtree( leaf_size = 3 if n_jobs < 1: - n_jobs = max(cpu_count() + 1 + n_jobs, 1) + n_jobs = max(cpu_count() + n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) @@ -381,6 +384,39 @@ def get_finite_row_indices(matrix): return row_indices +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], + "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], + "cluster_selection_epsilon": [ + Interval(Real, left=0, right=None, closed="left") + ], + "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], + "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], + "alpha": [Interval(Real, left=0, right=None, closed="neither")], + "algorithm": [ + StrOptions( + { + "auto", + "best", + "generic", + "prims_kdtree", + "prims_balltree", + "boruvka_kdtree", + "boruvka_balltree", + } + ) + ], + "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], + "memory": [str, None, Path], + "approx_min_span_tree": [bool], + "n_jobs": [int], + "cluster_selection_method": [StrOptions({"eom", "leaf"})], + "allow_single_cluster": [bool], + "metric_params": [dict, None], + } +) def hdbscan( X, min_cluster_size=5, @@ -480,8 +516,8 @@ def hdbscan( n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `n_jobs` - below -1, (n_cpus + 1 + n_jobs) are used. + supported by the specific algorithm). For `n_jobs<=0`, + (n_cpus + n_jobs) are used. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The @@ -532,32 +568,6 @@ def hdbscan( if min_samples is None: min_samples = min_cluster_size - if type(min_samples) is not int or type(min_cluster_size) is not int: - raise ValueError("Min samples and min cluster size must be integers!") - - if min_samples <= 0 or min_cluster_size <= 0: - raise ValueError("Min samples and Min cluster size must be positive integers") - - if min_cluster_size == 1: - raise ValueError("Min cluster size must be greater than one") - - if type(cluster_selection_epsilon) is int: - cluster_selection_epsilon = float(cluster_selection_epsilon) - - if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0: - raise ValueError("Epsilon must be a float value greater than or equal to 0!") - - if not isinstance(alpha, float) or alpha <= 0.0: - raise ValueError("Alpha must be a positive float value greater than 0!") - - if leaf_size < 1: - raise ValueError("Leaf size must be greater than 0!") - - if cluster_selection_method not in ("eom", "leaf"): - raise ValueError( - 'Invalid Cluster Selection Method: %s\nShould be one of: "eom", "leaf"\n' - ) - # Checks input and converts to an nd-array where possible if metric != "precomputed" or issparse(X): X = check_array(X, accept_sparse="csr", force_all_finite=False) @@ -880,8 +890,36 @@ class HDBSCAN(ClusterMixin, BaseEstimator): array([ 2, 6, -1, ..., -1, -1, -1]) """ - def _more_tags(self): - return {"allow_nan": True} + _parameter_constraints = { + "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], + "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], + "cluster_selection_epsilon": [ + Interval(Real, left=0, right=None, closed="left") + ], + "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], + "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], + "alpha": [Interval(Real, left=0, right=None, closed="neither")], + "algorithm": [ + StrOptions( + { + "auto", + "best", + "generic", + "prims_kdtree", + "prims_balltree", + "boruvka_kdtree", + "boruvka_balltree", + } + ) + ], + "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], + "memory": [str, None, Path], + "approx_min_span_tree": [bool], + "n_jobs": [int], + "cluster_selection_method": [StrOptions({"eom", "leaf"})], + "allow_single_cluster": [bool], + "metric_params": [dict, None], + } def __init__( self, @@ -933,6 +971,7 @@ def fit(self, X, y=None): self : object Returns self. """ + self._validate_params() metric_params = self.metric_params or {} if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. @@ -1120,3 +1159,6 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): return labelling_at_cut( self._single_linkage_tree_, cut_distance, min_cluster_size ) + + def _more_tags(self): + return {"allow_nan": True} diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 0edb47d718e7e..b88939e5da741 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -302,7 +302,7 @@ def test_hdbscan_boruvka_matches(tree): [{"alpha": -1}, ValueError], [{"alpha": "fail"}, ValueError], [{"leaf_size": 0}, ValueError], - [{"algorithm": "something_else"}, TypeError], + [{"algorithm": "something_else"}, ValueError], [{"metric": "minkowski", "metric_params": {"p": None}}, TypeError], ], ) From 132c1463ef1320e2a8c300c0e9f338638481f1fc Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Jun 2022 20:44:46 -0400 Subject: [PATCH 071/160] Adopted optics-like core_dist backend using `NearestNeighbors` --- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 1 - sklearn/cluster/_hdbscan/hdbscan_.py | 110 +++++++++++++----- 2 files changed, 78 insertions(+), 33 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index 5fec8727f4b69..9e8b88393cb99 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -58,7 +58,6 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( DistanceMetric dist_metric, np.double_t alpha=1.0): - # Add a comment cdef np.ndarray[np.double_t, ndim=1] current_distances_arr cdef np.ndarray[np.double_t, ndim=1] current_sources_arr cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index af1eadb9c53b7..45ae185376c58 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -18,10 +18,10 @@ from sklearn.neighbors import KDTree, BallTree from joblib import Memory from warnings import warn -from sklearn.utils import check_array +from sklearn.utils import check_array, gen_batches, get_chunk_n_rows from joblib.parallel import cpu_count from sklearn.utils._param_validation import Interval, StrOptions, validate_params - +from sklearn.neighbors import NearestNeighbors from scipy.sparse import csgraph from ._hdbscan_linkage import ( @@ -82,8 +82,7 @@ def _hdbscan_generic( # sklearn.metrics.pairwise_distances handle it, # enables the usage of numpy.inf in the distance # matrix to indicate missing distance information. - # TODO: Check if copying is necessary - distance_matrix = X.copy() + distance_matrix = X else: distance_matrix = pairwise_distances(X, metric=metric, **metric_params) @@ -178,29 +177,68 @@ def _hdbscan_sparse_distance_matrix( return single_linkage_tree +def _compute_core_distances_prims(X, neighbors, min_samples): + """Compute the k-th nearest neighbor of each sample. + + Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1] + but with more memory efficiency. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + neighbors : NearestNeighbors instance + The fitted nearest neighbors estimator. + min_samples : int + The number of points used to calculate core distance. + + Returns + ------- + core_distances : ndarray of shape (n_samples,) + Distance at which each sample becomes a core point. + Points which will never be core have a distance of inf. + """ + n_samples = X.shape[0] + core_distances = np.empty(n_samples) + core_distances.fill(np.nan) + + chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples, max_n_rows=n_samples) + slices = gen_batches(n_samples, chunk_n_rows) + for sl in slices: + core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1] + return core_distances + + def _hdbscan_prims_kdtree( X, min_samples=5, alpha=1.0, metric="euclidean", leaf_size=40, + n_jobs=4, **metric_params, ): - if X.dtype != np.float64: - X = X.astype(np.float64) - # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, dtype=np.double, order="C") - - tree = KDTree(X, metric=metric, leaf_size=leaf_size, **metric_params) + X = np.array(X, dtype=np.float64, order="C") - dist_metric = DistanceMetric.get_metric(metric, **metric_params) + if X.dtype != np.float64: + X = X.astype(np.float64) # Get distance to kth nearest neighbour - core_distances = tree.query( - X, k=min_samples + 1, dualtree=True, breadth_first=True - )[0][:, -1].copy(order="C") + nbrs = NearestNeighbors( + n_neighbors=min_samples, + algorithm="kd_tree", + leaf_size=leaf_size, + metric=metric, + metric_params=metric_params, + p=2 if metric_params is None else metric_params.get("p", 2), + n_jobs=n_jobs, + ).fit(X) + core_distances = _compute_core_distances_prims( + X, neighbors=nbrs, min_samples=min_samples + ) + dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) @@ -220,23 +258,30 @@ def _hdbscan_prims_balltree( alpha=1.0, metric="euclidean", leaf_size=40, + n_jobs=4, **metric_params, ): - if X.dtype != np.float64: - X = X.astype(np.float64) - # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, dtype=np.double, order="C") - - tree = BallTree(X, metric=metric, leaf_size=leaf_size, **metric_params) + X = np.array(X, dtype=np.float64, order="C") - dist_metric = DistanceMetric.get_metric(metric, **metric_params) + if X.dtype != np.float64: + X = X.astype(np.float64) # Get distance to kth nearest neighbour - core_distances = tree.query( - X, k=min_samples + 1, dualtree=True, breadth_first=True - )[0][:, -1].copy(order="C") + nbrs = NearestNeighbors( + n_neighbors=min_samples, + algorithm="ball_tree", + leaf_size=leaf_size, + metric=metric, + metric_params=metric_params, + p=2 if metric_params is None else metric_params.get("p", 2), + n_jobs=n_jobs, + ).fit(X) + core_distances = _compute_core_distances_prims( + X, neighbors=nbrs, min_samples=min_samples + ) + dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) @@ -571,7 +616,7 @@ def hdbscan( # Checks input and converts to an nd-array where possible if metric != "precomputed" or issparse(X): X = check_array(X, accept_sparse="csr", force_all_finite=False) - else: + elif isinstance(X, np.ndarray): # Only non-sparse, precomputed distance matrices are handled here # and thereby allowed to contain numpy.inf for missing distances @@ -581,7 +626,6 @@ def hdbscan( tmp[np.isinf(tmp)] = 1 check_array(tmp) - # Python 2 and 3 compliant string_type checking memory = Memory(location=memory, verbose=0) size = X.shape[0] @@ -610,6 +654,7 @@ def hdbscan( min_samples, alpha, metric, + n_jobs=n_jobs, **metric_params, ) elif algorithm == "prims_balltree": @@ -621,6 +666,7 @@ def hdbscan( alpha, metric, leaf_size, + n_jobs=n_jobs, **metric_params, ) elif algorithm == "boruvka_kdtree": @@ -632,7 +678,7 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - n_jobs, + n_jobs=n_jobs, **metric_params, ) elif algorithm == "boruvka_balltree": @@ -650,13 +696,12 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - n_jobs, + n_jobs=n_jobs, **metric_params, ) else: raise TypeError("Unknown algorithm type %s specified" % algorithm) else: - if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... single_linkage_tree = memory.cache(_hdbscan_generic)( @@ -676,6 +721,7 @@ def hdbscan( alpha, metric, leaf_size, + n_jobs=n_jobs, **metric_params, ) else: @@ -685,12 +731,11 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - n_jobs, + n_jobs=n_jobs, **metric_params, ) else: # Metric is a valid BallTree metric # TO DO: Need heuristic to decide when to go to boruvka; - # still debugging for now if X.shape[1] > 60: single_linkage_tree = memory.cache(_hdbscan_prims_balltree)( X, @@ -698,6 +743,7 @@ def hdbscan( alpha, metric, leaf_size, + n_jobs=n_jobs, **metric_params, ) else: @@ -707,7 +753,7 @@ def hdbscan( metric, leaf_size, approx_min_span_tree, - n_jobs, + n_jobs=n_jobs, **metric_params, ) From cfaf5972353cc9cbfd4326f0af4be4f7e3fc55a9 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 15 Jun 2022 12:36:30 -0400 Subject: [PATCH 072/160] Refactor of main hdbscan function --- sklearn/cluster/_hdbscan/hdbscan_.py | 359 ++++++++------------------- 1 file changed, 98 insertions(+), 261 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 45ae185376c58..76dac18fcd8e0 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -68,6 +68,14 @@ def _tree_to_labels( return (labels, probabilities, single_linkage_tree) +def _process_mst(min_spanning_tree): + # Sort edges of the min_spanning_tree by weight + row_order = np.argsort(min_spanning_tree.T[2]) + min_spanning_tree = min_spanning_tree[row_order, :] + # Convert edge list into standard hierarchical clustering format + return label(min_spanning_tree) + + def _hdbscan_generic( X, min_samples=5, @@ -109,13 +117,7 @@ def _hdbscan_generic( UserWarning, ) - # Sort edges of the min_spanning_tree by weight - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] - - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) - - return single_linkage_tree + return _process_mst(min_spanning_tree) def _hdbscan_sparse_distance_matrix( @@ -177,40 +179,9 @@ def _hdbscan_sparse_distance_matrix( return single_linkage_tree -def _compute_core_distances_prims(X, neighbors, min_samples): - """Compute the k-th nearest neighbor of each sample. - - Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1] - but with more memory efficiency. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The data. - neighbors : NearestNeighbors instance - The fitted nearest neighbors estimator. - min_samples : int - The number of points used to calculate core distance. - - Returns - ------- - core_distances : ndarray of shape (n_samples,) - Distance at which each sample becomes a core point. - Points which will never be core have a distance of inf. - """ - n_samples = X.shape[0] - core_distances = np.empty(n_samples) - core_distances.fill(np.nan) - - chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples, max_n_rows=n_samples) - slices = gen_batches(n_samples, chunk_n_rows) - for sl in slices: - core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1] - return core_distances - - -def _hdbscan_prims_kdtree( +def _hdbscan_prims( X, + algo, min_samples=5, alpha=1.0, metric="euclidean", @@ -220,81 +191,39 @@ def _hdbscan_prims_kdtree( ): # The Cython routines used require contiguous arrays if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, dtype=np.float64, order="C") - - if X.dtype != np.float64: - X = X.astype(np.float64) + X = np.array(X, order="C") # Get distance to kth nearest neighbour nbrs = NearestNeighbors( n_neighbors=min_samples, - algorithm="kd_tree", + algorithm=algo, leaf_size=leaf_size, metric=metric, metric_params=metric_params, - p=2 if metric_params is None else metric_params.get("p", 2), n_jobs=n_jobs, + p=None, ).fit(X) - core_distances = _compute_core_distances_prims( - X, neighbors=nbrs, min_samples=min_samples - ) - dist_metric = DistanceMetric.get_metric(metric, **metric_params) - - # Mutual reachability distance is implicit in mst_linkage_core_vector - min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) - - # Sort edges of the min_spanning_tree by weight - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] - - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) - - return single_linkage_tree - -def _hdbscan_prims_balltree( - X, - min_samples=5, - alpha=1.0, - metric="euclidean", - leaf_size=40, - n_jobs=4, - **metric_params, -): - # The Cython routines used require contiguous arrays - if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, dtype=np.float64, order="C") + n_samples = X.shape[0] + core_distances = np.empty(n_samples) + core_distances.fill(np.nan) - if X.dtype != np.float64: - X = X.astype(np.float64) + chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples, max_n_rows=n_samples) + slices = gen_batches(n_samples, chunk_n_rows) + for sl in slices: + core_distances[sl] = nbrs.kneighbors(X[sl], min_samples)[0][:, -1] - # Get distance to kth nearest neighbour - nbrs = NearestNeighbors( - n_neighbors=min_samples, - algorithm="ball_tree", - leaf_size=leaf_size, - metric=metric, - metric_params=metric_params, - p=2 if metric_params is None else metric_params.get("p", 2), - n_jobs=n_jobs, - ).fit(X) - core_distances = _compute_core_distances_prims( - X, neighbors=nbrs, min_samples=min_samples - ) dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) - # Sort edges of the min_spanning_tree by weight - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) - return single_linkage_tree + return _process_mst(min_spanning_tree) -def _hdbscan_boruvka_kdtree( +def _hdbscan_boruvka( X, + algo, min_samples=5, metric="euclidean", leaf_size=40, @@ -302,16 +231,14 @@ def _hdbscan_boruvka_kdtree( n_jobs=4, **metric_params, ): - if leaf_size < 3: - leaf_size = 3 + leaf_size = max(leaf_size, 3) - if n_jobs < 1: - n_jobs = max(cpu_count() + n_jobs, 1) + n_jobs = 1 if n_jobs == 0 else n_jobs + if n_jobs < 0: + n_jobs = max(cpu_count() + n_jobs + 1, 1) - if X.dtype != np.float64: - X = X.astype(np.float64) - - tree = KDTree(X, metric=metric, leaf_size=leaf_size, **metric_params) + Tree = KDTree if algo == "kd_tree" else BallTree + tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) n_samples = X.shape[0] if min_samples + 1 > n_samples: @@ -320,45 +247,8 @@ def _hdbscan_boruvka_kdtree( f" but {min_samples+1=}, {n_samples=}" ) - alg = KDTreeBoruvkaAlgorithm( - tree, - min_samples, - metric=metric, - leaf_size=leaf_size // 3, - approx_min_span_tree=approx_min_span_tree, - n_jobs=n_jobs, - **metric_params, - ) - min_spanning_tree = alg.spanning_tree() - # Sort edges of the min_spanning_tree by weight - row_order = np.argsort(min_spanning_tree.T[2]) - min_spanning_tree = min_spanning_tree[row_order, :] - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) - - return single_linkage_tree - - -def _hdbscan_boruvka_balltree( - X, - min_samples=5, - metric="euclidean", - leaf_size=40, - approx_min_span_tree=True, - n_jobs=4, - **metric_params, -): - if leaf_size < 3: - leaf_size = 3 - - if n_jobs < 1: - n_jobs = max(cpu_count() + 1 + n_jobs, 1) - - if X.dtype != np.float64: - X = X.astype(np.float64) - - tree = BallTree(X, metric=metric, leaf_size=leaf_size, **metric_params) - alg = BallTreeBoruvkaAlgorithm( + alg = KDTreeBoruvkaAlgorithm if algo == "kd_tree" else BallTreeBoruvkaAlgorithm + out = alg( tree, min_samples, metric=metric, @@ -367,13 +257,9 @@ def _hdbscan_boruvka_balltree( n_jobs=n_jobs, **metric_params, ) - min_spanning_tree = alg.spanning_tree() - # Sort edges of the min_spanning_tree by weight - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) + min_spanning_tree = out.spanning_tree() - return single_linkage_tree + return _process_mst(min_spanning_tree) def remap_single_linkage_tree(tree, internal_to_raw, outliers): @@ -524,8 +410,10 @@ def hdbscan( must be square. leaf_size : int, default=40 - Leaf size for trees responsible for fast nearest - neighbour queries. + Leaf size for trees responsible for fast nearest neighbour queries. A + large dataset size and small leaf_size may induce excessive memory + usage. If you are running out of memory consider increasing the + `leaf_size` parameter. algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised @@ -561,8 +449,8 @@ def hdbscan( n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `n_jobs<=0`, - (n_cpus + n_jobs) are used. + supported by the specific algorithm). For `n_jobs<0`, + `(n_cpus + n_jobs + 1)` are used. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The @@ -634,128 +522,77 @@ def hdbscan( min_samples = 1 metric_params = metric_params or {} + func = None + kwargs = dict( + X=X, + algo="kd_tree", + min_samples=min_samples, + alpha=alpha, + metric=metric, + leaf_size=leaf_size, + n_jobs=n_jobs, + **metric_params, + ) + if "kdtree" in algorithm and metric not in KDTree.valid_metrics: + raise ValueError( + f"{metric} is not a valid metric for a KDTree-based algorithm. Please" + " select a different metric." + ) + elif "balltree" in algorithm and metric not in BallTree.valid_metrics: + raise ValueError( + f"{metric} is not a valid metric for a BallTree-based algorithm. Please" + " select a different metric." + ) + if algorithm != "auto": if metric != "precomputed" and issparse(X) and algorithm != "generic": - raise ValueError("Sparse data matrices only support algorithm 'generic'.") + raise ValueError("Sparse data matrices only support algorithm `generic`.") if algorithm == "generic": - single_linkage_tree = memory.cache(_hdbscan_generic)( - X, - min_samples, - alpha, - metric, - **metric_params, - ) + func = _hdbscan_generic + for key in ("algo", "leaf_size", "n_jobs"): + kwargs.pop(key, None) elif algorithm == "prims_kdtree": - if metric not in KDTree.valid_metrics: - raise ValueError("Cannot use Prim's with KDTree for this metric!") - single_linkage_tree = memory.cache(_hdbscan_prims_kdtree)( - X, - min_samples, - alpha, - metric, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_prims elif algorithm == "prims_balltree": - if metric not in BallTree.valid_metrics: - raise ValueError("Cannot use Prim's with BallTree for this metric!") - single_linkage_tree = memory.cache(_hdbscan_prims_balltree)( - X, - min_samples, - alpha, - metric, - leaf_size, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_prims + kwargs["algo"] = "ball_tree" elif algorithm == "boruvka_kdtree": - if metric not in KDTree.valid_metrics: - raise ValueError("Cannot use Boruvka with KDTree for this metric!") - single_linkage_tree = memory.cache(_hdbscan_boruvka_kdtree)( - X, - min_samples, - metric, - leaf_size, - approx_min_span_tree, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_boruvka + kwargs.pop("alpha", None) elif algorithm == "boruvka_balltree": - if metric not in BallTree.valid_metrics: - raise ValueError("Cannot use Boruvka with BallTree for this metric!") - if (X.shape[0] // leaf_size) > 16000: - warn( - "A large dataset size and small leaf_size may induce excessive " - "memory usage. If you are running out of memory consider " - "increasing the `leaf_size` parameter." - ) - single_linkage_tree = memory.cache(_hdbscan_boruvka_balltree)( - X, - min_samples, - metric, - leaf_size, - approx_min_span_tree, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_boruvka + kwargs.pop("alpha", None) + kwargs["algo"] = "ball_tree" else: - raise TypeError("Unknown algorithm type %s specified" % algorithm) + raise TypeError( + f"Unknown algorithm type {algorithm} specified. Please select a" + " supported algorithm." + ) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... - single_linkage_tree = memory.cache(_hdbscan_generic)( - X, - min_samples, - alpha, - metric, - **metric_params, - ) + func = _hdbscan_generic + for key in ("algo", "leaf_size", "n_jobs"): + kwargs.pop(key, None) elif metric in KDTree.valid_metrics: - # TO DO: Need heuristic to decide when to go to boruvka; - # still debugging for now + # TO DO: Need heuristic to decide when to go to boruvka if X.shape[1] > 60: - single_linkage_tree = memory.cache(_hdbscan_prims_kdtree)( - X, - min_samples, - alpha, - metric, - leaf_size, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_prims else: - single_linkage_tree = memory.cache(_hdbscan_boruvka_kdtree)( - X, - min_samples, - metric, - leaf_size, - approx_min_span_tree, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_boruvka + kwargs.pop("alpha", None) else: # Metric is a valid BallTree metric # TO DO: Need heuristic to decide when to go to boruvka; if X.shape[1] > 60: - single_linkage_tree = memory.cache(_hdbscan_prims_balltree)( - X, - min_samples, - alpha, - metric, - leaf_size, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_prims + kwargs["algo"] = "ball_tree" else: - single_linkage_tree = memory.cache(_hdbscan_boruvka_balltree)( - X, - min_samples, - metric, - leaf_size, - approx_min_span_tree, - n_jobs=n_jobs, - **metric_params, - ) + func = _hdbscan_boruvka + kwargs.pop("alpha", None) + kwargs["algo"] = "ball_tree" + + single_linkage_tree = memory.cache(func)(**kwargs) return _tree_to_labels( single_linkage_tree, @@ -835,10 +672,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): - `'boruvka_balltree'` leaf_size : int, default=40 - If using a space tree algorithm (`KDTree`, or `BallTree`) the number - of points ina leaf node of the tree. This does not alter the - resulting clustering, but may have an effect on the runtime - of the algorithm. + Leaf size for trees responsible for fast nearest neighbour queries. A + large dataset size and small leaf_size may induce excessive memory + usage. If you are running out of memory consider increasing the + `leaf_size` parameter. Ignored for `algorithm=generic`. memory : str, default=None Used to cache the output of the computation of the tree. @@ -854,8 +691,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `n_jobs` - below -1, (n_cpus + 1 + n_jobs) are used. + supported by the specific algorithm). For `n_jobs<0`, + `(n_cpus + n_jobs + 1)` are used. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The From 400fcf1341288d44aca874234b63dc1964232a1e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 15 Jun 2022 12:38:24 -0400 Subject: [PATCH 073/160] Removed `approx_min_span_tree` -- defaulted to `True` --- sklearn/cluster/_hdbscan/hdbscan_.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 76dac18fcd8e0..985c26a3c81b7 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -227,7 +227,6 @@ def _hdbscan_boruvka( min_samples=5, metric="euclidean", leaf_size=40, - approx_min_span_tree=True, n_jobs=4, **metric_params, ): @@ -253,7 +252,7 @@ def _hdbscan_boruvka( min_samples, metric=metric, leaf_size=leaf_size // 3, - approx_min_span_tree=approx_min_span_tree, + approx_min_span_tree=True, n_jobs=n_jobs, **metric_params, ) @@ -341,7 +340,6 @@ def get_finite_row_indices(matrix): ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "memory": [str, None, Path], - "approx_min_span_tree": [bool], "n_jobs": [int], "cluster_selection_method": [StrOptions({"eom", "leaf"})], "allow_single_cluster": [bool], @@ -359,7 +357,6 @@ def hdbscan( leaf_size=40, algorithm="auto", memory=None, - approx_min_span_tree=True, n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, @@ -440,13 +437,6 @@ def hdbscan( By default, no caching is done. If a string is given, it is the path to the caching directory. - approx_min_span_tree : bool, default=True - Whether to accept an only approximate minimum spanning tree. - For some algorithms this can provide a significant speedup, but - the resulting clustering may be of marginally lower quality. - If you are willing to sacrifice speed for correctness you may want - to explore this; in general this should be left at the default True. - n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For `n_jobs<0`, @@ -682,13 +672,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): By default, no caching is done. If a string is given, it is the path to the caching directory. - approx_min_span_tree : bool, default=True - Whether to accept an only approximate minimum spanning tree. - For some algorithms this can provide a significant speedup, but - the resulting clustering may be of marginally lower quality. - If you are willing to sacrifice speed for correctness you may want - to explore this; in general this should be left at the default `True`. - n_jobs : int, default=4 Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For `n_jobs<0`, @@ -797,7 +780,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "memory": [str, None, Path], - "approx_min_span_tree": [bool], "n_jobs": [int], "cluster_selection_method": [StrOptions({"eom", "leaf"})], "allow_single_cluster": [bool], @@ -815,7 +797,6 @@ def __init__( algorithm="auto", leaf_size=40, memory=None, - approx_min_span_tree=True, n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, @@ -830,7 +811,6 @@ def __init__( self.algorithm = algorithm self.leaf_size = leaf_size self.memory = memory - self.approx_min_span_tree = approx_min_span_tree self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster From 44bb1761cbdc50b601a166fd6f468023659366a0 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 15 Jun 2022 12:39:30 -0400 Subject: [PATCH 074/160] Removed unnecessary metric option --- sklearn/cluster/_hdbscan/hdbscan_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 985c26a3c81b7..d5591f7f22097 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -40,7 +40,7 @@ from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm from sklearn.metrics._dist_metrics import DistanceMetric -FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"] +FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine"] def _tree_to_labels( From ef4481ee73189a4990a3755926bfa553d9fdc18f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 27 Jun 2022 16:45:18 -0400 Subject: [PATCH 075/160] Removed validity index, replaced w/ fowlkes-mallows score --- sklearn/cluster/_hdbscan/_validity.py | 410 ------------------ .../cluster/_hdbscan/tests/test_hdbscan.py | 33 +- 2 files changed, 27 insertions(+), 416 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_validity.py diff --git a/sklearn/cluster/_hdbscan/_validity.py b/sklearn/cluster/_hdbscan/_validity.py deleted file mode 100644 index 7131d3dc6b905..0000000000000 --- a/sklearn/cluster/_hdbscan/_validity.py +++ /dev/null @@ -1,410 +0,0 @@ -# Author: Leland McInnes -# Steve Astels -# John Healy -# -# License: BSD 3 clause -# Currently only used in test_hdbscan.py for testing the correctness of HDBSCAN - -import numpy as np -from sklearn.metrics import pairwise_distances -from scipy.spatial.distance import cdist -from ._hdbscan_linkage import mst_linkage_core -from numpy import isclose - - -def all_points_core_distance(distance_matrix, d=2.0): - """ - Compute the all-points-core-distance for all the points of a cluster. - - Parameters - ---------- - distance_matrix : array (cluster_size, cluster_size) - The pairwise distance matrix between points in the cluster. - - d : integer - The dimension of the data set, which is used in the computation - of the all-point-core-distance as per the paper. - - Returns - ------- - core_distances : array (cluster_size,) - The all-points-core-distance of each point in the cluster - - References - ---------- - Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., - 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). - """ - distance_matrix[distance_matrix != 0] = ( - 1.0 / distance_matrix[distance_matrix != 0] - ) ** d - result = distance_matrix.sum(axis=1) - result /= distance_matrix.shape[0] - 1 - result **= -1.0 / d - - return result - - -def all_points_mutual_reachability( - X, labels, cluster_id, metric="euclidean", d=None, metric_params=None -): - """ - Compute the all-points-mutual-reachability distances for all the points of - a cluster. - - If metric is 'precomputed' then assume X is a distance matrix for the full - dataset. Note that in this case you must pass in 'd' the dimension of the - dataset. - - Parameters - ---------- - X : array (n_samples, n_features) or (n_samples, n_samples) - The input data of the clustering. This can be the data, or, if - metric is set to `precomputed` the pairwise distance matrix used - for the clustering. - - labels : array (n_samples) - The label array output by the clustering, providing an integral - cluster label to each data point, with -1 for noise points. - - cluster_id : integer - The cluster label for which to compute the all-points - mutual-reachability (which should be done on a cluster - by cluster basis). - - metric : string - The metric used to compute distances for the clustering (and - to be re-used in computing distances for mr distance). If - set to `precomputed` then X is assumed to be the precomputed - distance matrix between samples. - - d : integer (or None) - The number of features (dimension) of the dataset. This need only - be set in the case of metric being set to `precomputed`, where - the ambient dimension of the data is unknown to the function. - - metric_params : dict, default=None - Arguments passed to the distance metric. - - Returns - ------- - - mutual_reachaibility : array (n_samples, n_samples) - The pairwise mutual reachability distances between all points in `X` - with `label` equal to `cluster_id`. - - core_distances : array (n_samples,) - The all-points-core_distance of all points in `X` with `label` equal - to `cluster_id`. - - References - ---------- - Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., - 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). - """ - if metric == "precomputed": - if d is None: - raise ValueError("If metric is precomputed a d value must be provided!") - distance_matrix = X[labels == cluster_id, :][:, labels == cluster_id] - else: - subset_X = X[labels == cluster_id, :] - metric_params = metric_params or {} - distance_matrix = pairwise_distances(subset_X, metric=metric, **metric_params) - d = X.shape[1] - - core_distances = all_points_core_distance(distance_matrix.copy(), d=d) - core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1)) - - result = np.dstack([distance_matrix, core_dist_matrix, core_dist_matrix.T]).max( - axis=-1 - ) - - return result, core_distances - - -def internal_minimum_spanning_tree(mr_distances): - """ - Compute the 'internal' minimum spanning tree given a matrix of mutual - reachability distances. Given a minimum spanning tree the 'internal' - graph is the subgraph induced by vertices of degree greater than one. - - Parameters - ---------- - mr_distances : array (cluster_size, cluster_size) - The pairwise mutual reachability distances, inferred to be the edge - weights of a complete graph. Since MSTs are computed per cluster - this is the all-points-mutual-reacability for points within a single - cluster. - - Returns - ------- - internal_nodes : array - An array listing the indices of the internal nodes of the MST - - internal_edges : array (?, 3) - An array of internal edges in weighted edge list format; that is - an edge is an array of length three listing the two vertices - forming the edge and weight of the edge. - - References - ---------- - Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., - 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). - """ - single_linkage_data = mst_linkage_core(mr_distances) - min_span_tree = single_linkage_data.copy() - for index, row in enumerate(min_span_tree[1:], 1): - candidates = np.where(isclose(mr_distances[int(row[1])], row[2]))[0] - candidates = np.intersect1d( - candidates, single_linkage_data[:index, :2].astype(int) - ) - candidates = candidates[candidates != row[1]] - assert len(candidates) > 0 - row[0] = candidates[0] - - vertices = np.arange(mr_distances.shape[0])[ - np.bincount(min_span_tree.T[:2].flatten().astype(np.intp)) > 1 - ] - # A little "fancy" we select from the flattened array reshape back - # (Fortran format to get indexing right) and take the product to do an and - # then convert back to boolean type. - edge_selection = np.prod( - np.in1d(min_span_tree.T[:2], vertices).reshape( - (min_span_tree.shape[0], 2), order="F" - ), - axis=1, - ).astype(bool) - - # Density sparseness is not well defined if there are no - # internal edges (as per the referenced paper). However - # MATLAB code from the original authors simply selects the - # largest of *all* the edges in the case that there are - # no internal edges, so we do the same here - if np.any(edge_selection): - # If there are any internal edges, then subselect them out - edges = min_span_tree[edge_selection] - else: - # If there are no internal edges then we want to take the - # max over all the edges that exist in the MST, so we simply - # do nothing and return all the edges in the MST. - edges = min_span_tree.copy() - - return vertices, edges - - -def density_separation( - X, - labels, - cluster_id1, - cluster_id2, - internal_nodes1, - internal_nodes2, - core_distances1, - core_distances2, - metric="euclidean", - **kwd_args, -): - """ - Compute the density separation between two clusters. This is the minimum - all-points mutual reachability distance between pairs of points, one from - internal nodes of MSTs of each cluster. - - Parameters - ---------- - X : array (n_samples, n_features) or (n_samples, n_samples) - The input data of the clustering. This can be the data, or, if - metric is set to `precomputed` the pairwise distance matrix used - for the clustering. - - labels : array (n_samples) - The label array output by the clustering, providing an integral - cluster label to each data point, with -1 for noise points. - - cluster_id1 : integer - The first cluster label to compute separation between. - - cluster_id2 : integer - The second cluster label to compute separation between. - - internal_nodes1 : array - The vertices of the MST for `cluster_id1` that were internal vertices. - - internal_nodes2 : array - The vertices of the MST for `cluster_id2` that were internal vertices. - - core_distances1 : array (size of cluster_id1,) - The all-points-core_distances of all points in the cluster - specified by cluster_id1. - - core_distances2 : array (size of cluster_id2,) - The all-points-core_distances of all points in the cluster - specified by cluster_id2. - - metric : string - The metric used to compute distances for the clustering (and - to be re-used in computing distances for mr distance). If - set to `precomputed` then X is assumed to be the precomputed - distance matrix between samples. - - **kwd_args : - Extra arguments to pass to the distance computation for other - metrics, such as minkowski, Mahanalobis etc. - - Returns - ------- - The 'density separation' between the clusters specified by - `cluster_id1` and `cluster_id2`. - - References - ---------- - Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., - 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). - """ - if metric == "precomputed": - sub_select = X[labels == cluster_id1, :][:, labels == cluster_id2] - distance_matrix = sub_select[internal_nodes1, :][:, internal_nodes2] - else: - cluster1 = X[labels == cluster_id1][internal_nodes1] - cluster2 = X[labels == cluster_id2][internal_nodes2] - distance_matrix = cdist(cluster1, cluster2, metric, **kwd_args) - - core_dist_matrix1 = np.tile( - core_distances1[internal_nodes1], (distance_matrix.shape[1], 1) - ).T - core_dist_matrix2 = np.tile( - core_distances2[internal_nodes2], (distance_matrix.shape[0], 1) - ) - - mr_dist_matrix = np.dstack( - [distance_matrix, core_dist_matrix1, core_dist_matrix2] - ).max(axis=-1) - - return mr_dist_matrix.min() - - -def validity_index( - X, labels, metric="euclidean", d=None, per_cluster_scores=False, metric_params=None -): - """ - Compute the density based cluster validity index. - - Compute the density based cluster validity index for the - clustering specified by `labels` and for each cluster in `labels`. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) or (n_samples, n_samples) - The input data of the clustering. - - If `metric=precomputed` this is treated as the pairwise distance matrix - used for the clustering. - - labels : array-like (n_samples) - The label array output by the clustering, providing an integral - cluster label to each data point, with -1 for noise points. - - metric : str, default='euclidean' - The metric used to compute distances for the clustering (and - to be re-used in computing distances for mr distance). If - set to `precomputed` then X is assumed to be the precomputed - distance matrix between samples. - - d : int, default=None - The number of features (dimension) of the dataset. This need only - be set in the case of metric being set to `precomputed`, where - the ambient dimension of the data is unknown to the function. - - per_cluster_scores : bool, default=False - Whether to return the validity index for individual clusters. - Defaults to False with the function returning a single float - value for the whole clustering. - - metric_params : dict, default=None - Arguments passed to the distance metric. - - Returns - ------- - validity_index : float - The density based cluster validity index for the clustering. This - is a numeric value between -1 and 1, with higher values indicating - a 'better' clustering. - - per_cluster_validity_index : array (n_clusters,) - The cluster validity index of each individual cluster as an array. - The overall validity index is the weighted average of these values. - Only returned if per_cluster_scores is set to True. - - References - ---------- - Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., - 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). - """ - core_distances = {} - density_sparseness = {} - mst_nodes = {} - mst_edges = {} - metric_params = metric_params or {} - - max_cluster_id = labels.max() + 1 - density_sep = np.inf * np.ones((max_cluster_id, max_cluster_id), dtype=np.float64) - cluster_validity_indices = np.empty(max_cluster_id, dtype=np.float64) - - for cluster_id in range(max_cluster_id): - - if np.sum(labels == cluster_id) == 0: - continue - - mr_distances, core_distances[cluster_id] = all_points_mutual_reachability( - X, labels, cluster_id, metric, d, **metric_params - ) - - mst_nodes[cluster_id], mst_edges[cluster_id] = internal_minimum_spanning_tree( - mr_distances - ) - density_sparseness[cluster_id] = mst_edges[cluster_id].T[2].max() - - for i in range(max_cluster_id): - - if np.sum(labels == i) == 0: - continue - - internal_nodes_i = mst_nodes[i] - for j in range(i + 1, max_cluster_id): - - if np.sum(labels == j) == 0: - continue - - internal_nodes_j = mst_nodes[j] - density_sep[i, j] = density_separation( - X, - labels, - i, - j, - internal_nodes_i, - internal_nodes_j, - core_distances[i], - core_distances[j], - metric=metric, - **metric_params, - ) - density_sep[j, i] = density_sep[i, j] - - n_samples = float(X.shape[0]) - result = 0 - - for i in range(max_cluster_id): - - if np.sum(labels == i) == 0: - continue - - min_density_sep = density_sep[i].min() - cluster_validity_indices[i] = (min_density_sep - density_sparseness[i]) / max( - min_density_sep, density_sparseness[i] - ) - cluster_size = np.sum(labels == i) - result += (cluster_size / n_samples) * cluster_validity_indices[i] - - if per_cluster_scores: - return result, cluster_validity_indices - else: - return result diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index b88939e5da741..4c8ecee1b8c0d 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -8,8 +8,7 @@ from scipy import stats from sklearn.utils._testing import assert_array_almost_equal from sklearn.cluster import HDBSCAN, hdbscan -from sklearn.cluster._hdbscan._validity import validity_index - +from sklearn.metrics import fowlkes_mallows_score from sklearn.datasets import make_blobs from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler @@ -85,8 +84,10 @@ def test_hdbscan_distance_matrix(): n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters - validity = validity_index(D, labels, metric="precomputed", d=2) - assert validity >= 0.6 + # Check that clustering is arbitrarily good + # This is a heuristic to guard against regression + score = fowlkes_mallows_score(y, labels) + assert score >= 0.98 def test_hdbscan_sparse_distance_matrix(): @@ -118,8 +119,10 @@ def test_hdbscan_feature_vector(): n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters - validity = validity_index(X, labels) - assert validity >= 0.4 + # Check that clustering is arbitrarily good + # This is a heuristic to guard against regression + score = fowlkes_mallows_score(y, labels) + assert score >= 0.98 @pytest.mark.parametrize( @@ -376,3 +379,21 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 assert counts[unique_labels == -1] == 2 + + +def test_hdbscan_not_dbscan(): + """ + Validate that HDBSCAN can properly cluster this difficult synthetic + dataset. Note that DBSCAN fails on this (see HDBSCAN plotting + example) + """ + centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] + X, _ = make_blobs( + n_samples=750, + centers=centers, + cluster_std=[0.2, 0.35, 1.35, 1.35], + random_state=0, + ) + hdb = HDBSCAN().fit(X) + n_clusters = len(set(hdb.labels_)) - int(-1 in hdb.labels_) + assert n_clusters == 4 From d7c449a09ab0e3daf9d669a235955be8bd9b1abc Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 27 Jun 2022 17:37:22 -0400 Subject: [PATCH 076/160] Minor cosmetic changes to tests --- sklearn/cluster/_hdbscan/tests/test_hdbscan.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 4c8ecee1b8c0d..865a82098849f 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -213,17 +213,12 @@ def test_hdbscan_high_dimensional(): def test_hdbscan_best_balltree_metric(): - labels = hdbscan(X, metric="seuclidean", metric_params={"V": np.ones(X.shape[1])})[ - 0 - ] + kwargs = dict(metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}) + labels, _ = hdbscan(X, **kwargs) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters - labels = ( - HDBSCAN(metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}) - .fit(X) - .labels_ - ) + labels = HDBSCAN(**kwargs).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters @@ -381,7 +376,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): assert counts[unique_labels == -1] == 2 -def test_hdbscan_not_dbscan(): +def test_hdbscan_better_than_dbscan(): """ Validate that HDBSCAN can properly cluster this difficult synthetic dataset. Note that DBSCAN fails on this (see HDBSCAN plotting From 3710209d01e6a026030873953957146efc4877af Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 27 Jun 2022 19:14:34 -0400 Subject: [PATCH 077/160] Refactored boruvka cython --- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 754 +++--------------- sklearn/cluster/_hdbscan/hdbscan_.py | 94 +-- .../cluster/_hdbscan/tests/test_hdbscan.py | 2 +- 3 files changed, 132 insertions(+), 718 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx index 1aa29db3509d4..8fc3de214415c 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -240,8 +240,7 @@ cdef class BoruvkaUnionFind (object): def _core_dist_query(tree, data, min_samples): return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) - -cdef class KDTreeBoruvkaAlgorithm (object): +cdef class BoruvkaAlgorithm (object): """A Dual Tree Boruvka Algorithm implemented for the sklearn KDTree space tree implementation. @@ -293,6 +292,7 @@ cdef class KDTreeBoruvkaAlgorithm (object): cdef np.intp_t num_points cdef np.intp_t num_nodes cdef np.intp_t num_features + cdef bint is_KDTree cdef public np.double_t[::1] core_distance cdef public np.double_t[::1] bounds @@ -330,8 +330,8 @@ cdef class KDTreeBoruvkaAlgorithm (object): alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): self.core_dist_tree = tree - self.tree = KDTree(tree.data, metric=metric, leaf_size=leaf_size, - **kwargs) + self.tree = tree + self.is_KDTree = isinstance(tree, KDTree) self._data = np.array(self.tree.data) self._raw_data = self.tree.data self.node_bounds = self.tree.node_bounds @@ -375,13 +375,14 @@ cdef class KDTreeBoruvkaAlgorithm (object): self.candidate_distance = ( ( self.candidate_distance_arr.data)) - # self._centroid_distances_arr = self.dist.pairwise( - # self.tree.node_bounds[0]) - # self.centroid_distances = ( - # ( - # - # self._centroid_distances_arr.data)) + if not self.is_KDTree: + # Compute centroids for BallTree + self._centroid_distances_arr = self.dist.pairwise(self.tree.node_bounds[0]) + self.centroid_distances = ( + ( + + self._centroid_distances_arr.data)) self._initialize_components() self._compute_bounds() @@ -389,8 +390,7 @@ cdef class KDTreeBoruvkaAlgorithm (object): # Set up fast pointer access to arrays self.component_of_point_ptr = &self.component_of_point[0] self.component_of_node_ptr = &self.component_of_node[0] - self.candidate_distance_ptr = ( - &self.candidate_distance[0]) + self.candidate_distance_ptr = &self.candidate_distance[0] self.candidate_neighbor_ptr = &self.candidate_neighbor[0] self.candidate_point_ptr = &self.candidate_point[0] self.core_distance_ptr = &self.core_distance[0] @@ -436,12 +436,14 @@ cdef class KDTreeBoruvkaAlgorithm (object): self.core_distance = ( ( self.core_distance_arr.data)) - # Since we do everything in terms of rdist to free up the GIL - # we need to convert all the core distances beforehand - # to make comparison feasible. - for n in range(self.num_points): - self.core_distance[n] = self.dist._dist_to_rdist( - self.core_distance[n]) + + if self.is_KDTree: + # Since we do everything in terms of rdist to free up the GIL + # we need to convert all the core distances beforehand + # to make comparison feasible. + for n in range(self.num_points): + self.core_distance[n] = self.dist._dist_to_rdist( + self.core_distance[n]) # Since we already computed NN distances for the min_samples closest # points we can use this to do the first round of boruvka -- we won't @@ -526,8 +528,11 @@ cdef class KDTreeBoruvkaAlgorithm (object): continue self.edges[self.num_edges, 0] = source self.edges[self.num_edges, 1] = sink - self.edges[self.num_edges, 2] = self.dist._rdist_to_dist( - self.candidate_distance[component]) + if self.is_KDTree: + self.edges[self.num_edges, 2] = self.dist._rdist_to_dist( + self.candidate_distance[component]) + else: + self.edges[self.num_edges, 2] = self.candidate_distance[component] self.num_edges += 1 self.component_union_find.union_(source, sink) @@ -630,6 +635,7 @@ cdef class KDTreeBoruvkaAlgorithm (object): cdef np.double_t d cdef np.double_t mr_dist + cdef np.double_t _radius cdef np.double_t new_bound cdef np.double_t new_upper_bound @@ -643,9 +649,16 @@ cdef class KDTreeBoruvkaAlgorithm (object): cdef np.double_t right_dist # Compute the distance between the query and reference nodes - node_dist = kdtree_min_rdist_dual(self.dist, - node1, node2, self.node_bounds, - self.num_features) + if self.is_KDTree: + node_dist = kdtree_min_rdist_dual(self.dist, + node1, node2, self.node_bounds, + self.num_features) + else: #BallTree + node_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, node2, + self.centroid_distances) + # If the distance between the nodes is less than the current bound for # the query and the nodes are not in the same component continue; @@ -713,14 +726,14 @@ cdef class KDTreeBoruvkaAlgorithm (object): continue if component1 != component2: - - d = self.dist.rdist(&raw_data[self.num_features * p], + if self.is_KDTree: + d = self.dist.rdist(&raw_data[self.num_features * p], + &raw_data[self.num_features * q], + self.num_features) + else: + d = self.dist.dist(&raw_data[self.num_features * p], &raw_data[self.num_features * q], - self.num_features) - - # mr_dist = max(distances[i, j], - # self.core_distance_ptr[p], - # self.core_distance_ptr[q]) + self.num_features) * self.alpha if self.alpha != 1.0: mr_dist = max(d / self.alpha, self.core_distance_ptr[p], @@ -741,9 +754,9 @@ cdef class KDTreeBoruvkaAlgorithm (object): # Compute new bounds for the query node, and # then propagate the results of that computation # up the tree. + _radius = self.dist._dist_to_rdist(node1_info.radius) if self.is_KDTree else node1_info.radius new_bound = min(new_upper_bound, - new_lower_bound + 2 * self.dist._dist_to_rdist(node1_info.radius)) - # new_bound = new_upper_bound + new_lower_bound + 2 * _radius) if new_bound < self.bounds_ptr[node1]: self.bounds_ptr[node1] = new_bound @@ -757,9 +770,21 @@ cdef class KDTreeBoruvkaAlgorithm (object): left_info = self.node_data[left] right_info = self.node_data[right] - new_bound = max(self.bounds_ptr[left], + bound_max = max(self.bounds_ptr[left], self.bounds_ptr[right]) + if self.is_KDTree: + new_bound = bound_max + else: + bound_min = min(self.bounds_ptr[left] + 2 * + (parent_info.radius - left_info.radius), + self.bounds_ptr[right] + 2 * + (parent_info.radius - right_info.radius)) + + if bound_min > 0: + new_bound = min(bound_max, bound_min) + else: + new_bound = bound_max if new_bound < self.bounds_ptr[parent]: self.bounds_ptr[parent] = new_bound node1 = parent @@ -779,19 +804,26 @@ cdef class KDTreeBoruvkaAlgorithm (object): left = 2 * node2 + 1 right = 2 * node2 + 2 - node2_info = self.node_data[left] - - left_dist = kdtree_min_rdist_dual(self.dist, - node1, left, - self.node_bounds, - self.num_features) - - node2_info = self.node_data[right] - - right_dist = kdtree_min_rdist_dual(self.dist, - node1, right, - self.node_bounds, - self.num_features) + if self.is_KDTree: + left_dist = kdtree_min_rdist_dual(self.dist, + node1, left, + self.node_bounds, + self.num_features) + right_dist = kdtree_min_rdist_dual(self.dist, + node1, right, + self.node_bounds, + self.num_features) + else: + node2_info = self.node_data[left] + left_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, left, + self.centroid_distances) + node2_info = self.node_data[right] + right_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, right, + self.centroid_distances) if left_dist < right_dist: self.dual_tree_traversal(node1, left) @@ -810,617 +842,27 @@ cdef class KDTreeBoruvkaAlgorithm (object): else: left = 2 * node1 + 1 right = 2 * node1 + 2 - - node1_info = self.node_data[left] - - left_dist = kdtree_min_rdist_dual(self.dist, - left, node2, - self.node_bounds, - self.num_features) - - node1_info = self.node_data[right] - - right_dist = kdtree_min_rdist_dual(self.dist, - right, node2, - self.node_bounds, - self.num_features) - - if left_dist < right_dist: - self.dual_tree_traversal(left, node2) - self.dual_tree_traversal(right, node2) + if self.is_KDTree: + left_dist = kdtree_min_rdist_dual(self.dist, + left, node2, + self.node_bounds, + self.num_features) + right_dist = kdtree_min_rdist_dual(self.dist, + right, node2, + self.node_bounds, + self.num_features) else: - self.dual_tree_traversal(right, node2) - self.dual_tree_traversal(left, node2) - - return 0 - - def spanning_tree(self): - """Compute the minimum spanning tree of the data held by - the tree passed in at construction""" - - # cdef np.intp_t num_components - # cdef np.intp_t num_nodes - - num_components = self.tree.data.shape[0] - num_nodes = self.tree.node_data.shape[0] - iteration = 0 - while num_components > 1: - self.dual_tree_traversal(0, 0) - num_components = self.update_components() - - return self.edges - - -cdef class BallTreeBoruvkaAlgorithm (object): - """A Dual Tree Boruvka Algorithm implemented for the sklearn - BallTree space tree implementation. - - Parameters - ---------- - - tree : BallTree - The ball-tree to run Dual Tree Boruvka over. - - min_samples : int, optional (default=5) - The min_samples parameter of HDBSCAN used to - determine core distances. - - metric : string, optional (default='euclidean') - The metric used to compute distances for the tree - - leaf_size : int, optional (default=20) - The Boruvka algorithm benefits from a smaller leaf size than - standard kd-tree nearest neighbor searches. The tree passed in - is used for a kNN search for core distance. A second tree is - constructed with a smaller leaf size for Boruvka; this is that - leaf size. - - alpha : float, optional (default=1.0) - The alpha distance scaling parameter as per Robust Single Linkage. - - approx_min_span_tree : bool (default False) - Take shortcuts and only approximate the min spanning tree. - This is considerably faster but does not return a true - minimal spanning tree. - - n_jobs : int, optional (default=4) - The number of parallel jobs used to compute core distances. - - **kwargs : - Keyword args passed to the metric. - """ - - cdef object tree - cdef object core_dist_tree - cdef DistanceMetric dist - cdef np.ndarray _data - cdef np.double_t[:, ::1] _raw_data - cdef np.double_t alpha - cdef np.int8_t approx_min_span_tree - cdef np.intp_t n_jobs - cdef np.intp_t min_samples - cdef np.intp_t num_points - cdef np.intp_t num_nodes - cdef np.intp_t num_features - - cdef public np.double_t[::1] core_distance - cdef public np.double_t[::1] bounds - cdef public np.intp_t[::1] component_of_point - cdef public np.intp_t[::1] component_of_node - cdef public np.intp_t[::1] candidate_neighbor - cdef public np.intp_t[::1] candidate_point - cdef public np.double_t[::1] candidate_distance - cdef public np.double_t[:, ::1] centroid_distances - cdef public np.intp_t[::1] idx_array - cdef public NodeData_t[::1] node_data - cdef BoruvkaUnionFind component_union_find - cdef np.ndarray edges - cdef np.intp_t num_edges - - cdef np.intp_t *component_of_point_ptr - cdef np.intp_t *component_of_node_ptr - cdef np.double_t *candidate_distance_ptr - cdef np.intp_t *candidate_neighbor_ptr - cdef np.intp_t *candidate_point_ptr - cdef np.double_t *core_distance_ptr - cdef np.double_t *bounds_ptr - - cdef np.ndarray components - cdef np.ndarray core_distance_arr - cdef np.ndarray bounds_arr - cdef np.ndarray _centroid_distances_arr - cdef np.ndarray component_of_point_arr - cdef np.ndarray component_of_node_arr - cdef np.ndarray candidate_point_arr - cdef np.ndarray candidate_neighbor_arr - cdef np.ndarray candidate_distance_arr - - def __init__(self, tree, min_samples=5, metric='euclidean', - alpha=1.0, leaf_size=20, approx_min_span_tree=False, n_jobs=4, - **kwargs): - - self.core_dist_tree = tree - self.tree = BallTree(tree.data, metric=metric, leaf_size=leaf_size, - **kwargs) - self._data = np.array(self.tree.data) - self._raw_data = self.tree.data - self.min_samples = min_samples - self.alpha = alpha - self.approx_min_span_tree = approx_min_span_tree - self.n_jobs = n_jobs - - self.num_points = self.tree.data.shape[0] - self.num_features = self.tree.data.shape[1] - self.num_nodes = self.tree.node_data.shape[0] - - self.dist = DistanceMetric.get_metric(metric, **kwargs) - - self.components = np.arange(self.num_points) - self.bounds_arr = np.empty(self.num_nodes, np.double) - self.component_of_point_arr = np.empty(self.num_points, dtype=np.intp) - self.component_of_node_arr = np.empty(self.num_nodes, dtype=np.intp) - self.candidate_neighbor_arr = np.empty(self.num_points, dtype=np.intp) - self.candidate_point_arr = np.empty(self.num_points, dtype=np.intp) - self.candidate_distance_arr = np.empty(self.num_points, - dtype=np.double) - self.component_union_find = BoruvkaUnionFind(self.num_points) - - self.edges = np.empty((self.num_points - 1, 3)) - self.num_edges = 0 - - self.idx_array = self.tree.idx_array - self.node_data = self.tree.node_data - - self.bounds = ( ( - self.bounds_arr.data)) - self.component_of_point = ( ( - self.component_of_point_arr.data)) - self.component_of_node = ( ( - self.component_of_node_arr.data)) - self.candidate_neighbor = ( ( - self.candidate_neighbor_arr.data)) - self.candidate_point = ( ( - self.candidate_point_arr.data)) - self.candidate_distance = ( ( - self.candidate_distance_arr.data)) - - self._centroid_distances_arr = self.dist.pairwise( - self.tree.node_bounds[0]) - self.centroid_distances = ( - ( - self._centroid_distances_arr.data)) - - self._initialize_components() - self._compute_bounds() - - # Set up fast pointer access to arrays - self.component_of_point_ptr = &self.component_of_point[0] - self.component_of_node_ptr = &self.component_of_node[0] - self.candidate_distance_ptr = &self.candidate_distance[0] - self.candidate_neighbor_ptr = &self.candidate_neighbor[0] - self.candidate_point_ptr = &self.candidate_point[0] - self.core_distance_ptr = &self.core_distance[0] - self.bounds_ptr = &self.bounds[0] - - cdef _compute_bounds(self): - """Initialize core distances""" - - cdef np.intp_t n - cdef np.intp_t i - cdef np.intp_t m - - cdef np.ndarray[np.double_t, ndim=2] knn_dist - cdef np.ndarray[np.intp_t, ndim=2] knn_indices - - if self.tree.data.shape[0] > 16384 and self.n_jobs > 1: - split_cnt = self.num_points // self.n_jobs - datasets = [] - for i in range(self.n_jobs): - if i == self.n_jobs - 1: - datasets.append(np.asarray(self.tree.data[i*split_cnt:])) - else: - datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt])) - - knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( - delayed(_core_dist_query) - (self.core_dist_tree, points, - self.min_samples + 1) - for points in datasets) - knn_dist = np.vstack([x[0] for x in knn_data]) - knn_indices = np.vstack([x[1] for x in knn_data]) - else: - knn_dist, knn_indices = self.core_dist_tree.query( - self.tree.data, - k=self.min_samples + 1, - dualtree=True, - breadth_first=True) - - self.core_distance_arr = knn_dist[:, self.min_samples].copy() - self.core_distance = ( ( - self.core_distance_arr.data)) - - # Since we already computed NN distances for the min_samples closest - # points we can use this to do the first round of boruvka -- we won't - # get every point due to core_distance/mutual reachability distance - # issues, but we'll get quite a few, and they are the hard ones to get, - # so fill in any we can and then run update components. - for n in range(self.num_points): - for i in range(0, self.min_samples + 1): - m = knn_indices[n, i] - if n == m: - continue - if self.core_distance[m] <= self.core_distance[n]: - self.candidate_point[n] = n - self.candidate_neighbor[n] = m - self.candidate_distance[n] = self.core_distance[n] - break - - self.update_components() - - for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX - - cdef _initialize_components(self): - """Initialize components of the min spanning tree (eventually there - is only one component; initially each point is its own component)""" - - cdef np.intp_t n - - for n in range(self.num_points): - self.component_of_point[n] = n - self.candidate_neighbor[n] = -1 - self.candidate_point[n] = -1 - self.candidate_distance[n] = DBL_MAX - - for n in range(self.num_nodes): - self.component_of_node[n] = -(n+1) + node1_info = self.node_data[left] + left_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + left, node2, + self.centroid_distances) + node1_info = self.node_data[right] + right_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + right, node2, + self.centroid_distances) - cdef update_components(self): - """Having found the nearest neighbor not in the same component for - each current component (via tree traversal), run through adding - edges to the min spanning tree and recomputing components via - union find.""" - - cdef np.intp_t source - cdef np.intp_t sink - cdef np.intp_t c - cdef np.intp_t component - cdef np.intp_t n - cdef np.intp_t i - cdef np.intp_t p - cdef np.intp_t current_component - cdef np.intp_t current_source_component - cdef np.intp_t current_sink_component - cdef np.intp_t child1 - cdef np.intp_t child2 - - cdef NodeData_t node_info - - # For each component there should be a: - # - candidate point (a point in the component) - # - candiate neighbor (the point to join with) - # - candidate_distance (the distance from point to neighbor) - # - # We will go through and and an edge to the edge list - # for each of these, and the union the two points - # together in the union find structure - - for c in range(self.components.shape[0]): - component = self.components[c] - source = self.candidate_point[component] - sink = self.candidate_neighbor[component] - if source == -1 or sink == -1: - continue - # raise ValueError('Source or sink of edge is not defined!') - current_source_component = self.component_union_find.find(source) - current_sink_component = self.component_union_find.find(sink) - if current_source_component == current_sink_component: - self.candidate_point[component] = -1 - self.candidate_neighbor[component] = -1 - self.candidate_distance[component] = DBL_MAX - continue - self.edges[self.num_edges, 0] = source - self.edges[self.num_edges, 1] = sink - self.edges[self.num_edges, 2] = self.candidate_distance[component] - self.num_edges += 1 - - self.component_union_find.union_(source, sink) - - self.candidate_distance[component] = DBL_MAX - if self.num_edges == self.num_points - 1: - self.components = self.component_union_find.components() - return self.components.shape[0] - - # After having joined everything in the union find data - # structure we need to go through and determine the components - # of each point for easy lookup. - # - # Have done that we then go through and set the component - # of each node, as this provides fast pruning in later - # tree traversals. - for n in range(self.tree.data.shape[0]): - self.component_of_point[n] = self.component_union_find.find(n) - - for n in range(self.tree.node_data.shape[0] - 1, -1, -1): - node_info = self.node_data[n] - # Case 1: - # If the node is a leaf we need to check that every point - # in the node is of the same component - if node_info.is_leaf: - current_component = self.component_of_point[self.idx_array[ - node_info.idx_start]] - for i in range(node_info.idx_start + 1, node_info.idx_end): - p = self.idx_array[i] - if self.component_of_point[p] != current_component: - break - else: - self.component_of_node[n] = current_component - # Case 2: - # If the node is not a leaf we only need to check - # that both child nodes are in the same component - else: - child1 = 2 * n + 1 - child2 = 2 * n + 2 - if self.component_of_node[child1] == self.component_of_node[child2]: - self.component_of_node[n] = self.component_of_node[child1] - - # Since we're working with mutual reachability distance we often have - # ties or near ties; because of that we can benefit by not resetting the - # bounds unless we get stuck (don't join any components). Thus - # we check for that, and only reset bounds in the case where we have - # the same number of components as we did going in. This doesn't - # produce a true min spanning tree, but only and approximation - # Thus only do this if the caller is willing to accept such - if self.approx_min_span_tree: - last_num_components = self.components.shape[0] - self.components = self.component_union_find.components() - - if self.components.shape[0] == last_num_components: - # Reset bounds - for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX - else: - self.components = self.component_union_find.components() - - for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX - - return self.components.shape[0] - - cdef int dual_tree_traversal(self, np.intp_t node1, - np.intp_t node2) except -1: - """Perform a dual tree traversal, pruning wherever possible, to find - the nearest neighbor not in the same component for each component. - This is akin to a standard dual tree NN search, but we also prune - whenever all points in query and reference nodes are in the same - component.""" - - cdef np.intp_t[::1] point_indices1, point_indices2 - - cdef np.intp_t i - cdef np.intp_t j - - cdef np.intp_t p - cdef np.intp_t q - - cdef np.intp_t parent - cdef np.intp_t child1 - cdef np.intp_t child2 - - cdef double node_dist - - cdef NodeData_t node1_info = self.node_data[node1] - cdef NodeData_t node2_info = self.node_data[node2] - cdef NodeData_t parent_info - cdef NodeData_t left_info - cdef NodeData_t right_info - - cdef np.intp_t component1 - cdef np.intp_t component2 - - cdef np.double_t *raw_data = ( &self._raw_data[0, 0]) - cdef np.double_t d - - cdef np.double_t mr_dist - - cdef np.double_t new_bound - cdef np.double_t new_upper_bound - cdef np.double_t new_lower_bound - cdef np.double_t bound_max - cdef np.double_t bound_min - - cdef np.intp_t left - cdef np.intp_t right - cdef np.double_t left_dist - cdef np.double_t right_dist - - node_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, node2, - self.centroid_distances) - - # If the distance between the nodes is less than the current bound for - # the query and the nodes are not in the same component continue; - # otherwise we get to prune this branch and return early. - if node_dist < self.bounds_ptr[node1]: - if self.component_of_node_ptr[node1] == self.component_of_node_ptr[ - node2] and self.component_of_node_ptr[node1] >= 0: - return 0 - else: - return 0 - - # Case 1: Both nodes are leaves - # for each pair of points in node1 x node2 we need - # to compute the distance and see if it better than - # the current nearest neighbor for the component of - # the point in the query node. - # - # We get to take some shortcuts: - # - if the core distance for a point is larger than - # the distance to the nearst neighbor of the - # component of the point ... then we can't get - # a better mutual reachability distance and we - # can skip computing anything for that point - # - if the points are in the same component we - # don't have to compute the distance. - # - # We also have some catches: - # - we need to compute mutual reachability distance - # not just the ordinary distance; this involves - # fiddling with core distances. - # - We need to scale distances according to alpha, - # but don't want to lose performance in the case - # that alpha is 1.0. - # - # Finally we can compute new bounds for the query node - # based on the distances found here, so do that and - # propagate the results up the tree. - if node1_info.is_leaf and node2_info.is_leaf: - - new_bound = 0.0 - - point_indices1 = self.idx_array[node1_info.idx_start: - node1_info.idx_end] - point_indices2 = self.idx_array[node2_info.idx_start: - node2_info.idx_end] - - for i in range(point_indices1.shape[0]): - - p = point_indices1[i] - component1 = self.component_of_point_ptr[p] - - if self.core_distance_ptr[p] > self.candidate_distance_ptr[ - component1]: - continue - - for j in range(point_indices2.shape[0]): - - q = point_indices2[j] - component2 = self.component_of_point_ptr[q] - - if self.core_distance_ptr[q] > self.candidate_distance_ptr[ - component1]: - continue - - if component1 != component2: - - d = self.dist.dist(&raw_data[self.num_features * p], - &raw_data[self.num_features * q], - self.num_features) * self.alpha - - if self.alpha != 1.0: - mr_dist = max(d / self.alpha, - self.core_distance_ptr[p], - self.core_distance_ptr[q]) - else: - mr_dist = max(d, self.core_distance_ptr[p], - self.core_distance_ptr[q]) - - if mr_dist < self.candidate_distance_ptr[component1]: - self.candidate_distance_ptr[component1] = mr_dist - self.candidate_neighbor_ptr[component1] = q - self.candidate_point_ptr[component1] = p - - new_upper_bound = max(new_upper_bound, - self.candidate_distance_ptr[component1]) - new_lower_bound = min(new_lower_bound, - self.candidate_distance_ptr[component1]) - - # Compute new bounds for the query node, and - # then propagate the results of that computation - # up the tree. - new_bound = min(new_upper_bound, - new_lower_bound + 2 * node1_info.radius) - if new_bound < self.bounds_ptr[node1]: - self.bounds_ptr[node1] = new_bound - - # Propagate bounds up the tree - while node1 > 0: - parent = (node1 - 1) // 2 - left = 2 * parent + 1 - right = 2 * parent + 2 - - parent_info = self.node_data[parent] - left_info = self.node_data[left] - right_info = self.node_data[right] - - bound_max = max(self.bounds_ptr[left], - self.bounds_ptr[right]) - bound_min = min(self.bounds_ptr[left] + 2 * - (parent_info.radius - left_info.radius), - self.bounds_ptr[right] + 2 * - (parent_info.radius - right_info.radius)) - - if bound_min > 0: - new_bound = min(bound_max, bound_min) - else: - new_bound = bound_max - - if new_bound < self.bounds_ptr[parent]: - self.bounds_ptr[parent] = new_bound - node1 = parent - else: - break - - # Case 2a: The query node is a leaf, or is smaller than - # the reference node. - # - # We descend in the reference tree. We first - # compute distances between nodes to determine - # whether we should prioritise the left or - # right branch in the reference tree. - elif node1_info.is_leaf or (not node2_info.is_leaf and - node2_info.radius > node1_info.radius): - - left = 2 * node2 + 1 - right = 2 * node2 + 2 - - node2_info = self.node_data[left] - - left_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, left, - self.centroid_distances) - - node2_info = self.node_data[right] - - right_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, right, - self.centroid_distances) - - if left_dist < right_dist: - self.dual_tree_traversal(node1, left) - self.dual_tree_traversal(node1, right) - else: - self.dual_tree_traversal(node1, right) - self.dual_tree_traversal(node1, left) - - # Case 2b: The reference node is a leaf, or is smaller than - # the query node. - # - # We descend in the query tree. We first - # compute distances between nodes to determine - # whether we should prioritise the left or - # right branch in the query tree. - else: - left = 2 * node1 + 1 - right = 2 * node1 + 2 - - node1_info = self.node_data[left] - - left_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - left, node2, - self.centroid_distances) - - node1_info = self.node_data[right] - - right_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - right, node2, - self.centroid_distances) if left_dist < right_dist: self.dual_tree_traversal(left, node2) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index d5591f7f22097..4be399e4b18ff 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -37,10 +37,37 @@ ) from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability -from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm +from ._hdbscan_boruvka import BoruvkaAlgorithm from sklearn.metrics._dist_metrics import DistanceMetric FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine"] +_PARAM_CONSTRAINTS = { + "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], + "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], + "cluster_selection_epsilon": [Interval(Real, left=0, right=None, closed="left")], + "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], + "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], + "alpha": [Interval(Real, left=0, right=None, closed="neither")], + "algorithm": [ + StrOptions( + { + "auto", + "best", + "generic", + "prims_kdtree", + "prims_balltree", + "boruvka_kdtree", + "boruvka_balltree", + } + ) + ], + "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], + "memory": [str, None, Path], + "n_jobs": [int], + "cluster_selection_method": [StrOptions({"eom", "leaf"})], + "allow_single_cluster": ["boolean"], + "metric_params": [dict, None], +} def _tree_to_labels( @@ -246,10 +273,9 @@ def _hdbscan_boruvka( f" but {min_samples+1=}, {n_samples=}" ) - alg = KDTreeBoruvkaAlgorithm if algo == "kd_tree" else BallTreeBoruvkaAlgorithm - out = alg( - tree, - min_samples, + out = BoruvkaAlgorithm( + tree=tree, + min_samples=min_samples, metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=True, @@ -316,34 +342,8 @@ def get_finite_row_indices(matrix): @validate_params( { + **_PARAM_CONSTRAINTS, "X": ["array-like", "sparse matrix"], - "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], - "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], - "cluster_selection_epsilon": [ - Interval(Real, left=0, right=None, closed="left") - ], - "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], - "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], - "alpha": [Interval(Real, left=0, right=None, closed="neither")], - "algorithm": [ - StrOptions( - { - "auto", - "best", - "generic", - "prims_kdtree", - "prims_balltree", - "boruvka_kdtree", - "boruvka_balltree", - } - ) - ], - "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], - "memory": [str, None, Path], - "n_jobs": [int], - "cluster_selection_method": [StrOptions({"eom", "leaf"})], - "allow_single_cluster": [bool], - "metric_params": [dict, None], } ) def hdbscan( @@ -756,35 +756,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): array([ 2, 6, -1, ..., -1, -1, -1]) """ - _parameter_constraints = { - "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], - "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], - "cluster_selection_epsilon": [ - Interval(Real, left=0, right=None, closed="left") - ], - "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], - "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], - "alpha": [Interval(Real, left=0, right=None, closed="neither")], - "algorithm": [ - StrOptions( - { - "auto", - "best", - "generic", - "prims_kdtree", - "prims_balltree", - "boruvka_kdtree", - "boruvka_balltree", - } - ) - ], - "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], - "memory": [str, None, Path], - "n_jobs": [int], - "cluster_selection_method": [StrOptions({"eom", "leaf"})], - "allow_single_cluster": [bool], - "metric_params": [dict, None], - } + _parameter_constraints = _PARAM_CONSTRAINTS def __init__( self, diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 865a82098849f..abaa6b8fdd356 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -214,7 +214,7 @@ def test_hdbscan_high_dimensional(): def test_hdbscan_best_balltree_metric(): kwargs = dict(metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}) - labels, _ = hdbscan(X, **kwargs) + labels, _, _ = hdbscan(X, **kwargs) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters From bf571d9774afd369c6f622d7b27474acfb4a8131 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 28 Jun 2022 14:23:28 -0400 Subject: [PATCH 078/160] Trimmed unnecessary mutual-reachability functions --- .../_hdbscan/_hdbscan_reachability.pyx | 114 ------------------ 1 file changed, 114 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx index e988a4155e9f6..4fa010f365f0a 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx @@ -94,117 +94,3 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, result[i, j] = max_dist return result.tocsr() - - -def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, - alpha=1.0, **kwargs): - dim = distance_matrix.shape[0] - min_points = min(dim - 1, min_points) - - if metric == 'minkowski': - tree = KDTree(X, metric=metric, p=p) - else: - tree = KDTree(X, metric=metric, **kwargs) - - core_distances = tree.query(X, k=min_points)[0][:, -1] - - if alpha != 1.0: - distance_matrix = distance_matrix / alpha - - stage1 = np.where(core_distances > distance_matrix, - core_distances, distance_matrix) - result = np.where(core_distances > stage1.T, - core_distances.T, stage1.T).T - return result - - -def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, - alpha=1.0, **kwargs): - dim = distance_matrix.shape[0] - min_points = min(dim - 1, min_points) - - tree = BallTree(X, metric=metric, **kwargs) - - core_distances = tree.query(X, k=min_points)[0][:, -1] - - if alpha != 1.0: - distance_matrix = distance_matrix / alpha - - stage1 = np.where(core_distances > distance_matrix, - core_distances, distance_matrix) - result = np.where(core_distances > stage1.T, - core_distances.T, stage1.T).T - return result - - -cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist( - np.ndarray[np.double_t, ndim=1] core_distances, - np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim): - - cdef np.intp_t i - cdef np.intp_t j - cdef np.intp_t result_pos - - result_pos = 0 - for i in range(dim): - for j in range(i + 1, dim): - if core_distances[i] > core_distances[j]: - if core_distances[i] > dists[result_pos]: - dists[result_pos] = core_distances[i] - - else: - if core_distances[j] > dists[result_pos]: - dists[result_pos] = core_distances[j] - - result_pos += 1 - - return dists - - -def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, - **kwargs): - - dim = X.shape[0] - min_points = min(dim - 1, min_points) - - if metric == 'minkowski': - tree = KDTree(X, metric=metric, p=p) - else: - tree = KDTree(X, metric=metric, **kwargs) - - core_distances = tree.query(X, k=min_points)[0][:, -1] - - del tree - gc.collect() - - dists = pdist(X, metric=metric, p=p, **kwargs) - - if alpha != 1.0: - dists /= alpha - - dists = mutual_reachability_from_pdist(core_distances, dists, dim) - - return dists - - -def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, - **kwargs): - - dim = X.shape[0] - min_points = min(dim - 1, min_points) - - tree = BallTree(X, metric=metric, **kwargs) - - core_distances = tree.query(X, k=min_points)[0][:, -1] - - del tree - gc.collect() - - dists = pdist(X, metric=metric, p=p, **kwargs) - - if alpha != 1.0: - dists /= alpha - - dists = mutual_reachability_from_pdist(core_distances, dists, dim) - - return dists From 997b4cb56b774b3842276d52b64c959912c38263 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 28 Jun 2022 14:29:19 -0400 Subject: [PATCH 079/160] Comments and minor cosmetics --- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx index 8fc3de214415c..4d167791b7a61 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -3,10 +3,8 @@ # License: 3-clause BSD # Code to implement a Dual Tree Boruvka Minimimum Spanning Tree computation -# The algorithm is largely tree independent, but fine details of handling -# different tree types has resulted in separate implementations. In -# due course this should be cleaned up to remove unnecessarily duplicated -# code, but it stands for now. +# The algorithm is largely tree independent, but some fine details still +# depend on the particular choice of tree. # # The core idea of the algorithm is to do repeated sweeps through the dataset, # adding edges to the tree with each sweep until a full tree is formed. @@ -27,8 +25,7 @@ # stages. Importantly, we can construct the full tree in O(log N) sweeps # and since each sweep has complexity equal to that of an all points # nearest neighbor query within the tree structure we are using we end -# up with sub-quadratic complexity at worst, and in the case of cover -# trees (still to be implemented) we can achieve O(N log N) complexity! +# up with sub-quadratic complexity at worst. # # This code is based on the papers: # @@ -44,8 +41,8 @@ # 2013, arXiv 1304.4327 # # As per the sklearn BallTree and KDTree implementations we make use of -# the rdist, which is a faster to compute notion of distance (for example -# in the euclidean case it is the distance squared). +# the rdist for KDTree, which is a faster-to-compute notion of distance +# (for example in the euclidean case it is the distance squared). # # To combine together components in between sweeps we make use of # a union find data structure. This is a separate implementation @@ -163,7 +160,7 @@ cdef inline np.double_t kdtree_min_rdist_dual( return rdist -cdef class BoruvkaUnionFind (object): +cdef class BoruvkaUnionFind(object): """Efficient union find implementation. Parameters @@ -240,7 +237,7 @@ cdef class BoruvkaUnionFind (object): def _core_dist_query(tree, data, min_samples): return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) -cdef class BoruvkaAlgorithm (object): +cdef class BoruvkaAlgorithm(object): """A Dual Tree Boruvka Algorithm implemented for the sklearn KDTree space tree implementation. From 6a7095c8d61295dcb04b67b84534d0b8c3629065 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 28 Jun 2022 14:38:23 -0400 Subject: [PATCH 080/160] Simplified tests wrt new validation mechanism --- .../cluster/_hdbscan/tests/test_hdbscan.py | 30 ++++--------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index abaa6b8fdd356..6f91ecada6859 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -283,31 +283,11 @@ def test_hdbscan_boruvka_matches(tree): assert (num_mismatches / float(data.shape[0])) < 0.15 -@pytest.mark.parametrize( - "kwargs, error", - [ - [{"X": "fail"}, ValueError], - [{"X": None}, ValueError], - [{"min_cluster_size": "fail"}, ValueError], - [{"min_samples": "fail"}, ValueError], - [{"min_samples": -1}, ValueError], - [{"metric": "imperial"}, ValueError], - [{"metric": None}, ValueError], - [{"metric": "precomputed", "algorithm": "boruvka_kdtree"}, ValueError], - [{"metric": "precomputed", "algorithm": "prims_kdtree"}, ValueError], - [{"metric": "precomputed", "algorithm": "boruvka_balltree"}, ValueError], - [{"metric": "precomputed", "algorithm": "prims_balltree"}, ValueError], - [{"alpha": -1}, ValueError], - [{"alpha": "fail"}, ValueError], - [{"leaf_size": 0}, ValueError], - [{"algorithm": "something_else"}, ValueError], - [{"metric": "minkowski", "metric_params": {"p": None}}, TypeError], - ], -) -def test_hdbscan_badargs(kwargs, error): - _X = kwargs.pop("X", X) - with pytest.raises(error): - hdbscan(_X, **kwargs) +@pytest.mark.parametrize("strategy", ["prims", "boruvka"]) +@pytest.mark.parametrize("tree", ["kd", "ball"]) +def test_hdbscan_precomputed_non_generic(strategy, tree): + with pytest.raises(ValueError): + hdbscan(X, metric="precomputed", algorithm=f"{strategy}_{tree}tree") def test_hdbscan_sparse(): From 54d71eb859e5ac4ba5598434158c0f15f148555f Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Wed, 29 Jun 2022 10:30:45 -0400 Subject: [PATCH 081/160] Update doc/modules/clustering.rst Co-authored-by: Thomas J. Fan --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 1ab2818e0680b..b4bf6114268e9 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1002,7 +1002,7 @@ HDBSCAN can be seen as an algorithm which performs DBSCAN* clustering across all values of `eps`. As mentioned prior, this is equivalent to finding the connected components of the mutual reachability graphs for all values of `eps`. To do this efficiently, HDBSCAN first extracts a minimum spanning tree (MST) from the fully --connected mutual reachability graph, then greedily cuts the edges with heighest +-connected mutual reachability graph, then greedily cuts the edges with highest weight. An outline of the HDBSCAN algorithm is as follows: 1. Extract the MST of :math:`G_{ms}` From 9162f624a4c2ad63f285197e5734d77ad012459e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 29 Jun 2022 10:35:35 -0400 Subject: [PATCH 082/160] Improved user guide entry wording per review feedback --- doc/modules/clustering.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 1ab2818e0680b..2ae7054569702 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -960,11 +960,10 @@ HDBSCAN The :class:`HDBSCAN` algorithm can be seen as an extension of :class:`DBSCAN` and :class:`OPTICS`. Specifically, DBSCAN asserts that the clustering criterion -(i.e. density requirement) is *globally homogeneous*. That is to say that if -there are clusters of differing density then DBSCAN may struggle to succesfully -capture them. HDBSCAN alleviates this assumption and explores all possible -density scales by building an alternative representation of the clustering -problem. +(i.e. density requirement) is *globally homogeneous*. In other words, DBSCAN +may struggle to successfully capture clusters with different densities. +HDBSCAN alleviates this assumption and explores all possible density scales by +building an alternative representation of the clustering problem. .. note:: From 4f5f5d6b93f3afec911a0bd4c563f969d724275e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 29 Jun 2022 16:34:48 -0400 Subject: [PATCH 083/160] Improved testing coverage --- sklearn/cluster/_hdbscan/hdbscan_.py | 47 +++------------ .../cluster/_hdbscan/tests/test_hdbscan.py | 60 +++++++++++++++++-- 2 files changed, 64 insertions(+), 43 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 4be399e4b18ff..4019f6419fb06 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -19,7 +19,6 @@ from joblib import Memory from warnings import warn from sklearn.utils import check_array, gen_batches, get_chunk_n_rows -from joblib.parallel import cpu_count from sklearn.utils._param_validation import Interval, StrOptions, validate_params from sklearn.neighbors import NearestNeighbors from scipy.sparse import csgraph @@ -40,7 +39,7 @@ from ._hdbscan_boruvka import BoruvkaAlgorithm from sklearn.metrics._dist_metrics import DistanceMetric -FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine"] +FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics _PARAM_CONSTRAINTS = { "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], @@ -110,9 +109,7 @@ def _hdbscan_generic( metric="euclidean", **metric_params, ): - if metric == "arccos": - distance_matrix = pairwise_distances(X, metric="cosine", **metric_params) - elif metric == "precomputed": + if metric == "precomputed": # Treating this case explicitly, instead of letting # sklearn.metrics.pairwise_distances handle it, # enables the usage of numpy.inf in the distance @@ -154,23 +151,12 @@ def _hdbscan_sparse_distance_matrix( **metric_params, ): assert issparse(X) - # Check for connected component on X - if csgraph.connected_components(X, directed=False, return_labels=False) > 1: - raise ValueError( - "Sparse distance matrix has multiple connected " - "components!\nThat is, there exist groups of points " - "that are completely disjoint -- there are no distance " - "relations connecting them\n" - "Run hdbscan on each component." - ) - - lil_matrix = X.tolil() # Compute sparse mutual reachability graph # if max_dist > 0, max distance to use when the reachability is infinite max_dist = metric_params.get("max_dist", 0.0) mutual_reachability_ = sparse_mutual_reachability( - lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha + X.tolil(), min_points=min_samples, max_dist=max_dist, alpha=alpha ) # Check connected component on mutual reachability # If more than one component, it means that even if the distance matrix X @@ -258,11 +244,6 @@ def _hdbscan_boruvka( **metric_params, ): leaf_size = max(leaf_size, 3) - - n_jobs = 1 if n_jobs == 0 else n_jobs - if n_jobs < 0: - n_jobs = max(cpu_count() + n_jobs + 1, 1) - Tree = KDTree if algo == "kd_tree" else BallTree tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) @@ -421,8 +402,8 @@ def hdbscan( `prims` approach is used. If the `X` passed during `fit` is sparse or `metric` is not a valid - metric for neither `KDTree` nor `BallTree` and is something other than - "cosine" and "arccos", then it resolves to use the `generic` algorithm. + metric for neither `KDTree` nor `BallTree` then it resolves to use + the `generic` algorithm. Available algorithms: - `'best'` @@ -506,11 +487,6 @@ def hdbscan( memory = Memory(location=memory, verbose=0) - size = X.shape[0] - min_samples = min(size - 1, min_samples) - if min_samples == 0: - min_samples = 1 - metric_params = metric_params or {} func = None kwargs = dict( @@ -554,11 +530,6 @@ def hdbscan( func = _hdbscan_boruvka kwargs.pop("alpha", None) kwargs["algo"] = "ball_tree" - else: - raise TypeError( - f"Unknown algorithm type {algorithm} specified. Please select a" - " supported algorithm." - ) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... @@ -650,8 +621,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): `prims` approach is used. If the `X` passed during `fit` is sparse or `metric` is not a valid - metric for neither `KDTree` nor `BallTree` and is something other than - "cosine" and "arccos", then it resolves to use the `generic` algorithm. + metric for neither `KDTree` nor `BallTree` then it resolves to use + the `generic` algorithm. Available algorithms: - `'best'` @@ -815,9 +786,7 @@ def fit(self, X, y=None): self._raw_data = X self._all_finite = ( - np.all(np.isfinite(X.tocoo().data)) - if issparse(X) - else np.all(np.isfinite(X)) + np.all(np.isfinite(X.data)) if issparse(X) else np.all(np.isfinite(X)) ) if not self._all_finite: diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 6f91ecada6859..d672908e359f8 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -224,17 +224,21 @@ def test_hdbscan_best_balltree_metric(): def test_hdbscan_no_clusters(): - labels = hdbscan(X, min_cluster_size=len(X) + 1)[0] + labels = hdbscan(X, min_cluster_size=len(X) - 1)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == 0 - labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_ + labels = HDBSCAN(min_cluster_size=len(X) - 1).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == 0 def test_hdbscan_min_cluster_size(): - for min_cluster_size in range(2, len(X) + 1, 1): + """ + Test that the smallest non-noise cluster has at least `min_cluster_size` + many points + """ + for min_cluster_size in range(2, len(X), 1): labels = hdbscan(X, min_cluster_size=min_cluster_size)[0] true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: @@ -260,7 +264,7 @@ def test_hdbscan_callable_metric(): def test_hdbscan_input_lists(): X = [[1.0, 2.0], [3.0, 4.0]] - HDBSCAN().fit(X) + HDBSCAN(min_samples=1).fit(X) @pytest.mark.parametrize("tree", ["kdtree", "balltree"]) @@ -286,8 +290,11 @@ def test_hdbscan_boruvka_matches(tree): @pytest.mark.parametrize("strategy", ["prims", "boruvka"]) @pytest.mark.parametrize("tree", ["kd", "ball"]) def test_hdbscan_precomputed_non_generic(strategy, tree): + hdb = HDBSCAN(metric="precomputed", algorithm=f"{strategy}_{tree}tree") with pytest.raises(ValueError): hdbscan(X, metric="precomputed", algorithm=f"{strategy}_{tree}tree") + with pytest.raises(ValueError): + hdb.fit(X) def test_hdbscan_sparse(): @@ -298,6 +305,18 @@ def test_hdbscan_sparse(): n_clusters = len(set(labels)) - int(-1 in labels) assert n_clusters == 3 + sparse_X_nan = sparse_X.copy() + sparse_X_nan[0, 0] = np.nan + labels = HDBSCAN().fit(sparse_X_nan).labels_ + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == 3 + + msg = "Sparse data matrices only support algorithm `generic`." + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="euclidean", algorithm="boruvka_balltree").fit(sparse_X) + with pytest.raises(ValueError, match=msg): + hdbscan(sparse_X, metric="euclidean", algorithm="boruvka_balltree") + def test_hdbscan_caching(tmp_path): @@ -372,3 +391,36 @@ def test_hdbscan_better_than_dbscan(): hdb = HDBSCAN().fit(X) n_clusters = len(set(hdb.labels_)) - int(-1 in hdb.labels_) assert n_clusters == 4 + + +def test_hdbscan_unfit_centers_errors(): + hdb = HDBSCAN() + msg = "Model has not been fit to data" + with pytest.raises(AttributeError, match=msg): + hdb.weighted_cluster_centroid(0) + with pytest.raises(AttributeError, match=msg): + hdb.weighted_cluster_medoid(0) + + +def test_hdbscan_precomputed_array_like(): + X = np.array([[1, np.inf], [np.inf, 1]]) + hdbscan(X, metric="precomputed") + + +@pytest.mark.parametrize("algo", ["boruvka_kdtree", "boruvka_balltree"]) +def test_hdbscan_min_samples_less_than_total(algo): + X = np.array([[1, 2], [2, 1]]) + + msg = "Expected min_samples" + with pytest.raises(ValueError, match=msg): + hdbscan(X, algorithm=algo, min_samples=3) + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm=algo, min_samples=3).fit(X) + + +def test_hdbscan_sparse_distances_too_few_nonzero(): + X = sparse.csr_matrix(np.zeros((10, 10))) + + msg = "There exists points with less than" + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed").fit(X) From 9abc237f96065e9cedaf87a817976dc052a65944 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 30 Jun 2022 18:16:24 -0400 Subject: [PATCH 084/160] Added initial changelog entry --- doc/whats_new/v1.2.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 6b36b64a98778..dac32995dc83d 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -63,6 +63,13 @@ Changelog :mod:`sklearn.cluster` ...................... +- |MajorFeature| Added :class:`cluster.HDBSCAN`, a modern hierarchical density-based + clustering algorithm. Similarly to :class:`cluster.OPTICS`, it can be seen as a + generalization of :class:`DBSCAN` by allowing for hierarchical instead of flat + clustering, however it varies in its approach from :class:`cluster.OPTICS`. This + algorithm is very robust to its hyperparameters and can be used on a wide + variety of data without much, if any, tuning. + - |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now accept sparse data type for input data. :pr:`14736` by :user:`Hunt Zhan `, :pr:`20802` by :user:`Brandon Pokorny `, @@ -153,7 +160,7 @@ Changelog negative likelihood ratios derived from the confusion matrix of a binary classification problem. :pr:`22518` by :user:`Arturo Amor `. - + - |Fix| :func:`metrics.ndcg_score` will now trigger a warning when the `y_true` value contains a negative value. Users may still use negative values, but the result may not be between 0 and 1. Starting in v1.4, passing in negative From 2d7c4c93bb3f9b8bd0334a45fe46d53052d3c4a7 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 30 Jun 2022 18:23:47 -0400 Subject: [PATCH 085/160] Added pr details in changelog entry --- doc/whats_new/v1.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index dac32995dc83d..b6e3e5705f73c 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -69,6 +69,7 @@ Changelog clustering, however it varies in its approach from :class:`cluster.OPTICS`. This algorithm is very robust to its hyperparameters and can be used on a wide variety of data without much, if any, tuning. + :pr:`22616` by :user:`Meekail Zain ` - |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now accept sparse data type for input data. :pr:`14736` by :user:`Hunt Zhan `, From 5e0bc41dab4d2104ddd4dd66bbb9d83b228520d3 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 4 Jul 2022 11:58:31 -0400 Subject: [PATCH 086/160] Trimmed extra function and modified comments --- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 11 ----------- sklearn/cluster/_hdbscan/hdbscan_.py | 7 +++---- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index 9e8b88393cb99..61fa43e2da60d 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -239,14 +239,3 @@ cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): U.union(aa, bb) return result_arr - - -cpdef np.ndarray[np.double_t, ndim=2] single_linkage(distance_matrix): - - cdef np.ndarray[np.double_t, ndim=2] hierarchy - cdef np.ndarray[np.double_t, ndim=2] for_labelling - - hierarchy = mst_linkage_core(distance_matrix) - for_labelling = hierarchy[np.argsort(hierarchy.T[2]), :] - - return label(for_labelling) diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 4019f6419fb06..45355c7d298aa 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -111,15 +111,14 @@ def _hdbscan_generic( ): if metric == "precomputed": # Treating this case explicitly, instead of letting - # sklearn.metrics.pairwise_distances handle it, - # enables the usage of numpy.inf in the distance - # matrix to indicate missing distance information. + # sklearn.metrics.pairwise_distances handle it, + # enables the usage of numpy.inf in the distance + # matrix to indicate missing distance information. distance_matrix = X else: distance_matrix = pairwise_distances(X, metric=metric, **metric_params) if issparse(distance_matrix): - # raise TypeError('Sparse distance matrices not yet supported') return _hdbscan_sparse_distance_matrix( distance_matrix, min_samples, From 0847be574195f12c2323bb1823f4853622792863 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Tue, 26 Jul 2022 09:30:30 -0400 Subject: [PATCH 087/160] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- doc/modules/clustering.rst | 15 ++++++++------- doc/whats_new/v1.2.rst | 9 +++++++-- sklearn/cluster/_hdbscan/hdbscan_.py | 1 - 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 39dbca4da82c6..4f19df57db311 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -959,16 +959,17 @@ HDBSCAN ======= The :class:`HDBSCAN` algorithm can be seen as an extension of :class:`DBSCAN` -and :class:`OPTICS`. Specifically, DBSCAN asserts that the clustering criterion -(i.e. density requirement) is *globally homogeneous*. In other words, DBSCAN -may struggle to successfully capture clusters with different densities. -HDBSCAN alleviates this assumption and explores all possible density scales by -building an alternative representation of the clustering problem. +and :class:`OPTICS`. Specifically, :class:`DBSCAN` asserts that the clustering +criterion (i.e. density requirement) is *globally homogeneous*. +In other words, :class:`DBSCAN` may struggle to successfully capture clusters +with different densities. +:class:`HDBSCAN` alleviates this assumption and explores all possible density +scales by building an alternative representation of the clustering problem. .. note:: - This implementation is adapted from the work done in - https://github.com/scikit-learn-contrib/hdbscan + This implementation is adapted from the original implementation of HDBSCAN, + `scikit-learn-contrib/hdbscan `_. Mutual Reachability Graph ------------------------- diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index b6e3e5705f73c..8ebd17ed24352 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -67,8 +67,13 @@ Changelog clustering algorithm. Similarly to :class:`cluster.OPTICS`, it can be seen as a generalization of :class:`DBSCAN` by allowing for hierarchical instead of flat clustering, however it varies in its approach from :class:`cluster.OPTICS`. This - algorithm is very robust to its hyperparameters and can be used on a wide - variety of data without much, if any, tuning. + algorithm is very robust with respect to its hyperparameters' values and can + be used on a wide variety of data without much, if any, tuning. + + This implementation is an adaptation from the original implementation of HDBSCAN in + `scikit-learn-contrib/hdbscan `_, + by :user:`Leland McInnes ` et al. + :pr:`22616` by :user:`Meekail Zain ` - |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 45355c7d298aa..03f4d159c69eb 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -564,7 +564,6 @@ def hdbscan( ) -# Inherits from sklearn class HDBSCAN(ClusterMixin, BaseEstimator): """Perform HDBSCAN clustering from vector array or distance matrix. From 1c9a76ab3924dd8879952d42bbd5c549456e0d11 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Jul 2022 19:23:59 -0400 Subject: [PATCH 088/160] Applied isort (with black on top) --- doc/modules/clustering.rst | 2 +- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 5 ++-- .../_hdbscan/_hdbscan_reachability.pyx | 9 ++++-- sklearn/cluster/_hdbscan/_hdbscan_tree.pyx | 3 ++ sklearn/cluster/_hdbscan/hdbscan_.py | 30 ++++++++----------- .../cluster/_hdbscan/tests/test_hdbscan.py | 21 +++++++------ 6 files changed, 35 insertions(+), 35 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 4f19df57db311..6ef029a1c17a9 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1052,7 +1052,7 @@ simplify the hyperparameter space. Based on Hierarchical Density Estimates. In: Pei, J., Tseng, V.S., Cao, L., Motoda, H., Xu, G. (eds) Advances in Knowledge Discovery and Data Mining. PAKDD 2013. Lecture Notes in Computer Science(), vol 7819. Springer, Berlin, - Heidelberg. https://doi.org/10.1007/978-3-642-37456-2_14 + Heidelberg. :doi:`Accelerated Hierarchical Density Based Clustering <10.1109/ICDMW.2017.12>` .. _optics: diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx index 4d167791b7a61..18a582a4594f1 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx @@ -51,17 +51,18 @@ # is a simpler version of the structure. import numpy as np -cimport numpy as np +cimport numpy as np from libc.float cimport DBL_MAX from libc.math cimport fabs, pow -from sklearn.neighbors import KDTree, BallTree +from sklearn.neighbors import BallTree, KDTree from sklearn.metrics._dist_metrics cimport DistanceMetric from joblib import Parallel, delayed + cdef np.double_t INF = np.inf diff --git a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx index 4fa010f365f0a..0d6c8618c53be 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx @@ -3,13 +3,16 @@ # License: 3-clause BSD import numpy as np + cimport numpy as np -from scipy.spatial.distance import pdist, squareform -from scipy.sparse import lil_matrix as sparse_matrix -from sklearn.neighbors import KDTree, BallTree import gc +from scipy.sparse import lil_matrix as sparse_matrix +from scipy.spatial.distance import pdist, squareform + +from sklearn.neighbors import BallTree, KDTree + def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): """Compute the weighted adjacency matrix of the mutual reachability diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx index 2c1bc3991c785..a7c3541cc9dcc 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx @@ -3,9 +3,12 @@ # License: 3-clause BSD import numpy as np + cimport numpy as np + import cython + cdef np.double_t INFTY = np.inf diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 03f4d159c69eb..dfdae2a924fc8 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -8,36 +8,30 @@ # # License: BSD 3 clause -from numbers import Real, Integral -import numpy as np +from numbers import Integral, Real from pathlib import Path +from warnings import warn + +import numpy as np +from joblib import Memory +from scipy.sparse import csgraph, issparse from sklearn.base import BaseEstimator, ClusterMixin from sklearn.metrics import pairwise_distances -from scipy.sparse import issparse -from sklearn.neighbors import KDTree, BallTree -from joblib import Memory -from warnings import warn +from sklearn.metrics._dist_metrics import DistanceMetric +from sklearn.neighbors import BallTree, KDTree, NearestNeighbors from sklearn.utils import check_array, gen_batches, get_chunk_n_rows from sklearn.utils._param_validation import Interval, StrOptions, validate_params -from sklearn.neighbors import NearestNeighbors -from scipy.sparse import csgraph -from ._hdbscan_linkage import ( - mst_linkage_core, - mst_linkage_core_vector, - label, -) +from ._hdbscan_boruvka import BoruvkaAlgorithm +from ._hdbscan_linkage import label, mst_linkage_core, mst_linkage_core_vector +from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_tree import ( - condense_tree, compute_stability, + condense_tree, get_clusters, labelling_at_cut, ) -from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability - -from ._hdbscan_boruvka import BoruvkaAlgorithm -from sklearn.metrics._dist_metrics import DistanceMetric FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics _PARAM_CONSTRAINTS = { diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index d672908e359f8..90fb43d860c93 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -3,21 +3,20 @@ Based on the DBSCAN test code """ import numpy as np +import pytest +from scipy import sparse, stats from scipy.spatial import distance -from scipy import sparse -from scipy import stats -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.cluster import HDBSCAN, hdbscan -from sklearn.metrics import fowlkes_mallows_score -from sklearn.datasets import make_blobs -from sklearn.utils import shuffle -from sklearn.preprocessing import StandardScaler from scipy.stats import mode -from sklearn.metrics.pairwise import _VALID_METRICS -from sklearn.neighbors import KDTree, BallTree -import pytest from sklearn import datasets +from sklearn.cluster import HDBSCAN, hdbscan +from sklearn.datasets import make_blobs +from sklearn.metrics import fowlkes_mallows_score +from sklearn.metrics.pairwise import _VALID_METRICS +from sklearn.neighbors import BallTree, KDTree +from sklearn.preprocessing import StandardScaler +from sklearn.utils import shuffle +from sklearn.utils._testing import assert_array_almost_equal n_clusters = 3 # X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) From c01a6099639094d44a304d15f7392765c36b7e43 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 26 Aug 2022 12:19:33 -0400 Subject: [PATCH 089/160] Stylistic improvements --- sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx | 161 +++++++++--------- .../_hdbscan/_hdbscan_reachability.pyx | 10 +- sklearn/cluster/_hdbscan/hdbscan_.py | 46 +++-- .../cluster/_hdbscan/tests/test_hdbscan.py | 12 +- 4 files changed, 117 insertions(+), 112 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx index 61fa43e2da60d..7dd12ec0d2873 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx @@ -3,7 +3,7 @@ # License: 3-clause BSD import numpy as np -cimport numpy as np +cimport numpy as cnp import cython from libc.float cimport DBL_MAX @@ -12,23 +12,24 @@ from libc.stdio cimport printf from sklearn.metrics._dist_metrics cimport DistanceMetric -cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( - np.ndarray[np.double_t, - ndim=2] distance_matrix): +cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_linkage_core( + cnp.ndarray[cnp.double_t, ndim=2] distance_matrix +): - cdef np.ndarray[np.intp_t, ndim=1] node_labels - cdef np.ndarray[np.intp_t, ndim=1] current_labels - cdef np.ndarray[np.double_t, ndim=1] current_distances - cdef np.ndarray[np.double_t, ndim=1] left - cdef np.ndarray[np.double_t, ndim=1] right - cdef np.ndarray[np.double_t, ndim=2] result + cdef: + cnp.ndarray[cnp.intp_t, ndim=1] node_labels + cnp.ndarray[cnp.intp_t, ndim=1] current_labels + cnp.ndarray[cnp.double_t, ndim=1] current_distances + cnp.ndarray[cnp.double_t, ndim=1] left + cnp.ndarray[cnp.double_t, ndim=1] right + cnp.ndarray[cnp.double_t, ndim=2] result - cdef np.ndarray label_filter + cnp.ndarray label_filter - cdef np.intp_t current_node - cdef np.intp_t new_node_index - cdef np.intp_t new_node - cdef np.intp_t i + cnp.intp_t current_node + cnp.intp_t new_node_index + cnp.intp_t new_node + cnp.intp_t i result = np.zeros((distance_matrix.shape[0] - 1, 3)) node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) @@ -52,49 +53,51 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( return result -cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( - np.ndarray[np.double_t, ndim=2, mode='c'] raw_data, - np.ndarray[np.double_t, ndim=1, mode='c'] core_distances, - DistanceMetric dist_metric, - np.double_t alpha=1.0): - - cdef np.ndarray[np.double_t, ndim=1] current_distances_arr - cdef np.ndarray[np.double_t, ndim=1] current_sources_arr - cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr - cdef np.ndarray[np.double_t, ndim=2] result_arr - - cdef np.double_t * current_distances - cdef np.double_t * current_sources - cdef np.double_t * current_core_distances - cdef np.double_t * raw_data_ptr - cdef np.int8_t * in_tree - cdef np.double_t[:, ::1] raw_data_view - cdef np.double_t[:, ::1] result - - cdef np.ndarray label_filter - - cdef np.intp_t current_node - cdef np.intp_t source_node - cdef np.intp_t right_node - cdef np.intp_t left_node - cdef np.intp_t new_node - cdef np.intp_t i - cdef np.intp_t j - cdef np.intp_t dim - cdef np.intp_t num_features - - cdef double current_node_core_distance - cdef double right_value - cdef double left_value - cdef double core_value - cdef double new_distance +cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_linkage_core_vector( + cnp.ndarray[cnp.double_t, ndim=2, mode='c'] raw_data, + cnp.ndarray[cnp.double_t, ndim=1, mode='c'] core_distances, + DistanceMetric dist_metric, + cnp.double_t alpha=1.0 +): + + cdef: + cnp.ndarray[cnp.double_t, ndim=1] current_distances_arr + cnp.ndarray[cnp.double_t, ndim=1] current_sources_arr + cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr + cnp.ndarray[cnp.double_t, ndim=2] result_arr + + cnp.double_t * current_distances + cnp.double_t * current_sources + cnp.double_t * current_core_distances + cnp.double_t * raw_data_ptr + cnp.int8_t * in_tree + cnp.double_t[:, ::1] raw_data_view + cnp.double_t[:, ::1] result + + cnp.ndarray label_filter + + cnp.intp_t current_node + cnp.intp_t source_node + cnp.intp_t right_node + cnp.intp_t left_node + cnp.intp_t new_node + cnp.intp_t i + cnp.intp_t j + cnp.intp_t dim + cnp.intp_t num_features + + double current_node_core_distance + double right_value + double left_value + double core_value + double new_distance dim = raw_data.shape[0] num_features = raw_data.shape[1] - raw_data_view = ( ( - raw_data.data)) - raw_data_ptr = ( &raw_data_view[0, 0]) + raw_data_view = ( ( + raw_data.data)) + raw_data_ptr = ( &raw_data_view[0, 0]) result_arr = np.zeros((dim - 1, 3)) in_tree_arr = np.zeros(dim, dtype=np.int8) @@ -102,11 +105,11 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( current_distances_arr = np.infty * np.ones(dim) current_sources_arr = np.ones(dim) - result = ( ( result_arr.data)) - in_tree = ( in_tree_arr.data) - current_distances = ( current_distances_arr.data) - current_sources = ( current_sources_arr.data) - current_core_distances = ( core_distances.data) + result = ( ( result_arr.data)) + in_tree = ( in_tree_arr.data) + current_distances = ( current_distances_arr.data) + current_sources = ( current_sources_arr.data) + current_core_distances = ( core_distances.data) for i in range(1, dim): @@ -174,21 +177,22 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( cdef class UnionFind (object): - cdef np.ndarray parent_arr - cdef np.ndarray size_arr - cdef np.intp_t next_label - cdef np.intp_t *parent - cdef np.intp_t *size + cdef: + cnp.ndarray parent_arr + cnp.ndarray size_arr + cnp.intp_t next_label + cnp.intp_t *parent + cnp.intp_t *size def __init__(self, N): self.parent_arr = -1 * np.ones(2 * N - 1, dtype=np.intp, order='C') self.next_label = N self.size_arr = np.hstack((np.ones(N, dtype=np.intp), np.zeros(N-1, dtype=np.intp))) - self.parent = ( self.parent_arr.data) - self.size = ( self.size_arr.data) + self.parent = ( self.parent_arr.data) + self.size = ( self.size_arr.data) - cdef void union(self, np.intp_t m, np.intp_t n): + cdef void union(self, cnp.intp_t m, cnp.intp_t n): self.size[self.next_label] = self.size[m] + self.size[n] self.parent[m] = self.next_label self.parent[n] = self.next_label @@ -198,8 +202,8 @@ cdef class UnionFind (object): return @cython.wraparound(True) - cdef np.intp_t fast_find(self, np.intp_t n): - cdef np.intp_t p + cdef cnp.intp_t fast_find(self, cnp.intp_t n): + cdef cnp.intp_t p p = n while self.parent_arr[n] != -1: n = self.parent_arr[n] @@ -209,24 +213,25 @@ cdef class UnionFind (object): return n @cython.wraparound(True) -cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L): +cpdef cnp.ndarray[cnp.double_t, ndim=2] label(cnp.double_t[:,:] L): - cdef np.ndarray[np.double_t, ndim=2] result_arr - cdef np.double_t[:, ::1] result + cdef: + cnp.ndarray[cnp.double_t, ndim=2] result_arr + cnp.double_t[:, ::1] result - cdef np.intp_t N, a, aa, b, bb, index - cdef np.double_t delta + cnp.intp_t N, a, aa, b, bb, index + cnp.double_t delta result_arr = np.zeros((L.shape[0], L.shape[1] + 1)) - result = ( ( - result_arr.data)) + result = ( ( + result_arr.data)) N = L.shape[0] + 1 U = UnionFind(N) for index in range(L.shape[0]): - a = L[index, 0] - b = L[index, 1] + a = L[index, 0] + b = L[index, 1] delta = L[index, 2] aa, bb = U.fast_find(a), U.fast_find(b) diff --git a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx index 0d6c8618c53be..c643ca80fad7d 100644 --- a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx @@ -14,7 +14,7 @@ from scipy.spatial.distance import pdist, squareform from sklearn.neighbors import BallTree, KDTree -def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): +def mutual_reachability(distance_matrix, min_points=5, alpha=None): """Compute the weighted adjacency matrix of the mutual reachability graph of a distance matrix. @@ -23,10 +23,14 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): distance_matrix : ndarray, shape (n_samples, n_samples) Array of distances between samples. - min_points : int, optional (default=5) + min_points : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. + alpha : float, default=None + A distance scaling parameter as used in robust single linkage. This + divides the distances when calculating mutual reachability. + Returns ------- mututal_reachability: ndarray, shape (n_samples, n_samples) @@ -49,7 +53,7 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): core_distances = np.sort(distance_matrix, axis=0)[min_points] - if alpha != 1.0: + if alpha is not None: distance_matrix = distance_matrix / alpha stage1 = np.where(core_distances > distance_matrix, diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index dfdae2a924fc8..2af24fabb9aad 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -45,8 +45,7 @@ StrOptions( { "auto", - "best", - "generic", + "brute", "prims_kdtree", "prims_balltree", "boruvka_kdtree", @@ -96,7 +95,7 @@ def _process_mst(min_spanning_tree): return label(min_spanning_tree) -def _hdbscan_generic( +def _hdbscan_brute( X, min_samples=5, alpha=1.0, @@ -374,10 +373,10 @@ def hdbscan( feature array. - If metric is a string or callable, it must be one of - the options allowed by `metrics.pairwise.pairwise_distances` for its - metric parameter. + the options allowed by :func:`metrics.pairwise.pairwise_distances` + for its metric parameter. - - If metric is "precomputed", X is assumed to be a distance matrix and + - If metric is "precomputed", `X` is assumed to be a distance matrix and must be square. leaf_size : int, default=40 @@ -390,17 +389,15 @@ def hdbscan( Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to `'auto'` which attempts to use a `KDTree` method if possible, - otherwise it uses a `BallTree` method. If the `X` passed during `fit` - has `n_features>60` then a `boruvka` approach is used, otherwise a - `prims` approach is used. + otherwise it uses a `BallTree` method. If `X` has `n_features>60` + then a `boruvka` approach is used, otherwise a `prims` approach is + used. - If the `X` passed during `fit` is sparse or `metric` is not a valid - metric for neither `KDTree` nor `BallTree` then it resolves to use - the `generic` algorithm. + If `X` is sparse or `metric` is invalid for both `KDTree` and + `BallTree`, then it resolves to use the `brute` algorithm. Available algorithms: - - `'best'` - - `'generic'` + - `'brute'` - `'prims_kdtree'` - `'prims_balltree'` - `'boruvka_kdtree'` @@ -504,11 +501,11 @@ def hdbscan( ) if algorithm != "auto": - if metric != "precomputed" and issparse(X) and algorithm != "generic": - raise ValueError("Sparse data matrices only support algorithm `generic`.") + if metric != "precomputed" and issparse(X) and algorithm != "brute": + raise ValueError("Sparse data matrices only support algorithm `brute`.") - if algorithm == "generic": - func = _hdbscan_generic + if algorithm == "brute": + func = _hdbscan_brute for key in ("algo", "leaf_size", "n_jobs"): kwargs.pop(key, None) elif algorithm == "prims_kdtree": @@ -526,7 +523,7 @@ def hdbscan( else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... - func = _hdbscan_generic + func = _hdbscan_brute for key in ("algo", "leaf_size", "n_jobs"): kwargs.pop(key, None) elif metric in KDTree.valid_metrics: @@ -612,13 +609,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): has `n_features>60` then a `boruvka` approach is used, otherwise a `prims` approach is used. - If the `X` passed during `fit` is sparse or `metric` is not a valid - metric for neither `KDTree` nor `BallTree` then it resolves to use - the `generic` algorithm. + If the `X` passed during `fit` is sparse or `metric` is invalid for + both `KDTree` and `BallTree`, then it resolves to use the `brute` + algorithm. Available algorithms: - - `'best'` - - `'generic'` + - `'brute'` - `'prims_kdtree'` - `'prims_balltree'` - `'boruvka_kdtree'` @@ -628,7 +624,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Leaf size for trees responsible for fast nearest neighbour queries. A large dataset size and small leaf_size may induce excessive memory usage. If you are running out of memory consider increasing the - `leaf_size` parameter. Ignored for `algorithm=generic`. + `leaf_size` parameter. Ignored for `algorithm=brute`. memory : str, default=None Used to cache the output of the computation of the tree. diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 90fb43d860c93..0d30db9c286ab 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -131,7 +131,7 @@ def test_hdbscan_feature_vector(): "prims_balltree", "boruvka_kdtree", "boruvka_balltree", - "generic", + "brute", "auto", ], ) @@ -157,7 +157,7 @@ def test_hdbscan_algorithms(algo, metric): "minkowski": {"p": 2}, "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, } - if algo not in ("auto", "generic"): + if algo not in ("auto", "brute"): if metric not in ALGOS_TREES[algo].valid_metrics: with pytest.raises(ValueError): hdbscan( @@ -271,14 +271,14 @@ def test_hdbscan_boruvka_matches(tree): data = generate_noisy_data() - labels_prims = hdbscan(data, algorithm="generic")[0] + labels_prims = hdbscan(data, algorithm="brute")[0] labels_boruvka = hdbscan(data, algorithm=f"boruvka_{tree}")[0] num_mismatches = homogeneity(labels_prims, labels_boruvka) assert (num_mismatches / float(data.shape[0])) < 0.15 - labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) + labels_prims = HDBSCAN(algorithm="brute").fit_predict(data) labels_boruvka = HDBSCAN(algorithm=f"boruvka_{tree}").fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) @@ -288,7 +288,7 @@ def test_hdbscan_boruvka_matches(tree): @pytest.mark.parametrize("strategy", ["prims", "boruvka"]) @pytest.mark.parametrize("tree", ["kd", "ball"]) -def test_hdbscan_precomputed_non_generic(strategy, tree): +def test_hdbscan_precomputed_non_brute(strategy, tree): hdb = HDBSCAN(metric="precomputed", algorithm=f"{strategy}_{tree}tree") with pytest.raises(ValueError): hdbscan(X, metric="precomputed", algorithm=f"{strategy}_{tree}tree") @@ -310,7 +310,7 @@ def test_hdbscan_sparse(): n_clusters = len(set(labels)) - int(-1 in labels) assert n_clusters == 3 - msg = "Sparse data matrices only support algorithm `generic`." + msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): HDBSCAN(metric="euclidean", algorithm="boruvka_balltree").fit(sparse_X) with pytest.raises(ValueError, match=msg): From b7736ef6db1650ba4c3d8d830348aacfe5589015 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 26 Aug 2022 12:28:20 -0400 Subject: [PATCH 090/160] Removed boruvka algorithm --- sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx | 887 ------------------ sklearn/cluster/_hdbscan/hdbscan_.py | 93 +- .../cluster/_hdbscan/tests/test_hdbscan.py | 46 +- 3 files changed, 20 insertions(+), 1006 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx diff --git a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx b/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx deleted file mode 100644 index 18a582a4594f1..0000000000000 --- a/sklearn/cluster/_hdbscan/_hdbscan_boruvka.pyx +++ /dev/null @@ -1,887 +0,0 @@ -# Minimum spanning tree single linkage implementation for hdbscan -# Authors: Leland McInnes -# License: 3-clause BSD - -# Code to implement a Dual Tree Boruvka Minimimum Spanning Tree computation -# The algorithm is largely tree independent, but some fine details still -# depend on the particular choice of tree. -# -# The core idea of the algorithm is to do repeated sweeps through the dataset, -# adding edges to the tree with each sweep until a full tree is formed. -# To do this, start with each node (or point) existing in it's own component. -# On each sweep find all the edges of minimum weight (in this instance -# of minimal mutual reachability distance) that join separate components. -# Add all these edges to the list of edges in the spanning tree, and then -# combine together all the components joined by edges. Begin the next sweep ... -# -# Eventually we end up with only one component, and all edges in we added -# form the minimum spanning tree. The key insight is that each sweep is -# essentially akin to a nearest neighbor search (with the caveat about being -# in separate components), and so can be performed very efficiently using -# a space tree such as a kdtree or ball tree. By using a dual tree formalism -# with a query tree and reference tree we can prune when all points im the -# query node are in the same component, as are all the points of the reference -# node. This allows for rapid pruning in the dual tree traversal in later -# stages. Importantly, we can construct the full tree in O(log N) sweeps -# and since each sweep has complexity equal to that of an all points -# nearest neighbor query within the tree structure we are using we end -# up with sub-quadratic complexity at worst. -# -# This code is based on the papers: -# -# Fast Euclidean Minimum Spanning Tree: Algorithm, analysis, and applications -# William B. March, Parikshit Ram, Alexander Gray -# Conference: Proceedings of the 16th ACM SIGKDD International Conference on -# Knowledge Discovery and Data Mining -# 2010 -# -# Tree-Independent Dual-Tree Algorithms -# Ryan R. Curtin, William B. March, Parikshit Ram, David V. Anderson, -# Alexander G. Gray, Charles L. Isbell Jr -# 2013, arXiv 1304.4327 -# -# As per the sklearn BallTree and KDTree implementations we make use of -# the rdist for KDTree, which is a faster-to-compute notion of distance -# (for example in the euclidean case it is the distance squared). -# -# To combine together components in between sweeps we make use of -# a union find data structure. This is a separate implementation -# from that used in the labelling of the single linkage tree as -# we can perform more specific optimizations here for what -# is a simpler version of the structure. - -import numpy as np - -cimport numpy as np -from libc.float cimport DBL_MAX -from libc.math cimport fabs, pow - -from sklearn.neighbors import BallTree, KDTree - -from sklearn.metrics._dist_metrics cimport DistanceMetric - -from joblib import Parallel, delayed - - -cdef np.double_t INF = np.inf - - -# Define the NodeData struct used in sklearn trees for faster -# access to the node data internals in Cython. -cdef struct NodeData_t: - np.intp_t idx_start - np.intp_t idx_end - np.intp_t is_leaf - np.double_t radius - - -# Define a function giving the minimum distance between two -# nodes of a ball tree -cdef inline np.double_t balltree_min_dist_dual( - np.double_t radius1, - np.double_t radius2, - np.intp_t node1, - np.intp_t node2, - np.double_t[:, ::1] centroid_dist) nogil except -1: - - cdef np.double_t dist_pt = centroid_dist[node1, node2] - return max(0, (dist_pt - radius1 - radius2)) - - -# Define a function giving the minimum distance between two -# nodes of a kd-tree -cdef inline np.double_t kdtree_min_dist_dual( - DistanceMetric metric, - np.intp_t node1, - np.intp_t node2, - np.double_t[:, :, ::1] node_bounds, - np.intp_t num_features) except -1: - - cdef np.double_t d, d1, d2, rdist = 0.0 - cdef np.double_t zero = 0.0 - cdef np.intp_t j - - if metric.p == INF: - for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) - d = (d1 + fabs(d1)) + (d2 + fabs(d2)) - - rdist = max(rdist, 0.5 * d) - else: - # here we'll use the fact that x + abs(x) = 2 * max(x, 0) - for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) - d = (d1 + fabs(d1)) + (d2 + fabs(d2)) - - rdist += pow(0.5 * d, metric.p) - - return metric._rdist_to_dist(rdist) - - -# As above, but this time we use the rdist as per the kdtree -# implementation. This allows us to release the GIL over -# larger sections of code -cdef inline np.double_t kdtree_min_rdist_dual( - DistanceMetric metric, - np.intp_t node1, - np.intp_t node2, - np.double_t[:, :, ::1] node_bounds, - np.intp_t num_features) nogil except -1: - - cdef np.double_t d, d1, d2, rdist = 0.0 - cdef np.double_t zero = 0.0 - cdef np.intp_t j - - if metric.p == INF: - for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) - d = (d1 + fabs(d1)) + (d2 + fabs(d2)) - - rdist = max(rdist, 0.5 * d) - else: - # here we'll use the fact that x + abs(x) = 2 * max(x, 0) - for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) - d = (d1 + fabs(d1)) + (d2 + fabs(d2)) - - rdist += pow(0.5 * d, metric.p) - - return rdist - - -cdef class BoruvkaUnionFind(object): - """Efficient union find implementation. - - Parameters - ---------- - - size : int - The total size of the set of objects to - track via the union find structure. - - Attributes - ---------- - - is_component : array of bool; shape (size, 1) - Array specifying whether each element of the - set is the root node, or identifier for - a component. - """ - - cdef np.ndarray _parent_arr - cdef np.intp_t[::1] _parent - cdef np.ndarray _rank_arr - cdef np.uint8_t[::1] _rank - cdef np.ndarray is_component - - def __init__(self, size): - self._parent_arr = np.arange(size, dtype=np.intp) - self._parent = ( ( - self._parent_arr.data)) - self._rank_arr = np.zeros(size, dtype=np.uint8) - self._rank = ( ( - self._rank_arr.data)) - self.is_component = np.ones(size, dtype=bool) - - cdef int union_(self, np.intp_t x, np.intp_t y) except -1: - """Union together elements x and y""" - cdef np.intp_t x_root = self.find(x) - cdef np.intp_t y_root = self.find(y) - - if x_root == y_root: - return 0 - - if self._rank[x_root] < self._rank[y_root]: - self._parent[x_root] = y_root - self.is_component[x_root] = False - elif self._rank[x_root] > self._rank[y_root]: - self._parent[y_root] = x_root - self.is_component[y_root] = False - else: - self._rank[x_root] += 1 - self._parent[y_root] = x_root - self.is_component[y_root] = False - - return 0 - - cdef np.intp_t find(self, np.intp_t x) except -1: - """Find the root or identifier for the component that x is in""" - cdef np.intp_t x_parent - cdef np.intp_t x_grandparent - - x_parent = self._parent[x] - while True: - if x_parent == x: - return x - x_grandparent = self._parent[x_parent] - self._parent[x] = x_grandparent - x = x_parent - x_parent = x_grandparent - - cdef np.ndarray[np.intp_t, ndim=1] components(self): - """Return an array of all component roots/identifiers""" - return self.is_component.nonzero()[0] - - -def _core_dist_query(tree, data, min_samples): - return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) - -cdef class BoruvkaAlgorithm(object): - """A Dual Tree Boruvka Algorithm implemented for the sklearn - KDTree space tree implementation. - - Parameters - ---------- - - tree : KDTree - The kd-tree to run Dual Tree Boruvka over. - - min_samples : int, optional (default= 5) - The min_samples parameter of HDBSCAN used to - determine core distances. - - metric : string, optional (default='euclidean') - The metric used to compute distances for the tree - - leaf_size : int, optional (default=20) - The Boruvka algorithm benefits from a smaller leaf size than - standard kd-tree nearest neighbor searches. The tree passed in - is used for a kNN search for core distance. A second tree is - constructed with a smaller leaf size for Boruvka; this is that - leaf size. - - alpha : float, optional (default=1.0) - The alpha distance scaling parameter as per Robust Single Linkage. - - approx_min_span_tree : bool, optional (default=False) - Take shortcuts and only approximate the min spanning tree. - This is considerably faster but does not return a true - minimal spanning tree. - - n_jobs : int, optional (default=4) - The number of parallel jobs used to compute core distances. - - **kwargs : - Keyword args passed to the metric. - """ - - cdef object tree - cdef object core_dist_tree - cdef DistanceMetric dist - cdef np.ndarray _data - cdef readonly const np.double_t[:, ::1] _raw_data - cdef np.double_t[:, :, ::1] node_bounds - cdef np.double_t alpha - cdef np.int8_t approx_min_span_tree - cdef np.intp_t n_jobs - cdef np.intp_t min_samples - cdef np.intp_t num_points - cdef np.intp_t num_nodes - cdef np.intp_t num_features - cdef bint is_KDTree - - cdef public np.double_t[::1] core_distance - cdef public np.double_t[::1] bounds - cdef public np.intp_t[::1] component_of_point - cdef public np.intp_t[::1] component_of_node - cdef public np.intp_t[::1] candidate_neighbor - cdef public np.intp_t[::1] candidate_point - cdef public np.double_t[::1] candidate_distance - cdef public np.double_t[:, ::1] centroid_distances - cdef public np.intp_t[::1] idx_array - cdef public NodeData_t[::1] node_data - cdef BoruvkaUnionFind component_union_find - cdef np.ndarray edges - cdef np.intp_t num_edges - - cdef np.intp_t *component_of_point_ptr - cdef np.intp_t *component_of_node_ptr - cdef np.double_t *candidate_distance_ptr - cdef np.intp_t *candidate_neighbor_ptr - cdef np.intp_t *candidate_point_ptr - cdef np.double_t *core_distance_ptr - cdef np.double_t *bounds_ptr - - cdef np.ndarray components - cdef np.ndarray core_distance_arr - cdef np.ndarray bounds_arr - cdef np.ndarray _centroid_distances_arr - cdef np.ndarray component_of_point_arr - cdef np.ndarray component_of_node_arr - cdef np.ndarray candidate_point_arr - cdef np.ndarray candidate_neighbor_arr - cdef np.ndarray candidate_distance_arr - - def __init__(self, tree, min_samples=5, metric='euclidean', leaf_size=20, - alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): - - self.core_dist_tree = tree - self.tree = tree - self.is_KDTree = isinstance(tree, KDTree) - self._data = np.array(self.tree.data) - self._raw_data = self.tree.data - self.node_bounds = self.tree.node_bounds - self.min_samples = min_samples - self.alpha = alpha - self.approx_min_span_tree = approx_min_span_tree - self.n_jobs = n_jobs - - self.num_points = self.tree.data.shape[0] - self.num_features = self.tree.data.shape[1] - self.num_nodes = self.tree.node_data.shape[0] - - self.dist = DistanceMetric.get_metric(metric, **kwargs) - - self.components = np.arange(self.num_points) - self.bounds_arr = np.empty(self.num_nodes, np.double) - self.component_of_point_arr = np.empty(self.num_points, dtype=np.intp) - self.component_of_node_arr = np.empty(self.num_nodes, dtype=np.intp) - self.candidate_neighbor_arr = np.empty(self.num_points, dtype=np.intp) - self.candidate_point_arr = np.empty(self.num_points, dtype=np.intp) - self.candidate_distance_arr = np.empty(self.num_points, - dtype=np.double) - self.component_union_find = BoruvkaUnionFind(self.num_points) - - self.edges = np.empty((self.num_points - 1, 3)) - self.num_edges = 0 - - self.idx_array = self.tree.idx_array - self.node_data = self.tree.node_data - - self.bounds = ( ( - self.bounds_arr.data)) - self.component_of_point = ( ( - self.component_of_point_arr.data)) - self.component_of_node = ( ( - self.component_of_node_arr.data)) - self.candidate_neighbor = ( ( - self.candidate_neighbor_arr.data)) - self.candidate_point = ( ( - self.candidate_point_arr.data)) - self.candidate_distance = ( ( - self.candidate_distance_arr.data)) - - if not self.is_KDTree: - # Compute centroids for BallTree - self._centroid_distances_arr = self.dist.pairwise(self.tree.node_bounds[0]) - self.centroid_distances = ( - ( - - self._centroid_distances_arr.data)) - - self._initialize_components() - self._compute_bounds() - - # Set up fast pointer access to arrays - self.component_of_point_ptr = &self.component_of_point[0] - self.component_of_node_ptr = &self.component_of_node[0] - self.candidate_distance_ptr = &self.candidate_distance[0] - self.candidate_neighbor_ptr = &self.candidate_neighbor[0] - self.candidate_point_ptr = &self.candidate_point[0] - self.core_distance_ptr = &self.core_distance[0] - self.bounds_ptr = &self.bounds[0] - - cdef _compute_bounds(self): - """Initialize core distances""" - - cdef np.intp_t n - cdef np.intp_t i - cdef np.intp_t m - - cdef np.ndarray[np.double_t, ndim=2] knn_dist - cdef np.ndarray[np.intp_t, ndim=2] knn_indices - - # A shortcut: if we have a lot of points then we can split the points - # into four piles and query them in parallel. On multicore systems - # (most systems) this amounts to a 2x-3x wall clock improvement. - if self.tree.data.shape[0] > 16384 and self.n_jobs > 1: - split_cnt = self.num_points // self.n_jobs - datasets = [] - for i in range(self.n_jobs): - if i == self.n_jobs - 1: - datasets.append(np.asarray(self.tree.data[i*split_cnt:])) - else: - datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt])) - - knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( - delayed(_core_dist_query) - (self.core_dist_tree, points, - self.min_samples + 1) - for points in datasets) - knn_dist = np.vstack([x[0] for x in knn_data]) - knn_indices = np.vstack([x[1] for x in knn_data]) - else: - knn_dist, knn_indices = self.core_dist_tree.query( - self.tree.data, - k=self.min_samples + 1, - dualtree=True, - breadth_first=True) - - self.core_distance_arr = knn_dist[:, self.min_samples].copy() - self.core_distance = ( ( - self.core_distance_arr.data)) - - - if self.is_KDTree: - # Since we do everything in terms of rdist to free up the GIL - # we need to convert all the core distances beforehand - # to make comparison feasible. - for n in range(self.num_points): - self.core_distance[n] = self.dist._dist_to_rdist( - self.core_distance[n]) - - # Since we already computed NN distances for the min_samples closest - # points we can use this to do the first round of boruvka -- we won't - # get every point due to core_distance/mutual reachability distance - # issues, but we'll get quite a few, and they are the hard ones to - # get, so fill in any we can and then run update components. - for n in range(self.num_points): - for i in range(0, self.min_samples + 1): - m = knn_indices[n, i] - if n == m: - continue - if self.core_distance[m] <= self.core_distance[n]: - self.candidate_point[n] = n - self.candidate_neighbor[n] = m - self.candidate_distance[n] = self.core_distance[n] - break - - self.update_components() - - for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX - - cdef _initialize_components(self): - """Initialize components of the min spanning tree (eventually there - is only one component; initially each point is its own component)""" - - cdef np.intp_t n - - for n in range(self.num_points): - self.component_of_point[n] = n - self.candidate_neighbor[n] = -1 - self.candidate_point[n] = -1 - self.candidate_distance[n] = DBL_MAX - - for n in range(self.num_nodes): - self.component_of_node[n] = -(n+1) - - cdef int update_components(self) except -1: - """Having found the nearest neighbor not in the same component for - each current component (via tree traversal), run through adding - edges to the min spanning tree and recomputing components via - union find.""" - - cdef np.intp_t source - cdef np.intp_t sink - cdef np.intp_t c - cdef np.intp_t component - cdef np.intp_t n - cdef np.intp_t i - cdef np.intp_t p - cdef np.intp_t current_component - cdef np.intp_t current_source_component - cdef np.intp_t current_sink_component - cdef np.intp_t child1 - cdef np.intp_t child2 - - cdef NodeData_t node_info - - # For each component there should be a: - # - candidate point (a point in the component) - # - candiate neighbor (the point to join with) - # - candidate_distance (the distance from point to neighbor) - # - # We will go through and and an edge to the edge list - # for each of these, and the union the two points - # together in the union find structure - - for c in range(self.components.shape[0]): - component = self.components[c] - source = self.candidate_point[component] - sink = self.candidate_neighbor[component] - if source == -1 or sink == -1: - continue - # raise ValueError('Source or sink of edge is not defined!') - current_source_component = self.component_union_find.find(source) - current_sink_component = self.component_union_find.find(sink) - if current_source_component == current_sink_component: - # We've already joined these, so ignore this edge - self.candidate_point[component] = -1 - self.candidate_neighbor[component] = -1 - self.candidate_distance[component] = DBL_MAX - continue - self.edges[self.num_edges, 0] = source - self.edges[self.num_edges, 1] = sink - if self.is_KDTree: - self.edges[self.num_edges, 2] = self.dist._rdist_to_dist( - self.candidate_distance[component]) - else: - self.edges[self.num_edges, 2] = self.candidate_distance[component] - self.num_edges += 1 - - self.component_union_find.union_(source, sink) - - # Reset everything,and check if we're done - self.candidate_distance[component] = DBL_MAX - if self.num_edges == self.num_points - 1: - self.components = self.component_union_find.components() - return self.components.shape[0] - - # After having joined everything in the union find data - # structure we need to go through and determine the components - # of each point for easy lookup. - # - # Have done that we then go through and set the component - # of each node, as this provides fast pruning in later - # tree traversals. - for n in range(self.tree.data.shape[0]): - self.component_of_point[n] = self.component_union_find.find(n) - - for n in range(self.tree.node_data.shape[0] - 1, -1, -1): - node_info = self.node_data[n] - # Case 1: - # If the node is a leaf we need to check that every point - # in the node is of the same component - if node_info.is_leaf: - current_component = self.component_of_point[ - self.idx_array[node_info.idx_start]] - for i in range(node_info.idx_start + 1, node_info.idx_end): - p = self.idx_array[i] - if self.component_of_point[p] != current_component: - break - else: - self.component_of_node[n] = current_component - # Case 2: - # If the node is not a leaf we only need to check - # that both child nodes are in the same component - else: - child1 = 2 * n + 1 - child2 = 2 * n + 2 - if (self.component_of_node[child1] == - self.component_of_node[child2]): - self.component_of_node[n] = self.component_of_node[child1] - - # Since we're working with mutual reachability distance we often have - # ties or near ties; because of that we can benefit by not resetting - # the bounds unless we get stuck (don't join any components). Thus - # we check for that, and only reset bounds in the case where we have - # the same number of components as we did going in. This doesn't - # produce a true min spanning tree, but only and approximation - # Thus only do this if the caller is willing to accept such - if self.approx_min_span_tree: - last_num_components = self.components.shape[0] - self.components = self.component_union_find.components() - - if self.components.shape[0] == last_num_components: - # Reset bounds - for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX - else: - self.components = self.component_union_find.components() - - for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX - - return self.components.shape[0] - - cdef int dual_tree_traversal(self, np.intp_t node1, - np.intp_t node2) nogil except -1: - """Perform a dual tree traversal, pruning wherever possible, to find - the nearest neighbor not in the same component for each component. - This is akin to a standard dual tree NN search, but we also prune - whenever all points in query and reference nodes are in the same - component.""" - - cdef np.intp_t[::1] point_indices1, point_indices2 - - cdef np.intp_t i - cdef np.intp_t j - - cdef np.intp_t p - cdef np.intp_t q - - cdef np.intp_t parent - cdef np.intp_t child1 - cdef np.intp_t child2 - - cdef double node_dist - - cdef NodeData_t node1_info = self.node_data[node1] - cdef NodeData_t node2_info = self.node_data[node2] - cdef NodeData_t parent_info - cdef NodeData_t left_info - cdef NodeData_t right_info - - cdef np.intp_t component1 - cdef np.intp_t component2 - - cdef np.double_t *raw_data = ( &self._raw_data[0, 0]) - cdef np.double_t d - - cdef np.double_t mr_dist - cdef np.double_t _radius - - cdef np.double_t new_bound - cdef np.double_t new_upper_bound - cdef np.double_t new_lower_bound - cdef np.double_t bound_max - cdef np.double_t bound_min - - cdef np.intp_t left - cdef np.intp_t right - cdef np.double_t left_dist - cdef np.double_t right_dist - - # Compute the distance between the query and reference nodes - if self.is_KDTree: - node_dist = kdtree_min_rdist_dual(self.dist, - node1, node2, self.node_bounds, - self.num_features) - else: #BallTree - node_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, node2, - self.centroid_distances) - - - # If the distance between the nodes is less than the current bound for - # the query and the nodes are not in the same component continue; - # otherwise we get to prune this branch and return early. - if node_dist < self.bounds_ptr[node1]: - if (self.component_of_node_ptr[node1] == - self.component_of_node_ptr[node2] and - self.component_of_node_ptr[node1] >= 0): - return 0 - else: - return 0 - - # Case 1: Both nodes are leaves - # for each pair of points in node1 x node2 we need - # to compute the distance and see if it better than - # the current nearest neighbor for the component of - # the point in the query node. - # - # We get to take some shortcuts: - # - if the core distance for a point is larger than - # the distance to the nearst neighbor of the - # component of the point ... then we can't get - # a better mutual reachability distance and we - # can skip computing anything for that point - # - if the points are in the same component we - # don't have to compute the distance. - # - # We also have some catches: - # - we need to compute mutual reachability distance - # not just the ordinary distance; this involves - # fiddling with core distances. - # - We need to scale distances according to alpha, - # but don't want to lose performance in the case - # that alpha is 1.0. - # - # Finally we can compute new bounds for the query node - # based on the distances found here, so do that and - # propagate the results up the tree. - if node1_info.is_leaf and node2_info.is_leaf: - - new_upper_bound = 0.0 - new_lower_bound = DBL_MAX - - point_indices1 = self.idx_array[node1_info.idx_start: - node1_info.idx_end] - point_indices2 = self.idx_array[node2_info.idx_start: - node2_info.idx_end] - - for i in range(point_indices1.shape[0]): - - p = point_indices1[i] - component1 = self.component_of_point_ptr[p] - - if (self.core_distance_ptr[p] > - self.candidate_distance_ptr[component1]): - continue - - for j in range(point_indices2.shape[0]): - - q = point_indices2[j] - component2 = self.component_of_point_ptr[q] - - if (self.core_distance_ptr[q] > - self.candidate_distance_ptr[component1]): - continue - - if component1 != component2: - if self.is_KDTree: - d = self.dist.rdist(&raw_data[self.num_features * p], - &raw_data[self.num_features * q], - self.num_features) - else: - d = self.dist.dist(&raw_data[self.num_features * p], - &raw_data[self.num_features * q], - self.num_features) * self.alpha - if self.alpha != 1.0: - mr_dist = max(d / self.alpha, - self.core_distance_ptr[p], - self.core_distance_ptr[q]) - else: - mr_dist = max(d, self.core_distance_ptr[p], - self.core_distance_ptr[q]) - if mr_dist < self.candidate_distance_ptr[component1]: - self.candidate_distance_ptr[component1] = mr_dist - self.candidate_neighbor_ptr[component1] = q - self.candidate_point_ptr[component1] = p - - new_upper_bound = max(new_upper_bound, - self.candidate_distance_ptr[component1]) - new_lower_bound = min(new_lower_bound, - self.candidate_distance_ptr[component1]) - - # Compute new bounds for the query node, and - # then propagate the results of that computation - # up the tree. - _radius = self.dist._dist_to_rdist(node1_info.radius) if self.is_KDTree else node1_info.radius - new_bound = min(new_upper_bound, - new_lower_bound + 2 * _radius) - if new_bound < self.bounds_ptr[node1]: - self.bounds_ptr[node1] = new_bound - - # Propagate bounds up the tree - while node1 > 0: - parent = (node1 - 1) // 2 - left = 2 * parent + 1 - right = 2 * parent + 2 - - parent_info = self.node_data[parent] - left_info = self.node_data[left] - right_info = self.node_data[right] - - bound_max = max(self.bounds_ptr[left], - self.bounds_ptr[right]) - - if self.is_KDTree: - new_bound = bound_max - else: - bound_min = min(self.bounds_ptr[left] + 2 * - (parent_info.radius - left_info.radius), - self.bounds_ptr[right] + 2 * - (parent_info.radius - right_info.radius)) - - if bound_min > 0: - new_bound = min(bound_max, bound_min) - else: - new_bound = bound_max - if new_bound < self.bounds_ptr[parent]: - self.bounds_ptr[parent] = new_bound - node1 = parent - else: - break - - # Case 2a: The query node is a leaf, or is smaller than - # the reference node. - # - # We descend in the reference tree. We first - # compute distances between nodes to determine - # whether we should prioritise the left or - # right branch in the reference tree. - elif node1_info.is_leaf or (not node2_info.is_leaf and - node2_info.radius > node1_info.radius): - - left = 2 * node2 + 1 - right = 2 * node2 + 2 - - if self.is_KDTree: - left_dist = kdtree_min_rdist_dual(self.dist, - node1, left, - self.node_bounds, - self.num_features) - right_dist = kdtree_min_rdist_dual(self.dist, - node1, right, - self.node_bounds, - self.num_features) - else: - node2_info = self.node_data[left] - left_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, left, - self.centroid_distances) - node2_info = self.node_data[right] - right_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, right, - self.centroid_distances) - - if left_dist < right_dist: - self.dual_tree_traversal(node1, left) - self.dual_tree_traversal(node1, right) - else: - self.dual_tree_traversal(node1, right) - self.dual_tree_traversal(node1, left) - - # Case 2b: The reference node is a leaf, or is smaller than - # the query node. - # - # We descend in the query tree. We first - # compute distances between nodes to determine - # whether we should prioritise the left or - # right branch in the query tree. - else: - left = 2 * node1 + 1 - right = 2 * node1 + 2 - if self.is_KDTree: - left_dist = kdtree_min_rdist_dual(self.dist, - left, node2, - self.node_bounds, - self.num_features) - right_dist = kdtree_min_rdist_dual(self.dist, - right, node2, - self.node_bounds, - self.num_features) - else: - node1_info = self.node_data[left] - left_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - left, node2, - self.centroid_distances) - node1_info = self.node_data[right] - right_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - right, node2, - self.centroid_distances) - - - if left_dist < right_dist: - self.dual_tree_traversal(left, node2) - self.dual_tree_traversal(right, node2) - else: - self.dual_tree_traversal(right, node2) - self.dual_tree_traversal(left, node2) - - return 0 - - cpdef spanning_tree(self): - """Compute the minimum spanning tree of the data held by - the tree passed in at construction""" - - cdef np.intp_t num_components - cdef np.intp_t num_nodes - - num_components = self.tree.data.shape[0] - num_nodes = self.tree.node_data.shape[0] - while num_components > 1: - self.dual_tree_traversal(0, 0) - num_components = self.update_components() - - return self.edges diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan_.py index 2af24fabb9aad..fbe4f9d4e4671 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan_.py @@ -23,7 +23,6 @@ from sklearn.utils import check_array, gen_batches, get_chunk_n_rows from sklearn.utils._param_validation import Interval, StrOptions, validate_params -from ._hdbscan_boruvka import BoruvkaAlgorithm from ._hdbscan_linkage import label, mst_linkage_core, mst_linkage_core_vector from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability from ._hdbscan_tree import ( @@ -46,10 +45,8 @@ { "auto", "brute", - "prims_kdtree", - "prims_balltree", - "boruvka_kdtree", - "boruvka_balltree", + "kdtree", + "balltree", } ) ], @@ -226,40 +223,6 @@ def _hdbscan_prims( return _process_mst(min_spanning_tree) -def _hdbscan_boruvka( - X, - algo, - min_samples=5, - metric="euclidean", - leaf_size=40, - n_jobs=4, - **metric_params, -): - leaf_size = max(leaf_size, 3) - Tree = KDTree if algo == "kd_tree" else BallTree - tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) - - n_samples = X.shape[0] - if min_samples + 1 > n_samples: - raise ValueError( - "Expected min_samples + 1 <= n_samples, " - f" but {min_samples+1=}, {n_samples=}" - ) - - out = BoruvkaAlgorithm( - tree=tree, - min_samples=min_samples, - metric=metric, - leaf_size=leaf_size // 3, - approx_min_span_tree=True, - n_jobs=n_jobs, - **metric_params, - ) - min_spanning_tree = out.spanning_tree() - - return _process_mst(min_spanning_tree) - - def remap_single_linkage_tree(tree, internal_to_raw, outliers): """ Takes an internal single_linkage_tree structure and adds back in a set of points @@ -388,20 +351,16 @@ def hdbscan( algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a `KDTree` method if possible, - otherwise it uses a `BallTree` method. If `X` has `n_features>60` - then a `boruvka` approach is used, otherwise a `prims` approach is - used. + to `'auto'` which attempts to use a `KDTree` tree if possible, + otherwise it uses a `BallTree` tree. If `X` is sparse or `metric` is invalid for both `KDTree` and `BallTree`, then it resolves to use the `brute` algorithm. Available algorithms: - `'brute'` - - `'prims_kdtree'` - - `'prims_balltree'` - - `'boruvka_kdtree'` - - `'boruvka_balltree'` + - `'kdtree'` + - `'balltree'` memory : str, default=None Used to cache the output of the computation of the tree. @@ -508,18 +467,11 @@ def hdbscan( func = _hdbscan_brute for key in ("algo", "leaf_size", "n_jobs"): kwargs.pop(key, None) - elif algorithm == "prims_kdtree": + elif algorithm == "kdtree": func = _hdbscan_prims - elif algorithm == "prims_balltree": + elif algorithm == "balltree": func = _hdbscan_prims kwargs["algo"] = "ball_tree" - elif algorithm == "boruvka_kdtree": - func = _hdbscan_boruvka - kwargs.pop("alpha", None) - elif algorithm == "boruvka_balltree": - func = _hdbscan_boruvka - kwargs.pop("alpha", None) - kwargs["algo"] = "ball_tree" else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... @@ -527,21 +479,10 @@ def hdbscan( for key in ("algo", "leaf_size", "n_jobs"): kwargs.pop(key, None) elif metric in KDTree.valid_metrics: - # TO DO: Need heuristic to decide when to go to boruvka - if X.shape[1] > 60: - func = _hdbscan_prims - else: - func = _hdbscan_boruvka - kwargs.pop("alpha", None) + func = _hdbscan_prims else: # Metric is a valid BallTree metric - # TO DO: Need heuristic to decide when to go to boruvka; - if X.shape[1] > 60: - func = _hdbscan_prims - kwargs["algo"] = "ball_tree" - else: - func = _hdbscan_boruvka - kwargs.pop("alpha", None) - kwargs["algo"] = "ball_tree" + func = _hdbscan_prims + kwargs["algo"] = "ball_tree" single_linkage_tree = memory.cache(func)(**kwargs) @@ -604,10 +545,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a `KDTree` method if possible, - otherwise it uses a `BallTree` method. If the `X` passed during `fit` - has `n_features>60` then a `boruvka` approach is used, otherwise a - `prims` approach is used. + to `'auto'` which attempts to use a `KDTree` tree if possible, + otherwise it uses a `BallTree` tree. If the `X` passed during `fit` is sparse or `metric` is invalid for both `KDTree` and `BallTree`, then it resolves to use the `brute` @@ -615,10 +554,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Available algorithms: - `'brute'` - - `'prims_kdtree'` - - `'prims_balltree'` - - `'boruvka_kdtree'` - - `'boruvka_balltree'` + - `'kdtree'` + - `'balltree'` leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries. A diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 0d30db9c286ab..d49c70f1e5e5d 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -129,8 +129,6 @@ def test_hdbscan_feature_vector(): [ "prims_kdtree", "prims_balltree", - "boruvka_kdtree", - "boruvka_balltree", "brute", "auto", ], @@ -148,8 +146,6 @@ def test_hdbscan_algorithms(algo, metric): ALGOS_TREES = { "prims_kdtree": KDTree, "prims_balltree": BallTree, - "boruvka_kdtree": KDTree, - "boruvka_balltree": BallTree, } METRIC_PARAMS = { "mahalanobis": {"V": np.eye(X.shape[1])}, @@ -266,32 +262,11 @@ def test_hdbscan_input_lists(): HDBSCAN(min_samples=1).fit(X) -@pytest.mark.parametrize("tree", ["kdtree", "balltree"]) -def test_hdbscan_boruvka_matches(tree): - - data = generate_noisy_data() - - labels_prims = hdbscan(data, algorithm="brute")[0] - labels_boruvka = hdbscan(data, algorithm=f"boruvka_{tree}")[0] - - num_mismatches = homogeneity(labels_prims, labels_boruvka) - - assert (num_mismatches / float(data.shape[0])) < 0.15 - - labels_prims = HDBSCAN(algorithm="brute").fit_predict(data) - labels_boruvka = HDBSCAN(algorithm=f"boruvka_{tree}").fit_predict(data) - - num_mismatches = homogeneity(labels_prims, labels_boruvka) - - assert (num_mismatches / float(data.shape[0])) < 0.15 - - -@pytest.mark.parametrize("strategy", ["prims", "boruvka"]) @pytest.mark.parametrize("tree", ["kd", "ball"]) -def test_hdbscan_precomputed_non_brute(strategy, tree): - hdb = HDBSCAN(metric="precomputed", algorithm=f"{strategy}_{tree}tree") +def test_hdbscan_precomputed_non_brute(tree): + hdb = HDBSCAN(metric="precomputed", algorithm=f"prims_{tree}tree") with pytest.raises(ValueError): - hdbscan(X, metric="precomputed", algorithm=f"{strategy}_{tree}tree") + hdbscan(X, metric="precomputed", algorithm=f"prims_{tree}tree") with pytest.raises(ValueError): hdb.fit(X) @@ -312,9 +287,9 @@ def test_hdbscan_sparse(): msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="boruvka_balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="prims_balltree").fit(sparse_X) with pytest.raises(ValueError, match=msg): - hdbscan(sparse_X, metric="euclidean", algorithm="boruvka_balltree") + hdbscan(sparse_X, metric="euclidean", algorithm="prims_balltree") def test_hdbscan_caching(tmp_path): @@ -406,17 +381,6 @@ def test_hdbscan_precomputed_array_like(): hdbscan(X, metric="precomputed") -@pytest.mark.parametrize("algo", ["boruvka_kdtree", "boruvka_balltree"]) -def test_hdbscan_min_samples_less_than_total(algo): - X = np.array([[1, 2], [2, 1]]) - - msg = "Expected min_samples" - with pytest.raises(ValueError, match=msg): - hdbscan(X, algorithm=algo, min_samples=3) - with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm=algo, min_samples=3).fit(X) - - def test_hdbscan_sparse_distances_too_few_nonzero(): X = sparse.csr_matrix(np.zeros((10, 10))) From 84484eae90ec264925dcdbd59d7e8d9ba7beb398 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 26 Aug 2022 12:36:11 -0400 Subject: [PATCH 091/160] Refactored file names and setup file --- sklearn/cluster/__init__.py | 2 +- .../{_hdbscan_linkage.pyx => _linkage.pyx} | 0 ...can_reachability.pyx => _reachability.pyx} | 0 .../_hdbscan/{_hdbscan_tree.pyx => _tree.pyx} | 0 .../_hdbscan/{hdbscan_.py => hdbscan.py} | 6 +-- sklearn/cluster/_hdbscan/setup.py | 42 +++++++++++++++++++ .../cluster/_hdbscan/tests/test_hdbscan.py | 6 +-- sklearn/cluster/setup.py | 26 ------------ 8 files changed, 48 insertions(+), 34 deletions(-) rename sklearn/cluster/_hdbscan/{_hdbscan_linkage.pyx => _linkage.pyx} (100%) rename sklearn/cluster/_hdbscan/{_hdbscan_reachability.pyx => _reachability.pyx} (100%) rename sklearn/cluster/_hdbscan/{_hdbscan_tree.pyx => _tree.pyx} (100%) rename sklearn/cluster/_hdbscan/{hdbscan_.py => hdbscan.py} (99%) create mode 100644 sklearn/cluster/_hdbscan/setup.py diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 69977a0489ba0..f3c9a8ee2c20f 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -23,7 +23,7 @@ ) from ._bicluster import SpectralBiclustering, SpectralCoclustering from ._birch import Birch -from ._hdbscan.hdbscan_ import HDBSCAN, hdbscan +from ._hdbscan.hdbscan import HDBSCAN, hdbscan __all__ = [ "AffinityPropagation", diff --git a/sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx similarity index 100% rename from sklearn/cluster/_hdbscan/_hdbscan_linkage.pyx rename to sklearn/cluster/_hdbscan/_linkage.pyx diff --git a/sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx similarity index 100% rename from sklearn/cluster/_hdbscan/_hdbscan_reachability.pyx rename to sklearn/cluster/_hdbscan/_reachability.pyx diff --git a/sklearn/cluster/_hdbscan/_hdbscan_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx similarity index 100% rename from sklearn/cluster/_hdbscan/_hdbscan_tree.pyx rename to sklearn/cluster/_hdbscan/_tree.pyx diff --git a/sklearn/cluster/_hdbscan/hdbscan_.py b/sklearn/cluster/_hdbscan/hdbscan.py similarity index 99% rename from sklearn/cluster/_hdbscan/hdbscan_.py rename to sklearn/cluster/_hdbscan/hdbscan.py index fbe4f9d4e4671..ffefd0d3d5443 100644 --- a/sklearn/cluster/_hdbscan/hdbscan_.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -23,9 +23,9 @@ from sklearn.utils import check_array, gen_batches, get_chunk_n_rows from sklearn.utils._param_validation import Interval, StrOptions, validate_params -from ._hdbscan_linkage import label, mst_linkage_core, mst_linkage_core_vector -from ._hdbscan_reachability import mutual_reachability, sparse_mutual_reachability -from ._hdbscan_tree import ( +from ._linkage import label, mst_linkage_core, mst_linkage_core_vector +from ._reachability import mutual_reachability, sparse_mutual_reachability +from ._tree import ( compute_stability, condense_tree, get_clusters, diff --git a/sklearn/cluster/_hdbscan/setup.py b/sklearn/cluster/_hdbscan/setup.py new file mode 100644 index 0000000000000..9392c08be2256 --- /dev/null +++ b/sklearn/cluster/_hdbscan/setup.py @@ -0,0 +1,42 @@ +# License: BSD 3 clause +import os + +import numpy + + +def configuration(parent_package="", top_path=None): + from numpy.distutils.misc_util import Configuration + + libraries = [] + if os.name == "posix": + libraries.append("m") + + config = Configuration("_hdbscan", parent_package, top_path) + + # HDBSCAN subpackage + config.add_subpackage("tests") + config.add_extension( + "_linkage", + sources=["_linkage.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_extension( + "_reachability", + sources=["_reachability.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_extension( + "_tree", + sources=["_tree.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + + setup(**configuration(top_path="").todict()) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index d49c70f1e5e5d..55c25d319b2c8 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -19,7 +19,6 @@ from sklearn.utils._testing import assert_array_almost_equal n_clusters = 3 -# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) X, y = make_blobs(n_samples=200, random_state=10) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) @@ -76,7 +75,7 @@ def test_hdbscan_distance_matrix(): labels = hdbscan(D, metric="precomputed")[0] # number of clusters, ignoring noise if present - n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise + n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN(metric="precomputed").fit(D).labels_ @@ -101,7 +100,7 @@ def test_hdbscan_sparse_distance_matrix(): labels = hdbscan(D, metric="precomputed")[0] # number of clusters, ignoring noise if present - n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise + n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN(metric="precomputed").fit(D).labels_ @@ -188,7 +187,6 @@ def test_hdbscan_dbscan_clustering(): def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) - # H, y = shuffle(X, y, random_state=7) H = StandardScaler().fit_transform(H) labels = hdbscan(H)[0] n_clusters_1 = len(set(labels)) - int(-1 in labels) diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index eb5f622f65ad8..9ba195cf3230c 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -60,32 +60,6 @@ def configuration(parent_package="", top_path=None): config.add_subpackage("tests") config.add_subpackage("_hdbscan") - # HDBSCAN subpackage - config.add_subpackage("_hdbscan.tests") - config.add_extension( - "_hdbscan._hdbscan_boruvka", - sources=["_hdbscan/_hdbscan_boruvka.pyx"], - include_dirs=[numpy.get_include(), "_hdbscan"], - libraries=libraries, - ) - config.add_extension( - "_hdbscan._hdbscan_linkage", - sources=["_hdbscan/_hdbscan_linkage.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - config.add_extension( - "_hdbscan._hdbscan_reachability", - sources=["_hdbscan/_hdbscan_reachability.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - config.add_extension( - "_hdbscan._hdbscan_tree", - sources=["_hdbscan/_hdbscan_tree.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) return config From 9ebc64355c3a7bcbbabdc3cf6611968d88acf43c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 26 Aug 2022 12:53:04 -0400 Subject: [PATCH 092/160] Reintroduced boruvka algorithm --- sklearn/cluster/_hdbscan/_boruvka.pyx | 887 ++++++++++++++++++++++++++ sklearn/cluster/_hdbscan/hdbscan.py | 93 ++- sklearn/cluster/_hdbscan/setup.py | 6 + 3 files changed, 971 insertions(+), 15 deletions(-) create mode 100644 sklearn/cluster/_hdbscan/_boruvka.pyx diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx new file mode 100644 index 0000000000000..18a582a4594f1 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -0,0 +1,887 @@ +# Minimum spanning tree single linkage implementation for hdbscan +# Authors: Leland McInnes +# License: 3-clause BSD + +# Code to implement a Dual Tree Boruvka Minimimum Spanning Tree computation +# The algorithm is largely tree independent, but some fine details still +# depend on the particular choice of tree. +# +# The core idea of the algorithm is to do repeated sweeps through the dataset, +# adding edges to the tree with each sweep until a full tree is formed. +# To do this, start with each node (or point) existing in it's own component. +# On each sweep find all the edges of minimum weight (in this instance +# of minimal mutual reachability distance) that join separate components. +# Add all these edges to the list of edges in the spanning tree, and then +# combine together all the components joined by edges. Begin the next sweep ... +# +# Eventually we end up with only one component, and all edges in we added +# form the minimum spanning tree. The key insight is that each sweep is +# essentially akin to a nearest neighbor search (with the caveat about being +# in separate components), and so can be performed very efficiently using +# a space tree such as a kdtree or ball tree. By using a dual tree formalism +# with a query tree and reference tree we can prune when all points im the +# query node are in the same component, as are all the points of the reference +# node. This allows for rapid pruning in the dual tree traversal in later +# stages. Importantly, we can construct the full tree in O(log N) sweeps +# and since each sweep has complexity equal to that of an all points +# nearest neighbor query within the tree structure we are using we end +# up with sub-quadratic complexity at worst. +# +# This code is based on the papers: +# +# Fast Euclidean Minimum Spanning Tree: Algorithm, analysis, and applications +# William B. March, Parikshit Ram, Alexander Gray +# Conference: Proceedings of the 16th ACM SIGKDD International Conference on +# Knowledge Discovery and Data Mining +# 2010 +# +# Tree-Independent Dual-Tree Algorithms +# Ryan R. Curtin, William B. March, Parikshit Ram, David V. Anderson, +# Alexander G. Gray, Charles L. Isbell Jr +# 2013, arXiv 1304.4327 +# +# As per the sklearn BallTree and KDTree implementations we make use of +# the rdist for KDTree, which is a faster-to-compute notion of distance +# (for example in the euclidean case it is the distance squared). +# +# To combine together components in between sweeps we make use of +# a union find data structure. This is a separate implementation +# from that used in the labelling of the single linkage tree as +# we can perform more specific optimizations here for what +# is a simpler version of the structure. + +import numpy as np + +cimport numpy as np +from libc.float cimport DBL_MAX +from libc.math cimport fabs, pow + +from sklearn.neighbors import BallTree, KDTree + +from sklearn.metrics._dist_metrics cimport DistanceMetric + +from joblib import Parallel, delayed + + +cdef np.double_t INF = np.inf + + +# Define the NodeData struct used in sklearn trees for faster +# access to the node data internals in Cython. +cdef struct NodeData_t: + np.intp_t idx_start + np.intp_t idx_end + np.intp_t is_leaf + np.double_t radius + + +# Define a function giving the minimum distance between two +# nodes of a ball tree +cdef inline np.double_t balltree_min_dist_dual( + np.double_t radius1, + np.double_t radius2, + np.intp_t node1, + np.intp_t node2, + np.double_t[:, ::1] centroid_dist) nogil except -1: + + cdef np.double_t dist_pt = centroid_dist[node1, node2] + return max(0, (dist_pt - radius1 - radius2)) + + +# Define a function giving the minimum distance between two +# nodes of a kd-tree +cdef inline np.double_t kdtree_min_dist_dual( + DistanceMetric metric, + np.intp_t node1, + np.intp_t node2, + np.double_t[:, :, ::1] node_bounds, + np.intp_t num_features) except -1: + + cdef np.double_t d, d1, d2, rdist = 0.0 + cdef np.double_t zero = 0.0 + cdef np.intp_t j + + if metric.p == INF: + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = max(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, metric.p) + + return metric._rdist_to_dist(rdist) + + +# As above, but this time we use the rdist as per the kdtree +# implementation. This allows us to release the GIL over +# larger sections of code +cdef inline np.double_t kdtree_min_rdist_dual( + DistanceMetric metric, + np.intp_t node1, + np.intp_t node2, + np.double_t[:, :, ::1] node_bounds, + np.intp_t num_features) nogil except -1: + + cdef np.double_t d, d1, d2, rdist = 0.0 + cdef np.double_t zero = 0.0 + cdef np.intp_t j + + if metric.p == INF: + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = max(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(num_features): + d1 = (node_bounds[0, node1, j] - + node_bounds[1, node2, j]) + d2 = (node_bounds[0, node2, j] - + node_bounds[1, node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, metric.p) + + return rdist + + +cdef class BoruvkaUnionFind(object): + """Efficient union find implementation. + + Parameters + ---------- + + size : int + The total size of the set of objects to + track via the union find structure. + + Attributes + ---------- + + is_component : array of bool; shape (size, 1) + Array specifying whether each element of the + set is the root node, or identifier for + a component. + """ + + cdef np.ndarray _parent_arr + cdef np.intp_t[::1] _parent + cdef np.ndarray _rank_arr + cdef np.uint8_t[::1] _rank + cdef np.ndarray is_component + + def __init__(self, size): + self._parent_arr = np.arange(size, dtype=np.intp) + self._parent = ( ( + self._parent_arr.data)) + self._rank_arr = np.zeros(size, dtype=np.uint8) + self._rank = ( ( + self._rank_arr.data)) + self.is_component = np.ones(size, dtype=bool) + + cdef int union_(self, np.intp_t x, np.intp_t y) except -1: + """Union together elements x and y""" + cdef np.intp_t x_root = self.find(x) + cdef np.intp_t y_root = self.find(y) + + if x_root == y_root: + return 0 + + if self._rank[x_root] < self._rank[y_root]: + self._parent[x_root] = y_root + self.is_component[x_root] = False + elif self._rank[x_root] > self._rank[y_root]: + self._parent[y_root] = x_root + self.is_component[y_root] = False + else: + self._rank[x_root] += 1 + self._parent[y_root] = x_root + self.is_component[y_root] = False + + return 0 + + cdef np.intp_t find(self, np.intp_t x) except -1: + """Find the root or identifier for the component that x is in""" + cdef np.intp_t x_parent + cdef np.intp_t x_grandparent + + x_parent = self._parent[x] + while True: + if x_parent == x: + return x + x_grandparent = self._parent[x_parent] + self._parent[x] = x_grandparent + x = x_parent + x_parent = x_grandparent + + cdef np.ndarray[np.intp_t, ndim=1] components(self): + """Return an array of all component roots/identifiers""" + return self.is_component.nonzero()[0] + + +def _core_dist_query(tree, data, min_samples): + return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) + +cdef class BoruvkaAlgorithm(object): + """A Dual Tree Boruvka Algorithm implemented for the sklearn + KDTree space tree implementation. + + Parameters + ---------- + + tree : KDTree + The kd-tree to run Dual Tree Boruvka over. + + min_samples : int, optional (default= 5) + The min_samples parameter of HDBSCAN used to + determine core distances. + + metric : string, optional (default='euclidean') + The metric used to compute distances for the tree + + leaf_size : int, optional (default=20) + The Boruvka algorithm benefits from a smaller leaf size than + standard kd-tree nearest neighbor searches. The tree passed in + is used for a kNN search for core distance. A second tree is + constructed with a smaller leaf size for Boruvka; this is that + leaf size. + + alpha : float, optional (default=1.0) + The alpha distance scaling parameter as per Robust Single Linkage. + + approx_min_span_tree : bool, optional (default=False) + Take shortcuts and only approximate the min spanning tree. + This is considerably faster but does not return a true + minimal spanning tree. + + n_jobs : int, optional (default=4) + The number of parallel jobs used to compute core distances. + + **kwargs : + Keyword args passed to the metric. + """ + + cdef object tree + cdef object core_dist_tree + cdef DistanceMetric dist + cdef np.ndarray _data + cdef readonly const np.double_t[:, ::1] _raw_data + cdef np.double_t[:, :, ::1] node_bounds + cdef np.double_t alpha + cdef np.int8_t approx_min_span_tree + cdef np.intp_t n_jobs + cdef np.intp_t min_samples + cdef np.intp_t num_points + cdef np.intp_t num_nodes + cdef np.intp_t num_features + cdef bint is_KDTree + + cdef public np.double_t[::1] core_distance + cdef public np.double_t[::1] bounds + cdef public np.intp_t[::1] component_of_point + cdef public np.intp_t[::1] component_of_node + cdef public np.intp_t[::1] candidate_neighbor + cdef public np.intp_t[::1] candidate_point + cdef public np.double_t[::1] candidate_distance + cdef public np.double_t[:, ::1] centroid_distances + cdef public np.intp_t[::1] idx_array + cdef public NodeData_t[::1] node_data + cdef BoruvkaUnionFind component_union_find + cdef np.ndarray edges + cdef np.intp_t num_edges + + cdef np.intp_t *component_of_point_ptr + cdef np.intp_t *component_of_node_ptr + cdef np.double_t *candidate_distance_ptr + cdef np.intp_t *candidate_neighbor_ptr + cdef np.intp_t *candidate_point_ptr + cdef np.double_t *core_distance_ptr + cdef np.double_t *bounds_ptr + + cdef np.ndarray components + cdef np.ndarray core_distance_arr + cdef np.ndarray bounds_arr + cdef np.ndarray _centroid_distances_arr + cdef np.ndarray component_of_point_arr + cdef np.ndarray component_of_node_arr + cdef np.ndarray candidate_point_arr + cdef np.ndarray candidate_neighbor_arr + cdef np.ndarray candidate_distance_arr + + def __init__(self, tree, min_samples=5, metric='euclidean', leaf_size=20, + alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): + + self.core_dist_tree = tree + self.tree = tree + self.is_KDTree = isinstance(tree, KDTree) + self._data = np.array(self.tree.data) + self._raw_data = self.tree.data + self.node_bounds = self.tree.node_bounds + self.min_samples = min_samples + self.alpha = alpha + self.approx_min_span_tree = approx_min_span_tree + self.n_jobs = n_jobs + + self.num_points = self.tree.data.shape[0] + self.num_features = self.tree.data.shape[1] + self.num_nodes = self.tree.node_data.shape[0] + + self.dist = DistanceMetric.get_metric(metric, **kwargs) + + self.components = np.arange(self.num_points) + self.bounds_arr = np.empty(self.num_nodes, np.double) + self.component_of_point_arr = np.empty(self.num_points, dtype=np.intp) + self.component_of_node_arr = np.empty(self.num_nodes, dtype=np.intp) + self.candidate_neighbor_arr = np.empty(self.num_points, dtype=np.intp) + self.candidate_point_arr = np.empty(self.num_points, dtype=np.intp) + self.candidate_distance_arr = np.empty(self.num_points, + dtype=np.double) + self.component_union_find = BoruvkaUnionFind(self.num_points) + + self.edges = np.empty((self.num_points - 1, 3)) + self.num_edges = 0 + + self.idx_array = self.tree.idx_array + self.node_data = self.tree.node_data + + self.bounds = ( ( + self.bounds_arr.data)) + self.component_of_point = ( ( + self.component_of_point_arr.data)) + self.component_of_node = ( ( + self.component_of_node_arr.data)) + self.candidate_neighbor = ( ( + self.candidate_neighbor_arr.data)) + self.candidate_point = ( ( + self.candidate_point_arr.data)) + self.candidate_distance = ( ( + self.candidate_distance_arr.data)) + + if not self.is_KDTree: + # Compute centroids for BallTree + self._centroid_distances_arr = self.dist.pairwise(self.tree.node_bounds[0]) + self.centroid_distances = ( + ( + + self._centroid_distances_arr.data)) + + self._initialize_components() + self._compute_bounds() + + # Set up fast pointer access to arrays + self.component_of_point_ptr = &self.component_of_point[0] + self.component_of_node_ptr = &self.component_of_node[0] + self.candidate_distance_ptr = &self.candidate_distance[0] + self.candidate_neighbor_ptr = &self.candidate_neighbor[0] + self.candidate_point_ptr = &self.candidate_point[0] + self.core_distance_ptr = &self.core_distance[0] + self.bounds_ptr = &self.bounds[0] + + cdef _compute_bounds(self): + """Initialize core distances""" + + cdef np.intp_t n + cdef np.intp_t i + cdef np.intp_t m + + cdef np.ndarray[np.double_t, ndim=2] knn_dist + cdef np.ndarray[np.intp_t, ndim=2] knn_indices + + # A shortcut: if we have a lot of points then we can split the points + # into four piles and query them in parallel. On multicore systems + # (most systems) this amounts to a 2x-3x wall clock improvement. + if self.tree.data.shape[0] > 16384 and self.n_jobs > 1: + split_cnt = self.num_points // self.n_jobs + datasets = [] + for i in range(self.n_jobs): + if i == self.n_jobs - 1: + datasets.append(np.asarray(self.tree.data[i*split_cnt:])) + else: + datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt])) + + knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(_core_dist_query) + (self.core_dist_tree, points, + self.min_samples + 1) + for points in datasets) + knn_dist = np.vstack([x[0] for x in knn_data]) + knn_indices = np.vstack([x[1] for x in knn_data]) + else: + knn_dist, knn_indices = self.core_dist_tree.query( + self.tree.data, + k=self.min_samples + 1, + dualtree=True, + breadth_first=True) + + self.core_distance_arr = knn_dist[:, self.min_samples].copy() + self.core_distance = ( ( + self.core_distance_arr.data)) + + + if self.is_KDTree: + # Since we do everything in terms of rdist to free up the GIL + # we need to convert all the core distances beforehand + # to make comparison feasible. + for n in range(self.num_points): + self.core_distance[n] = self.dist._dist_to_rdist( + self.core_distance[n]) + + # Since we already computed NN distances for the min_samples closest + # points we can use this to do the first round of boruvka -- we won't + # get every point due to core_distance/mutual reachability distance + # issues, but we'll get quite a few, and they are the hard ones to + # get, so fill in any we can and then run update components. + for n in range(self.num_points): + for i in range(0, self.min_samples + 1): + m = knn_indices[n, i] + if n == m: + continue + if self.core_distance[m] <= self.core_distance[n]: + self.candidate_point[n] = n + self.candidate_neighbor[n] = m + self.candidate_distance[n] = self.core_distance[n] + break + + self.update_components() + + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + + cdef _initialize_components(self): + """Initialize components of the min spanning tree (eventually there + is only one component; initially each point is its own component)""" + + cdef np.intp_t n + + for n in range(self.num_points): + self.component_of_point[n] = n + self.candidate_neighbor[n] = -1 + self.candidate_point[n] = -1 + self.candidate_distance[n] = DBL_MAX + + for n in range(self.num_nodes): + self.component_of_node[n] = -(n+1) + + cdef int update_components(self) except -1: + """Having found the nearest neighbor not in the same component for + each current component (via tree traversal), run through adding + edges to the min spanning tree and recomputing components via + union find.""" + + cdef np.intp_t source + cdef np.intp_t sink + cdef np.intp_t c + cdef np.intp_t component + cdef np.intp_t n + cdef np.intp_t i + cdef np.intp_t p + cdef np.intp_t current_component + cdef np.intp_t current_source_component + cdef np.intp_t current_sink_component + cdef np.intp_t child1 + cdef np.intp_t child2 + + cdef NodeData_t node_info + + # For each component there should be a: + # - candidate point (a point in the component) + # - candiate neighbor (the point to join with) + # - candidate_distance (the distance from point to neighbor) + # + # We will go through and and an edge to the edge list + # for each of these, and the union the two points + # together in the union find structure + + for c in range(self.components.shape[0]): + component = self.components[c] + source = self.candidate_point[component] + sink = self.candidate_neighbor[component] + if source == -1 or sink == -1: + continue + # raise ValueError('Source or sink of edge is not defined!') + current_source_component = self.component_union_find.find(source) + current_sink_component = self.component_union_find.find(sink) + if current_source_component == current_sink_component: + # We've already joined these, so ignore this edge + self.candidate_point[component] = -1 + self.candidate_neighbor[component] = -1 + self.candidate_distance[component] = DBL_MAX + continue + self.edges[self.num_edges, 0] = source + self.edges[self.num_edges, 1] = sink + if self.is_KDTree: + self.edges[self.num_edges, 2] = self.dist._rdist_to_dist( + self.candidate_distance[component]) + else: + self.edges[self.num_edges, 2] = self.candidate_distance[component] + self.num_edges += 1 + + self.component_union_find.union_(source, sink) + + # Reset everything,and check if we're done + self.candidate_distance[component] = DBL_MAX + if self.num_edges == self.num_points - 1: + self.components = self.component_union_find.components() + return self.components.shape[0] + + # After having joined everything in the union find data + # structure we need to go through and determine the components + # of each point for easy lookup. + # + # Have done that we then go through and set the component + # of each node, as this provides fast pruning in later + # tree traversals. + for n in range(self.tree.data.shape[0]): + self.component_of_point[n] = self.component_union_find.find(n) + + for n in range(self.tree.node_data.shape[0] - 1, -1, -1): + node_info = self.node_data[n] + # Case 1: + # If the node is a leaf we need to check that every point + # in the node is of the same component + if node_info.is_leaf: + current_component = self.component_of_point[ + self.idx_array[node_info.idx_start]] + for i in range(node_info.idx_start + 1, node_info.idx_end): + p = self.idx_array[i] + if self.component_of_point[p] != current_component: + break + else: + self.component_of_node[n] = current_component + # Case 2: + # If the node is not a leaf we only need to check + # that both child nodes are in the same component + else: + child1 = 2 * n + 1 + child2 = 2 * n + 2 + if (self.component_of_node[child1] == + self.component_of_node[child2]): + self.component_of_node[n] = self.component_of_node[child1] + + # Since we're working with mutual reachability distance we often have + # ties or near ties; because of that we can benefit by not resetting + # the bounds unless we get stuck (don't join any components). Thus + # we check for that, and only reset bounds in the case where we have + # the same number of components as we did going in. This doesn't + # produce a true min spanning tree, but only and approximation + # Thus only do this if the caller is willing to accept such + if self.approx_min_span_tree: + last_num_components = self.components.shape[0] + self.components = self.component_union_find.components() + + if self.components.shape[0] == last_num_components: + # Reset bounds + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + else: + self.components = self.component_union_find.components() + + for n in range(self.num_nodes): + self.bounds_arr[n] = DBL_MAX + + return self.components.shape[0] + + cdef int dual_tree_traversal(self, np.intp_t node1, + np.intp_t node2) nogil except -1: + """Perform a dual tree traversal, pruning wherever possible, to find + the nearest neighbor not in the same component for each component. + This is akin to a standard dual tree NN search, but we also prune + whenever all points in query and reference nodes are in the same + component.""" + + cdef np.intp_t[::1] point_indices1, point_indices2 + + cdef np.intp_t i + cdef np.intp_t j + + cdef np.intp_t p + cdef np.intp_t q + + cdef np.intp_t parent + cdef np.intp_t child1 + cdef np.intp_t child2 + + cdef double node_dist + + cdef NodeData_t node1_info = self.node_data[node1] + cdef NodeData_t node2_info = self.node_data[node2] + cdef NodeData_t parent_info + cdef NodeData_t left_info + cdef NodeData_t right_info + + cdef np.intp_t component1 + cdef np.intp_t component2 + + cdef np.double_t *raw_data = ( &self._raw_data[0, 0]) + cdef np.double_t d + + cdef np.double_t mr_dist + cdef np.double_t _radius + + cdef np.double_t new_bound + cdef np.double_t new_upper_bound + cdef np.double_t new_lower_bound + cdef np.double_t bound_max + cdef np.double_t bound_min + + cdef np.intp_t left + cdef np.intp_t right + cdef np.double_t left_dist + cdef np.double_t right_dist + + # Compute the distance between the query and reference nodes + if self.is_KDTree: + node_dist = kdtree_min_rdist_dual(self.dist, + node1, node2, self.node_bounds, + self.num_features) + else: #BallTree + node_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, node2, + self.centroid_distances) + + + # If the distance between the nodes is less than the current bound for + # the query and the nodes are not in the same component continue; + # otherwise we get to prune this branch and return early. + if node_dist < self.bounds_ptr[node1]: + if (self.component_of_node_ptr[node1] == + self.component_of_node_ptr[node2] and + self.component_of_node_ptr[node1] >= 0): + return 0 + else: + return 0 + + # Case 1: Both nodes are leaves + # for each pair of points in node1 x node2 we need + # to compute the distance and see if it better than + # the current nearest neighbor for the component of + # the point in the query node. + # + # We get to take some shortcuts: + # - if the core distance for a point is larger than + # the distance to the nearst neighbor of the + # component of the point ... then we can't get + # a better mutual reachability distance and we + # can skip computing anything for that point + # - if the points are in the same component we + # don't have to compute the distance. + # + # We also have some catches: + # - we need to compute mutual reachability distance + # not just the ordinary distance; this involves + # fiddling with core distances. + # - We need to scale distances according to alpha, + # but don't want to lose performance in the case + # that alpha is 1.0. + # + # Finally we can compute new bounds for the query node + # based on the distances found here, so do that and + # propagate the results up the tree. + if node1_info.is_leaf and node2_info.is_leaf: + + new_upper_bound = 0.0 + new_lower_bound = DBL_MAX + + point_indices1 = self.idx_array[node1_info.idx_start: + node1_info.idx_end] + point_indices2 = self.idx_array[node2_info.idx_start: + node2_info.idx_end] + + for i in range(point_indices1.shape[0]): + + p = point_indices1[i] + component1 = self.component_of_point_ptr[p] + + if (self.core_distance_ptr[p] > + self.candidate_distance_ptr[component1]): + continue + + for j in range(point_indices2.shape[0]): + + q = point_indices2[j] + component2 = self.component_of_point_ptr[q] + + if (self.core_distance_ptr[q] > + self.candidate_distance_ptr[component1]): + continue + + if component1 != component2: + if self.is_KDTree: + d = self.dist.rdist(&raw_data[self.num_features * p], + &raw_data[self.num_features * q], + self.num_features) + else: + d = self.dist.dist(&raw_data[self.num_features * p], + &raw_data[self.num_features * q], + self.num_features) * self.alpha + if self.alpha != 1.0: + mr_dist = max(d / self.alpha, + self.core_distance_ptr[p], + self.core_distance_ptr[q]) + else: + mr_dist = max(d, self.core_distance_ptr[p], + self.core_distance_ptr[q]) + if mr_dist < self.candidate_distance_ptr[component1]: + self.candidate_distance_ptr[component1] = mr_dist + self.candidate_neighbor_ptr[component1] = q + self.candidate_point_ptr[component1] = p + + new_upper_bound = max(new_upper_bound, + self.candidate_distance_ptr[component1]) + new_lower_bound = min(new_lower_bound, + self.candidate_distance_ptr[component1]) + + # Compute new bounds for the query node, and + # then propagate the results of that computation + # up the tree. + _radius = self.dist._dist_to_rdist(node1_info.radius) if self.is_KDTree else node1_info.radius + new_bound = min(new_upper_bound, + new_lower_bound + 2 * _radius) + if new_bound < self.bounds_ptr[node1]: + self.bounds_ptr[node1] = new_bound + + # Propagate bounds up the tree + while node1 > 0: + parent = (node1 - 1) // 2 + left = 2 * parent + 1 + right = 2 * parent + 2 + + parent_info = self.node_data[parent] + left_info = self.node_data[left] + right_info = self.node_data[right] + + bound_max = max(self.bounds_ptr[left], + self.bounds_ptr[right]) + + if self.is_KDTree: + new_bound = bound_max + else: + bound_min = min(self.bounds_ptr[left] + 2 * + (parent_info.radius - left_info.radius), + self.bounds_ptr[right] + 2 * + (parent_info.radius - right_info.radius)) + + if bound_min > 0: + new_bound = min(bound_max, bound_min) + else: + new_bound = bound_max + if new_bound < self.bounds_ptr[parent]: + self.bounds_ptr[parent] = new_bound + node1 = parent + else: + break + + # Case 2a: The query node is a leaf, or is smaller than + # the reference node. + # + # We descend in the reference tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the reference tree. + elif node1_info.is_leaf or (not node2_info.is_leaf and + node2_info.radius > node1_info.radius): + + left = 2 * node2 + 1 + right = 2 * node2 + 2 + + if self.is_KDTree: + left_dist = kdtree_min_rdist_dual(self.dist, + node1, left, + self.node_bounds, + self.num_features) + right_dist = kdtree_min_rdist_dual(self.dist, + node1, right, + self.node_bounds, + self.num_features) + else: + node2_info = self.node_data[left] + left_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, left, + self.centroid_distances) + node2_info = self.node_data[right] + right_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + node1, right, + self.centroid_distances) + + if left_dist < right_dist: + self.dual_tree_traversal(node1, left) + self.dual_tree_traversal(node1, right) + else: + self.dual_tree_traversal(node1, right) + self.dual_tree_traversal(node1, left) + + # Case 2b: The reference node is a leaf, or is smaller than + # the query node. + # + # We descend in the query tree. We first + # compute distances between nodes to determine + # whether we should prioritise the left or + # right branch in the query tree. + else: + left = 2 * node1 + 1 + right = 2 * node1 + 2 + if self.is_KDTree: + left_dist = kdtree_min_rdist_dual(self.dist, + left, node2, + self.node_bounds, + self.num_features) + right_dist = kdtree_min_rdist_dual(self.dist, + right, node2, + self.node_bounds, + self.num_features) + else: + node1_info = self.node_data[left] + left_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + left, node2, + self.centroid_distances) + node1_info = self.node_data[right] + right_dist = balltree_min_dist_dual(node1_info.radius, + node2_info.radius, + right, node2, + self.centroid_distances) + + + if left_dist < right_dist: + self.dual_tree_traversal(left, node2) + self.dual_tree_traversal(right, node2) + else: + self.dual_tree_traversal(right, node2) + self.dual_tree_traversal(left, node2) + + return 0 + + cpdef spanning_tree(self): + """Compute the minimum spanning tree of the data held by + the tree passed in at construction""" + + cdef np.intp_t num_components + cdef np.intp_t num_nodes + + num_components = self.tree.data.shape[0] + num_nodes = self.tree.node_data.shape[0] + while num_components > 1: + self.dual_tree_traversal(0, 0) + num_components = self.update_components() + + return self.edges diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index ffefd0d3d5443..1d2caf8eebe57 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -23,6 +23,7 @@ from sklearn.utils import check_array, gen_batches, get_chunk_n_rows from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from ._boruvka import BoruvkaAlgorithm from ._linkage import label, mst_linkage_core, mst_linkage_core_vector from ._reachability import mutual_reachability, sparse_mutual_reachability from ._tree import ( @@ -45,8 +46,10 @@ { "auto", "brute", - "kdtree", - "balltree", + "prims_kdtree", + "prims_balltree", + "boruvka_kdtree", + "boruvka_balltree", } ) ], @@ -223,6 +226,40 @@ def _hdbscan_prims( return _process_mst(min_spanning_tree) +def _hdbscan_boruvka( + X, + algo, + min_samples=5, + metric="euclidean", + leaf_size=40, + n_jobs=4, + **metric_params, +): + leaf_size = max(leaf_size, 3) + Tree = KDTree if algo == "kd_tree" else BallTree + tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) + + n_samples = X.shape[0] + if min_samples + 1 > n_samples: + raise ValueError( + "Expected min_samples + 1 <= n_samples, " + f" but {min_samples+1=}, {n_samples=}" + ) + + out = BoruvkaAlgorithm( + tree=tree, + min_samples=min_samples, + metric=metric, + leaf_size=leaf_size // 3, + approx_min_span_tree=True, + n_jobs=n_jobs, + **metric_params, + ) + min_spanning_tree = out.spanning_tree() + + return _process_mst(min_spanning_tree) + + def remap_single_linkage_tree(tree, internal_to_raw, outliers): """ Takes an internal single_linkage_tree structure and adds back in a set of points @@ -351,16 +388,20 @@ def hdbscan( algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a `KDTree` tree if possible, - otherwise it uses a `BallTree` tree. + to `'auto'` which attempts to use a `KDTree` method if possible, + otherwise it uses a `BallTree` method. If `X` has `n_features>60` + then a `boruvka` approach is used, otherwise a `prims` approach is + used. If `X` is sparse or `metric` is invalid for both `KDTree` and `BallTree`, then it resolves to use the `brute` algorithm. Available algorithms: - `'brute'` - - `'kdtree'` - - `'balltree'` + - `'prims_kdtree'` + - `'prims_balltree'` + - `'boruvka_kdtree'` + - `'boruvka_balltree'` memory : str, default=None Used to cache the output of the computation of the tree. @@ -467,11 +508,18 @@ def hdbscan( func = _hdbscan_brute for key in ("algo", "leaf_size", "n_jobs"): kwargs.pop(key, None) - elif algorithm == "kdtree": + elif algorithm == "prims_kdtree": func = _hdbscan_prims - elif algorithm == "balltree": + elif algorithm == "prims_balltree": func = _hdbscan_prims kwargs["algo"] = "ball_tree" + elif algorithm == "boruvka_kdtree": + func = _hdbscan_boruvka + kwargs.pop("alpha", None) + elif algorithm == "boruvka_balltree": + func = _hdbscan_boruvka + kwargs.pop("alpha", None) + kwargs["algo"] = "ball_tree" else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... @@ -479,10 +527,21 @@ def hdbscan( for key in ("algo", "leaf_size", "n_jobs"): kwargs.pop(key, None) elif metric in KDTree.valid_metrics: - func = _hdbscan_prims + # TO DO: Need heuristic to decide when to go to boruvka + if X.shape[1] > 60: + func = _hdbscan_prims + else: + func = _hdbscan_boruvka + kwargs.pop("alpha", None) else: # Metric is a valid BallTree metric - func = _hdbscan_prims - kwargs["algo"] = "ball_tree" + # TO DO: Need heuristic to decide when to go to boruvka; + if X.shape[1] > 60: + func = _hdbscan_prims + kwargs["algo"] = "ball_tree" + else: + func = _hdbscan_boruvka + kwargs.pop("alpha", None) + kwargs["algo"] = "ball_tree" single_linkage_tree = memory.cache(func)(**kwargs) @@ -545,8 +604,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): algorithm : str, default='auto' Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a `KDTree` tree if possible, - otherwise it uses a `BallTree` tree. + to `'auto'` which attempts to use a `KDTree` method if possible, + otherwise it uses a `BallTree` method. If the `X` passed during `fit` + has `n_features>60` then a `boruvka` approach is used, otherwise a + `prims` approach is used. If the `X` passed during `fit` is sparse or `metric` is invalid for both `KDTree` and `BallTree`, then it resolves to use the `brute` @@ -554,8 +615,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Available algorithms: - `'brute'` - - `'kdtree'` - - `'balltree'` + - `'prims_kdtree'` + - `'prims_balltree'` + - `'boruvka_kdtree'` + - `'boruvka_balltree'` leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries. A diff --git a/sklearn/cluster/_hdbscan/setup.py b/sklearn/cluster/_hdbscan/setup.py index 9392c08be2256..c082ec8bdf214 100644 --- a/sklearn/cluster/_hdbscan/setup.py +++ b/sklearn/cluster/_hdbscan/setup.py @@ -33,6 +33,12 @@ def configuration(parent_package="", top_path=None): include_dirs=[numpy.get_include()], libraries=libraries, ) + config.add_extension( + "_boruvka", + sources=["_boruvka.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) return config From 24c5b980557503d94caff576bdbcf60d5dcf9460 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 26 Aug 2022 13:15:49 -0400 Subject: [PATCH 093/160] Updated test file for boruvka removal --- .../cluster/_hdbscan/tests/test_hdbscan.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 55c25d319b2c8..2377222a47e7f 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -126,8 +126,8 @@ def test_hdbscan_feature_vector(): @pytest.mark.parametrize( "algo", [ - "prims_kdtree", - "prims_balltree", + "kdtree", + "balltree", "brute", "auto", ], @@ -143,8 +143,8 @@ def test_hdbscan_algorithms(algo, metric): assert n_clusters_2 == n_clusters ALGOS_TREES = { - "prims_kdtree": KDTree, - "prims_balltree": BallTree, + "kdtree": KDTree, + "balltree": BallTree, } METRIC_PARAMS = { "mahalanobis": {"V": np.eye(X.shape[1])}, @@ -285,9 +285,9 @@ def test_hdbscan_sparse(): msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="prims_balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) with pytest.raises(ValueError, match=msg): - hdbscan(sparse_X, metric="euclidean", algorithm="prims_balltree") + hdbscan(sparse_X, metric="euclidean", algorithm="balltree") def test_hdbscan_caching(tmp_path): @@ -332,15 +332,18 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 - assert counts[unique_labels == -1] == 46 - # for this random seed an epsilon of 0.2 will produce exactly 2 noise + # Arbitrary heuristic. Would prefer something more precise. + assert counts[unique_labels == -1] > 30 + + # for this random seed an epsilon of 0.18 will produce exactly 2 noise # points at that cut in single linkage. labels = HDBSCAN( min_cluster_size=5, - cluster_selection_epsilon=0.2, + cluster_selection_epsilon=0.18, cluster_selection_method="eom", allow_single_cluster=True, + algorithm="kdtree", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 From 585d7bb6299adaa43c25be52928c3c79700e3c0b Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 28 Aug 2022 15:13:04 -0400 Subject: [PATCH 094/160] Added dtype specification to input array validation --- sklearn/cluster/_hdbscan/hdbscan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index ffefd0d3d5443..0ad6337c5b6a0 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -423,7 +423,9 @@ def hdbscan( # Checks input and converts to an nd-array where possible if metric != "precomputed" or issparse(X): - X = check_array(X, accept_sparse="csr", force_all_finite=False) + X = check_array( + X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 + ) elif isinstance(X, np.ndarray): # Only non-sparse, precomputed distance matrices are handled here # and thereby allowed to contain numpy.inf for missing distances From 507f0da09b37b42b11815456568222343e9f5b44 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Wed, 31 Aug 2022 12:58:51 -0400 Subject: [PATCH 095/160] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/cluster/_hdbscan/_linkage.pyx | 1 - sklearn/cluster/_hdbscan/hdbscan.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 7dd12ec0d2873..919e2713b0a25 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -7,7 +7,6 @@ cimport numpy as cnp import cython from libc.float cimport DBL_MAX -from libc.stdio cimport printf from sklearn.metrics._dist_metrics cimport DistanceMetric diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 0ad6337c5b6a0..b283b7f1645e3 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -192,8 +192,7 @@ def _hdbscan_prims( **metric_params, ): # The Cython routines used require contiguous arrays - if not X.flags["C_CONTIGUOUS"]: - X = np.array(X, order="C") + X = np.asarray(X, order="C") # Get distance to kth nearest neighbour nbrs = NearestNeighbors( @@ -207,8 +206,7 @@ def _hdbscan_prims( ).fit(X) n_samples = X.shape[0] - core_distances = np.empty(n_samples) - core_distances.fill(np.nan) + core_distances = np.full(n_samples, fill_value=np.nan, dtype=np.float64) chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples, max_n_rows=n_samples) slices = gen_batches(n_samples, chunk_n_rows) From ed6d17dcb15b54feafa26f974e16c010ad3b587c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 31 Aug 2022 13:28:30 -0400 Subject: [PATCH 096/160] Further review feedback --- sklearn/cluster/_hdbscan/hdbscan.py | 51 ++++++++++++++--------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index b283b7f1645e3..340d1b680f9b3 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -20,17 +20,12 @@ from sklearn.metrics import pairwise_distances from sklearn.metrics._dist_metrics import DistanceMetric from sklearn.neighbors import BallTree, KDTree, NearestNeighbors -from sklearn.utils import check_array, gen_batches, get_chunk_n_rows +from sklearn.utils import check_array from sklearn.utils._param_validation import Interval, StrOptions, validate_params from ._linkage import label, mst_linkage_core, mst_linkage_core_vector from ._reachability import mutual_reachability, sparse_mutual_reachability -from ._tree import ( - compute_stability, - condense_tree, - get_clusters, - labelling_at_cut, -) +from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics _PARAM_CONSTRAINTS = { @@ -52,7 +47,7 @@ ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "memory": [str, None, Path], - "n_jobs": [int], + "n_jobs": [Integral, None], "cluster_selection_method": [StrOptions({"eom", "leaf"})], "allow_single_cluster": ["boolean"], "metric_params": [dict, None], @@ -97,6 +92,7 @@ def _hdbscan_brute( min_samples=5, alpha=1.0, metric="euclidean", + n_jobs=None, **metric_params, ): if metric == "precomputed": @@ -106,7 +102,9 @@ def _hdbscan_brute( # matrix to indicate missing distance information. distance_matrix = X else: - distance_matrix = pairwise_distances(X, metric=metric, **metric_params) + distance_matrix = pairwise_distances( + X, metric=metric, n_jobs=n_jobs, **metric_params + ) if issparse(distance_matrix): return _hdbscan_sparse_distance_matrix( @@ -188,7 +186,7 @@ def _hdbscan_prims( alpha=1.0, metric="euclidean", leaf_size=40, - n_jobs=4, + n_jobs=None, **metric_params, ): # The Cython routines used require contiguous arrays @@ -206,13 +204,8 @@ def _hdbscan_prims( ).fit(X) n_samples = X.shape[0] - core_distances = np.full(n_samples, fill_value=np.nan, dtype=np.float64) - - chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples, max_n_rows=n_samples) - slices = gen_batches(n_samples, chunk_n_rows) - for sl in slices: - core_distances[sl] = nbrs.kneighbors(X[sl], min_samples)[0][:, -1] - + core_distances = np.empty(n_samples, dtype=np.float64) + core_distances[:] = nbrs.kneighbors(X, min_samples)[0][:, -1] dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_linkage_core_vector @@ -291,13 +284,15 @@ def hdbscan( leaf_size=40, algorithm="auto", memory=None, - n_jobs=4, + n_jobs=None, cluster_selection_method="eom", allow_single_cluster=False, metric_params=None, ): """Perform HDBSCAN clustering from a vector array or distance matrix. + ..versionadded:: 1.2 + Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ @@ -365,10 +360,11 @@ def hdbscan( By default, no caching is done. If a string is given, it is the path to the caching directory. - n_jobs : int, default=4 - Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `n_jobs<0`, - `(n_cpus + n_jobs + 1)` are used. + n_jobs : int, default=None + Number of jobs to run in parallel to calculate distances. + `None` means 1 unless in a :obj:`joblib.parallel_backend` context. + `-1` means using all processors. See :term:`Glossary ` + for more details. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The @@ -505,6 +501,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), and be more robust to parameter selection. + ..versionadded:: 1.2 + Parameters ---------- min_cluster_size : int, default=5 @@ -568,10 +566,11 @@ class HDBSCAN(ClusterMixin, BaseEstimator): By default, no caching is done. If a string is given, it is the path to the caching directory. - n_jobs : int, default=4 - Number of parallel jobs to run in core distance computations (if - supported by the specific algorithm). For `n_jobs<0`, - `(n_cpus + n_jobs + 1)` are used. + n_jobs : int, default=None + Number of jobs to run in parallel to calculate distances. + `None` means 1 unless in a :obj:`joblib.parallel_backend` context. + `-1` means using all processors. See :term:`Glossary ` + for more details. cluster_selection_method : str, default='eom' The method used to select clusters from the condensed tree. The From 38f71c75fc3e2f1ea2784b12e5bb8a79c12bfd22 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 31 Aug 2022 13:37:10 -0400 Subject: [PATCH 097/160] Updated tests and improved `n_jobs` handling --- sklearn/cluster/_hdbscan/hdbscan.py | 6 +- .../cluster/_hdbscan/tests/test_hdbscan.py | 61 +++++++++++++++---- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 50c61a1f5f095..480c0e1ae03af 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -13,7 +13,7 @@ from warnings import warn import numpy as np -from joblib import Memory +from joblib import Memory, effective_n_jobs from scipy.sparse import csgraph, issparse from sklearn.base import BaseEstimator, ClusterMixin @@ -223,7 +223,7 @@ def _hdbscan_boruvka( min_samples=5, metric="euclidean", leaf_size=40, - n_jobs=4, + n_jobs=None, **metric_params, ): leaf_size = max(leaf_size, 3) @@ -236,7 +236,7 @@ def _hdbscan_boruvka( "Expected min_samples + 1 <= n_samples, " f" but {min_samples+1=}, {n_samples=}" ) - + n_jobs = effective_n_jobs(n_jobs) out = BoruvkaAlgorithm( tree=tree, min_samples=min_samples, diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 2377222a47e7f..b040b81ba061b 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -126,8 +126,10 @@ def test_hdbscan_feature_vector(): @pytest.mark.parametrize( "algo", [ - "kdtree", - "balltree", + "prims_kdtree", + "prims_balltree", + "boruvka_kdtree", + "boruvka_balltree", "brute", "auto", ], @@ -143,8 +145,10 @@ def test_hdbscan_algorithms(algo, metric): assert n_clusters_2 == n_clusters ALGOS_TREES = { - "kdtree": KDTree, - "balltree": BallTree, + "prims_kdtree": KDTree, + "prims_balltree": BallTree, + "boruvka_kdtree": KDTree, + "boruvka_balltree": BallTree, } METRIC_PARAMS = { "mahalanobis": {"V": np.eye(X.shape[1])}, @@ -260,11 +264,32 @@ def test_hdbscan_input_lists(): HDBSCAN(min_samples=1).fit(X) +@pytest.mark.parametrize("tree", ["kdtree", "balltree"]) +def test_hdbscan_boruvka_matches(tree): + + data = generate_noisy_data() + + labels_prims = hdbscan(data, algorithm="brute")[0] + labels_boruvka = hdbscan(data, algorithm=f"boruvka_{tree}")[0] + + num_mismatches = homogeneity(labels_prims, labels_boruvka) + + assert (num_mismatches / float(data.shape[0])) < 0.15 + + labels_prims = HDBSCAN(algorithm="brute").fit_predict(data) + labels_boruvka = HDBSCAN(algorithm=f"boruvka_{tree}").fit_predict(data) + + num_mismatches = homogeneity(labels_prims, labels_boruvka) + + assert (num_mismatches / float(data.shape[0])) < 0.15 + + +@pytest.mark.parametrize("strategy", ["prims", "boruvka"]) @pytest.mark.parametrize("tree", ["kd", "ball"]) -def test_hdbscan_precomputed_non_brute(tree): - hdb = HDBSCAN(metric="precomputed", algorithm=f"prims_{tree}tree") +def test_hdbscan_precomputed_non_brute(strategy, tree): + hdb = HDBSCAN(metric="precomputed", algorithm=f"{strategy}_{tree}tree") with pytest.raises(ValueError): - hdbscan(X, metric="precomputed", algorithm=f"prims_{tree}tree") + hdbscan(X, metric="precomputed", algorithm=f"{strategy}_{tree}tree") with pytest.raises(ValueError): hdb.fit(X) @@ -285,9 +310,9 @@ def test_hdbscan_sparse(): msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) + HDBSCAN(metric="euclidean", algorithm="boruvka_balltree").fit(sparse_X) with pytest.raises(ValueError, match=msg): - hdbscan(sparse_X, metric="euclidean", algorithm="balltree") + hdbscan(sparse_X, metric="euclidean", algorithm="boruvka_balltree") def test_hdbscan_caching(tmp_path): @@ -334,16 +359,15 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): assert len(unique_labels) == 2 # Arbitrary heuristic. Would prefer something more precise. - assert counts[unique_labels == -1] > 30 + assert counts[unique_labels == -1] == 46 - # for this random seed an epsilon of 0.18 will produce exactly 2 noise + # for this random seed an epsilon of 0.2 will produce exactly 2 noise # points at that cut in single linkage. labels = HDBSCAN( min_cluster_size=5, - cluster_selection_epsilon=0.18, + cluster_selection_epsilon=0.2, cluster_selection_method="eom", allow_single_cluster=True, - algorithm="kdtree", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 @@ -382,6 +406,17 @@ def test_hdbscan_precomputed_array_like(): hdbscan(X, metric="precomputed") +@pytest.mark.parametrize("algo", ["boruvka_kdtree", "boruvka_balltree"]) +def test_hdbscan_min_samples_less_than_total(algo): + X = np.array([[1, 2], [2, 1]]) + + msg = "Expected min_samples" + with pytest.raises(ValueError, match=msg): + hdbscan(X, algorithm=algo, min_samples=3) + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm=algo, min_samples=3).fit(X) + + def test_hdbscan_sparse_distances_too_few_nonzero(): X = sparse.csr_matrix(np.zeros((10, 10))) From eefbacc3d8ce452f61345d7fb0a41a159cbeaaf2 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 13:22:51 -0400 Subject: [PATCH 098/160] Refactored to remove `hdbscan` function -- use estimator instead --- sklearn/cluster/__init__.py | 3 +- sklearn/cluster/_hdbscan/hdbscan.py | 326 +++++------------- .../cluster/_hdbscan/tests/test_hdbscan.py | 130 +++---- 3 files changed, 125 insertions(+), 334 deletions(-) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index f3c9a8ee2c20f..40b89ea0da8ba 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -23,7 +23,7 @@ ) from ._bicluster import SpectralBiclustering, SpectralCoclustering from ._birch import Birch -from ._hdbscan.hdbscan import HDBSCAN, hdbscan +from ._hdbscan.hdbscan import HDBSCAN __all__ = [ "AffinityPropagation", @@ -53,5 +53,4 @@ "SpectralBiclustering", "SpectralCoclustering", "HDBSCAN", - "hdbscan", ] diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 340d1b680f9b3..a96c83ee953ea 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -20,8 +20,7 @@ from sklearn.metrics import pairwise_distances from sklearn.metrics._dist_metrics import DistanceMetric from sklearn.neighbors import BallTree, KDTree, NearestNeighbors -from sklearn.utils import check_array -from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils._param_validation import Interval, StrOptions from ._linkage import label, mst_linkage_core, mst_linkage_core_vector from ._reachability import mutual_reachability, sparse_mutual_reachability @@ -267,231 +266,6 @@ def get_finite_row_indices(matrix): return row_indices -@validate_params( - { - **_PARAM_CONSTRAINTS, - "X": ["array-like", "sparse matrix"], - } -) -def hdbscan( - X, - min_cluster_size=5, - min_samples=None, - alpha=1.0, - cluster_selection_epsilon=0.0, - max_cluster_size=0, - metric="euclidean", - leaf_size=40, - algorithm="auto", - memory=None, - n_jobs=None, - cluster_selection_method="eom", - allow_single_cluster=False, - metric_params=None, -): - """Perform HDBSCAN clustering from a vector array or distance matrix. - - ..versionadded:: 1.2 - - Parameters - ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) - A feature array, or array of distances between samples if - `metric='precomputed'`. - - min_cluster_size : int, default=5 - The minimum number of samples in a group for that group to be - considered a cluster; groupings smaller than this size will be left - as noise. - - min_samples : int, default=None - The number of samples in a neighborhood for a point - to be considered as a core point. This includes the point itself. - defaults to the `min_cluster_size`. - - alpha : float, default=1.0 - A distance scaling parameter as used in robust single linkage. - See [2]_ for more information. - - cluster_selection_epsilon : float, default=0.0 - A distance threshold. Clusters below this value will be merged. - See [3]_ for more information. - - max_cluster_size : int, default=0 - A limit to the size of clusters returned by the `eom` cluster selection - algorithm. Has no effect if `cluster_selection_method=leaf`. Can be - overridden in rare cases by a high value for - `cluster_selection_epsilon`. - - metric : str or callable, default='minkowski' - The metric to use when calculating distance between instances in a - feature array. - - - If metric is a string or callable, it must be one of - the options allowed by :func:`metrics.pairwise.pairwise_distances` - for its metric parameter. - - - If metric is "precomputed", `X` is assumed to be a distance matrix and - must be square. - - leaf_size : int, default=40 - Leaf size for trees responsible for fast nearest neighbour queries. A - large dataset size and small leaf_size may induce excessive memory - usage. If you are running out of memory consider increasing the - `leaf_size` parameter. - - algorithm : str, default='auto' - Exactly which algorithm to use; hdbscan has variants specialised - for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a `KDTree` tree if possible, - otherwise it uses a `BallTree` tree. - - If `X` is sparse or `metric` is invalid for both `KDTree` and - `BallTree`, then it resolves to use the `brute` algorithm. - - Available algorithms: - - `'brute'` - - `'kdtree'` - - `'balltree'` - - memory : str, default=None - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - - n_jobs : int, default=None - Number of jobs to run in parallel to calculate distances. - `None` means 1 unless in a :obj:`joblib.parallel_backend` context. - `-1` means using all processors. See :term:`Glossary ` - for more details. - - cluster_selection_method : str, default='eom' - The method used to select clusters from the condensed tree. The - standard approach for HDBSCAN* is to use an Excess of Mass algorithm - to find the most persistent clusters. Alternatively you can instead - select the clusters at the leaves of the tree -- this provides the - most fine grained and homogeneous clusters. Options are: - - `eom` - - `leaf` - - allow_single_cluster : bool, default=False - By default HDBSCAN* will not produce a single cluster. Setting this to - `True` will allow single cluster results in the case that you feel this - is a valid result for your dataset. - - metric_params : dict, default=None - Arguments passed to the distance metric. - - Returns - ------- - labels : ndarray, shape (n_samples, ) - Cluster labels for each point. Noisy samples are given the label -1. - - probabilities : ndarray, shape (n_samples, ) - Cluster membership strengths for each point. Noisy samples are assigned - 0. - - single_linkage_tree : ndarray, shape (n_samples - 1, 4) - The single linkage tree produced during clustering in scipy - hierarchical clustering format - (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). - - References - ---------- - - .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). - Density-based clustering based on hierarchical density estimates. - In Pacific-Asia Conference on Knowledge Discovery and Data Mining - (pp. 160-172). Springer Berlin Heidelberg. - - .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the - cluster tree. In Advances in Neural Information Processing Systems - (pp. 343-351). - - .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical - Density-based Cluster Selection. arxiv preprint 1911.02282. - """ - if min_samples is None: - min_samples = min_cluster_size - - # Checks input and converts to an nd-array where possible - if metric != "precomputed" or issparse(X): - X = check_array( - X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 - ) - elif isinstance(X, np.ndarray): - # Only non-sparse, precomputed distance matrices are handled here - # and thereby allowed to contain numpy.inf for missing distances - - # Perform check_array(X) after removing infinite values (numpy.inf) - # from the given distance matrix. - tmp = X.copy() - tmp[np.isinf(tmp)] = 1 - check_array(tmp) - - memory = Memory(location=memory, verbose=0) - - metric_params = metric_params or {} - func = None - kwargs = dict( - X=X, - algo="kd_tree", - min_samples=min_samples, - alpha=alpha, - metric=metric, - leaf_size=leaf_size, - n_jobs=n_jobs, - **metric_params, - ) - if "kdtree" in algorithm and metric not in KDTree.valid_metrics: - raise ValueError( - f"{metric} is not a valid metric for a KDTree-based algorithm. Please" - " select a different metric." - ) - elif "balltree" in algorithm and metric not in BallTree.valid_metrics: - raise ValueError( - f"{metric} is not a valid metric for a BallTree-based algorithm. Please" - " select a different metric." - ) - - if algorithm != "auto": - if metric != "precomputed" and issparse(X) and algorithm != "brute": - raise ValueError("Sparse data matrices only support algorithm `brute`.") - - if algorithm == "brute": - func = _hdbscan_brute - for key in ("algo", "leaf_size", "n_jobs"): - kwargs.pop(key, None) - elif algorithm == "kdtree": - func = _hdbscan_prims - elif algorithm == "balltree": - func = _hdbscan_prims - kwargs["algo"] = "ball_tree" - else: - if issparse(X) or metric not in FAST_METRICS: - # We can't do much with sparse matrices ... - func = _hdbscan_brute - for key in ("algo", "leaf_size", "n_jobs"): - kwargs.pop(key, None) - elif metric in KDTree.valid_metrics: - func = _hdbscan_prims - else: # Metric is a valid BallTree metric - func = _hdbscan_prims - kwargs["algo"] = "ball_tree" - - single_linkage_tree = memory.cache(func)(**kwargs) - - return _tree_to_labels( - single_linkage_tree, - min_cluster_size, - cluster_selection_method, - allow_single_cluster, - cluster_selection_epsilon, - max_cluster_size, - ) - - class HDBSCAN(ClusterMixin, BaseEstimator): """Perform HDBSCAN clustering from vector array or distance matrix. @@ -705,15 +479,16 @@ def fit(self, X, y=None): metric_params = self.metric_params or {} if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. - # Rows with these values - X = self._validate_data(X, force_all_finite=False, accept_sparse="csr") + X = self._validate_data( + X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 + ) self._raw_data = X - self._all_finite = ( + all_finite = ( np.all(np.isfinite(X.data)) if issparse(X) else np.all(np.isfinite(X)) ) - if not self._all_finite: + if not all_finite: # Pass only the purely finite indices into hdbscan # We will later assign all non-finite points to the # background-1 cluster @@ -723,25 +498,92 @@ def fit(self, X, y=None): outliers = list(set(range(X.shape[0])) - set(finite_index)) elif issparse(X): # Handle sparse precomputed distance matrices separately - X = self._validate_data(X, accept_sparse="csr") + X = self._validate_data( + X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 + ) else: - # Only non-sparse, precomputed distance matrices are allowed - # to have numpy.inf values indicating missing distances - X = self._validate_data(X, force_all_finite="allow-nan") + # Only non-sparse, precomputed distance matrices are handled here + # and thereby allowed to contain numpy.inf for missing distances + + # Perform data validation after removing infinite values (numpy.inf) + # from the given distance matrix. + tmp = X.copy() + tmp[np.isinf(tmp)] = 1 + self._validate_data(tmp) self.n_features_in_ = X.shape[1] - kwargs = self.get_params() - # prediction data only applies to the persistent model, so remove - # it from the keyword args we pass on the the function - kwargs["metric_params"] = metric_params + self._min_samples = ( + self.min_cluster_size if self.min_samples is None else self.min_samples + ) + + memory = Memory(location=self.memory, verbose=0) + + func = None + kwargs = dict( + X=X, + algo="kd_tree", + min_samples=self._min_samples, + alpha=self.alpha, + metric=self.metric, + leaf_size=self.leaf_size, + n_jobs=self.n_jobs, + **metric_params, + ) + if "kdtree" in self.algorithm and self.metric not in KDTree.valid_metrics: + raise ValueError( + f"{self.metric} is not a valid metric for a KDTree-based algorithm." + " Please select a different metric." + ) + elif "balltree" in self.algorithm and self.metric not in BallTree.valid_metrics: + raise ValueError( + f"{self.metric} is not a valid metric for a BallTree-based algorithm." + " Please select a different metric." + ) + + if self.algorithm != "auto": + if ( + self.metric != "precomputed" + and issparse(X) + and self.algorithm != "brute" + ): + raise ValueError("Sparse data matrices only support algorithm `brute`.") + + if self.algorithm == "brute": + func = _hdbscan_brute + for key in ("algo", "leaf_size", "n_jobs"): + kwargs.pop(key, None) + elif self.algorithm == "kdtree": + func = _hdbscan_prims + elif self.algorithm == "balltree": + func = _hdbscan_prims + kwargs["algo"] = "ball_tree" + else: + if issparse(X) or self.metric not in FAST_METRICS: + # We can't do much with sparse matrices ... + func = _hdbscan_brute + for key in ("algo", "leaf_size", "n_jobs"): + kwargs.pop(key, None) + elif self.metric in KDTree.valid_metrics: + func = _hdbscan_prims + else: # Metric is a valid BallTree metric + func = _hdbscan_prims + kwargs["algo"] = "ball_tree" + + single_linkage_tree = memory.cache(func)(**kwargs) ( self.labels_, self.probabilities_, self._single_linkage_tree_, - ) = hdbscan(X, **kwargs) - - if self.metric != "precomputed" and not self._all_finite: + ) = _tree_to_labels( + single_linkage_tree, + self.min_cluster_size, + self.cluster_selection_method, + self.allow_single_cluster, + self.cluster_selection_epsilon, + self.max_cluster_size, + ) + if self.metric != "precomputed" and not all_finite: # remap indices to align with original data in the case of # non-finite entries. self._single_linkage_tree_ = remap_single_linkage_tree( diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 2377222a47e7f..68bf0776832ca 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -9,7 +9,7 @@ from scipy.stats import mode from sklearn import datasets -from sklearn.cluster import HDBSCAN, hdbscan +from sklearn.cluster import HDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import fowlkes_mallows_score from sklearn.metrics.pairwise import _VALID_METRICS @@ -18,7 +18,7 @@ from sklearn.utils import shuffle from sklearn.utils._testing import assert_array_almost_equal -n_clusters = 3 +n_clusters_true = 3 X, y = make_blobs(n_samples=200, random_state=10) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) @@ -73,14 +73,9 @@ def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) - labels = hdbscan(D, metric="precomputed")[0] - # number of clusters, ignoring noise if present - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - labels = HDBSCAN(metric="precomputed").fit(D).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + n_clusters_ = len(set(labels)) - int(-1 in labels) + assert n_clusters_ == n_clusters_true # Check that clustering is arbitrarily good # This is a heuristic to guard against regression @@ -98,24 +93,15 @@ def test_hdbscan_sparse_distance_matrix(): D = sparse.csr_matrix(D) D.eliminate_zeros() - labels = hdbscan(D, metric="precomputed")[0] - # number of clusters, ignoring noise if present - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - labels = HDBSCAN(metric="precomputed").fit(D).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true def test_hdbscan_feature_vector(): - labels = hdbscan(X)[0] - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN().fit(X).labels_ - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + labels = HDBSCAN().fit_predict(X) + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true # Check that clustering is arbitrarily good # This is a heuristic to guard against regression @@ -134,94 +120,72 @@ def test_hdbscan_feature_vector(): ) @pytest.mark.parametrize("metric", _VALID_METRICS) def test_hdbscan_algorithms(algo, metric): - labels = hdbscan(X, algorithm=algo)[0] - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(algorithm=algo).fit(X).labels_ + labels = HDBSCAN(algorithm=algo).fit_predict(X) n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + assert n_clusters_2 == n_clusters_true ALGOS_TREES = { "kdtree": KDTree, "balltree": BallTree, } - METRIC_PARAMS = { + metric_params = { "mahalanobis": {"V": np.eye(X.shape[1])}, "seuclidean": {"V": np.ones(X.shape[1])}, "minkowski": {"p": 2}, "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, - } + }.get(metric, None) if algo not in ("auto", "brute"): if metric not in ALGOS_TREES[algo].valid_metrics: with pytest.raises(ValueError): - hdbscan( - X, + HDBSCAN( algorithm=algo, metric=metric, - metric_params=METRIC_PARAMS.get(metric, None), - ) + metric_params=metric_params, + ).fit(X) elif metric == "wminkowski": with pytest.warns(FutureWarning): - hdbscan( - X, + HDBSCAN( algorithm=algo, metric=metric, - metric_params=METRIC_PARAMS.get(metric, None), - ) + metric_params=metric_params, + ).fit(X) else: - hdbscan( - X, + HDBSCAN( algorithm=algo, metric=metric, - metric_params=METRIC_PARAMS.get(metric, None), - ) + metric_params=metric_params, + ).fit(X) def test_hdbscan_dbscan_clustering(): clusterer = HDBSCAN().fit(X) labels = clusterer.dbscan_clustering(0.3) n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters == n_clusters_1 + assert n_clusters_true == n_clusters_1 def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) H = StandardScaler().fit_transform(H) - labels = hdbscan(H)[0] - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = ( - HDBSCAN( - algorithm="auto", - metric="seuclidean", - metric_params={"V": np.ones(H.shape[1])}, - ) - .fit(H) - .labels_ - ) + labels = HDBSCAN( + algorithm="auto", + metric="seuclidean", + metric_params={"V": np.ones(H.shape[1])}, + ).fit_predict(H) n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + assert n_clusters_2 == n_clusters_true def test_hdbscan_best_balltree_metric(): - kwargs = dict(metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}) - labels, _, _ = hdbscan(X, **kwargs) - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(**kwargs).fit(X).labels_ + labels = HDBSCAN( + metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} + ).fit_predict(X) n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + assert n_clusters_2 == n_clusters_true def test_hdbscan_no_clusters(): - labels = hdbscan(X, min_cluster_size=len(X) - 1)[0] - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == 0 - - labels = HDBSCAN(min_cluster_size=len(X) - 1).fit(X).labels_ + labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == 0 @@ -232,12 +196,7 @@ def test_hdbscan_min_cluster_size(): many points """ for min_cluster_size in range(2, len(X), 1): - labels = hdbscan(X, min_cluster_size=min_cluster_size)[0] - true_labels = [label for label in labels if label != -1] - if len(true_labels) != 0: - assert np.min(np.bincount(true_labels)) >= min_cluster_size - - labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_ + labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X) true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size @@ -245,14 +204,9 @@ def test_hdbscan_min_cluster_size(): def test_hdbscan_callable_metric(): metric = distance.euclidean - - labels = hdbscan(X, metric=metric)[0] - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_1 == n_clusters - - labels = HDBSCAN(metric=metric).fit(X).labels_ + labels = HDBSCAN(metric=metric).fit_predict(X) n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters + assert n_clusters_2 == n_clusters_true def test_hdbscan_input_lists(): @@ -263,8 +217,6 @@ def test_hdbscan_input_lists(): @pytest.mark.parametrize("tree", ["kd", "ball"]) def test_hdbscan_precomputed_non_brute(tree): hdb = HDBSCAN(metric="precomputed", algorithm=f"prims_{tree}tree") - with pytest.raises(ValueError): - hdbscan(X, metric="precomputed", algorithm=f"prims_{tree}tree") with pytest.raises(ValueError): hdb.fit(X) @@ -286,14 +238,12 @@ def test_hdbscan_sparse(): msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) - with pytest.raises(ValueError, match=msg): - hdbscan(sparse_X, metric="euclidean", algorithm="balltree") def test_hdbscan_caching(tmp_path): - labels1 = HDBSCAN(memory=tmp_path, min_samples=5).fit(X).labels_ - labels2 = HDBSCAN(memory=tmp_path, min_samples=5, min_cluster_size=6).fit(X).labels_ + labels1 = HDBSCAN(memory=tmp_path, min_samples=5).fit_predict(X) + labels2 = HDBSCAN(memory=tmp_path, min_samples=5, min_cluster_size=6).fit_predict(X) n_clusters1 = len(set(labels1)) - int(-1 in labels1) n_clusters2 = len(set(labels2)) - int(-1 in labels2) assert n_clusters1 == n_clusters2 @@ -379,7 +329,7 @@ def test_hdbscan_unfit_centers_errors(): def test_hdbscan_precomputed_array_like(): X = np.array([[1, np.inf], [np.inf, 1]]) - hdbscan(X, metric="precomputed") + HDBSCAN(metric="precomputed").fit(X) def test_hdbscan_sparse_distances_too_few_nonzero(): From 3f895745e6901b0f416a5bf386b2727de5040d2c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 15:27:21 -0400 Subject: [PATCH 099/160] minor cleanup --- sklearn/cluster/_hdbscan/hdbscan.py | 17 +++---- .../cluster/_hdbscan/tests/test_hdbscan.py | 45 +++++++------------ 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index a96c83ee953ea..16abf0afa9948 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -136,8 +136,6 @@ def _hdbscan_sparse_distance_matrix( alpha=1.0, **metric_params, ): - assert issparse(X) - # Compute sparse mutual reachability graph # if max_dist > 0, max distance to use when the reachability is infinite max_dist = metric_params.get("max_dist", 0.0) @@ -154,11 +152,11 @@ def _hdbscan_sparse_distance_matrix( > 1 ): raise ValueError( - "There exists points with less than %s neighbors. " - "Ensure your distance matrix has non zeros values for " + "There exists points with fewer than %s neighbors. " + "Ensure your distance matrix has non-zero values for " "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " - "or specify a `max_dist` to use when distances are missing." - % (min_samples, min_samples) + "or specify a `max_dist` in `metric_params` to use when distances " + "are missing." % (min_samples, min_samples) ) # Compute the minimum spanning tree for the sparse graph @@ -499,7 +497,9 @@ def fit(self, X, y=None): elif issparse(X): # Handle sparse precomputed distance matrices separately X = self._validate_data( - X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 + X, + accept_sparse="csr", + dtype=np.float64, ) else: # Only non-sparse, precomputed distance matrices are handled here @@ -565,7 +565,8 @@ def fit(self, X, y=None): kwargs.pop(key, None) elif self.metric in KDTree.valid_metrics: func = _hdbscan_prims - else: # Metric is a valid BallTree metric + else: + # Metric is a valid BallTree metric func = _hdbscan_prims kwargs["algo"] = "ball_tree" diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 68bf0776832ca..7f6a647094357 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -8,7 +8,6 @@ from scipy.spatial import distance from scipy.stats import mode -from sklearn import datasets from sklearn.cluster import HDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import fowlkes_mallows_score @@ -44,16 +43,6 @@ def test_missing_data(): assert np.allclose(clean_model.labels_, model.labels_[clean_indices]) -def generate_noisy_data(): - rng = np.random.RandomState(0) - blobs, _ = datasets.make_blobs( - n_samples=200, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 - ) - moons, _ = datasets.make_moons(n_samples=200, noise=0.05) - noise = rng.uniform(-1.0, 3.0, (50, 2)) - return np.vstack([blobs, moons, noise]) - - def homogeneity(labels1, labels2): num_missed = 0.0 for label in set(labels1): @@ -73,9 +62,9 @@ def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) - labels = HDBSCAN(metric="precomputed").fit(D).labels_ - n_clusters_ = len(set(labels)) - int(-1 in labels) - assert n_clusters_ == n_clusters_true + labels = HDBSCAN(metric="precomputed").fit_predict(D) + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true # Check that clustering is arbitrarily good # This is a heuristic to guard against regression @@ -93,7 +82,7 @@ def test_hdbscan_sparse_distance_matrix(): D = sparse.csr_matrix(D) D.eliminate_zeros() - labels = HDBSCAN(metric="precomputed").fit(D).labels_ + labels = HDBSCAN(metric="precomputed").fit_predict(D) n_clusters = len(set(labels)) - int(-1 in labels) assert n_clusters == n_clusters_true @@ -121,8 +110,8 @@ def test_hdbscan_feature_vector(): @pytest.mark.parametrize("metric", _VALID_METRICS) def test_hdbscan_algorithms(algo, metric): labels = HDBSCAN(algorithm=algo).fit_predict(X) - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters_true + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true ALGOS_TREES = { "kdtree": KDTree, @@ -160,8 +149,8 @@ def test_hdbscan_algorithms(algo, metric): def test_hdbscan_dbscan_clustering(): clusterer = HDBSCAN().fit(X) labels = clusterer.dbscan_clustering(0.3) - n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert n_clusters_true == n_clusters_1 + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true def test_hdbscan_high_dimensional(): @@ -172,22 +161,22 @@ def test_hdbscan_high_dimensional(): metric="seuclidean", metric_params={"V": np.ones(H.shape[1])}, ).fit_predict(H) - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters_true + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true def test_hdbscan_best_balltree_metric(): labels = HDBSCAN( metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} ).fit_predict(X) - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters_true + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true def test_hdbscan_no_clusters(): labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == 0 + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == 0 def test_hdbscan_min_cluster_size(): @@ -205,8 +194,8 @@ def test_hdbscan_min_cluster_size(): def test_hdbscan_callable_metric(): metric = distance.euclidean labels = HDBSCAN(metric=metric).fit_predict(X) - n_clusters_2 = len(set(labels)) - int(-1 in labels) - assert n_clusters_2 == n_clusters_true + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == n_clusters_true def test_hdbscan_input_lists(): @@ -335,6 +324,6 @@ def test_hdbscan_precomputed_array_like(): def test_hdbscan_sparse_distances_too_few_nonzero(): X = sparse.csr_matrix(np.zeros((10, 10))) - msg = "There exists points with less than" + msg = "There exists points with fewer than" with pytest.raises(ValueError, match=msg): HDBSCAN(metric="precomputed").fit(X) From 33f950ba9b0dbeb49e9604e969439ccd5d194e39 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 15:50:33 -0400 Subject: [PATCH 100/160] Parameter simplification, and cluster_center refactor --- sklearn/cluster/_hdbscan/hdbscan.py | 118 ++++++++---------- .../cluster/_hdbscan/tests/test_hdbscan.py | 12 +- 2 files changed, 56 insertions(+), 74 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 16abf0afa9948..594d863f5fdf7 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -27,30 +27,6 @@ from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics -_PARAM_CONSTRAINTS = { - "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], - "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], - "cluster_selection_epsilon": [Interval(Real, left=0, right=None, closed="left")], - "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], - "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], - "alpha": [Interval(Real, left=0, right=None, closed="neither")], - "algorithm": [ - StrOptions( - { - "auto", - "brute", - "kdtree", - "balltree", - } - ) - ], - "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], - "memory": [str, None, Path], - "n_jobs": [Integral, None], - "cluster_selection_method": [StrOptions({"eom", "leaf"})], - "allow_single_cluster": ["boolean"], - "metric_params": [dict, None], -} def _tree_to_labels( @@ -423,7 +399,32 @@ class HDBSCAN(ClusterMixin, BaseEstimator): array([ 2, 6, -1, ..., -1, -1, -1]) """ - _parameter_constraints = _PARAM_CONSTRAINTS + _parameter_constraints = { + "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], + "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], + "cluster_selection_epsilon": [ + Interval(Real, left=0, right=None, closed="left") + ], + "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], + "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], + "alpha": [Interval(Real, left=0, right=None, closed="neither")], + "algorithm": [ + StrOptions( + { + "auto", + "brute", + "kdtree", + "balltree", + } + ) + ], + "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], + "memory": [str, None, Path], + "n_jobs": [Integral, None], + "cluster_selection_method": [StrOptions({"eom", "leaf"})], + "allow_single_cluster": ["boolean"], + "metric_params": [dict, None], + } def __init__( self, @@ -474,7 +475,7 @@ def fit(self, X, y=None): Returns self. """ self._validate_params() - metric_params = self.metric_params or {} + self._metric_params = self.metric_params or {} if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. X = self._validate_data( @@ -509,7 +510,7 @@ def fit(self, X, y=None): # from the given distance matrix. tmp = X.copy() tmp[np.isinf(tmp)] = 1 - self._validate_data(tmp) + self._validate_data(tmp, dtype=np.float64) self.n_features_in_ = X.shape[1] self._min_samples = ( @@ -527,7 +528,7 @@ def fit(self, X, y=None): metric=self.metric, leaf_size=self.leaf_size, n_jobs=self.n_jobs, - **metric_params, + **self._metric_params, ) if "kdtree" in self.algorithm and self.metric not in KDTree.valid_metrics: raise ValueError( @@ -621,19 +622,27 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ - def weighted_cluster_centroid(self, cluster_id): + def weighted_cluster_center(self, cluster_id, mode="centroid"): """ Provide an approximate representative point for a given cluster. - Note that this technique assumes a euclidean metric for speed of - computation. For more general metrics use the `weighted_cluster_medoid` - method which is slower, but can work with more general metrics. - Parameters ---------- cluster_id : int The id of the cluster to compute a centroid for. + mode : str, default="centroid" + The mode to use when providing the cluster center. The options are: + - "centroid" which calculates the center by taking the weighted + average of their positions. Note that the algorithm assumes a + euclidean metric and does not guarantee that the output will be + an observed data point. + - "medoid" which calculates the center by taking the point in the + fitted data which minimizes the distance to all other points in + the cluster. This is slower than "centroid" since it requires + computing additional pairwise distances between points of the + same cluster but guarantees the output is an observed data point. + Returns ------- centroid : array of shape (n_features,) @@ -652,42 +661,15 @@ def weighted_cluster_centroid(self, cluster_id): cluster_data = self._raw_data[mask] cluster_membership_strengths = self.probabilities_[mask] - return np.average(cluster_data, weights=cluster_membership_strengths, axis=0) - - def weighted_cluster_medoid(self, cluster_id): - """ - Provide an approximate representative point for a given cluster. - - Note that this technique can be very slow and memory intensive for - large clusters. For faster results use the `weighted_cluster_centroid` - method which is faster, but assumes a euclidean metric. - - Parameters - ---------- - cluster_id : int - The id of the cluster to compute a medoid for. - - Returns - ------- - centroid : array of shape (n_features,) - A representative medoid for cluster `cluster_id`. - """ - if not hasattr(self, "labels_"): - raise AttributeError("Model has not been fit to data") - - if cluster_id == -1: - raise ValueError( - "Cannot calculate weighted centroid for -1 cluster " - "since it is a noise cluster" + if mode == "centroid": + return np.average( + cluster_data, weights=cluster_membership_strengths, axis=0 ) - mask = self.labels_ == cluster_id - cluster_data = self._raw_data[mask] - cluster_membership_strengths = self.probabilities_[mask] - metric_params = self.metric_params or {} - - dist_mat = pairwise_distances(cluster_data, metric=self.metric, **metric_params) - + # mode == "medoid" + dist_mat = pairwise_distances( + cluster_data, metric=self.metric, **self._metric_params + ) dist_mat = dist_mat * cluster_membership_strengths medoid_index = np.argmin(dist_mat.sum(axis=1)) return cluster_data[medoid_index] @@ -732,4 +714,4 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): ) def _more_tags(self): - return {"allow_nan": True} + return {"allow_nan": self.metric != "precomputed"} diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 7f6a647094357..6fb3f14e14a99 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -244,19 +244,19 @@ def test_hdbscan_centroids_medoids(): clusterer = HDBSCAN().fit(H) for idx, center in enumerate(centers): - centroid = clusterer.weighted_cluster_centroid(idx) + centroid = clusterer.weighted_cluster_center(idx, mode="centroid") assert_array_almost_equal(centroid, center, decimal=1) - medoid = clusterer.weighted_cluster_medoid(idx) + medoid = clusterer.weighted_cluster_center(idx, mode="medoid") assert_array_almost_equal(medoid, center, decimal=1) def test_hdbscan_no_centroid_medoid_for_noise(): clusterer = HDBSCAN().fit(X) with pytest.raises(ValueError): - clusterer.weighted_cluster_centroid(-1) + clusterer.weighted_cluster_center(-1, mode="centroid") with pytest.raises(ValueError): - clusterer.weighted_cluster_medoid(-1) + clusterer.weighted_cluster_center(-1, mode="medoid") def test_hdbscan_allow_single_cluster_with_epsilon(): @@ -311,9 +311,9 @@ def test_hdbscan_unfit_centers_errors(): hdb = HDBSCAN() msg = "Model has not been fit to data" with pytest.raises(AttributeError, match=msg): - hdb.weighted_cluster_centroid(0) + hdb.weighted_cluster_center(0, mode="centroid") with pytest.raises(AttributeError, match=msg): - hdb.weighted_cluster_medoid(0) + hdb.weighted_cluster_center(0, mode="medoid") def test_hdbscan_precomputed_array_like(): From d29cc0273fcc226f01f6958a76430c0b3a89cdad Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 16:13:32 -0400 Subject: [PATCH 101/160] Minor typo corrections and reordering of user-guide entry --- doc/modules/clustering.rst | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 6ef029a1c17a9..305c1f8a8da7f 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -989,13 +989,19 @@ as These two notions allow us to construct the *mutual reachability graph* :math:`G_{ms}` defined for a fixed choice of `min_samples` by associating each sample :math:`x_p` with a vertex of the graph, and thus edges between points -:math:`x_p, x_q` is the mutual reachability distance :math:`d_m(x_p, x_q)` +:math:`x_p, x_q` are the mutual reachability distance :math:`d_m(x_p, x_q)` between them. We may build subsets of this graph, labeled :math:`G_{ms,\epsilon}` defined as the original graph after removing any edges with value greater than `eps`. Any points whose core distance is less than `eps` are at this staged marked as noise. The remaining points are then clustered by finding the connected components of this trimmed graph. +.. note:: + + Taking the connected components of a trimmed graph :math:`G_{ms,\epsilon}` is + equivalent to running DBSCAN* with `min_samples` and `eps`. DBSCAN* is a + slightly modified version of DBSCAN mentioned in [CM2013]_. + Hierarchical Clustering ----------------------- HDBSCAN can be seen as an algorithm which performs DBSCAN* clustering across all @@ -1009,21 +1015,14 @@ weight. An outline of the HDBSCAN algorithm is as follows: 2. Extend the MST by adding a "self edge" for each vertex, with weight equal to the core distance of the underlying sample. 3. Initialize a single cluster and label for the MST. - 4. Remove the edge with the greatest weight is from the MST (ties are + 4. Remove the edge with the greatest weight from the MST (ties are removed simultaneously). 5. Assign cluster labels to the connected components which contain the end points of the now-removed edge. If the component does not have at least one edge it is instead assigned a "null" label marking it as noise. 6. Repeat 4-6 until there are no more connected components. -.. note:: - - The clustering generated by taking the connected components of a trimmed - graph :math:`G_{ms,\epsilon}` equivalent to running DBSCAN* with `min_samples` - and `eps`. DBSCAN* is a slightly modified version of DBSCAN mentioned in - [CM2013]_. - -HDBSCAN is therefore able to obtain all possible partitions obtainable by +HDBSCAN is therefore able to obtain all possible partitions achievable by DBSCAN* for a fixed choice of `min_samples` in a hierarchical fashion. Indeed, this allows HDBSCAN to perform clustering across multiple densities and as such it no longer needs `eps` to be given as a hyperparameter. Instead From 3b86f1dbaee30003cc1c0d6a3783b0c8d8d61fbc Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 16:21:33 -0400 Subject: [PATCH 102/160] streamlined test --- .../cluster/_hdbscan/tests/test_hdbscan.py | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 6fb3f14e14a99..21c4bc4c214da 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -113,6 +113,10 @@ def test_hdbscan_algorithms(algo, metric): n_clusters = len(set(labels)) - int(-1 in labels) assert n_clusters == n_clusters_true + # Validation for brute is handled by `pairwise_distances` + if algo in ("brute", "auto"): + return + ALGOS_TREES = { "kdtree": KDTree, "balltree": BallTree, @@ -123,27 +127,21 @@ def test_hdbscan_algorithms(algo, metric): "minkowski": {"p": 2}, "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, }.get(metric, None) - if algo not in ("auto", "brute"): - if metric not in ALGOS_TREES[algo].valid_metrics: - with pytest.raises(ValueError): - HDBSCAN( - algorithm=algo, - metric=metric, - metric_params=metric_params, - ).fit(X) - elif metric == "wminkowski": - with pytest.warns(FutureWarning): - HDBSCAN( - algorithm=algo, - metric=metric, - metric_params=metric_params, - ).fit(X) - else: - HDBSCAN( - algorithm=algo, - metric=metric, - metric_params=metric_params, - ).fit(X) + + hdb = HDBSCAN( + algorithm=algo, + metric=metric, + metric_params=metric_params, + ) + + if metric not in ALGOS_TREES[algo].valid_metrics: + with pytest.raises(ValueError): + hdb.fit(X) + elif metric == "wminkowski": + with pytest.warns(FutureWarning): + hdb.fit(X) + else: + hdb.fit(X) def test_hdbscan_dbscan_clustering(): From cada149c7fb625c0521aa8abcca9af4850b90a78 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 16:53:26 -0400 Subject: [PATCH 103/160] Documentation update per review feedback --- doc/conf.py | 1 - doc/modules/classes.rst | 1 - doc/modules/clustering.rst | 6 +++--- examples/cluster/plot_hdbscan.py | 25 +++++++++++++------------ 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index ec0aab9f766fc..430e1714ec6cf 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -566,7 +566,6 @@ def setup(app): "sklearn.cluster.dbscan": "dbscan-function", "sklearn.covariance.oas": "oas-function", "sklearn.decomposition.fastica": "fastica-function", - "sklearn.cluster.hdbscan": "hdbscan-function", } diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 58e7df247eaac..b28b716141a78 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -124,7 +124,6 @@ Functions cluster.cluster_optics_xi cluster.compute_optics_graph cluster.dbscan - cluster.hdbscan cluster.estimate_bandwidth cluster.k_means cluster.kmeans_plusplus diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 305c1f8a8da7f..1eb2e51273d43 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -990,9 +990,9 @@ These two notions allow us to construct the *mutual reachability graph* :math:`G_{ms}` defined for a fixed choice of `min_samples` by associating each sample :math:`x_p` with a vertex of the graph, and thus edges between points :math:`x_p, x_q` are the mutual reachability distance :math:`d_m(x_p, x_q)` -between them. We may build subsets of this graph, labeled -:math:`G_{ms,\epsilon}` defined as the original graph after removing any edges -with value greater than `eps`. Any points whose core distance is less than `eps` +between them. We may build subsets of this graph, denoted as +:math:`G_{ms,\epsilon}`, by removing any edges with value greater than `eps` +from the original graph. Any points whose core distance is less than `eps` are at this staged marked as noise. The remaining points are then clustered by finding the connected components of this trimmed graph. diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index c25dd0cef67d5..98f8f6a3fce51 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -3,9 +3,10 @@ ==================================== Demo of HDBSCAN clustering algorithm ==================================== +.. currentmodule:: sklearn -In this demo we will take a look at :class:`sklearn.cluster.HDBSCAN` from the -perspective of generalizing the :class:`sklearn.cluster.DBSCAN` algorithm. +In this demo we will take a look at :class:`cluster.HDBSCAN` from the +perspective of generalizing the :class:`cluster.DBSCAN` algorithm. We'll compare both algorithms on specific datasets. Finally we'll evaluate HDBSCAN's sensitivity to certain hyperparameters. We first define a couple utility functions for convenience. @@ -20,7 +21,7 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None): if ax is None: - _, ax = plt.subplots() + _, ax = plt.subplots(figsize=(10, 4)) labels = labels if labels is not None else np.ones(X.shape[0]) probabilities = probabilities if probabilities is not None else np.ones(X.shape[0]) # Black removed and is used for noise instead. @@ -51,6 +52,7 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items()) title += f" | {parameters_str}" ax.set_title(title) + plt.tight_layout() # %% @@ -62,12 +64,11 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # DBSCAN it does not require specification of an arbitray (and indeed tricky) # `eps` hyperparameter. For example, below we generate a dataset composed of # a mixture of three diagonal Gaussians. -fig, axis = plt.subplots(1, 1, figsize=(12, 5)) centers = [[1, 1], [-1, -1], [1.5, -1.5]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=[0.4, 0.1, 0.75], random_state=0 ) -plot(X, labels=labels_true, ground_truth=True, ax=axis) +plot(X, labels=labels_true, ground_truth=True) # %% # Scale Invariance # ----------------- @@ -77,7 +78,7 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # epsilon value that works for one dataset, and try to apply it to a # similar but rescaled versions of the dataset. Below are plots of the original # dataset, and versions rescaled by 0.5 and 3 respectively. -fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +fig, axes = plt.subplots(3, 1, figsize=(10, 12)) parameters = {"eps": 0.3} dbs = DBSCAN(**parameters).fit(X) plot(X, dbs.labels_, parameters=parameters, ax=axes[0]) @@ -99,9 +100,9 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # great care must be taken to select the appropriate value for `eps`. HDBSCAN # is much more robust in this sense. HDBSCAN can be seen as clustering over # all possible values of `eps` and extracting the best clusters from all -# possible clusters (see :ref:`HDBSCAN`). One immediate advantage is that -# HDBSCAN is scale-invariant. -fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +# possible clusters (see :ref:`User Guide `). One immediate +# advantage is that HDBSCAN is scale-invariant. +fig, axes = plt.subplots(3, 1, figsize=(10, 12)) hdb = HDBSCAN().fit(X) plot(X, hdb.labels_, hdb.probabilities_, ax=axes[0]) hdb.fit(0.5 * X) @@ -132,7 +133,7 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # clusters into many false clusters. Not to mention this requires manually # tuning choices of `eps` until we find a tradeoff that we are comfortable # with. Let's see how DBSCAN tackles this. -fig, axes = plt.subplots(2, 1, figsize=(12, 10)) +fig, axes = plt.subplots(2, 1, figsize=(10, 8)) params = {"eps": 0.7} dbs = DBSCAN(**params).fit(X) plot(X, dbs.labels_, parameters=params, ax=axes[0]) @@ -182,7 +183,7 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # overlap. PARAM = ({"min_cluster_size": 5}, {"min_cluster_size": 3}, {"min_cluster_size": 25}) -fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +fig, axes = plt.subplots(3, 1, figsize=(10, 12)) for i, param in enumerate(PARAM): hdb = HDBSCAN(**param).fit(X) labels = hdb.labels_ @@ -204,7 +205,7 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= {"min_cluster_size": 20, "min_samples": 3}, {"min_cluster_size": 20, "min_samples": 25}, ) -fig, axes = plt.subplots(3, 1, figsize=(12, 16)) +fig, axes = plt.subplots(3, 1, figsize=(10, 12)) for i, param in enumerate(PARAM): hdb = HDBSCAN(**param).fit(X) labels = hdb.labels_ From 45aab3cd469f6f4d4f914a745fb3cfb7278fc8c0 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 17:10:02 -0400 Subject: [PATCH 104/160] Removed unnecessary function and made minor tweak to test --- sklearn/cluster/_hdbscan/hdbscan.py | 1 + .../cluster/_hdbscan/tests/test_hdbscan.py | 20 ++----------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 594d863f5fdf7..c70445fd32023 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -508,6 +508,7 @@ def fit(self, X, y=None): # Perform data validation after removing infinite values (numpy.inf) # from the given distance matrix. + X = np.asarray(X) tmp = X.copy() tmp[np.isinf(tmp)] = 1 self._validate_data(tmp, dtype=np.float64) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 21c4bc4c214da..21e6f8bfa6983 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -6,7 +6,6 @@ import pytest from scipy import sparse, stats from scipy.spatial import distance -from scipy.stats import mode from sklearn.cluster import HDBSCAN from sklearn.datasets import make_blobs @@ -43,21 +42,6 @@ def test_missing_data(): assert np.allclose(clean_model.labels_, model.labels_[clean_indices]) -def homogeneity(labels1, labels2): - num_missed = 0.0 - for label in set(labels1): - matches = labels2[labels1 == label] - match_mode = mode(matches)[0][0] - num_missed += np.sum(matches != match_mode) - - for label in set(labels2): - matches = labels1[labels2 == label] - match_mode = mode(matches)[0][0] - num_missed += np.sum(matches != match_mode) - - return num_missed / 2.0 - - def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) @@ -314,8 +298,8 @@ def test_hdbscan_unfit_centers_errors(): hdb.weighted_cluster_center(0, mode="medoid") -def test_hdbscan_precomputed_array_like(): - X = np.array([[1, np.inf], [np.inf, 1]]) +@pytest.mark.parametrize("X", [np.array([[1, np.inf], [np.inf, 1]]), [[1, 2], [2, 1]]]) +def test_hdbscan_precomputed_array_like(X): HDBSCAN(metric="precomputed").fit(X) From 67cab1a52918846d9c73563c6bbca46515cbcd2d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 6 Sep 2022 17:13:52 -0400 Subject: [PATCH 105/160] Simplified plotting demo single-axis plots --- examples/cluster/plot_hdbscan.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index 98f8f6a3fce51..ebecc410acda9 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -118,12 +118,11 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # Traditional DBSCAN assumes that any potential clusters are homogenous in # density. HDBSCAN is free from such constraints. To demonstrate this we # consider the following dataset -fig, axis = plt.subplots(1, 1, figsize=(12, 5)) centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0 ) -plot(X, labels=labels_true, ground_truth=True, ax=axis) +plot(X, labels=labels_true, ground_truth=True) # %% # This dataset is more difficult for DBSCAN due to the varying densities and @@ -148,9 +147,8 @@ def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax= # that DBSCAN is incapable of simultaneously separating the two dense clusters # while preventing the sparse clusters from fragmenting. Let's compare with # HDBSCAN. -fig, axis = plt.subplots(1, 1, figsize=(12, 5)) hdb = HDBSCAN().fit(X) -plot(X, hdb.labels_, hdb.probabilities_, ax=axis) +plot(X, hdb.labels_, hdb.probabilities_) # %% # HDBSCAN is able to pick up and preserve the multi-scale structure of the From 7edfd5582657261c4ed5d3b894198bade5b73f59 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 14 Sep 2022 12:26:21 -0400 Subject: [PATCH 106/160] Refactored weighted centers --- sklearn/cluster/_hdbscan/hdbscan.py | 138 ++++++++---------- .../cluster/_hdbscan/tests/test_hdbscan.py | 31 +--- 2 files changed, 67 insertions(+), 102 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index c70445fd32023..f8405a59b1bdb 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -9,11 +9,9 @@ # License: BSD 3 clause from numbers import Integral, Real -from pathlib import Path from warnings import warn import numpy as np -from joblib import Memory from scipy.sparse import csgraph, issparse from sklearn.base import BaseEstimator, ClusterMixin @@ -176,9 +174,8 @@ def _hdbscan_prims( p=None, ).fit(X) - n_samples = X.shape[0] - core_distances = np.empty(n_samples, dtype=np.float64) - core_distances[:] = nbrs.kneighbors(X, min_samples)[0][:, -1] + neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) + core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) dist_metric = DistanceMetric.get_metric(metric, **metric_params) # Mutual reachability distance is implicit in mst_linkage_core_vector @@ -309,11 +306,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): usage. If you are running out of memory consider increasing the `leaf_size` parameter. Ignored for `algorithm=brute`. - memory : str, default=None - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - n_jobs : int, default=None Number of jobs to run in parallel to calculate distances. `None` means 1 unless in a :obj:`joblib.parallel_backend` context. @@ -334,6 +326,22 @@ class HDBSCAN(ClusterMixin, BaseEstimator): to True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. + store_centers : str, default=None + Which, if any, cluster centers to compute and store. The options are: + - `None` which does not compute nor store any centers. + - `"centroid"` which calculates the center by taking the weighted + average of their positions. Note that the algorithm uses the + euclidean metric and does not guarantee that the output will be + an observed data point. + - `"medoid"` which calculates the center by taking the point in the + fitted data which minimizes the distance to all other points in + the cluster. This is slower than "centroid" since it requires + computing additional pairwise distances between points of the + same cluster but guarantees the output is an observed data point. + The medoid is also well-defined for arbitrary metrics, and does not + depend on a euclidean metric. + - `"both"`which computes and stores both forms of centers. + metric_params : dict, default=None Arguments passed to the distance metric. @@ -419,10 +427,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): ) ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], - "memory": [str, None, Path], "n_jobs": [Integral, None], "cluster_selection_method": [StrOptions({"eom", "leaf"})], "allow_single_cluster": ["boolean"], + "store_centers": [None, StrOptions({"centroid", "medoid", "both"})], "metric_params": [dict, None], } @@ -436,10 +444,10 @@ def __init__( alpha=1.0, algorithm="auto", leaf_size=40, - memory=None, n_jobs=4, cluster_selection_method="eom", allow_single_cluster=False, + store_centers=None, metric_params=None, ): self.min_cluster_size = min_cluster_size @@ -450,10 +458,10 @@ def __init__( self.metric = metric self.algorithm = algorithm self.leaf_size = leaf_size - self.memory = memory self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster + self.store_centers = store_centers self.metric_params = metric_params def fit(self, X, y=None): @@ -508,18 +516,16 @@ def fit(self, X, y=None): # Perform data validation after removing infinite values (numpy.inf) # from the given distance matrix. - X = np.asarray(X) - tmp = X.copy() - tmp[np.isinf(tmp)] = 1 - self._validate_data(tmp, dtype=np.float64) - + X = self._validate_data(X, force_all_finite=False, dtype=np.float64) + if np.isnan(X).any(): + # TODO: Support np.nan in Cython implementation for sparse + # HDBSCAN + raise ValueError("np.nan values found in precomputed-sparse") self.n_features_in_ = X.shape[1] self._min_samples = ( self.min_cluster_size if self.min_samples is None else self.min_samples ) - memory = Memory(location=self.memory, verbose=0) - func = None kwargs = dict( X=X, @@ -552,7 +558,7 @@ def fit(self, X, y=None): if self.algorithm == "brute": func = _hdbscan_brute - for key in ("algo", "leaf_size", "n_jobs"): + for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.algorithm == "kdtree": func = _hdbscan_prims @@ -563,16 +569,17 @@ def fit(self, X, y=None): if issparse(X) or self.metric not in FAST_METRICS: # We can't do much with sparse matrices ... func = _hdbscan_brute - for key in ("algo", "leaf_size", "n_jobs"): + for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.metric in KDTree.valid_metrics: + # TODO: Benchmark KD vs Ball Tree efficacy func = _hdbscan_prims else: # Metric is a valid BallTree metric func = _hdbscan_prims kwargs["algo"] = "ball_tree" - single_linkage_tree = memory.cache(func)(**kwargs) + single_linkage_tree = func(**kwargs) ( self.labels_, @@ -600,6 +607,8 @@ def fit(self, X, y=None): new_probabilities[finite_index] = self.probabilities_ self.probabilities_ = new_probabilities + if self.store_centers: + self._weighted_cluster_center(X) return self def fit_predict(self, X, y=None): @@ -623,57 +632,34 @@ def fit_predict(self, X, y=None): self.fit(X) return self.labels_ - def weighted_cluster_center(self, cluster_id, mode="centroid"): - """ - Provide an approximate representative point for a given cluster. - - Parameters - ---------- - cluster_id : int - The id of the cluster to compute a centroid for. - - mode : str, default="centroid" - The mode to use when providing the cluster center. The options are: - - "centroid" which calculates the center by taking the weighted - average of their positions. Note that the algorithm assumes a - euclidean metric and does not guarantee that the output will be - an observed data point. - - "medoid" which calculates the center by taking the point in the - fitted data which minimizes the distance to all other points in - the cluster. This is slower than "centroid" since it requires - computing additional pairwise distances between points of the - same cluster but guarantees the output is an observed data point. - - Returns - ------- - centroid : array of shape (n_features,) - A representative centroid for cluster `cluster_id`. - """ - if not hasattr(self, "labels_"): - raise AttributeError("Model has not been fit to data") - - if cluster_id == -1: - raise ValueError( - "Cannot calculate weighted centroid for -1 cluster " - "since it is a noise cluster" - ) - - mask = self.labels_ == cluster_id - cluster_data = self._raw_data[mask] - cluster_membership_strengths = self.probabilities_[mask] - - if mode == "centroid": - return np.average( - cluster_data, weights=cluster_membership_strengths, axis=0 - ) - - # mode == "medoid" - dist_mat = pairwise_distances( - cluster_data, metric=self.metric, **self._metric_params - ) - dist_mat = dist_mat * cluster_membership_strengths - medoid_index = np.argmin(dist_mat.sum(axis=1)) - return cluster_data[medoid_index] + def _weighted_cluster_center(self, X): + n_clusters = len(set(self.labels_)) + mask = np.empty((X.shape[0],), dtype=np.bool_) + make_centroids = self.store_centers in ("centroid", "both") + make_medoids = self.store_centers in ("medoid", "both") + + if make_centroids: + self.centroids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64) + if make_medoids: + self.medoids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64) + + # Need to handle iteratively seen each cluster may have a different + # number of samples, hence we can't create a homogenous 3D array. + for idx in range(n_clusters): + mask = self.labels_ == idx + data = X[mask] + strength = self.probabilities_[mask] + if make_centroids: + self.centroids_[idx] = np.average(data, weights=strength, axis=0) + if make_medoids: + # TODO: Implement weighted argmin PWD backend + dist_mat = pairwise_distances( + data, metric=self.metric, **self._metric_params + ) + dist_mat = dist_mat * strength + medoid_index = np.argmin(dist_mat.sum(axis=1)) + self.medoids_[idx] = data[medoid_index] + return def dbscan_clustering(self, cut_distance, min_cluster_size=5): """ @@ -694,18 +680,16 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): Parameters ---------- - cut_distance : float The mutual reachability distance cut value to use to generate a flat clustering. - min_cluster_size : int, optional + min_cluster_size : int, default=5 Clusters smaller than this value with be called 'noise' and remain unclustered in the resulting flat clustering. Returns ------- - labels : array [n_samples] An array of cluster labels, one per datapoint. Unclustered points are assigned the label -1. diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 21e6f8bfa6983..801eea65fd271 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -211,34 +211,24 @@ def test_hdbscan_sparse(): HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) -def test_hdbscan_caching(tmp_path): - - labels1 = HDBSCAN(memory=tmp_path, min_samples=5).fit_predict(X) - labels2 = HDBSCAN(memory=tmp_path, min_samples=5, min_cluster_size=6).fit_predict(X) - n_clusters1 = len(set(labels1)) - int(-1 in labels1) - n_clusters2 = len(set(labels2)) - int(-1 in labels2) - assert n_clusters1 == n_clusters2 - - -def test_hdbscan_centroids_medoids(): +def test_hdbscan_centers(): centers = [(0.0, 0.0), (3.0, 3.0)] H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) - clusterer = HDBSCAN().fit(H) + hdb = HDBSCAN(store_centers="both").fit(H) for idx, center in enumerate(centers): - centroid = clusterer.weighted_cluster_center(idx, mode="centroid") + centroid = hdb.centroids_[idx] assert_array_almost_equal(centroid, center, decimal=1) - medoid = clusterer.weighted_cluster_center(idx, mode="medoid") + medoid = hdb.centroids_[idx] assert_array_almost_equal(medoid, center, decimal=1) def test_hdbscan_no_centroid_medoid_for_noise(): - clusterer = HDBSCAN().fit(X) with pytest.raises(ValueError): - clusterer.weighted_cluster_center(-1, mode="centroid") + HDBSCAN(store_centers="centroid").fit(X) with pytest.raises(ValueError): - clusterer.weighted_cluster_center(-1, mode="medoid") + HDBSCAN(store_centers="medoid").fit(X) def test_hdbscan_allow_single_cluster_with_epsilon(): @@ -289,15 +279,6 @@ def test_hdbscan_better_than_dbscan(): assert n_clusters == 4 -def test_hdbscan_unfit_centers_errors(): - hdb = HDBSCAN() - msg = "Model has not been fit to data" - with pytest.raises(AttributeError, match=msg): - hdb.weighted_cluster_center(0, mode="centroid") - with pytest.raises(AttributeError, match=msg): - hdb.weighted_cluster_center(0, mode="medoid") - - @pytest.mark.parametrize("X", [np.array([[1, np.inf], [np.inf, 1]]), [[1, 2], [2, 1]]]) def test_hdbscan_precomputed_array_like(X): HDBSCAN(metric="precomputed").fit(X) From d173707aafed48683e9bc8b00b9715e09530b3e0 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:39:51 -0400 Subject: [PATCH 107/160] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/cluster/_hdbscan/hdbscan.py | 50 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index f8405a59b1bdb..afad099026e7b 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -238,7 +238,7 @@ def get_finite_row_indices(matrix): class HDBSCAN(ClusterMixin, BaseEstimator): - """Perform HDBSCAN clustering from vector array or distance matrix. + """Cluster data using hierarchical density-based clustering. HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications with Noise. Performs DBSCAN over varying epsilon values and integrates @@ -258,7 +258,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): min_samples : int, default=None The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. - defaults to the `min_cluster_size`. + When `None`, defaults to `min_cluster_size`. cluster_selection_epsilon : float, default=0.0 A distance threshold. Clusters below this value will be merged. @@ -275,7 +275,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): feature array. - If metric is a string or callable, it must be one of - the options allowed by `metrics.pairwise.pairwise_distances` for its + the options allowed by :func:`~sklearn.metrics.pairwise.pairwise_distances` for its metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and @@ -285,14 +285,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator): A distance scaling parameter as used in robust single linkage. See [3]_ for more information. - algorithm : str, default='auto' + algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto" Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a `KDTree` tree if possible, - otherwise it uses a `BallTree` tree. + to `'auto'` which attempts to use a :class:`~sklearn.neighbors.KDTree` tree if possible, + otherwise it uses a :class:`~sklearn.neighbors.BallTree` tree. If the `X` passed during `fit` is sparse or `metric` is invalid for - both `KDTree` and `BallTree`, then it resolves to use the `brute` + both :class:`~sklearn.neighbors.KDTree` and :class:`~sklearn.neighbors.BallTree`, then it resolves to use the `"brute"` algorithm. Available algorithms: @@ -301,8 +301,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): - `'balltree'` leaf_size : int, default=40 - Leaf size for trees responsible for fast nearest neighbour queries. A - large dataset size and small leaf_size may induce excessive memory + Leaf size for trees responsible for fast nearest neighbour queries when a KDTree or a BallTree are used as algorithms. A + large dataset size and small `leaf_size` may induce excessive memory usage. If you are running out of memory consider increasing the `leaf_size` parameter. Ignored for `algorithm=brute`. @@ -312,14 +312,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): `-1` means using all processors. See :term:`Glossary ` for more details. - cluster_selection_method : str, default='eom' + cluster_selection_method : {"eom", "leaf"}, default="eom" The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the - most fine grained and homogeneous clusters. Options are: - - `eom` - - `leaf` + most fine grained and homogeneous clusters. allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this @@ -347,11 +345,11 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Attributes ---------- - labels_ : ndarray, shape (n_samples, ) - Cluster labels for each point in the dataset given to fit(). + labels_ : ndarray of shape (n_samples,) + Cluster labels for each point in the dataset given to :term:`fit`. Noisy samples are given the label -1. - probabilities_ : ndarray, shape (n_samples, ) + probabilities_ : ndarray of shape (n_samples,) The strength with which each sample is a member of its assigned cluster. Noise points have probability zero; points in clusters have values assigned proportional to the degree that they @@ -414,7 +412,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Interval(Real, left=0, right=None, closed="left") ], "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], - "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], + "metric": [StrOptions(set(FAST_METRICS + {"precomputed"})), callable], "alpha": [Interval(Real, left=0, right=None, closed="neither")], "algorithm": [ StrOptions( @@ -465,16 +463,16 @@ def __init__( self.metric_params = metric_params def fit(self, X, y=None): - """Perform HDBSCAN clustering from features or distance matrix. + """Find clusters based on hierarchical density-based clustering. Parameters ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + ndarray of shape (n_samples, n_samples) A feature array, or array of distances between samples if `metric='precomputed'`. - y : Ignored + y : None Ignored. Returns @@ -612,21 +610,21 @@ def fit(self, X, y=None): return self def fit_predict(self, X, y=None): - """Perform clustering on X and return cluster labels. + """Cluster X and return the associated cluster labels. Parameters ---------- - X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ - array of shape (n_samples, n_samples) + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + ndarray of shape (n_samples, n_samples) A feature array, or array of distances between samples if `metric='precomputed'`. - y : Ignored + y : None Ignored. Returns ------- - y : ndarray, shape (n_samples, ) + y : ndarray of shape (n_samples,) Cluster labels. """ self.fit(X) From 1056cb0a80759b0aba6c6e0c9598af7aaf951ba5 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 14 Sep 2022 20:20:42 -0400 Subject: [PATCH 108/160] Further review feedback implemented --- sklearn/cluster/_hdbscan/_tree.pyx | 6 +- sklearn/cluster/_hdbscan/hdbscan.py | 227 +++++++++--------- .../cluster/_hdbscan/tests/test_hdbscan.py | 31 ++- 3 files changed, 147 insertions(+), 117 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index a7c3541cc9dcc..3d6bed3e8df34 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -574,7 +574,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, cluster_selection_method='eom', allow_single_cluster=False, cluster_selection_epsilon=0.0, - max_cluster_size=0): + max_cluster_size=None): """Given a tree and stability dict, produce the cluster labels (and probabilities) for a flat clustering based on the chosen cluster selection method. @@ -599,7 +599,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, cluster_selection_epsilon: float, optional (default 0.0) A distance threshold for cluster splits. - max_cluster_size: int, optional (default 0) + max_cluster_size: int, default=None The maximum size for clusters located by the EOM clusterer. Can be overridden by the cluster_selection_epsilon parameter in rare cases. @@ -643,7 +643,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, num_points = np.max(tree[tree['child_size'] == 1]['child']) + 1 max_lambda = np.max(tree['lambda_val']) - if max_cluster_size <= 0: + if max_cluster_size is None: max_cluster_size = num_points + 1 # Set to a value that will never be triggered cluster_sizes = {child: child_size for child, child_size in zip(cluster_tree['child'], cluster_tree['child_size'])} diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index afad099026e7b..41b5f86a2bc4a 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -14,12 +14,12 @@ import numpy as np from scipy.sparse import csgraph, issparse -from sklearn.base import BaseEstimator, ClusterMixin -from sklearn.metrics import pairwise_distances -from sklearn.metrics._dist_metrics import DistanceMetric -from sklearn.neighbors import BallTree, KDTree, NearestNeighbors -from sklearn.utils._param_validation import Interval, StrOptions - +from ...base import BaseEstimator, ClusterMixin +from ...metrics import pairwise_distances +from ...metrics._dist_metrics import DistanceMetric +from ...neighbors import BallTree, KDTree, NearestNeighbors +from ...utils._param_validation import Interval, StrOptions +from ...utils.validation import _assert_all_finite from ._linkage import label, mst_linkage_core, mst_linkage_core_vector from ._reachability import mutual_reachability, sparse_mutual_reachability from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut @@ -33,7 +33,7 @@ def _tree_to_labels( cluster_selection_method="eom", allow_single_cluster=False, cluster_selection_epsilon=0.0, - max_cluster_size=0, + max_cluster_size=None, ): """Converts a pretrained tree and cluster size into a set of labels and probabilities. @@ -80,13 +80,48 @@ def _hdbscan_brute( ) if issparse(distance_matrix): - return _hdbscan_sparse_distance_matrix( - distance_matrix, - min_samples, - alpha, - **metric_params, + # Compute sparse mutual reachability graph + # if max_dist > 0, max distance to use when the reachability is infinite + max_dist = metric_params.get("max_dist", 0.0) + mutual_reachability_ = sparse_mutual_reachability( + X.tolil(), min_points=min_samples, max_dist=max_dist, alpha=alpha ) + # Check connected component on mutual reachability + # If more than one component, it means that even if the distance matrix X + # has one component, there exists with less than `min_samples` neighbors + if ( + csgraph.connected_components( + mutual_reachability_, directed=False, return_labels=False + ) + > 1 + ): + raise ValueError( + f"There exists points with fewer than {min_samples} neighbors. Ensure" + " your distance matrix has non-zero values for at least" + f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" + " graph), or specify a `max_dist` in `metric_params` to use when" + " distances are missing." + ) + + # Compute the minimum spanning tree for the sparse graph + sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) + + edges_sorted_indices = np.argsort(sparse_min_spanning_tree.data) + rows, cols = sparse_min_spanning_tree.nonzero() + min_spanning_tree = np.vstack( + ( + rows[edges_sorted_indices], + cols[edges_sorted_indices], + sparse_min_spanning_tree.data[edges_sorted_indices], + ), + ).T + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + return single_linkage_tree + + # distance_matrix is dense mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) min_spanning_tree = mst_linkage_core(mutual_reachability_) @@ -104,52 +139,6 @@ def _hdbscan_brute( return _process_mst(min_spanning_tree) -def _hdbscan_sparse_distance_matrix( - X, - min_samples=5, - alpha=1.0, - **metric_params, -): - # Compute sparse mutual reachability graph - # if max_dist > 0, max distance to use when the reachability is infinite - max_dist = metric_params.get("max_dist", 0.0) - mutual_reachability_ = sparse_mutual_reachability( - X.tolil(), min_points=min_samples, max_dist=max_dist, alpha=alpha - ) - # Check connected component on mutual reachability - # If more than one component, it means that even if the distance matrix X - # has one component, there exists with less than `min_samples` neighbors - if ( - csgraph.connected_components( - mutual_reachability_, directed=False, return_labels=False - ) - > 1 - ): - raise ValueError( - "There exists points with fewer than %s neighbors. " - "Ensure your distance matrix has non-zero values for " - "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " - "or specify a `max_dist` in `metric_params` to use when distances " - "are missing." % (min_samples, min_samples) - ) - - # Compute the minimum spanning tree for the sparse graph - sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) - - # Convert the graph to scipy cluster array format - nonzeros = sparse_min_spanning_tree.nonzero() - nonzero_vals = sparse_min_spanning_tree[nonzeros] - min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T - - # Sort edges of the min_spanning_tree by weight - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0] - - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) - - return single_linkage_tree - - def _hdbscan_prims( X, algo, @@ -264,47 +253,48 @@ class HDBSCAN(ClusterMixin, BaseEstimator): A distance threshold. Clusters below this value will be merged. See [5]_ for more information. - max_cluster_size : int, default=0 - A limit to the size of clusters returned by the `eom` cluster selection - algorithm. Has no effect if `cluster_selection_method=leaf`. Can be - overridden in rare cases by a high value for - `cluster_selection_epsilon`. + max_cluster_size : int, default=None + A limit to the size of clusters returned by the `"eom"` cluster + selection algorithm. There is no limit when `max_cluster_size=None`. + Has no effect if `cluster_selection_method="leaf"`. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. - If metric is a string or callable, it must be one of - the options allowed by :func:`~sklearn.metrics.pairwise.pairwise_distances` for its - metric parameter. + the options allowed by :func:`~sklearn.metrics.pairwise.pairwise_distances` + for its metric parameter. - If metric is "precomputed", X is assumed to be a distance matrix and must be square. + metric_params : dict, default=None + Arguments passed to the distance metric. + alpha : float, default=1.0 A distance scaling parameter as used in robust single linkage. See [3]_ for more information. algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto" - Exactly which algorithm to use; hdbscan has variants specialised - for different characteristics of the data. By default this is set - to `'auto'` which attempts to use a :class:`~sklearn.neighbors.KDTree` tree if possible, - otherwise it uses a :class:`~sklearn.neighbors.BallTree` tree. + Exactly which algorithm to use for computing core distances; By default + this is set to `"auto"` which attempts to use a + :class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses + a :class:`~sklearn.neighbors.BallTree` tree. Both `"KDTree"` and + `"BallTree"` algorithms use the + :class:`~sklearn.neighbors.NearestNeighbors` estimator. If the `X` passed during `fit` is sparse or `metric` is invalid for - both :class:`~sklearn.neighbors.KDTree` and :class:`~sklearn.neighbors.BallTree`, then it resolves to use the `"brute"` - algorithm. - - Available algorithms: - - `'brute'` - - `'kdtree'` - - `'balltree'` + both :class:`~sklearn.neighbors.KDTree` and + :class:`~sklearn.neighbors.BallTree`, then it resolves to use the + `"brute"` algorithm. leaf_size : int, default=40 - Leaf size for trees responsible for fast nearest neighbour queries when a KDTree or a BallTree are used as algorithms. A - large dataset size and small `leaf_size` may induce excessive memory - usage. If you are running out of memory consider increasing the - `leaf_size` parameter. Ignored for `algorithm=brute`. + Leaf size for trees responsible for fast nearest neighbour queries when + a KDTree or a BallTree are used as core-distance algorithms. A large + dataset size and small `leaf_size` may induce excessive memory usage. + If you are running out of memory consider increasing the `leaf_size` + parameter. Ignored for `algorithm="brute"`. n_jobs : int, default=None Number of jobs to run in parallel to calculate distances. @@ -314,10 +304,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): cluster_selection_method : {"eom", "leaf"}, default="eom" The method used to select clusters from the condensed tree. The - standard approach for HDBSCAN* is to use an Excess of Mass algorithm - to find the most persistent clusters. Alternatively you can instead - select the clusters at the leaves of the tree -- this provides the - most fine grained and homogeneous clusters. + standard approach for HDBSCAN* is to use an Excess of Mass (`"eom"`) + algorithm to find the most persistent clusters. Alternatively you can + instead select the clusters at the leaves of the tree -- this provides + the most fine grained and homogeneous clusters. allow_single_cluster : bool, default=False By default HDBSCAN* will not produce a single cluster, setting this @@ -340,9 +330,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): depend on a euclidean metric. - `"both"`which computes and stores both forms of centers. - metric_params : dict, default=None - Arguments passed to the distance metric. - Attributes ---------- labels_ : ndarray of shape (n_samples,) @@ -362,6 +349,25 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. + centroids_ : ndarray of shape (n_clusters, n_features) + A collection containing the centroid of each cluster calculated under + the standard euclidean metric. The centroids may fall "outside" their + respective clusters if the clusters themselves are non-convex. + + Note that `n_clusters` only counts non-trivial clusters. That is to + say, the `-1` label for the virtual noise cluster is excluded. + + medoids_ : ndarray of shape (n_clusters, n_features) + A collection containing the medoid of each cluster calculated under + the whichever metric was passed to the `metric` parameter. The + medoids are points in the original cluster which minimize the average + distance to all other points in that cluster under the chosen metric. + These can be thought of as the result of projecting the `metric`-based + centroid back onto the cluster. + + Note that `n_clusters` only counts non-trivial clusters. That is to + say, the `-1` label for the virtual noise cluster is excluded. + See Also -------- DBSCAN : Density-Based Spatial Clustering of Applications @@ -411,8 +417,12 @@ class HDBSCAN(ClusterMixin, BaseEstimator): "cluster_selection_epsilon": [ Interval(Real, left=0, right=None, closed="left") ], - "max_cluster_size": [Interval(Integral, left=0, right=None, closed="left")], - "metric": [StrOptions(set(FAST_METRICS + {"precomputed"})), callable], + "max_cluster_size": [ + None, + Interval(Integral, left=1, right=None, closed="left"), + ], + "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable], + "metric_params": [dict, None], "alpha": [Interval(Real, left=0, right=None, closed="neither")], "algorithm": [ StrOptions( @@ -429,7 +439,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator): "cluster_selection_method": [StrOptions({"eom", "leaf"})], "allow_single_cluster": ["boolean"], "store_centers": [None, StrOptions({"centroid", "medoid", "both"})], - "metric_params": [dict, None], } def __init__( @@ -437,8 +446,9 @@ def __init__( min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0, - max_cluster_size=0, + max_cluster_size=None, metric="euclidean", + metric_params=None, alpha=1.0, algorithm="auto", leaf_size=40, @@ -446,7 +456,6 @@ def __init__( cluster_selection_method="eom", allow_single_cluster=False, store_centers=None, - metric_params=None, ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples @@ -454,13 +463,13 @@ def __init__( self.max_cluster_size = max_cluster_size self.cluster_selection_epsilon = cluster_selection_epsilon self.metric = metric + self.metric_params = metric_params self.algorithm = algorithm self.leaf_size = leaf_size self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster self.store_centers = store_centers - self.metric_params = metric_params def fit(self, X, y=None): """Find clusters based on hierarchical density-based clustering. @@ -488,10 +497,11 @@ def fit(self, X, y=None): X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 ) self._raw_data = X - - all_finite = ( - np.all(np.isfinite(X.data)) if issparse(X) else np.all(np.isfinite(X)) - ) + all_finite = True + try: + _assert_all_finite(X.data if issparse(X) else X) + except ValueError: + all_finite = False if not all_finite: # Pass only the purely finite indices into hdbscan @@ -524,7 +534,7 @@ def fit(self, X, y=None): self.min_cluster_size if self.min_samples is None else self.min_samples ) - func = None + mst_func = None kwargs = dict( X=X, algo="kd_tree", @@ -535,12 +545,12 @@ def fit(self, X, y=None): n_jobs=self.n_jobs, **self._metric_params, ) - if "kdtree" in self.algorithm and self.metric not in KDTree.valid_metrics: + if self.algorithm == "kdtree" and self.metric not in KDTree.valid_metrics: raise ValueError( f"{self.metric} is not a valid metric for a KDTree-based algorithm." " Please select a different metric." ) - elif "balltree" in self.algorithm and self.metric not in BallTree.valid_metrics: + elif self.algorithm == "balltree" and self.metric not in BallTree.valid_metrics: raise ValueError( f"{self.metric} is not a valid metric for a BallTree-based algorithm." " Please select a different metric." @@ -555,29 +565,29 @@ def fit(self, X, y=None): raise ValueError("Sparse data matrices only support algorithm `brute`.") if self.algorithm == "brute": - func = _hdbscan_brute + mst_func = _hdbscan_brute for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.algorithm == "kdtree": - func = _hdbscan_prims + mst_func = _hdbscan_prims elif self.algorithm == "balltree": - func = _hdbscan_prims + mst_func = _hdbscan_prims kwargs["algo"] = "ball_tree" else: if issparse(X) or self.metric not in FAST_METRICS: # We can't do much with sparse matrices ... - func = _hdbscan_brute + mst_func = _hdbscan_brute for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.metric in KDTree.valid_metrics: # TODO: Benchmark KD vs Ball Tree efficacy - func = _hdbscan_prims + mst_func = _hdbscan_prims else: # Metric is a valid BallTree metric - func = _hdbscan_prims + mst_func = _hdbscan_prims kwargs["algo"] = "ball_tree" - single_linkage_tree = func(**kwargs) + single_linkage_tree = mst_func(**kwargs) ( self.labels_, @@ -631,7 +641,8 @@ def fit_predict(self, X, y=None): return self.labels_ def _weighted_cluster_center(self, X): - n_clusters = len(set(self.labels_)) + # Number of non-noise clusters + n_clusters = len(set(self.labels_)) - int(-1 in set(self.labels_)) mask = np.empty((X.shape[0],), dtype=np.bool_) make_centroids = self.store_centers in ("centroid", "both") make_medoids = self.store_centers in ("medoid", "both") diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 801eea65fd271..e1c010288e002 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -223,12 +223,10 @@ def test_hdbscan_centers(): medoid = hdb.centroids_[idx] assert_array_almost_equal(medoid, center, decimal=1) - -def test_hdbscan_no_centroid_medoid_for_noise(): - with pytest.raises(ValueError): - HDBSCAN(store_centers="centroid").fit(X) - with pytest.raises(ValueError): - HDBSCAN(store_centers="medoid").fit(X) + # Ensure that nothing is done for noise + hdb = HDBSCAN(store_centers="both", min_cluster_size=X.shape[0]).fit(X) + assert hdb.centroids_.shape[0] == 0 + assert hdb.medoids_.shape[0] == 0 def test_hdbscan_allow_single_cluster_with_epsilon(): @@ -290,3 +288,24 @@ def test_hdbscan_sparse_distances_too_few_nonzero(): msg = "There exists points with fewer than" with pytest.raises(ValueError, match=msg): HDBSCAN(metric="precomputed").fit(X) + + +def test_hdbscan_tree_invalid_metric(): + metric_callable = lambda x: x + msg = ( + ".* is not a valid metric for a .*-based algorithm\\. Please select a different" + " metric\\." + ) + + # Callables are not supported for either + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm="kdtree", metric=metric_callable).fit(X) + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm="balltree", metric=metric_callable).fit(X) + + # The set of valid metrics for KDTree at the time of writing this test is a + # strict subset of those supported in BallTree + metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) + if len(metrics_not_kd) > 0: + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm="kdtree", metric=metrics_not_kd[0]).fit(X) From 39b3e5a285a0622afa2b7743c19a2fe56d3d9650 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 14 Sep 2022 20:31:50 -0400 Subject: [PATCH 109/160] Updated tests with review feedback --- .../cluster/_hdbscan/tests/test_hdbscan.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index e1c010288e002..fd1fbd1759393 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -14,23 +14,23 @@ from sklearn.neighbors import BallTree, KDTree from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle -from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_allclose, assert_array_equal n_clusters_true = 3 X, y = make_blobs(n_samples=200, random_state=10) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) -X_missing_data = X.copy() -X_missing_data[0] = [np.nan, 1] -X_missing_data[5] = [np.nan, np.nan] - -def test_missing_data(): +@pytest.mark.parametrize("missing_value", [np.inf, np.nan]) +def test_missing_data(missing_value): """ Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster. """ + X_missing_data = X.copy() + X_missing_data[0] = [missing_value, 1] + X_missing_data[5] = [missing_value, missing_value] model = HDBSCAN().fit(X_missing_data) assert model.labels_[0] == -1 assert model.labels_[5] == -1 @@ -39,7 +39,7 @@ def test_missing_data(): assert model.probabilities_[5] == 0 clean_indices = list(range(1, 5)) + list(range(6, 200)) clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) - assert np.allclose(clean_model.labels_, model.labels_[clean_indices]) + assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) def test_hdbscan_distance_matrix(): @@ -180,11 +180,6 @@ def test_hdbscan_callable_metric(): assert n_clusters == n_clusters_true -def test_hdbscan_input_lists(): - X = [[1.0, 2.0], [3.0, 4.0]] - HDBSCAN(min_samples=1).fit(X) - - @pytest.mark.parametrize("tree", ["kd", "ball"]) def test_hdbscan_precomputed_non_brute(tree): hdb = HDBSCAN(metric="precomputed", algorithm=f"prims_{tree}tree") @@ -193,7 +188,6 @@ def test_hdbscan_precomputed_non_brute(tree): def test_hdbscan_sparse(): - sparse_X = sparse.csr_matrix(X) labels = HDBSCAN().fit(sparse_X).labels_ @@ -218,10 +212,10 @@ def test_hdbscan_centers(): for idx, center in enumerate(centers): centroid = hdb.centroids_[idx] - assert_array_almost_equal(centroid, center, decimal=1) + assert_allclose(center, centroid, rtol=1, atol=0.05) medoid = hdb.centroids_[idx] - assert_array_almost_equal(medoid, center, decimal=1) + assert_allclose(center, medoid, rtol=1, atol=0.05) # Ensure that nothing is done for noise hdb = HDBSCAN(store_centers="both", min_cluster_size=X.shape[0]).fit(X) @@ -277,9 +271,16 @@ def test_hdbscan_better_than_dbscan(): assert n_clusters == 4 -@pytest.mark.parametrize("X", [np.array([[1, np.inf], [np.inf, 1]]), [[1, 2], [2, 1]]]) -def test_hdbscan_precomputed_array_like(X): - HDBSCAN(metric="precomputed").fit(X) +@pytest.mark.parametrize( + "kwargs, X", + [ + ({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])), + ({"metric": "precomputed"}, [[1, 2], [2, 1]]), + ({"min_samples": 1}, [[1, 2], [3, 4]]), + ], +) +def test_hdbscan_usable_inputs(X, kwargs): + HDBSCAN(**kwargs).fit(X) def test_hdbscan_sparse_distances_too_few_nonzero(): From 5c42b0d83bf58448f80b92c714e46498c9cb6bc4 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Thu, 15 Sep 2022 15:31:22 -0400 Subject: [PATCH 110/160] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/cluster/_hdbscan/_reachability.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index c643ca80fad7d..72f740d9e3e8e 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -20,7 +20,7 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=None): Parameters ---------- - distance_matrix : ndarray, shape (n_samples, n_samples) + distance_matrix : ndarray of shape (n_samples, n_samples) Array of distances between samples. min_points : int, default=5 @@ -46,6 +46,9 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=None): size = distance_matrix.shape[0] min_points = min(size - 1, min_points) try: + # Compute the core distances for all samples `x_p` corresponding + # to the distance of the k-th farthest neighbours (including + # `x_p`). core_distances = np.partition(distance_matrix, min_points, axis=0)[min_points] From aa999f5c7918be80dab92e496e7e76b24091b5d4 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 15:32:15 -0400 Subject: [PATCH 111/160] Renamed mst functions --- sklearn/cluster/_hdbscan/_linkage.pyx | 6 +++--- sklearn/cluster/_hdbscan/_reachability.pyx | 2 +- sklearn/cluster/_hdbscan/hdbscan.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 919e2713b0a25..057a9929069c6 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -8,10 +8,10 @@ import cython from libc.float cimport DBL_MAX -from sklearn.metrics._dist_metrics cimport DistanceMetric +from ...metrics._dist_metrics cimport DistanceMetric -cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_linkage_core( +cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_distance_matrix( cnp.ndarray[cnp.double_t, ndim=2] distance_matrix ): @@ -52,7 +52,7 @@ cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_linkage_core( return result -cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_linkage_core_vector( +cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix( cnp.ndarray[cnp.double_t, ndim=2, mode='c'] raw_data, cnp.ndarray[cnp.double_t, ndim=1, mode='c'] core_distances, DistanceMetric dist_metric, diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index c643ca80fad7d..e08c6bc3c98ba 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -11,7 +11,7 @@ import gc from scipy.sparse import lil_matrix as sparse_matrix from scipy.spatial.distance import pdist, squareform -from sklearn.neighbors import BallTree, KDTree +from ...neighbors import BallTree, KDTree def mutual_reachability(distance_matrix, min_points=5, alpha=None): diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 41b5f86a2bc4a..85e6ee11f4b49 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -20,7 +20,7 @@ from ...neighbors import BallTree, KDTree, NearestNeighbors from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite -from ._linkage import label, mst_linkage_core, mst_linkage_core_vector +from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix from ._reachability import mutual_reachability, sparse_mutual_reachability from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut @@ -124,7 +124,7 @@ def _hdbscan_brute( # distance_matrix is dense mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) - min_spanning_tree = mst_linkage_core(mutual_reachability_) + min_spanning_tree = mst_from_distance_matrix(mutual_reachability_) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): @@ -167,8 +167,8 @@ def _hdbscan_prims( core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) dist_metric = DistanceMetric.get_metric(metric, **metric_params) - # Mutual reachability distance is implicit in mst_linkage_core_vector - min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) + # Mutual reachability distance is implicit in mst_from_data_matrix + min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) return _process_mst(min_spanning_tree) From 4860a7faf2486ef696f798bbfa28c828fe03c9d3 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 16:56:21 -0400 Subject: [PATCH 112/160] Refactored _reachability.pyx --- sklearn/cluster/_hdbscan/_reachability.pyx | 123 +++++++++--------- sklearn/cluster/_hdbscan/hdbscan.py | 17 ++- .../cluster/_hdbscan/tests/test_hdbscan.py | 11 +- 3 files changed, 83 insertions(+), 68 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 2828368d2ccaa..2ff66445d83cd 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -3,33 +3,32 @@ # License: 3-clause BSD import numpy as np - -cimport numpy as np +cimport numpy as cnp import gc -from scipy.sparse import lil_matrix as sparse_matrix +from scipy.sparse import issparse from scipy.spatial.distance import pdist, squareform from ...neighbors import BallTree, KDTree - -def mutual_reachability(distance_matrix, min_points=5, alpha=None): +def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): """Compute the weighted adjacency matrix of the mutual reachability graph of a distance matrix. Parameters ---------- - distance_matrix : ndarray of shape (n_samples, n_samples) + distance_matrix : {ndarray or sparse matrix} of shape (n_samples, n_samples) Array of distances between samples. min_points : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. - alpha : float, default=None - A distance scaling parameter as used in robust single linkage. This - divides the distances when calculating mutual reachability. + max_dist : float, default=0.0 + The distance which `np.inf` is replaced with. When the true mutual- + reachability distance is measured to be infinite, it is instead + truncated to `max_dist`. Returns ------- @@ -43,64 +42,66 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=None): In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. """ - size = distance_matrix.shape[0] - min_points = min(size - 1, min_points) - try: - # Compute the core distances for all samples `x_p` corresponding - # to the distance of the k-th farthest neighbours (including - # `x_p`). - core_distances = np.partition(distance_matrix, - min_points, - axis=0)[min_points] - except AttributeError: - core_distances = np.sort(distance_matrix, - axis=0)[min_points] - - if alpha is not None: - distance_matrix = distance_matrix / alpha - - stage1 = np.where(core_distances > distance_matrix, - core_distances, distance_matrix) - result = np.where(core_distances > stage1.T, - core_distances.T, stage1.T).T - return result - - -cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, - float alpha=1.0, float max_dist=0.): - - cdef np.intp_t i - cdef np.intp_t j - cdef np.intp_t n - cdef np.double_t mr_dist - cdef list sorted_row_data - cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance - cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data - cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data - - result = sparse_matrix(lil_matrix.shape) - core_distance = np.empty(lil_matrix.shape[0], dtype=np.double) - - for i in range(lil_matrix.shape[0]): - sorted_row_data = sorted(lil_matrix.data[i]) - if min_points - 1 < len(sorted_row_data): - core_distance[i] = sorted_row_data[min_points - 1] + if issparse(distance_matrix): + return _sparse_mutual_reachability( + distance_matrix, + min_points=min_points, + max_dist=max_dist + ) + return _dense_mutual_reachability(distance_matrix, min_points=min_points) + +cdef _dense_mutual_reachability(cnp.ndarray distance_matrix, min_points=5): + cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0] + + # Account for index offset + min_points -= 1 + + # Compute the core distances for all samples `x_p` corresponding + # to the distance of the k-th farthest neighbours (including + # `x_p`). + core_distances = np.partition( + distance_matrix, + min_points, + axis=0, + )[min_points] + + for i in range(n_samples): + for j in range(n_samples): + mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) + distance_matrix[i, j] = mr_dist + return distance_matrix + +# Assumes LIL format. +# TODO: Rewrite for CSR. +cdef _sparse_mutual_reachability( + object distance_matrix, + cnp.intp_t min_points=5, + cnp.float64_t max_dist=0. +): + cdef cnp.intp_t i, j, n + cdef cnp.float64_t mr_dist + cdef cnp.ndarray[dtype=cnp.float64_t, ndim=1] core_distances + cdef cnp.ndarray[dtype=cnp.int32_t, ndim=1] nz_row_data + cdef cnp.ndarray[dtype=cnp.int32_t, ndim=1] nz_col_data + core_distances = np.empty(distance_matrix.shape[0], dtype=np.float64) + + # Account for index offset + min_points -= 1 + for i in range(distance_matrix.shape[0]): + if min_points < len(distance_matrix.data[i]): + core_distances[i] = np.partition(distance_matrix.data[i], min_points)[min_points] else: - core_distance[i] = np.infty + core_distances[i] = np.infty - if alpha != 1.0: - lil_matrix = lil_matrix / alpha - - nz_row_data, nz_col_data = lil_matrix.nonzero() + nz_row_data, nz_col_data = distance_matrix.nonzero() for n in range(nz_row_data.shape[0]): i = nz_row_data[n] j = nz_col_data[n] - - mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j]) + mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) if np.isfinite(mr_dist): - result[i, j] = mr_dist + distance_matrix[i, j] = mr_dist elif max_dist > 0: - result[i, j] = max_dist + distance_matrix[i, j] = max_dist - return result.tocsr() + return distance_matrix.tocsr() diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 85e6ee11f4b49..1d17bf1edba8c 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -21,7 +21,7 @@ from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix -from ._reachability import mutual_reachability, sparse_mutual_reachability +from ._reachability import mutual_reachability from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics @@ -63,7 +63,7 @@ def _process_mst(min_spanning_tree): def _hdbscan_brute( X, min_samples=5, - alpha=1.0, + alpha=None, metric="euclidean", n_jobs=None, **metric_params, @@ -78,13 +78,15 @@ def _hdbscan_brute( distance_matrix = pairwise_distances( X, metric=metric, n_jobs=n_jobs, **metric_params ) + if alpha is not None: + distance_matrix = distance_matrix / alpha if issparse(distance_matrix): # Compute sparse mutual reachability graph # if max_dist > 0, max distance to use when the reachability is infinite max_dist = metric_params.get("max_dist", 0.0) - mutual_reachability_ = sparse_mutual_reachability( - X.tolil(), min_points=min_samples, max_dist=max_dist, alpha=alpha + mutual_reachability_ = mutual_reachability( + distance_matrix.tolil(), min_points=min_samples, max_dist=max_dist ) # Check connected component on mutual reachability # If more than one component, it means that even if the distance matrix X @@ -122,7 +124,7 @@ def _hdbscan_brute( return single_linkage_tree # distance_matrix is dense - mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) + mutual_reachability_ = mutual_reachability(distance_matrix, min_samples) min_spanning_tree = mst_from_distance_matrix(mutual_reachability_) @@ -534,6 +536,11 @@ def fit(self, X, y=None): self.min_cluster_size if self.min_samples is None else self.min_samples ) + if self._min_samples > X.shape[0]: + raise ValueError( + f"min_samples ({self._min_samples}) must be at most the number of" + f" samples in X ({X.shape[0]})" + ) mst_func = None kwargs = dict( X=X, diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index fd1fbd1759393..885a0be86d115 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -276,11 +276,11 @@ def test_hdbscan_better_than_dbscan(): [ ({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])), ({"metric": "precomputed"}, [[1, 2], [2, 1]]), - ({"min_samples": 1}, [[1, 2], [3, 4]]), + ({}, [[1, 2], [3, 4]]), ], ) def test_hdbscan_usable_inputs(X, kwargs): - HDBSCAN(**kwargs).fit(X) + HDBSCAN(min_samples=1, **kwargs).fit(X) def test_hdbscan_sparse_distances_too_few_nonzero(): @@ -310,3 +310,10 @@ def test_hdbscan_tree_invalid_metric(): if len(metrics_not_kd) > 0: with pytest.raises(ValueError, match=msg): HDBSCAN(algorithm="kdtree", metric=metrics_not_kd[0]).fit(X) + + +def test_hdbscan_too_many_min_samples(): + hdb = HDBSCAN(min_samples=len(X) + 1) + msg = r"min_samples (.*) must be at most" + with pytest.raises(ValueError, match=msg): + hdb.fit(X) From 2eff9cc417fa439259cf566b7f7fc92f7dafde0f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 17:33:29 -0400 Subject: [PATCH 113/160] Adjusted documentation --- sklearn/cluster/_hdbscan/_reachability.pyx | 6 ++++-- sklearn/cluster/_hdbscan/hdbscan.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 2ff66445d83cd..18f429d73369b 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -14,11 +14,13 @@ from ...neighbors import BallTree, KDTree def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): """Compute the weighted adjacency matrix of the mutual reachability - graph of a distance matrix. + graph of a distance matrix. Note that computation is performed in-place for + `distance_matrix`. If out-of-place computation is required, pass a copy to + this function. Parameters ---------- - distance_matrix : {ndarray or sparse matrix} of shape (n_samples, n_samples) + distance_matrix : ndarray or sparse matrix of shape (n_samples, n_samples) Array of distances between samples. min_points : int, default=5 diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 1d17bf1edba8c..3927b61c2d913 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -123,7 +123,9 @@ def _hdbscan_brute( return single_linkage_tree - # distance_matrix is dense + # `distance_matrix` is dense at this point. + # Note that `distance_matrix` is manipulated in-place, however we do not + # need it for anything else past this point, hence the operation is safe. mutual_reachability_ = mutual_reachability(distance_matrix, min_samples) min_spanning_tree = mst_from_distance_matrix(mutual_reachability_) From 7a9b365ed7ae0d23b21863b90ba5fca314986a0c Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 17:52:28 -0400 Subject: [PATCH 114/160] Cython cleanup for _reachability.pyx --- sklearn/cluster/_hdbscan/_reachability.pyx | 59 +++++++++++++--------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 18f429d73369b..c097980faec2c 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -3,7 +3,9 @@ # License: 3-clause BSD import numpy as np +from cython.parallel cimport prange cimport numpy as cnp +from libc.math cimport isfinite import gc @@ -44,19 +46,27 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. """ + # Account for index offset + min_points -= 1 + if issparse(distance_matrix): - return _sparse_mutual_reachability( + _sparse_mutual_reachability( distance_matrix, min_points=min_points, max_dist=max_dist ) - return _dense_mutual_reachability(distance_matrix, min_points=min_points) + return distance_matrix.tocsr() -cdef _dense_mutual_reachability(cnp.ndarray distance_matrix, min_points=5): - cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0] + _dense_mutual_reachability(distance_matrix, min_points=min_points) + return distance_matrix - # Account for index offset - min_points -= 1 +cdef _dense_mutual_reachability( + cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, + cnp.intp_t min_points=5 +): + cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0] + cdef cnp.float64_t mr_dist + cdef cnp.float64_t[:] core_distances # Compute the core distances for all samples `x_p` corresponding # to the distance of the k-th farthest neighbours (including @@ -67,11 +77,15 @@ cdef _dense_mutual_reachability(cnp.ndarray distance_matrix, min_points=5): axis=0, )[min_points] - for i in range(n_samples): - for j in range(n_samples): - mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) - distance_matrix[i, j] = mr_dist - return distance_matrix + with nogil: + for i in range(n_samples): + for j in prange(n_samples): + mr_dist = max( + core_distances[i], + core_distances[j], + distance_matrix[i, j] + ) + distance_matrix[i, j] = mr_dist # Assumes LIL format. # TODO: Rewrite for CSR. @@ -80,30 +94,27 @@ cdef _sparse_mutual_reachability( cnp.intp_t min_points=5, cnp.float64_t max_dist=0. ): - cdef cnp.intp_t i, j, n + cdef cnp.intp_t i, j, n, n_samples = distance_matrix.shape[0] cdef cnp.float64_t mr_dist - cdef cnp.ndarray[dtype=cnp.float64_t, ndim=1] core_distances - cdef cnp.ndarray[dtype=cnp.int32_t, ndim=1] nz_row_data - cdef cnp.ndarray[dtype=cnp.int32_t, ndim=1] nz_col_data - core_distances = np.empty(distance_matrix.shape[0], dtype=np.float64) + cdef cnp.float64_t[:] core_distances + cdef cnp.int32_t[:] nz_row_data, nz_col_data + core_distances = np.empty(n_samples, dtype=np.float64) - # Account for index offset - min_points -= 1 - for i in range(distance_matrix.shape[0]): + for i in range(n_samples): if min_points < len(distance_matrix.data[i]): - core_distances[i] = np.partition(distance_matrix.data[i], min_points)[min_points] + core_distances[i] = np.partition( + distance_matrix.data[i], + min_points + )[min_points] else: core_distances[i] = np.infty nz_row_data, nz_col_data = distance_matrix.nonzero() - for n in range(nz_row_data.shape[0]): i = nz_row_data[n] j = nz_col_data[n] mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) - if np.isfinite(mr_dist): + if isfinite(mr_dist): distance_matrix[i, j] = mr_dist elif max_dist > 0: distance_matrix[i, j] = max_dist - - return distance_matrix.tocsr() From da44c832d917b42eaff4fa51bb9814347b7d44f0 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 17:57:56 -0400 Subject: [PATCH 115/160] Improved docs --- sklearn/cluster/_hdbscan/_reachability.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index c097980faec2c..3b66920015200 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -23,7 +23,8 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): Parameters ---------- distance_matrix : ndarray or sparse matrix of shape (n_samples, n_samples) - Array of distances between samples. + Array of distances between samples. If sparse, the array must be in + `LIL` format. min_points : int, default=5 The number of points in a neighbourhood for a point to be considered @@ -36,7 +37,7 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): Returns ------- - mututal_reachability: ndarray, shape (n_samples, n_samples) + mututal_reachability: ndarray of shape (n_samples, n_samples) Weighted adjacency matrix of the mutual reachability graph. References From 26dad213d03e2ece44ff14f6f1c998131039350b Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Thu, 15 Sep 2022 17:58:44 -0400 Subject: [PATCH 116/160] Update sklearn/cluster/_hdbscan/hdbscan.py Co-authored-by: Guillaume Lemaitre --- sklearn/cluster/_hdbscan/hdbscan.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 3927b61c2d913..b909ac39364d9 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -400,8 +400,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): Sander, J., 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). - .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical - Density-based Cluster Selection. arxiv preprint 1911.02282. + .. [5] :arxiv:`Malzer, C., & Baum, M. (2019). + "A Hybrid Approach To Hierarchical Density-based Cluster Selection." + <1911.02282>`. Examples -------- From 15595beb0588ba54aa19447e88566e3a9210ce20 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 18:14:44 -0400 Subject: [PATCH 117/160] Minor cleanup --- sklearn/cluster/_hdbscan/hdbscan.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 3927b61c2d913..80e38010cf9d9 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -195,7 +195,7 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers): finite_count = len(internal_to_raw) outlier_count = len(outliers) - for i, (left, right, distance, size) in enumerate(tree): + for i, (left, right, *_) in enumerate(tree): if left < finite_count: tree[i, 0] = internal_to_raw[left] else: @@ -216,7 +216,7 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers): return tree -def get_finite_row_indices(matrix): +def _get_finite_row_indices(matrix): """ Returns the indices of the purely finite rows of a sparse matrix or dense ndarray @@ -226,7 +226,7 @@ def get_finite_row_indices(matrix): [i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))] ) else: - row_indices = np.where(np.isfinite(matrix).sum(axis=1) == matrix.shape[1])[0] + row_indices = np.isfinite(matrix.sum(axis=1)).nonzero()[0] return row_indices @@ -498,7 +498,10 @@ def fit(self, X, y=None): if self.metric != "precomputed": # Non-precomputed matrices may contain non-finite values. X = self._validate_data( - X, accept_sparse="csr", force_all_finite=False, dtype=np.float64 + X, + accept_sparse=["csr", "lil"], + force_all_finite=False, + dtype=np.float64, ) self._raw_data = X all_finite = True @@ -510,8 +513,10 @@ def fit(self, X, y=None): if not all_finite: # Pass only the purely finite indices into hdbscan # We will later assign all non-finite points to the - # background-1 cluster - finite_index = get_finite_row_indices(X) + # noise cluster (label=-1) + # TODO: Correctly propogate np.nan as a missing value, instead + # of relegating to noise as we currently do. + finite_index = _get_finite_row_indices(X) X = X[finite_index] internal_to_raw = {x: y for x, y in enumerate(finite_index)} outliers = list(set(range(X.shape[0])) - set(finite_index)) @@ -519,7 +524,7 @@ def fit(self, X, y=None): # Handle sparse precomputed distance matrices separately X = self._validate_data( X, - accept_sparse="csr", + accept_sparse=["csr", "lil"], dtype=np.float64, ) else: @@ -589,7 +594,7 @@ def fit(self, X, y=None): for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.metric in KDTree.valid_metrics: - # TODO: Benchmark KD vs Ball Tree efficacy + # TODO: Benchmark KD vs Ball Tree efficiency mst_func = _hdbscan_prims else: # Metric is a valid BallTree metric From 0b0fa0edf212b83d9acffd547c6923b622003027 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 18:57:29 -0400 Subject: [PATCH 118/160] Minor refactor for propogating missing data --- sklearn/cluster/_hdbscan/hdbscan.py | 34 ++++++++---- .../cluster/_hdbscan/tests/test_hdbscan.py | 52 +++++++++++++------ 2 files changed, 60 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 8fce5e200cd55..d71c018a596e1 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -177,7 +177,7 @@ def _hdbscan_prims( return _process_mst(min_spanning_tree) -def remap_single_linkage_tree(tree, internal_to_raw, outliers): +def remap_single_linkage_tree(tree, internal_to_raw, non_finite): """ Takes an internal single_linkage_tree structure and adds back in a set of points that were initially detected as non-finite and returns that new tree. @@ -194,7 +194,7 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers): """ finite_count = len(internal_to_raw) - outlier_count = len(outliers) + outlier_count = len(non_finite) for i, (left, right, *_) in enumerate(tree): if left < finite_count: tree[i, 0] = internal_to_raw[left] @@ -205,10 +205,10 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers): else: tree[i, 1] = right + outlier_count - outlier_tree = np.zeros((len(outliers), 4)) + outlier_tree = np.zeros((len(non_finite), 4)) last_cluster_id = tree[tree.shape[0] - 1][0:2].max() last_cluster_size = tree[tree.shape[0] - 1][3] - for i, outlier in enumerate(outliers): + for i, outlier in enumerate(non_finite): outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) last_cluster_id += 1 last_cluster_size += 1 @@ -226,7 +226,7 @@ def _get_finite_row_indices(matrix): [i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))] ) else: - row_indices = np.isfinite(matrix.sum(axis=1)).nonzero()[0] + (row_indices,) = np.isfinite(matrix.sum(axis=1)).nonzero() return row_indices @@ -515,12 +515,22 @@ def fit(self, X, y=None): # Pass only the purely finite indices into hdbscan # We will later assign all non-finite points to the # noise cluster (label=-1) - # TODO: Correctly propogate np.nan as a missing value, instead - # of relegating to noise as we currently do. + + # Reduce X to make the checks for missing/outlier samples more + # convenient. + reduced_X = X.sum(axis=1) + + # Samples with missing data are denoted by the presence of + # `np.nan` + missing = list(np.isnan(reduced_X).nonzero()[0]) + + # Outlier samples are denoted by the presence of `np.inf` + outliers = list(np.isinf(reduced_X).nonzero()[0]) + + # Continue with only finite samples finite_index = _get_finite_row_indices(X) - X = X[finite_index] internal_to_raw = {x: y for x, y in enumerate(finite_index)} - outliers = list(set(range(X.shape[0])) - set(finite_index)) + X = X[finite_index] elif issparse(X): # Handle sparse precomputed distance matrices separately X = self._validate_data( @@ -620,14 +630,18 @@ def fit(self, X, y=None): # remap indices to align with original data in the case of # non-finite entries. self._single_linkage_tree_ = remap_single_linkage_tree( - self._single_linkage_tree_, internal_to_raw, outliers + self._single_linkage_tree_, + internal_to_raw, + non_finite=outliers + missing, ) new_labels = np.full(self._raw_data.shape[0], -1) new_labels[finite_index] = self.labels_ + new_labels[missing] = -2 self.labels_ = new_labels new_probabilities = np.zeros(self._raw_data.shape[0]) new_probabilities[finite_index] = self.probabilities_ + new_probabilities[missing] = np.nan self.probabilities_ = new_probabilities if self.store_centers: diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 885a0be86d115..a5ca1a340f1a5 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -22,15 +22,35 @@ X = StandardScaler().fit_transform(X) -@pytest.mark.parametrize("missing_value", [np.inf, np.nan]) -def test_missing_data(missing_value): +def test_missing_data(): """ - Tests if nan data are treated as infinite distance from all other points + Tests if nan data are propogated as missing data rather than outliers. + """ + X_missing_data = X.copy() + X_missing_data[0] = [np.nan, 1] + X_missing_data[5] = [np.nan, np.nan] + # import pdb; pdb.set_trace() + model = HDBSCAN().fit(X_missing_data) + + (missing_labels_idx,) = (model.labels_ == -2).nonzero() + assert_array_equal(missing_labels_idx, [0, 5]) + + (missing_probs_idx,) = (np.isnan(model.probabilities_)).nonzero() + assert_array_equal(missing_probs_idx, [0, 5]) + + clean_indices = list(range(1, 5)) + list(range(6, 200)) + clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) + assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) + + +def test_outlier_data(): + """ + Tests if np.inf data are treated as infinite distance from all other points and assigned to -1 cluster. """ X_missing_data = X.copy() - X_missing_data[0] = [missing_value, 1] - X_missing_data[5] = [missing_value, missing_value] + X_missing_data[0] = [np.inf, 1] + X_missing_data[5] = [np.inf, np.inf] model = HDBSCAN().fit(X_missing_data) assert model.labels_[0] == -1 assert model.labels_[5] == -1 @@ -47,7 +67,7 @@ def test_hdbscan_distance_matrix(): D /= np.max(D) labels = HDBSCAN(metric="precomputed").fit_predict(D) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true # Check that clustering is arbitrarily good @@ -67,13 +87,13 @@ def test_hdbscan_sparse_distance_matrix(): D.eliminate_zeros() labels = HDBSCAN(metric="precomputed").fit_predict(D) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true def test_hdbscan_feature_vector(): labels = HDBSCAN().fit_predict(X) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true # Check that clustering is arbitrarily good @@ -94,7 +114,7 @@ def test_hdbscan_feature_vector(): @pytest.mark.parametrize("metric", _VALID_METRICS) def test_hdbscan_algorithms(algo, metric): labels = HDBSCAN(algorithm=algo).fit_predict(X) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true # Validation for brute is handled by `pairwise_distances` @@ -131,7 +151,7 @@ def test_hdbscan_algorithms(algo, metric): def test_hdbscan_dbscan_clustering(): clusterer = HDBSCAN().fit(X) labels = clusterer.dbscan_clustering(0.3) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true @@ -143,7 +163,7 @@ def test_hdbscan_high_dimensional(): metric="seuclidean", metric_params={"V": np.ones(H.shape[1])}, ).fit_predict(H) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true @@ -151,13 +171,13 @@ def test_hdbscan_best_balltree_metric(): labels = HDBSCAN( metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} ).fit_predict(X) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true def test_hdbscan_no_clusters(): labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == 0 @@ -176,7 +196,7 @@ def test_hdbscan_min_cluster_size(): def test_hdbscan_callable_metric(): metric = distance.euclidean labels = HDBSCAN(metric=metric).fit_predict(X) - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true @@ -191,13 +211,13 @@ def test_hdbscan_sparse(): sparse_X = sparse.csr_matrix(X) labels = HDBSCAN().fit(sparse_X).labels_ - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == 3 sparse_X_nan = sparse_X.copy() sparse_X_nan[0, 0] = np.nan labels = HDBSCAN().fit(sparse_X_nan).labels_ - n_clusters = len(set(labels)) - int(-1 in labels) + n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == 3 msg = "Sparse data matrices only support algorithm `brute`." From f96e8d6bbfc4cfa45767480e796729a4b5d589d1 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 18:58:46 -0400 Subject: [PATCH 119/160] Updated docs --- sklearn/cluster/_hdbscan/hdbscan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index d71c018a596e1..629ea7d73e269 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -338,7 +338,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): ---------- labels_ : ndarray of shape (n_samples,) Cluster labels for each point in the dataset given to :term:`fit`. - Noisy samples are given the label -1. + There are two reserved labels: + - Noisy samples are given the label -1. + - Samples with missing data are given the label -2. probabilities_ : ndarray of shape (n_samples,) The strength with which each sample is a member of its assigned From 8f5c22b845c72dc180ffb0558847a40f2d4ccc3b Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 19:00:51 -0400 Subject: [PATCH 120/160] Updated authorships --- sklearn/cluster/_hdbscan/_linkage.pyx | 3 ++- sklearn/cluster/_hdbscan/_reachability.pyx | 3 ++- sklearn/cluster/_hdbscan/hdbscan.py | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index 057a9929069c6..752166edda1de 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -1,5 +1,6 @@ # Minimum spanning tree single linkage implementation for hdbscan -# Authors: Leland McInnes, Steve Astels +# Authors: Leland McInnes +# Steve Astels # License: 3-clause BSD import numpy as np diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 3b66920015200..6ac591394f38c 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -1,5 +1,6 @@ # mutual reachability distance compiutations -# Authors: Leland McInnes +# Authors: Leland McInnes +# Meekail Zain # License: 3-clause BSD import numpy as np diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 629ea7d73e269..48aa2a7b688b8 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -2,9 +2,10 @@ HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise """ -# Author: Leland McInnes -# Steve Astels -# John Healy +# Authors: Leland McInnes +# Steve Astels +# John Healy +# Meekail Zain # # License: BSD 3 clause From 23185f0a527d615b8b30037db4f739a9255747ee Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 19:03:16 -0400 Subject: [PATCH 121/160] Updated `n_cluster` calc in `_weighted_cluster_center` --- sklearn/cluster/_hdbscan/hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 48aa2a7b688b8..f731f4a9f9915 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -674,7 +674,7 @@ def fit_predict(self, X, y=None): def _weighted_cluster_center(self, X): # Number of non-noise clusters - n_clusters = len(set(self.labels_)) - int(-1 in set(self.labels_)) + n_clusters = len(set(self.labels_) - {-1, -2}) mask = np.empty((X.shape[0],), dtype=np.bool_) make_centroids = self.store_centers in ("centroid", "both") make_medoids = self.store_centers in ("medoid", "both") From 8ed0869720bf5355de895c9de5be7076f73422ae Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 20:06:46 -0400 Subject: [PATCH 122/160] Refactored brute algorithm and added `copy` parameter --- sklearn/cluster/_hdbscan/_reachability.pyx | 13 ++- sklearn/cluster/_hdbscan/hdbscan.py | 110 ++++++++++-------- .../cluster/_hdbscan/tests/test_hdbscan.py | 9 +- 3 files changed, 74 insertions(+), 58 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 6ac591394f38c..64aa9573e103a 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -51,16 +51,17 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): # Account for index offset min_points -= 1 + # Note that in both routines `distance_matrix` is operated on in-place. At + # this point, if out-of-place operation is desired then this function + # should have been passed a copy. if issparse(distance_matrix): - _sparse_mutual_reachability( + return _sparse_mutual_reachability( distance_matrix, min_points=min_points, max_dist=max_dist - ) - return distance_matrix.tocsr() + ).tocsr() - _dense_mutual_reachability(distance_matrix, min_points=min_points) - return distance_matrix + return _dense_mutual_reachability(distance_matrix, min_points=min_points) cdef _dense_mutual_reachability( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, @@ -88,6 +89,7 @@ cdef _dense_mutual_reachability( distance_matrix[i, j] ) distance_matrix[i, j] = mr_dist + return distance_matrix # Assumes LIL format. # TODO: Rewrite for CSR. @@ -120,3 +122,4 @@ cdef _sparse_mutual_reachability( distance_matrix[i, j] = mr_dist elif max_dist > 0: distance_matrix[i, j] = max_dist + return distance_matrix diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index f731f4a9f9915..90a447f075b57 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -28,6 +28,33 @@ FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics +def _brute_mst(mutual_reachability, min_samples, sparse=False): + if not sparse: + return mst_from_distance_matrix(mutual_reachability) + + # Check connected component on mutual reachability + # If more than one component, it means that even if the distance matrix X + # has one component, there exists with less than `min_samples` neighbors + if ( + csgraph.connected_components( + mutual_reachability, directed=False, return_labels=False + ) + > 1 + ): + raise ValueError( + f"There exists points with fewer than {min_samples} neighbors. Ensure" + " your distance matrix has non-zero values for at least" + f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" + " graph), or specify a `max_dist` in `metric_params` to use when" + " distances are missing." + ) + + # Compute the minimum spanning tree for the sparse graph + sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability) + rows, cols = sparse_min_spanning_tree.nonzero() + return np.vstack((rows, cols, sparse_min_spanning_tree.data)).T + + def _tree_to_labels( single_linkage_tree, min_cluster_size=10, @@ -67,6 +94,7 @@ def _hdbscan_brute( alpha=None, metric="euclidean", n_jobs=None, + copy=False, **metric_params, ): if metric == "precomputed": @@ -74,63 +102,31 @@ def _hdbscan_brute( # sklearn.metrics.pairwise_distances handle it, # enables the usage of numpy.inf in the distance # matrix to indicate missing distance information. - distance_matrix = X + distance_matrix = X.copy() if copy else X + else: distance_matrix = pairwise_distances( X, metric=metric, n_jobs=n_jobs, **metric_params ) if alpha is not None: - distance_matrix = distance_matrix / alpha - - if issparse(distance_matrix): - # Compute sparse mutual reachability graph - # if max_dist > 0, max distance to use when the reachability is infinite - max_dist = metric_params.get("max_dist", 0.0) - mutual_reachability_ = mutual_reachability( - distance_matrix.tolil(), min_points=min_samples, max_dist=max_dist - ) - # Check connected component on mutual reachability - # If more than one component, it means that even if the distance matrix X - # has one component, there exists with less than `min_samples` neighbors - if ( - csgraph.connected_components( - mutual_reachability_, directed=False, return_labels=False - ) - > 1 - ): - raise ValueError( - f"There exists points with fewer than {min_samples} neighbors. Ensure" - " your distance matrix has non-zero values for at least" - f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" - " graph), or specify a `max_dist` in `metric_params` to use when" - " distances are missing." - ) - - # Compute the minimum spanning tree for the sparse graph - sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) - - edges_sorted_indices = np.argsort(sparse_min_spanning_tree.data) - rows, cols = sparse_min_spanning_tree.nonzero() - min_spanning_tree = np.vstack( - ( - rows[edges_sorted_indices], - cols[edges_sorted_indices], - sparse_min_spanning_tree.data[edges_sorted_indices], - ), - ).T - - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) + if copy: + distance_matrix = distance_matrix / alpha + else: + distance_matrix /= alpha - return single_linkage_tree + # max_dist is only relevant for sparse and is ignored for dense + max_dist = metric_params.get("max_dist", 0.0) + sparse = issparse(distance_matrix) + distance_matrix = distance_matrix.tolil() if sparse else distance_matrix - # `distance_matrix` is dense at this point. # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. - mutual_reachability_ = mutual_reachability(distance_matrix, min_samples) - - min_spanning_tree = mst_from_distance_matrix(mutual_reachability_) - + mutual_reachability_ = mutual_reachability( + distance_matrix, min_points=min_samples, max_dist=max_dist + ) + min_spanning_tree = _brute_mst( + mutual_reachability_, min_samples=min_samples, sparse=sparse + ) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): warn( @@ -335,6 +331,17 @@ class HDBSCAN(ClusterMixin, BaseEstimator): depend on a euclidean metric. - `"both"`which computes and stores both forms of centers. + copy : bool, default=False + If `copy=True` then any time an in-place modifications would be made + that would overwrite data passed to :term:`fit`, a copy will first be + made, guaranteeing that the original data will be unchanged. Currently + this only makes a difference when passing in a dense precomputed + distance array (i.e. when `metric="precomputed"`). + + Note that, even if `copy=False`, a copy may still be made during + :term:`fit` if conversion of the passed data is necessary. See + :func:`~sklearn.utils.validation.check_array` for more details. + Attributes ---------- labels_ : ndarray of shape (n_samples,) @@ -447,6 +454,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): "cluster_selection_method": [StrOptions({"eom", "leaf"})], "allow_single_cluster": ["boolean"], "store_centers": [None, StrOptions({"centroid", "medoid", "both"})], + "copy": ["boolean"], } def __init__( @@ -464,6 +472,7 @@ def __init__( cluster_selection_method="eom", allow_single_cluster=False, store_centers=None, + copy=False, ): self.min_cluster_size = min_cluster_size self.min_samples = min_samples @@ -478,6 +487,7 @@ def __init__( self.cluster_selection_method = cluster_selection_method self.allow_single_cluster = allow_single_cluster self.store_centers = store_centers + self.copy = copy def fit(self, X, y=None): """Find clusters based on hierarchical density-based clustering. @@ -594,6 +604,7 @@ def fit(self, X, y=None): if self.algorithm == "brute": mst_func = _hdbscan_brute + kwargs["copy"] = self.copy for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.algorithm == "kdtree": @@ -605,6 +616,7 @@ def fit(self, X, y=None): if issparse(X) or self.metric not in FAST_METRICS: # We can't do much with sparse matrices ... mst_func = _hdbscan_brute + kwargs["copy"] = self.copy for key in ("algo", "leaf_size"): kwargs.pop(key, None) elif self.metric in KDTree.valid_metrics: @@ -731,7 +743,7 @@ def dbscan_clustering(self, cut_distance, min_cluster_size=5): Returns ------- - labels : array [n_samples] + labels : ndarray of shape (n_samples,) An array of cluster labels, one per datapoint. Unclustered points are assigned the label -1. """ diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index a5ca1a340f1a5..292b3e922fb04 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -10,7 +10,7 @@ from sklearn.cluster import HDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import fowlkes_mallows_score -from sklearn.metrics.pairwise import _VALID_METRICS +from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances from sklearn.neighbors import BallTree, KDTree from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle @@ -63,10 +63,11 @@ def test_outlier_data(): def test_hdbscan_distance_matrix(): - D = distance.squareform(distance.pdist(X)) - D /= np.max(D) + D = euclidean_distances(X) + D_original = D.copy() + labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D) - labels = HDBSCAN(metric="precomputed").fit_predict(D) + assert_allclose(D, D_original) n_clusters = len(set(labels) - {-1, -2}) assert n_clusters == n_clusters_true From e6b9c2d59c222654ef5e8f808670fa21e548a7b1 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 15 Sep 2022 22:32:10 -0400 Subject: [PATCH 123/160] Updated tests a bit --- .../cluster/_hdbscan/tests/test_hdbscan.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py index 292b3e922fb04..59f4668eaf206 100644 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py @@ -21,6 +21,13 @@ X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) +ALGORITHMS = [ + "kdtree", + "balltree", + "brute", + "auto", +] + def test_missing_data(): """ @@ -103,15 +110,7 @@ def test_hdbscan_feature_vector(): assert score >= 0.98 -@pytest.mark.parametrize( - "algo", - [ - "kdtree", - "balltree", - "brute", - "auto", - ], -) +@pytest.mark.parametrize("algo", ALGORITHMS) @pytest.mark.parametrize("metric", _VALID_METRICS) def test_hdbscan_algorithms(algo, metric): labels = HDBSCAN(algorithm=algo).fit_predict(X) @@ -226,20 +225,20 @@ def test_hdbscan_sparse(): HDBSCAN(metric="euclidean", algorithm="balltree").fit(sparse_X) -def test_hdbscan_centers(): +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_hdbscan_centers(algorithm): centers = [(0.0, 0.0), (3.0, 3.0)] H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) hdb = HDBSCAN(store_centers="both").fit(H) - for idx, center in enumerate(centers): - centroid = hdb.centroids_[idx] + for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_): assert_allclose(center, centroid, rtol=1, atol=0.05) - - medoid = hdb.centroids_[idx] assert_allclose(center, medoid, rtol=1, atol=0.05) # Ensure that nothing is done for noise - hdb = HDBSCAN(store_centers="both", min_cluster_size=X.shape[0]).fit(X) + hdb = HDBSCAN( + algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0] + ).fit(X) assert hdb.centroids_.shape[0] == 0 assert hdb.medoids_.shape[0] == 0 From 886bab09cae1c519319f5fc2c9b434a58a336f8d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Aug 2023 22:01:31 -0400 Subject: [PATCH 124/160] Removed outdated test file --- .../cluster/_hdbscan/tests/test_hdbscan.py | 370 ------------------ 1 file changed, 370 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/tests/test_hdbscan.py diff --git a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py b/sklearn/cluster/_hdbscan/tests/test_hdbscan.py deleted file mode 100644 index 0dfdc33b60c64..0000000000000 --- a/sklearn/cluster/_hdbscan/tests/test_hdbscan.py +++ /dev/null @@ -1,370 +0,0 @@ -""" -Tests for HDBSCAN clustering algorithm -Based on the DBSCAN test code -""" -import numpy as np -import pytest -from scipy import sparse, stats -from scipy.spatial import distance - -from sklearn.cluster import HDBSCAN -from sklearn.datasets import make_blobs, make_moons -from sklearn.metrics import fowlkes_mallows_score, homogeneity_score -from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances -from sklearn.neighbors import BallTree, KDTree -from sklearn.preprocessing import StandardScaler -from sklearn.utils import shuffle -from sklearn.utils._testing import assert_allclose, assert_array_equal - -n_clusters_true = 3 -X, y = make_blobs(n_samples=200, random_state=10) -X, y = shuffle(X, y, random_state=7) -X = StandardScaler().fit_transform(X) - -ALGORITHMS = [ - "prims_kdtree", - "prims_balltree", - "boruvka_kdtree", - "boruvka_balltree", - "brute", - "auto", -] - - -def generate_noisy_data(): - rng = np.random.RandomState(0) - blobs, _ = make_blobs( - n_samples=200, centers=[(-0.75, 2.25), (1.0, 2.0)], cluster_std=0.25 - ) - moons, _ = make_moons(n_samples=200, noise=0.05) - noise = rng.uniform(-1.0, 3.0, (50, 2)) - return np.vstack([blobs, moons, noise]) - - -def test_missing_data(): - """ - Tests if nan data are propogated as missing data rather than outliers. - """ - X_missing_data = X.copy() - X_missing_data[0] = [np.nan, 1] - X_missing_data[5] = [np.nan, np.nan] - # import pdb; pdb.set_trace() - model = HDBSCAN().fit(X_missing_data) - - (missing_labels_idx,) = (model.labels_ == -2).nonzero() - assert_array_equal(missing_labels_idx, [0, 5]) - - (missing_probs_idx,) = (np.isnan(model.probabilities_)).nonzero() - assert_array_equal(missing_probs_idx, [0, 5]) - - clean_indices = list(range(1, 5)) + list(range(6, 200)) - clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) - assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) - - -def test_outlier_data(): - """ - Tests if np.inf data are treated as infinite distance from all other points - and assigned to -1 cluster. - """ - X_missing_data = X.copy() - X_missing_data[0] = [np.inf, 1] - X_missing_data[5] = [np.inf, np.inf] - model = HDBSCAN().fit(X_missing_data) - assert model.labels_[0] == -1 - assert model.labels_[5] == -1 - assert model.probabilities_[0] == 0 - assert model.probabilities_[5] == 0 - assert model.probabilities_[5] == 0 - clean_indices = list(range(1, 5)) + list(range(6, 200)) - clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) - assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) - - -def test_hdbscan_distance_matrix(): - D = euclidean_distances(X) - D_original = D.copy() - labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D) - - assert_allclose(D, D_original) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - # Check that clustering is arbitrarily good - # This is a heuristic to guard against regression - score = fowlkes_mallows_score(y, labels) - assert score >= 0.98 - - -def test_hdbscan_sparse_distance_matrix(): - D = distance.squareform(distance.pdist(X)) - D /= np.max(D) - - threshold = stats.scoreatpercentile(D.flatten(), 50) - - D[D >= threshold] = 0.0 - D = sparse.csr_matrix(D) - D.eliminate_zeros() - - labels = HDBSCAN(metric="precomputed").fit_predict(D) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - -def test_hdbscan_feature_vector(): - labels = HDBSCAN().fit_predict(X) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - # Check that clustering is arbitrarily good - # This is a heuristic to guard against regression - score = fowlkes_mallows_score(y, labels) - assert score >= 0.98 - - -@pytest.mark.parametrize("algo", ALGORITHMS) -@pytest.mark.parametrize("metric", _VALID_METRICS) -def test_hdbscan_algorithms(algo, metric): - labels = HDBSCAN(algorithm=algo).fit_predict(X) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - # Validation for brute is handled by `pairwise_distances` - if algo in ("brute", "auto"): - return - - ALGOS_TREES = { - "prims_kdtree": KDTree, - "prims_balltree": BallTree, - "boruvka_kdtree": KDTree, - "boruvka_balltree": BallTree, - } - metric_params = { - "mahalanobis": {"V": np.eye(X.shape[1])}, - "seuclidean": {"V": np.ones(X.shape[1])}, - "minkowski": {"p": 2}, - "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, - }.get(metric, None) - - hdb = HDBSCAN( - algorithm=algo, - metric=metric, - metric_params=metric_params, - ) - - if metric not in ALGOS_TREES[algo].valid_metrics: - with pytest.raises(ValueError): - hdb.fit(X) - elif metric == "wminkowski": - with pytest.warns(FutureWarning): - hdb.fit(X) - else: - hdb.fit(X) - - -def test_hdbscan_dbscan_clustering(): - clusterer = HDBSCAN().fit(X) - labels = clusterer.dbscan_clustering(0.3) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - -def test_hdbscan_high_dimensional(): - H, y = make_blobs(n_samples=50, random_state=0, n_features=64) - H = StandardScaler().fit_transform(H) - labels = HDBSCAN( - algorithm="auto", - metric="seuclidean", - metric_params={"V": np.ones(H.shape[1])}, - ).fit_predict(H) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - -def test_hdbscan_best_balltree_metric(): - labels = HDBSCAN( - metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} - ).fit_predict(X) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - -def test_hdbscan_no_clusters(): - labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == 0 - - -def test_hdbscan_min_cluster_size(): - """ - Test that the smallest non-noise cluster has at least `min_cluster_size` - many points - """ - for min_cluster_size in range(2, len(X), 1): - labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X) - true_labels = [label for label in labels if label != -1] - if len(true_labels) != 0: - assert np.min(np.bincount(true_labels)) >= min_cluster_size - - -def test_hdbscan_callable_metric(): - metric = distance.euclidean - labels = HDBSCAN(metric=metric).fit_predict(X) - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == n_clusters_true - - -@pytest.mark.parametrize("tree", ["kdtree", "balltree"]) -def test_hdbscan_boruvka_matches(tree): - - data = generate_noisy_data() - - labels_prims = HDBSCAN(algorithm="brute").fit_predict(data) - labels_boruvka = HDBSCAN(algorithm=f"boruvka_{tree}").fit_predict(data) - - num_mismatches = homogeneity_score(labels_prims, labels_boruvka) - - assert (num_mismatches / float(data.shape[0])) < 0.15 - - -@pytest.mark.parametrize("strategy", ["prims", "boruvka"]) -@pytest.mark.parametrize("tree", ["kd", "ball"]) -def test_hdbscan_precomputed_non_brute(strategy, tree): - hdb = HDBSCAN(metric="precomputed", algorithm=f"{strategy}_{tree}tree") - with pytest.raises(ValueError): - hdb.fit(X) - - -def test_hdbscan_sparse(): - sparse_X = sparse.csr_matrix(X) - - labels = HDBSCAN().fit(sparse_X).labels_ - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == 3 - - sparse_X_nan = sparse_X.copy() - sparse_X_nan[0, 0] = np.nan - labels = HDBSCAN().fit(sparse_X_nan).labels_ - n_clusters = len(set(labels) - {-1, -2}) - assert n_clusters == 3 - - msg = "Sparse data matrices only support algorithm `brute`." - with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="boruvka_balltree").fit(sparse_X) - - -@pytest.mark.parametrize("algorithm", ALGORITHMS) -def test_hdbscan_centers(algorithm): - centers = [(0.0, 0.0), (3.0, 3.0)] - H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) - - # Note: boruvka performs very poorly when min_samples < 6 - hdb = HDBSCAN(store_centers="both", min_samples=6, algorithm=algorithm).fit(H) - for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_): - assert_allclose(center, centroid, rtol=1, atol=0.05) - assert_allclose(center, medoid, rtol=1, atol=0.05) - - # Ensure that nothing is done for noise - hdb = HDBSCAN( - algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0] - ).fit(X) - assert hdb.centroids_.shape[0] == 0 - assert hdb.medoids_.shape[0] == 0 - - -def test_hdbscan_allow_single_cluster_with_epsilon(): - rng = np.random.RandomState(0) - no_structure = rng.rand(150, 2) - # without epsilon we should see many noise points as children of root. - labels = HDBSCAN( - min_cluster_size=5, - cluster_selection_epsilon=0.0, - cluster_selection_method="eom", - allow_single_cluster=True, - algorithm="brute", - ).fit_predict(no_structure) - unique_labels, counts = np.unique(labels, return_counts=True) - assert len(unique_labels) == 2 - - # Arbitrary heuristic. Would prefer something more precise. - assert counts[unique_labels == -1] == 31 - - # for this random seed an epsilon of 0.18 (very brittle) will produce - # exactly 2 noise points at that cut in single linkage. - # TODO: Replace with more robust test if possible - labels = HDBSCAN( - min_cluster_size=5, - cluster_selection_epsilon=0.18, - cluster_selection_method="eom", - allow_single_cluster=True, - ).fit_predict(no_structure) - unique_labels, counts = np.unique(labels, return_counts=True) - assert len(unique_labels) == 2 - assert counts[unique_labels == -1] == 2 - - -def test_hdbscan_better_than_dbscan(): - """ - Validate that HDBSCAN can properly cluster this difficult synthetic - dataset. Note that DBSCAN fails on this (see HDBSCAN plotting - example) - """ - centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] - X, _ = make_blobs( - n_samples=750, - centers=centers, - cluster_std=[0.2, 0.35, 1.35, 1.35], - random_state=0, - ) - hdb = HDBSCAN().fit(X) - n_clusters = len(set(hdb.labels_)) - int(-1 in hdb.labels_) - assert n_clusters == 4 - - -@pytest.mark.parametrize( - "kwargs, X", - [ - ({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])), - ({"metric": "precomputed"}, [[1, 2], [2, 1]]), - ({}, [[1, 2], [3, 4]]), - ], -) -def test_hdbscan_usable_inputs(X, kwargs): - HDBSCAN(min_samples=1, **kwargs).fit(X) - - -def test_hdbscan_sparse_distances_too_few_nonzero(): - X = sparse.csr_matrix(np.zeros((10, 10))) - - msg = "There exists points with fewer than" - with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed").fit(X) - - -@pytest.mark.parametrize("mst", ["prims", "boruvka"]) -def test_hdbscan_tree_invalid_metric(mst): - metric_callable = lambda x: x - msg = ( - ".* is not a valid metric for a .*-based algorithm\\. Please select a different" - " metric\\." - ) - - # Callables are not supported for either - with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm=f"{mst}_kdtree", metric=metric_callable).fit(X) - with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm=f"{mst}_balltree", metric=metric_callable).fit(X) - - # The set of valid metrics for KDTree at the time of writing this test is a - # strict subset of those supported in BallTree - metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) - if len(metrics_not_kd) > 0: - with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm=f"{mst}_kdtree", metric=metrics_not_kd[0]).fit(X) - - -def test_hdbscan_too_many_min_samples(): - hdb = HDBSCAN(min_samples=len(X) + 1) - msg = r"min_samples (.*) must be at most" - with pytest.raises(ValueError, match=msg): - hdb.fit(X) From 8feccf1190ea943c1f0bce8ec6b17edcc93cc643 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Aug 2023 22:02:03 -0400 Subject: [PATCH 125/160] Removed old setup.py --- sklearn/cluster/setup.py | 69 ---------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 sklearn/cluster/setup.py diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py deleted file mode 100644 index 9ba195cf3230c..0000000000000 --- a/sklearn/cluster/setup.py +++ /dev/null @@ -1,69 +0,0 @@ -# Author: Alexandre Gramfort -# License: BSD 3 clause -import os - -import numpy - - -def configuration(parent_package="", top_path=None): - from numpy.distutils.misc_util import Configuration - - libraries = [] - if os.name == "posix": - libraries.append("m") - - config = Configuration("cluster", parent_package, top_path) - - config.add_extension( - "_dbscan_inner", - sources=["_dbscan_inner.pyx"], - include_dirs=[numpy.get_include()], - language="c++", - ) - - config.add_extension( - "_hierarchical_fast", - sources=["_hierarchical_fast.pyx"], - language="c++", - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - - config.add_extension( - "_k_means_common", - sources=["_k_means_common.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - - config.add_extension( - "_k_means_lloyd", - sources=["_k_means_lloyd.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - - config.add_extension( - "_k_means_elkan", - sources=["_k_means_elkan.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - - config.add_extension( - "_k_means_minibatch", - sources=["_k_means_minibatch.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - - config.add_subpackage("tests") - config.add_subpackage("_hdbscan") - - return config - - -if __name__ == "__main__": - from numpy.distutils.core import setup - - setup(**configuration(top_path="").todict()) From 969b7c57adbfd174a7e9ba688caac6686fdf0c9e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Aug 2023 22:02:18 -0400 Subject: [PATCH 126/160] Removed submodule setup.py --- sklearn/cluster/_hdbscan/setup.py | 48 ------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 sklearn/cluster/_hdbscan/setup.py diff --git a/sklearn/cluster/_hdbscan/setup.py b/sklearn/cluster/_hdbscan/setup.py deleted file mode 100644 index c082ec8bdf214..0000000000000 --- a/sklearn/cluster/_hdbscan/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -# License: BSD 3 clause -import os - -import numpy - - -def configuration(parent_package="", top_path=None): - from numpy.distutils.misc_util import Configuration - - libraries = [] - if os.name == "posix": - libraries.append("m") - - config = Configuration("_hdbscan", parent_package, top_path) - - # HDBSCAN subpackage - config.add_subpackage("tests") - config.add_extension( - "_linkage", - sources=["_linkage.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - config.add_extension( - "_reachability", - sources=["_reachability.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - config.add_extension( - "_tree", - sources=["_tree.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - config.add_extension( - "_boruvka", - sources=["_boruvka.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) - return config - - -if __name__ == "__main__": - from numpy.distutils.core import setup - - setup(**configuration(top_path="").todict()) From 88ff7e266339c9fa3cd0423b6603250e1965a9b5 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 8 Aug 2023 22:25:28 -0400 Subject: [PATCH 127/160] Iter on styling --- sklearn/cluster/_hdbscan/_boruvka.pyx | 308 +++++++++++++------------- sklearn/cluster/_hdbscan/hdbscan.py | 1 + 2 files changed, 150 insertions(+), 159 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 3662e4c379871..4dfce8bfa6532 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -58,65 +58,64 @@ from libc.math cimport fabs, pow from sklearn.neighbors import BallTree, KDTree -from sklearn.metrics._dist_metrics cimport DistanceMetric +from ...metrics._dist_metrics cimport DistanceMetric +from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t from joblib import Parallel, delayed -cdef np.double_t INF = np.inf +cdef float64_t INF = np.inf # Define the NodeData struct used in sklearn trees for faster # access to the node data internals in Cython. cdef struct NodeData_t: - np.intp_t idx_start - np.intp_t idx_end - np.intp_t is_leaf - np.double_t radius + intp_t idx_start + intp_t idx_end + intp_t is_leaf + float64_t radius # Define a function giving the minimum distance between two # nodes of a ball tree -cdef inline np.double_t balltree_min_dist_dual( - np.double_t radius1, - np.double_t radius2, - np.intp_t node1, - np.intp_t node2, - np.double_t[:, ::1] centroid_dist) nogil except -1: - - cdef np.double_t dist_pt = centroid_dist[node1, node2] +cdef inline float64_t balltree_min_dist_dual( + float64_t radius1, + float64_t radius2, + intp_t node1, + intp_t node2, + float64_t[:, ::1] centroid_dist +) nogil except -1: + + cdef float64_t dist_pt = centroid_dist[node1, node2] return max(0, (dist_pt - radius1 - radius2)) # Define a function giving the minimum distance between two # nodes of a kd-tree -cdef inline np.double_t kdtree_min_dist_dual( +cdef inline float64_t kdtree_min_dist_dual( DistanceMetric metric, - np.intp_t node1, - np.intp_t node2, - np.double_t[:, :, ::1] node_bounds, - np.intp_t num_features) except -1: + intp_t node1, + intp_t node2, + float64_t[:, :, ::1] node_bounds, + intp_t num_features +) except -1: - cdef np.double_t d, d1, d2, rdist = 0.0 - cdef np.double_t zero = 0.0 - cdef np.intp_t j + cdef float64_t d, d1, d2, rdist = 0.0 + cdef float64_t zero = 0.0 + cdef intp_t j if metric.p == INF: for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) + d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] + d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] d = (d1 + fabs(d1)) + (d2 + fabs(d2)) rdist = max(rdist, 0.5 * d) else: # here we'll use the fact that x + abs(x) = 2 * max(x, 0) for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) + d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] + d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] d = (d1 + fabs(d1)) + (d2 + fabs(d2)) rdist += pow(0.5 * d, metric.p) @@ -127,33 +126,30 @@ cdef inline np.double_t kdtree_min_dist_dual( # As above, but this time we use the rdist as per the kdtree # implementation. This allows us to release the GIL over # larger sections of code -cdef inline np.double_t kdtree_min_rdist_dual( +cdef inline float64_t kdtree_min_rdist_dual( DistanceMetric metric, - np.intp_t node1, - np.intp_t node2, - np.double_t[:, :, ::1] node_bounds, - np.intp_t num_features) nogil except -1: + intp_t node1, + intp_t node2, + float64_t[:, :, ::1] node_bounds, + intp_t num_features +) nogil except -1: - cdef np.double_t d, d1, d2, rdist = 0.0 - cdef np.double_t zero = 0.0 - cdef np.intp_t j + cdef float64_t d, d1, d2, rdist = 0.0 + cdef float64_t zero = 0.0 + cdef intp_t j if metric.p == INF: for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) + d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] + d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] d = (d1 + fabs(d1)) + (d2 + fabs(d2)) rdist = max(rdist, 0.5 * d) else: # here we'll use the fact that x + abs(x) = 2 * max(x, 0) for j in range(num_features): - d1 = (node_bounds[0, node1, j] - - node_bounds[1, node2, j]) - d2 = (node_bounds[0, node2, j] - - node_bounds[1, node1, j]) + d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] + d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] d = (d1 + fabs(d1)) + (d2 + fabs(d2)) rdist += pow(0.5 * d, metric.p) @@ -180,25 +176,19 @@ cdef class BoruvkaUnionFind(object): a component. """ - cdef np.ndarray _parent_arr - cdef np.intp_t[::1] _parent - cdef np.ndarray _rank_arr - cdef np.uint8_t[::1] _rank + cdef intp_t[::1] _parent + cdef uint8_t[::1] _rank cdef np.ndarray is_component def __init__(self, size): - self._parent_arr = np.arange(size, dtype=np.intp) - self._parent = ( ( - self._parent_arr.data)) - self._rank_arr = np.zeros(size, dtype=np.uint8) - self._rank = ( ( - self._rank_arr.data)) + self._parent = np.arange(size, dtype=np.intp) + self._rank = np.zeros(size, dtype=np.uint8) self.is_component = np.ones(size, dtype=bool) - cdef int union_(self, np.intp_t x, np.intp_t y) except -1: + cdef int union_(self, intp_t x, intp_t y) except -1: """Union together elements x and y""" - cdef np.intp_t x_root = self.find(x) - cdef np.intp_t y_root = self.find(y) + cdef intp_t x_root = self.find(x) + cdef intp_t y_root = self.find(y) if x_root == y_root: return 0 @@ -216,10 +206,10 @@ cdef class BoruvkaUnionFind(object): return 0 - cdef np.intp_t find(self, np.intp_t x) except -1: + cdef intp_t find(self, intp_t x) except -1: """Find the root or identifier for the component that x is in""" - cdef np.intp_t x_parent - cdef np.intp_t x_grandparent + cdef intp_t x_parent + cdef intp_t x_grandparent x_parent = self._parent[x] while True: @@ -230,7 +220,7 @@ cdef class BoruvkaUnionFind(object): x = x_parent x_parent = x_grandparent - cdef np.ndarray[np.intp_t, ndim=1] components(self): + cdef ndarray[intp_t, ndim=1] components(self): """Return an array of all component roots/identifiers""" return self.is_component.nonzero()[0] @@ -281,38 +271,38 @@ cdef class BoruvkaAlgorithm(object): cdef object core_dist_tree cdef DistanceMetric dist cdef np.ndarray _data - cdef readonly const np.double_t[:, ::1] _raw_data - cdef np.double_t[:, :, ::1] node_bounds - cdef np.double_t alpha - cdef np.int8_t approx_min_span_tree - cdef np.intp_t n_jobs - cdef np.intp_t min_samples - cdef np.intp_t num_points - cdef np.intp_t num_nodes - cdef np.intp_t num_features + cdef readonly const float64_t[:, ::1] _raw_data + cdef float64_t[:, :, ::1] node_bounds + cdef float64_t alpha + cdef int8_t approx_min_span_tree + cdef intp_t n_jobs + cdef intp_t min_samples + cdef intp_t num_points + cdef intp_t num_nodes + cdef intp_t num_features cdef bint is_KDTree - cdef public np.double_t[::1] core_distance - cdef public np.double_t[::1] bounds - cdef public np.intp_t[::1] component_of_point - cdef public np.intp_t[::1] component_of_node - cdef public np.intp_t[::1] candidate_neighbor - cdef public np.intp_t[::1] candidate_point - cdef public np.double_t[::1] candidate_distance - cdef public np.double_t[:, ::1] centroid_distances - cdef public np.intp_t[::1] idx_array + cdef public float64_t[::1] core_distance + cdef public float64_t[::1] bounds + cdef public intp_t[::1] component_of_point + cdef public intp_t[::1] component_of_node + cdef public intp_t[::1] candidate_neighbor + cdef public intp_t[::1] candidate_point + cdef public float64_t[::1] candidate_distance + cdef public float64_t[:, ::1] centroid_distances + cdef public intp_t[::1] idx_array cdef public NodeData_t[::1] node_data cdef BoruvkaUnionFind component_union_find cdef np.ndarray edges - cdef np.intp_t num_edges + cdef intp_t num_edges - cdef np.intp_t *component_of_point_ptr - cdef np.intp_t *component_of_node_ptr - cdef np.double_t *candidate_distance_ptr - cdef np.intp_t *candidate_neighbor_ptr - cdef np.intp_t *candidate_point_ptr - cdef np.double_t *core_distance_ptr - cdef np.double_t *bounds_ptr + cdef intp_t *component_of_point_ptr + cdef intp_t *component_of_node_ptr + cdef float64_t *candidate_distance_ptr + cdef intp_t *candidate_neighbor_ptr + cdef intp_t *candidate_point_ptr + cdef float64_t *core_distance_ptr + cdef float64_t *bounds_ptr cdef np.ndarray components cdef np.ndarray core_distance_arr @@ -359,49 +349,49 @@ cdef class BoruvkaAlgorithm(object): self.idx_array = self.tree.idx_array self.node_data = self.tree.node_data - self.bounds = ( ( + self.bounds = ( ( self.bounds_arr.data)) - self.component_of_point = ( ( - self.component_of_point_arr.data)) - self.component_of_node = ( ( - self.component_of_node_arr.data)) - self.candidate_neighbor = ( ( - self.candidate_neighbor_arr.data)) - self.candidate_point = ( ( - self.candidate_point_arr.data)) - self.candidate_distance = ( ( - self.candidate_distance_arr.data)) + self.component_of_point = ( ( + self.component_of_point_arr.data)) + self.component_of_node = ( ( + self.component_of_node_arr.data)) + self.candidate_neighbor = ( ( + self.candidate_neighbor_arr.data)) + self.candidate_point = ( ( + self.candidate_point_arr.data)) + self.candidate_distance = ( ( + self.candidate_distance_arr.data)) if not self.is_KDTree: # Compute centroids for BallTree self._centroid_distances_arr = self.dist.pairwise(self.tree.node_bounds[0]) self.centroid_distances = ( - ( - + self._centroid_distances_arr.data)) self._initialize_components() self._compute_bounds() # Set up fast pointer access to arrays - self.component_of_point_ptr = &self.component_of_point[0] - self.component_of_node_ptr = &self.component_of_node[0] - self.candidate_distance_ptr = &self.candidate_distance[0] - self.candidate_neighbor_ptr = &self.candidate_neighbor[0] - self.candidate_point_ptr = &self.candidate_point[0] - self.core_distance_ptr = &self.core_distance[0] - self.bounds_ptr = &self.bounds[0] + self.component_of_point_ptr = &self.component_of_point[0] + self.component_of_node_ptr = &self.component_of_node[0] + self.candidate_distance_ptr = &self.candidate_distance[0] + self.candidate_neighbor_ptr = &self.candidate_neighbor[0] + self.candidate_point_ptr = &self.candidate_point[0] + self.core_distance_ptr = &self.core_distance[0] + self.bounds_ptr = &self.bounds[0] cdef _compute_bounds(self): """Initialize core distances""" - cdef np.intp_t n - cdef np.intp_t i - cdef np.intp_t m + cdef intp_t n + cdef intp_t i + cdef intp_t m - cdef np.ndarray[np.double_t, ndim=2] knn_dist - cdef np.ndarray[np.intp_t, ndim=2] knn_indices + cdef np.ndarray[float64_t, ndim=2] knn_dist + cdef np.ndarray[intp_t, ndim=2] knn_indices # A shortcut: if we have a lot of points then we can split the points # into four piles and query them in parallel. On multicore systems @@ -430,8 +420,8 @@ cdef class BoruvkaAlgorithm(object): breadth_first=True) self.core_distance_arr = knn_dist[:, self.min_samples - 1].copy() - self.core_distance = ( ( - self.core_distance_arr.data)) + self.core_distance = ( ( + self.core_distance_arr.data)) if self.is_KDTree: @@ -461,13 +451,13 @@ cdef class BoruvkaAlgorithm(object): self.update_components() for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX + self.bounds_arr[n] = DBL_MAX cdef _initialize_components(self): """Initialize components of the min spanning tree (eventually there is only one component; initially each point is its own component)""" - cdef np.intp_t n + cdef intp_t n for n in range(self.num_points): self.component_of_point[n] = n @@ -484,18 +474,18 @@ cdef class BoruvkaAlgorithm(object): edges to the min spanning tree and recomputing components via union find.""" - cdef np.intp_t source - cdef np.intp_t sink - cdef np.intp_t c - cdef np.intp_t component - cdef np.intp_t n - cdef np.intp_t i - cdef np.intp_t p - cdef np.intp_t current_component - cdef np.intp_t current_source_component - cdef np.intp_t current_sink_component - cdef np.intp_t child1 - cdef np.intp_t child2 + cdef intp_t source + cdef intp_t sink + cdef intp_t c + cdef intp_t component + cdef intp_t n + cdef intp_t i + cdef intp_t p + cdef intp_t current_component + cdef intp_t current_source_component + cdef intp_t current_sink_component + cdef intp_t child1 + cdef intp_t child2 cdef NodeData_t node_info @@ -588,34 +578,34 @@ cdef class BoruvkaAlgorithm(object): if self.components.shape[0] == last_num_components: # Reset bounds for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX + self.bounds_arr[n] = DBL_MAX else: self.components = self.component_union_find.components() for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX + self.bounds_arr[n] = DBL_MAX return self.components.shape[0] - cdef int dual_tree_traversal(self, np.intp_t node1, - np.intp_t node2) nogil except -1: + cdef int dual_tree_traversal(self, intp_t node1, + intp_t node2) nogil except -1: """Perform a dual tree traversal, pruning wherever possible, to find the nearest neighbor not in the same component for each component. This is akin to a standard dual tree NN search, but we also prune whenever all points in query and reference nodes are in the same component.""" - cdef np.intp_t[::1] point_indices1, point_indices2 + cdef intp_t[::1] point_indices1, point_indices2 - cdef np.intp_t i - cdef np.intp_t j + cdef intp_t i + cdef intp_t j - cdef np.intp_t p - cdef np.intp_t q + cdef intp_t p + cdef intp_t q - cdef np.intp_t parent - cdef np.intp_t child1 - cdef np.intp_t child2 + cdef intp_t parent + cdef intp_t child1 + cdef intp_t child2 cdef double node_dist @@ -625,25 +615,25 @@ cdef class BoruvkaAlgorithm(object): cdef NodeData_t left_info cdef NodeData_t right_info - cdef np.intp_t component1 - cdef np.intp_t component2 + cdef intp_t component1 + cdef intp_t component2 - cdef np.double_t *raw_data = ( &self._raw_data[0, 0]) - cdef np.double_t d + cdef float64_t *raw_data = ( &self._raw_data[0, 0]) + cdef float64_t d - cdef np.double_t mr_dist - cdef np.double_t _radius + cdef float64_t mr_dist + cdef float64_t _radius - cdef np.double_t new_bound - cdef np.double_t new_upper_bound - cdef np.double_t new_lower_bound - cdef np.double_t bound_max - cdef np.double_t bound_min + cdef float64_t new_bound + cdef float64_t new_upper_bound + cdef float64_t new_lower_bound + cdef float64_t bound_max + cdef float64_t bound_min - cdef np.intp_t left - cdef np.intp_t right - cdef np.double_t left_dist - cdef np.double_t right_dist + cdef intp_t left + cdef intp_t right + cdef float64_t left_dist + cdef float64_t right_dist # Compute the distance between the query and reference nodes if self.is_KDTree: @@ -874,8 +864,8 @@ cdef class BoruvkaAlgorithm(object): """Compute the minimum spanning tree of the data held by the tree passed in at construction""" - cdef np.intp_t num_components - cdef np.intp_t num_nodes + cdef intp_t num_components + cdef intp_t num_nodes num_components = self.tree.data.shape[0] num_nodes = self.tree.node_data.shape[0] diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 1c62996ebb229..e107489c6256d 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -541,6 +541,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): store_centers : str, default=None Which, if any, cluster centers to compute and store. The options are: + - `None` which does not compute nor store any centers. - `"centroid"` which calculates the center by taking the weighted average of their positions. Note that the algorithm uses the From 44b1463411052c1fd520603d11f62376bf0d7094 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 09:17:20 -0400 Subject: [PATCH 128/160] Included boruvka in build --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1ba7c10321fa1..0ee397da3f624 100755 --- a/setup.py +++ b/setup.py @@ -209,6 +209,7 @@ def check_package_status(package, min_version): {"sources": ["_k_means_minibatch.pyx"], "include_np": True}, ], "cluster._hdbscan": [ + {"sources": ["_boruvka.pyx"], "include_np": True}, {"sources": ["_linkage.pyx"], "include_np": True}, {"sources": ["_reachability.pyx"], "include_np": True}, {"sources": ["_tree.pyx"], "include_np": True}, From 177227bd1c7e2631e80efda2f4603a4e7bcec852 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 09:17:32 -0400 Subject: [PATCH 129/160] Added int8_t type for boruvka --- sklearn/utils/_typedefs.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index 3ffe5b3b41098..d568d49c75f28 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -14,6 +14,7 @@ # TODO: Stop defining custom types locally or globally like DTYPE_t and friends and # use these consistently throughout the codebase. # NOTE: Extend this list as needed when converting more cython extensions. +ctypedef char int8_t ctypedef unsigned char uint8_t ctypedef unsigned int uint32_t ctypedef unsigned long long uint64_t From 2835e91f92ce453bd831858da42e849528c5f910 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 09:18:12 -0400 Subject: [PATCH 130/160] Iter on boruvka, imported cnp --- sklearn/cluster/_hdbscan/_boruvka.pyx | 97 +++++++++++++++------------ 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 4dfce8bfa6532..3453c6f2d2e4b 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -52,14 +52,14 @@ import numpy as np -cimport numpy as np +cimport numpy as cnp from libc.float cimport DBL_MAX from libc.math cimport fabs, pow from sklearn.neighbors import BallTree, KDTree from ...metrics._dist_metrics cimport DistanceMetric -from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t +from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t, int8_t from joblib import Parallel, delayed @@ -178,7 +178,7 @@ cdef class BoruvkaUnionFind(object): cdef intp_t[::1] _parent cdef uint8_t[::1] _rank - cdef np.ndarray is_component + cdef cnp.ndarray is_component def __init__(self, size): self._parent = np.arange(size, dtype=np.intp) @@ -220,7 +220,7 @@ cdef class BoruvkaUnionFind(object): x = x_parent x_parent = x_grandparent - cdef ndarray[intp_t, ndim=1] components(self): + cdef cnp.ndarray[intp_t, ndim=1] components(self): """Return an array of all component roots/identifiers""" return self.is_component.nonzero()[0] @@ -228,7 +228,7 @@ cdef class BoruvkaUnionFind(object): def _core_dist_query(tree, data, min_samples): return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) -cdef class BoruvkaAlgorithm(object): +cdef class BoruvkaAlgorithm: """A Dual Tree Boruvka Algorithm implemented for the sklearn KDTree space tree implementation. @@ -270,8 +270,8 @@ cdef class BoruvkaAlgorithm(object): cdef object tree cdef object core_dist_tree cdef DistanceMetric dist - cdef np.ndarray _data - cdef readonly const float64_t[:, ::1] _raw_data + cdef cnp.ndarray _data + cdef readonly const float64_t[:, ::1] raw_data cdef float64_t[:, :, ::1] node_bounds cdef float64_t alpha cdef int8_t approx_min_span_tree @@ -293,7 +293,7 @@ cdef class BoruvkaAlgorithm(object): cdef public intp_t[::1] idx_array cdef public NodeData_t[::1] node_data cdef BoruvkaUnionFind component_union_find - cdef np.ndarray edges + cdef cnp.ndarray edges cdef intp_t num_edges cdef intp_t *component_of_point_ptr @@ -304,15 +304,15 @@ cdef class BoruvkaAlgorithm(object): cdef float64_t *core_distance_ptr cdef float64_t *bounds_ptr - cdef np.ndarray components - cdef np.ndarray core_distance_arr - cdef np.ndarray bounds_arr - cdef np.ndarray _centroid_distances_arr - cdef np.ndarray component_of_point_arr - cdef np.ndarray component_of_node_arr - cdef np.ndarray candidate_point_arr - cdef np.ndarray candidate_neighbor_arr - cdef np.ndarray candidate_distance_arr + cdef cnp.ndarray components + cdef cnp.ndarray core_distance_arr + cdef cnp.ndarray bounds_arr + cdef cnp.ndarray _centroid_distances_arr + cdef cnp.ndarray component_of_point_arr + cdef cnp.ndarray component_of_node_arr + cdef cnp.ndarray candidate_point_arr + cdef cnp.ndarray candidate_neighbor_arr + cdef cnp.ndarray candidate_distance_arr def __init__(self, tree, min_samples=5, metric='euclidean', leaf_size=20, alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): @@ -321,7 +321,7 @@ cdef class BoruvkaAlgorithm(object): self.tree = tree self.is_KDTree = isinstance(tree, KDTree) self._data = np.array(self.tree.data) - self._raw_data = self.tree.data + self.raw_data = self.tree.data self.node_bounds = self.tree.node_bounds self.alpha = alpha self.approx_min_span_tree = approx_min_span_tree @@ -390,8 +390,8 @@ cdef class BoruvkaAlgorithm(object): cdef intp_t i cdef intp_t m - cdef np.ndarray[float64_t, ndim=2] knn_dist - cdef np.ndarray[intp_t, ndim=2] knn_indices + cdef cnp.ndarray[float64_t, ndim=2] knn_dist + cdef cnp.ndarray[intp_t, ndim=2] knn_indices # A shortcut: if we have a lot of points then we can split the points # into four piles and query them in parallel. On multicore systems @@ -618,7 +618,6 @@ cdef class BoruvkaAlgorithm(object): cdef intp_t component1 cdef intp_t component2 - cdef float64_t *raw_data = ( &self._raw_data[0, 0]) cdef float64_t d cdef float64_t mr_dist @@ -714,36 +713,50 @@ cdef class BoruvkaAlgorithm(object): if component1 != component2: if self.is_KDTree: - d = self.dist.rdist(&raw_data[self.num_features * p], - &raw_data[self.num_features * q], - self.num_features) + d = self.dist.rdist( + &self.raw_data[self.num_features * p][0], + &self.raw_data[self.num_features * q][0], + self.num_features + ) else: - d = self.dist.dist(&raw_data[self.num_features * p], - &raw_data[self.num_features * q], - self.num_features) * self.alpha + d = self.dist.dist( + &self.raw_data[self.num_features * p][0], + &self.raw_data[self.num_features * q][0], + self.num_features + ) * self.alpha if self.alpha != 1.0: - mr_dist = max(d / self.alpha, - self.core_distance_ptr[p], - self.core_distance_ptr[q]) + mr_dist = max( + d / self.alpha, + self.core_distance_ptr[p], + self.core_distance_ptr[q] + ) else: - mr_dist = max(d, self.core_distance_ptr[p], - self.core_distance_ptr[q]) + mr_dist = max( + d, self.core_distance_ptr[p], + self.core_distance_ptr[q] + ) if mr_dist < self.candidate_distance_ptr[component1]: self.candidate_distance_ptr[component1] = mr_dist self.candidate_neighbor_ptr[component1] = q self.candidate_point_ptr[component1] = p - new_upper_bound = max(new_upper_bound, - self.candidate_distance_ptr[component1]) - new_lower_bound = min(new_lower_bound, - self.candidate_distance_ptr[component1]) + new_upper_bound = max( + new_upper_bound, + self.candidate_distance_ptr[component1] + ) + new_lower_bound = min( + new_lower_bound, + self.candidate_distance_ptr[component1] + ) # Compute new bounds for the query node, and # then propagate the results of that computation # up the tree. _radius = self.dist._dist_to_rdist(node1_info.radius) if self.is_KDTree else node1_info.radius - new_bound = min(new_upper_bound, - new_lower_bound + 2 * _radius) + new_bound = min( + new_upper_bound, + new_lower_bound + 2 * _radius + ) if new_bound < self.bounds_ptr[node1]: self.bounds_ptr[node1] = new_bound @@ -763,10 +776,10 @@ cdef class BoruvkaAlgorithm(object): if self.is_KDTree: new_bound = bound_max else: - bound_min = min(self.bounds_ptr[left] + 2 * - (parent_info.radius - left_info.radius), - self.bounds_ptr[right] + 2 * - (parent_info.radius - right_info.radius)) + bound_min = min( + self.bounds_ptr[left] + 2 * (parent_info.radius - left_info.radius), + self.bounds_ptr[right] + 2 * (parent_info.radius - right_info.radius) + ) if bound_min > 0: new_bound = min(bound_max, bound_min) From bb430546d5da033ff4ec43567e093eb9c25ffb40 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 10:01:49 -0400 Subject: [PATCH 131/160] Formatting and declaration grouping --- sklearn/cluster/_hdbscan/_boruvka.pyx | 179 +++++++++++++------------- 1 file changed, 88 insertions(+), 91 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 3453c6f2d2e4b..7ff4519ed6d9c 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -228,6 +228,7 @@ cdef class BoruvkaUnionFind(object): def _core_dist_query(tree, data, min_samples): return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) + cdef class BoruvkaAlgorithm: """A Dual Tree Boruvka Algorithm implemented for the sklearn KDTree space tree implementation. @@ -474,20 +475,12 @@ cdef class BoruvkaAlgorithm: edges to the min spanning tree and recomputing components via union find.""" - cdef intp_t source - cdef intp_t sink - cdef intp_t c - cdef intp_t component - cdef intp_t n - cdef intp_t i - cdef intp_t p - cdef intp_t current_component - cdef intp_t current_source_component - cdef intp_t current_sink_component - cdef intp_t child1 - cdef intp_t child2 - - cdef NodeData_t node_info + cdef: + intp_t sink, source, c, component, n, i, p + intp_t current_component, current_source_component + intp_t current_sink_component + intp_t child1, child2 + NodeData_t node_info # For each component there should be a: # - candidate point (a point in the component) @@ -597,62 +590,46 @@ cdef class BoruvkaAlgorithm: cdef intp_t[::1] point_indices1, point_indices2 - cdef intp_t i - cdef intp_t j - - cdef intp_t p - cdef intp_t q - - cdef intp_t parent - cdef intp_t child1 - cdef intp_t child2 - - cdef double node_dist + cdef intp_t i, j, p, q + cdef intp_t parent, child1, child2 + cdef intp_t component1, component2 cdef NodeData_t node1_info = self.node_data[node1] cdef NodeData_t node2_info = self.node_data[node2] - cdef NodeData_t parent_info - cdef NodeData_t left_info - cdef NodeData_t right_info - - cdef intp_t component1 - cdef intp_t component2 - - cdef float64_t d + cdef NodeData_t parent_info, left_info, right_info - cdef float64_t mr_dist - cdef float64_t _radius - cdef float64_t new_bound - cdef float64_t new_upper_bound - cdef float64_t new_lower_bound - cdef float64_t bound_max - cdef float64_t bound_min + cdef float64_t d, mr_dist, _radius, node_dist + cdef float64_t new_bound, new_upper_bound, new_lower_bound + cdef float64_t bound_max, bound_min - cdef intp_t left - cdef intp_t right - cdef float64_t left_dist - cdef float64_t right_dist + cdef intp_t left, right, left_dist, right_dist # Compute the distance between the query and reference nodes if self.is_KDTree: - node_dist = kdtree_min_rdist_dual(self.dist, - node1, node2, self.node_bounds, - self.num_features) + node_dist = kdtree_min_rdist_dual( + self.dist, + node1, node2, self.node_bounds, + self.num_features + ) else: #BallTree - node_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, node2, - self.centroid_distances) + node_dist = balltree_min_dist_dual( + node1_info.radius, + node2_info.radius, + node1, node2, + self.centroid_distances + ) # If the distance between the nodes is less than the current bound for # the query and the nodes are not in the same component continue; # otherwise we get to prune this branch and return early. if node_dist < self.bounds_ptr[node1]: - if (self.component_of_node_ptr[node1] == + if ( + self.component_of_node_ptr[node1] == self.component_of_node_ptr[node2] and - self.component_of_node_ptr[node1] >= 0): + self.component_of_node_ptr[node1] >= 0 + ): return 0 else: return 0 @@ -688,10 +665,12 @@ cdef class BoruvkaAlgorithm: new_upper_bound = 0.0 new_lower_bound = DBL_MAX - point_indices1 = self.idx_array[node1_info.idx_start: - node1_info.idx_end] - point_indices2 = self.idx_array[node2_info.idx_start: - node2_info.idx_end] + point_indices1 = self.idx_array[ + node1_info.idx_start:node1_info.idx_end + ] + point_indices2 = self.idx_array[ + node2_info.idx_start:node2_info.idx_end + ] for i in range(point_indices1.shape[0]): @@ -770,8 +749,10 @@ cdef class BoruvkaAlgorithm: left_info = self.node_data[left] right_info = self.node_data[right] - bound_max = max(self.bounds_ptr[left], - self.bounds_ptr[right]) + bound_max = max( + self.bounds_ptr[left], + self.bounds_ptr[right] + ) if self.is_KDTree: new_bound = bound_max @@ -805,25 +786,33 @@ cdef class BoruvkaAlgorithm: right = 2 * node2 + 2 if self.is_KDTree: - left_dist = kdtree_min_rdist_dual(self.dist, - node1, left, - self.node_bounds, - self.num_features) - right_dist = kdtree_min_rdist_dual(self.dist, - node1, right, - self.node_bounds, - self.num_features) + left_dist = kdtree_min_rdist_dual( + self.dist, + node1, left, + self.node_bounds, + self.num_features + ) + right_dist = kdtree_min_rdist_dual( + self.dist, + node1, right, + self.node_bounds, + self.num_features + ) else: node2_info = self.node_data[left] - left_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, left, - self.centroid_distances) + left_dist = balltree_min_dist_dual( + node1_info.radius, + node2_info.radius, + node1, left, + self.centroid_distances + ) node2_info = self.node_data[right] - right_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - node1, right, - self.centroid_distances) + right_dist = balltree_min_dist_dual( + node1_info.radius, + node2_info.radius, + node1, right, + self.centroid_distances + ) if left_dist < right_dist: self.dual_tree_traversal(node1, left) @@ -843,25 +832,33 @@ cdef class BoruvkaAlgorithm: left = 2 * node1 + 1 right = 2 * node1 + 2 if self.is_KDTree: - left_dist = kdtree_min_rdist_dual(self.dist, - left, node2, - self.node_bounds, - self.num_features) - right_dist = kdtree_min_rdist_dual(self.dist, - right, node2, - self.node_bounds, - self.num_features) + left_dist = kdtree_min_rdist_dual( + self.dist, + left, node2, + self.node_bounds, + self.num_features + ) + right_dist = kdtree_min_rdist_dual( + self.dist, + right, node2, + self.node_bounds, + self.num_features + ) else: node1_info = self.node_data[left] - left_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - left, node2, - self.centroid_distances) + left_dist = balltree_min_dist_dual( + node1_info.radius, + node2_info.radius, + left, node2, + self.centroid_distances + ) node1_info = self.node_data[right] - right_dist = balltree_min_dist_dual(node1_info.radius, - node2_info.radius, - right, node2, - self.centroid_distances) + right_dist = balltree_min_dist_dual( + node1_info.radius, + node2_info.radius, + right, node2, + self.centroid_distances + ) if left_dist < right_dist: From 3fa0a9bb204dc7ebad5b4c5aa89dde78b8cf9ca5 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 10:08:49 -0400 Subject: [PATCH 132/160] Ndarray->memview refactor --- sklearn/cluster/_hdbscan/_boruvka.pyx | 67 +++++---------------------- 1 file changed, 11 insertions(+), 56 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 7ff4519ed6d9c..f3735806c3f8a 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -297,23 +297,7 @@ cdef class BoruvkaAlgorithm: cdef cnp.ndarray edges cdef intp_t num_edges - cdef intp_t *component_of_point_ptr - cdef intp_t *component_of_node_ptr - cdef float64_t *candidate_distance_ptr - cdef intp_t *candidate_neighbor_ptr - cdef intp_t *candidate_point_ptr - cdef float64_t *core_distance_ptr - cdef float64_t *bounds_ptr - cdef cnp.ndarray components - cdef cnp.ndarray core_distance_arr - cdef cnp.ndarray bounds_arr - cdef cnp.ndarray _centroid_distances_arr - cdef cnp.ndarray component_of_point_arr - cdef cnp.ndarray component_of_node_arr - cdef cnp.ndarray candidate_point_arr - cdef cnp.ndarray candidate_neighbor_arr - cdef cnp.ndarray candidate_distance_arr def __init__(self, tree, min_samples=5, metric='euclidean', leaf_size=20, alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): @@ -335,12 +319,12 @@ cdef class BoruvkaAlgorithm: self.dist = DistanceMetric.get_metric(metric, **kwargs) self.components = np.arange(self.num_points) - self.bounds_arr = np.empty(self.num_nodes, np.double) - self.component_of_point_arr = np.empty(self.num_points, dtype=np.intp) - self.component_of_node_arr = np.empty(self.num_nodes, dtype=np.intp) - self.candidate_neighbor_arr = np.empty(self.num_points, dtype=np.intp) - self.candidate_point_arr = np.empty(self.num_points, dtype=np.intp) - self.candidate_distance_arr = np.empty(self.num_points, + self.bounds = np.empty(self.num_nodes, np.double) + self.component_of_point = np.empty(self.num_points, dtype=np.intp) + self.component_of_node = np.empty(self.num_nodes, dtype=np.intp) + self.candidate_neighbor = np.empty(self.num_points, dtype=np.intp) + self.candidate_point = np.empty(self.num_points, dtype=np.intp) + self.candidate_distance = np.empty(self.num_points, dtype=np.double) self.component_union_find = BoruvkaUnionFind(self.num_points) @@ -350,40 +334,13 @@ cdef class BoruvkaAlgorithm: self.idx_array = self.tree.idx_array self.node_data = self.tree.node_data - self.bounds = ( ( - self.bounds_arr.data)) - self.component_of_point = ( ( - self.component_of_point_arr.data)) - self.component_of_node = ( ( - self.component_of_node_arr.data)) - self.candidate_neighbor = ( ( - self.candidate_neighbor_arr.data)) - self.candidate_point = ( ( - self.candidate_point_arr.data)) - self.candidate_distance = ( ( - self.candidate_distance_arr.data)) - if not self.is_KDTree: # Compute centroids for BallTree - self._centroid_distances_arr = self.dist.pairwise(self.tree.node_bounds[0]) - self.centroid_distances = ( - ( - - self._centroid_distances_arr.data)) + self.centroid_distances = self.dist.pairwise(self.tree.node_bounds[0]) self._initialize_components() self._compute_bounds() - # Set up fast pointer access to arrays - self.component_of_point_ptr = &self.component_of_point[0] - self.component_of_node_ptr = &self.component_of_node[0] - self.candidate_distance_ptr = &self.candidate_distance[0] - self.candidate_neighbor_ptr = &self.candidate_neighbor[0] - self.candidate_point_ptr = &self.candidate_point[0] - self.core_distance_ptr = &self.core_distance[0] - self.bounds_ptr = &self.bounds[0] - cdef _compute_bounds(self): """Initialize core distances""" @@ -420,9 +377,7 @@ cdef class BoruvkaAlgorithm: dualtree=True, breadth_first=True) - self.core_distance_arr = knn_dist[:, self.min_samples - 1].copy() - self.core_distance = ( ( - self.core_distance_arr.data)) + self.core_distance = knn_dist[:, self.min_samples - 1].copy() if self.is_KDTree: @@ -452,7 +407,7 @@ cdef class BoruvkaAlgorithm: self.update_components() for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX + self.bounds[n] = DBL_MAX cdef _initialize_components(self): """Initialize components of the min spanning tree (eventually there @@ -571,12 +526,12 @@ cdef class BoruvkaAlgorithm: if self.components.shape[0] == last_num_components: # Reset bounds for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX + self.bounds[n] = DBL_MAX else: self.components = self.component_union_find.components() for n in range(self.num_nodes): - self.bounds_arr[n] = DBL_MAX + self.bounds[n] = DBL_MAX return self.components.shape[0] From d5eba10a14b102d92a68f31b2fd6262d2328c6d3 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 10:25:16 -0400 Subject: [PATCH 133/160] Updated distancemetric typing --- sklearn/cluster/_hdbscan/_boruvka.pyx | 67 ++++++++++++++------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index f3735806c3f8a..cf5f656eec222 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -58,7 +58,7 @@ from libc.math cimport fabs, pow from sklearn.neighbors import BallTree, KDTree -from ...metrics._dist_metrics cimport DistanceMetric +from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric64 from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t, int8_t from joblib import Parallel, delayed @@ -93,7 +93,7 @@ cdef inline float64_t balltree_min_dist_dual( # Define a function giving the minimum distance between two # nodes of a kd-tree cdef inline float64_t kdtree_min_dist_dual( - DistanceMetric metric, + DistanceMetric64 metric, intp_t node1, intp_t node2, float64_t[:, :, ::1] node_bounds, @@ -127,7 +127,7 @@ cdef inline float64_t kdtree_min_dist_dual( # implementation. This allows us to release the GIL over # larger sections of code cdef inline float64_t kdtree_min_rdist_dual( - DistanceMetric metric, + DistanceMetric64 metric, intp_t node1, intp_t node2, float64_t[:, :, ::1] node_bounds, @@ -270,7 +270,7 @@ cdef class BoruvkaAlgorithm: cdef object tree cdef object core_dist_tree - cdef DistanceMetric dist + cdef DistanceMetric64 dist cdef cnp.ndarray _data cdef readonly const float64_t[:, ::1] raw_data cdef float64_t[:, :, ::1] node_bounds @@ -558,7 +558,8 @@ cdef class BoruvkaAlgorithm: cdef float64_t new_bound, new_upper_bound, new_lower_bound cdef float64_t bound_max, bound_min - cdef intp_t left, right, left_dist, right_dist + cdef intp_t left, right + cdef float64_t left_dist, right_dist # Compute the distance between the query and reference nodes if self.is_KDTree: @@ -579,11 +580,11 @@ cdef class BoruvkaAlgorithm: # If the distance between the nodes is less than the current bound for # the query and the nodes are not in the same component continue; # otherwise we get to prune this branch and return early. - if node_dist < self.bounds_ptr[node1]: + if node_dist < self.bounds[node1]: if ( - self.component_of_node_ptr[node1] == - self.component_of_node_ptr[node2] and - self.component_of_node_ptr[node1] >= 0 + self.component_of_node[node1] == + self.component_of_node[node2] and + self.component_of_node[node1] >= 0 ): return 0 else: @@ -630,19 +631,19 @@ cdef class BoruvkaAlgorithm: for i in range(point_indices1.shape[0]): p = point_indices1[i] - component1 = self.component_of_point_ptr[p] + component1 = self.component_of_point[p] - if (self.core_distance_ptr[p] > - self.candidate_distance_ptr[component1]): + if (self.core_distance[p] > + self.candidate_distance[component1]): continue for j in range(point_indices2.shape[0]): q = point_indices2[j] - component2 = self.component_of_point_ptr[q] + component2 = self.component_of_point[q] - if (self.core_distance_ptr[q] > - self.candidate_distance_ptr[component1]): + if (self.core_distance[q] > + self.candidate_distance[component1]): continue if component1 != component2: @@ -661,26 +662,26 @@ cdef class BoruvkaAlgorithm: if self.alpha != 1.0: mr_dist = max( d / self.alpha, - self.core_distance_ptr[p], - self.core_distance_ptr[q] + self.core_distance[p], + self.core_distance[q] ) else: mr_dist = max( - d, self.core_distance_ptr[p], - self.core_distance_ptr[q] + d, self.core_distance[p], + self.core_distance[q] ) - if mr_dist < self.candidate_distance_ptr[component1]: - self.candidate_distance_ptr[component1] = mr_dist - self.candidate_neighbor_ptr[component1] = q - self.candidate_point_ptr[component1] = p + if mr_dist < self.candidate_distance[component1]: + self.candidate_distance[component1] = mr_dist + self.candidate_neighbor[component1] = q + self.candidate_point[component1] = p new_upper_bound = max( new_upper_bound, - self.candidate_distance_ptr[component1] + self.candidate_distance[component1] ) new_lower_bound = min( new_lower_bound, - self.candidate_distance_ptr[component1] + self.candidate_distance[component1] ) # Compute new bounds for the query node, and @@ -691,8 +692,8 @@ cdef class BoruvkaAlgorithm: new_upper_bound, new_lower_bound + 2 * _radius ) - if new_bound < self.bounds_ptr[node1]: - self.bounds_ptr[node1] = new_bound + if new_bound < self.bounds[node1]: + self.bounds[node1] = new_bound # Propagate bounds up the tree while node1 > 0: @@ -705,24 +706,24 @@ cdef class BoruvkaAlgorithm: right_info = self.node_data[right] bound_max = max( - self.bounds_ptr[left], - self.bounds_ptr[right] + self.bounds[left], + self.bounds[right] ) if self.is_KDTree: new_bound = bound_max else: bound_min = min( - self.bounds_ptr[left] + 2 * (parent_info.radius - left_info.radius), - self.bounds_ptr[right] + 2 * (parent_info.radius - right_info.radius) + self.bounds[left] + 2 * (parent_info.radius - left_info.radius), + self.bounds[right] + 2 * (parent_info.radius - right_info.radius) ) if bound_min > 0: new_bound = min(bound_max, bound_min) else: new_bound = bound_max - if new_bound < self.bounds_ptr[parent]: - self.bounds_ptr[parent] = new_bound + if new_bound < self.bounds[parent]: + self.bounds[parent] = new_bound node1 = parent else: break From 05276bd5fb5ba6c901a1d3a2c85f2ef27586e811 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 10:26:14 -0400 Subject: [PATCH 134/160] Added prototype test --- sklearn/cluster/tests/test_hdbscan.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 7a2bac12ef057..437333e721696 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -38,6 +38,18 @@ OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} +@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) +def test_hdbscan_boruvka_matches(tree): + hdb_prims = HDBSCAN(tree_algorithm=tree, mst_algorithm="prims").fit(X, y) + hdb_boruvka = HDBSCAN(tree_algorithm=tree, mst_algorithm="boruvka").fit(X, y) + labels_prims = hdb_prims.labels_ + labels_boruvka = hdb_boruvka.labels_ + + similarity = fowlkes_mallows_score(labels_prims, labels_boruvka) + + assert similarity > 0.85 + + @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) def test_outlier_data(outlier_type): """ From b3ac0d109c16cd877b20c5bc55c1cd299718ba66 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 10:27:02 -0400 Subject: [PATCH 135/160] Corrected algo key-word --- sklearn/cluster/tests/test_hdbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 437333e721696..1d55f28bc4199 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -40,8 +40,8 @@ @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) def test_hdbscan_boruvka_matches(tree): - hdb_prims = HDBSCAN(tree_algorithm=tree, mst_algorithm="prims").fit(X, y) - hdb_boruvka = HDBSCAN(tree_algorithm=tree, mst_algorithm="boruvka").fit(X, y) + hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims").fit(X, y) + hdb_boruvka = HDBSCAN(algorithm=tree, mst_algorithm="boruvka").fit(X, y) labels_prims = hdb_prims.labels_ labels_boruvka = hdb_boruvka.labels_ From 27d593c80a14aaed40b9292b4b484a50d45c98fa Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 11:38:46 -0400 Subject: [PATCH 136/160] Added partial dispatch for boruvka --- sklearn/cluster/_hdbscan/_boruvka.pyx | 13 +++++++--- sklearn/cluster/_hdbscan/hdbscan.py | 36 +++++++++++++++++++++------ sklearn/cluster/tests/test_hdbscan.py | 13 ++++++++++ 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index cf5f656eec222..2a54ea2b747b7 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -84,7 +84,7 @@ cdef inline float64_t balltree_min_dist_dual( intp_t node1, intp_t node2, float64_t[:, ::1] centroid_dist -) nogil except -1: +) except -1 nogil: cdef float64_t dist_pt = centroid_dist[node1, node2] return max(0, (dist_pt - radius1 - radius2)) @@ -132,7 +132,7 @@ cdef inline float64_t kdtree_min_rdist_dual( intp_t node2, float64_t[:, :, ::1] node_bounds, intp_t num_features -) nogil except -1: +) except -1 nogil: cdef float64_t d, d1, d2, rdist = 0.0 cdef float64_t zero = 0.0 @@ -311,6 +311,7 @@ cdef class BoruvkaAlgorithm: self.alpha = alpha self.approx_min_span_tree = approx_min_span_tree self.n_jobs = n_jobs + self.min_samples = min_samples self.num_points = self.tree.data.shape[0] self.num_features = self.tree.data.shape[1] @@ -377,6 +378,7 @@ cdef class BoruvkaAlgorithm: dualtree=True, breadth_first=True) + print(f"DEBUG *** self.min_samples={self.min_samples}") self.core_distance = knn_dist[:, self.min_samples - 1].copy() @@ -535,8 +537,11 @@ cdef class BoruvkaAlgorithm: return self.components.shape[0] - cdef int dual_tree_traversal(self, intp_t node1, - intp_t node2) nogil except -1: + cdef int dual_tree_traversal( + self, + intp_t node1, + intp_t node2 + ) except -1 nogil: """Perform a dual tree traversal, pruning wherever possible, to find the nearest neighbor not in the same component for each component. This is akin to a standard dual tree NN search, but we also prune diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index e107489c6256d..d4f8ec87855e4 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -354,6 +354,7 @@ def _hdbscan_boruvka( X, algo, min_samples=5, + alpha=1.0, metric="euclidean", leaf_size=40, n_jobs=None, @@ -369,6 +370,7 @@ def _hdbscan_boruvka( min_samples=min_samples, metric=metric, leaf_size=leaf_size // 3, + alpha=alpha, approx_min_span_tree=True, n_jobs=n_jobs, **metric_params, @@ -504,7 +506,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): If the `X` passed during `fit` is sparse or `metric` is invalid for both :class:`~sklearn.neighbors.KDTree` and :class:`~sklearn.neighbors.BallTree`, then it resolves to use the - `"brute"` algorithm. + `"brute"` minimum-spanning tree algorithm. .. deprecated:: 1.4 The `'kdtree'` option was deprecated in version 1.4, @@ -514,6 +516,13 @@ class HDBSCAN(ClusterMixin, BaseEstimator): The `'balltree'` option was deprecated in version 1.4, and will be renamed to `'ball_tree'` in 1.6. + mst_algorithm : {"auto", "brute", "prims", "boruvka"}, default="auto" + Exactly which algorithm to use for building the minimum spanning tree; + by default this is set to `"auto"` which switches between `"prims"` and + `"boruvka"` based on a heuristic. + + .. versionadded:: 1.4 + leaf_size : int, default=40 Leaf size for trees responsible for fast nearest neighbour queries when a KDTree or a BallTree are used as core-distance algorithms. A large @@ -671,6 +680,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): deprecated={"kdtree", "balltree"}, ), ], + "mst_algorithm": [StrOptions({"auto", "brute", "prims", "boruvka"})], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "n_jobs": [Integral, None], "cluster_selection_method": [StrOptions({"eom", "leaf"})], @@ -689,6 +699,7 @@ def __init__( metric_params=None, alpha=1.0, algorithm="auto", + mst_algorithm="auto", leaf_size=40, n_jobs=None, cluster_selection_method="eom", @@ -704,6 +715,7 @@ def __init__( self.metric = metric self.metric_params = metric_params self.algorithm = algorithm + self.mst_algorithm = mst_algorithm self.leaf_size = leaf_size self.n_jobs = n_jobs self.cluster_selection_method = cluster_selection_method @@ -796,6 +808,17 @@ def fit(self, X, y=None): f" samples in X ({X.shape[0]})" ) + algos = {self.algorithm, self.mst_algorithm} + if ( + "brute" in algos + and len({"kd_tree", "ball_tree", "prims", "boruvka"}.intersection(algos)) + > 0 + ): + raise ValueError( + "When setting either `algorithm='brute'` or `mst_algorithm='brute'`," + " both keyword arguments must only be set to either 'brute' or 'auto'." + ) + # TODO(1.6): Remove if self.algorithm == "kdtree": warn( @@ -853,13 +876,12 @@ def fit(self, X, y=None): if self.algorithm == "brute": mst_func = _hdbscan_brute kwargs["copy"] = self.copy - elif self.algorithm == "kd_tree": - mst_func = _hdbscan_prims - kwargs["algo"] = "kd_tree" - kwargs["leaf_size"] = self.leaf_size else: - mst_func = _hdbscan_prims - kwargs["algo"] = "ball_tree" + if self.mst_algorithm == "prims": + mst_func = _hdbscan_prims + else: + mst_func = _hdbscan_boruvka + kwargs["algo"] = self.algorithm kwargs["leaf_size"] = self.leaf_size else: if issparse(X) or self.metric not in FAST_METRICS: diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 1d55f28bc4199..da1b7a45465af 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -50,6 +50,19 @@ def test_hdbscan_boruvka_matches(tree): assert similarity > 0.85 +def test_hdbscan_mst_algorithm_errors(): + msg = "When setting either" + for tree in ["kd_tree", "ball_tree"]: + hdb = HDBSCAN(algorithm=tree, mst_algorithm="brute") + with pytest.raises(ValueError, match=msg): + hdb.fit(X, y) + + for mst_algo in ["prims", "boruvka"]: + hdb = HDBSCAN(algorithm="brute", mst_algorithm=mst_algo) + with pytest.raises(ValueError, match=msg): + hdb.fit(X, y) + + @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) def test_outlier_data(outlier_type): """ From 6a72efa0acb2b864b13075ed2f7d224b958eb556 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 26 Sep 2023 12:00:28 -0400 Subject: [PATCH 137/160] Updated boruvka formatting --- sklearn/cluster/_hdbscan/_boruvka.pyx | 47 +++++++++++---------------- sklearn/cluster/_hdbscan/_linkage.pxd | 8 +++++ sklearn/cluster/_hdbscan/_linkage.pyx | 9 +---- sklearn/cluster/_hdbscan/hdbscan.py | 3 +- 4 files changed, 30 insertions(+), 37 deletions(-) create mode 100644 sklearn/cluster/_hdbscan/_linkage.pxd diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 2a54ea2b747b7..4eba9812b9f04 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -56,10 +56,12 @@ cimport numpy as cnp from libc.float cimport DBL_MAX from libc.math cimport fabs, pow -from sklearn.neighbors import BallTree, KDTree +from sklearn.neighbors import KDTree from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric64 -from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t, int8_t +from ...utils._typedefs cimport intp_t, float64_t, uint8_t, int8_t +from ._linkage cimport MST_edge_t +from ._linkage import MST_edge_dtype from joblib import Parallel, delayed @@ -101,7 +103,6 @@ cdef inline float64_t kdtree_min_dist_dual( ) except -1: cdef float64_t d, d1, d2, rdist = 0.0 - cdef float64_t zero = 0.0 cdef intp_t j if metric.p == INF: @@ -135,7 +136,6 @@ cdef inline float64_t kdtree_min_rdist_dual( ) except -1 nogil: cdef float64_t d, d1, d2, rdist = 0.0 - cdef float64_t zero = 0.0 cdef intp_t j if metric.p == INF: @@ -294,7 +294,7 @@ cdef class BoruvkaAlgorithm: cdef public intp_t[::1] idx_array cdef public NodeData_t[::1] node_data cdef BoruvkaUnionFind component_union_find - cdef cnp.ndarray edges + cdef MST_edge_t[::1] edges cdef intp_t num_edges cdef cnp.ndarray components @@ -325,11 +325,10 @@ cdef class BoruvkaAlgorithm: self.component_of_node = np.empty(self.num_nodes, dtype=np.intp) self.candidate_neighbor = np.empty(self.num_points, dtype=np.intp) self.candidate_point = np.empty(self.num_points, dtype=np.intp) - self.candidate_distance = np.empty(self.num_points, - dtype=np.double) + self.candidate_distance = np.empty(self.num_points, dtype=np.double) self.component_union_find = BoruvkaUnionFind(self.num_points) - self.edges = np.empty((self.num_points - 1, 3)) + self.edges = np.empty((self.num_points - 1,), dtype=MST_edge_dtype) self.num_edges = 0 self.idx_array = self.tree.idx_array @@ -381,7 +380,6 @@ cdef class BoruvkaAlgorithm: print(f"DEBUG *** self.min_samples={self.min_samples}") self.core_distance = knn_dist[:, self.min_samples - 1].copy() - if self.is_KDTree: # Since we do everything in terms of rdist to free up the GIL # we need to convert all the core distances beforehand @@ -463,13 +461,14 @@ cdef class BoruvkaAlgorithm: self.candidate_neighbor[component] = -1 self.candidate_distance[component] = DBL_MAX continue - self.edges[self.num_edges, 0] = source - self.edges[self.num_edges, 1] = sink + + self.edges[self.num_edges].current_node = source + self.edges[self.num_edges].next_node = sink if self.is_KDTree: - self.edges[self.num_edges, 2] = self.dist._rdist_to_dist( + self.edges[self.num_edges].distance = self.dist._rdist_to_dist( self.candidate_distance[component]) else: - self.edges[self.num_edges, 2] = self.candidate_distance[component] + self.edges[self.num_edges].distance = self.candidate_distance[component] self.num_edges += 1 self.component_union_find.union_(source, sink) @@ -551,14 +550,12 @@ cdef class BoruvkaAlgorithm: cdef intp_t[::1] point_indices1, point_indices2 cdef intp_t i, j, p, q - cdef intp_t parent, child1, child2 - cdef intp_t component1, component2 + cdef intp_t parent, component1, component2 cdef NodeData_t node1_info = self.node_data[node1] cdef NodeData_t node2_info = self.node_data[node2] cdef NodeData_t parent_info, left_info, right_info - cdef float64_t d, mr_dist, _radius, node_dist cdef float64_t new_bound, new_upper_bound, new_lower_bound cdef float64_t bound_max, bound_min @@ -573,7 +570,7 @@ cdef class BoruvkaAlgorithm: node1, node2, self.node_bounds, self.num_features ) - else: #BallTree + else: node_dist = balltree_min_dist_dual( node1_info.radius, node2_info.radius, @@ -581,7 +578,6 @@ cdef class BoruvkaAlgorithm: self.centroid_distances ) - # If the distance between the nodes is less than the current bound for # the query and the nodes are not in the same component continue; # otherwise we get to prune this branch and return early. @@ -816,12 +812,11 @@ cdef class BoruvkaAlgorithm: node1_info = self.node_data[right] right_dist = balltree_min_dist_dual( node1_info.radius, - node2_info.radius, - right, node2, - self.centroid_distances + node2_info.radius, + right, node2, + self.centroid_distances ) - if left_dist < right_dist: self.dual_tree_traversal(left, node2) self.dual_tree_traversal(right, node2) @@ -835,13 +830,9 @@ cdef class BoruvkaAlgorithm: """Compute the minimum spanning tree of the data held by the tree passed in at construction""" - cdef intp_t num_components - cdef intp_t num_nodes - - num_components = self.tree.data.shape[0] - num_nodes = self.tree.node_data.shape[0] + cdef intp_t num_components = self.tree.data.shape[0] while num_components > 1: self.dual_tree_traversal(0, 0) num_components = self.update_components() - return self.edges + return np.array(self.edges, dtype=MST_edge_dtype) diff --git a/sklearn/cluster/_hdbscan/_linkage.pxd b/sklearn/cluster/_hdbscan/_linkage.pxd new file mode 100644 index 0000000000000..2575441f9bd36 --- /dev/null +++ b/sklearn/cluster/_hdbscan/_linkage.pxd @@ -0,0 +1,8 @@ +from ...utils._typedefs cimport float64_t, int64_t + +# Packed shouldn't make a difference since they're all 8-byte quantities, +# but it's included just to be safe. +ctypedef packed struct MST_edge_t: + int64_t current_node + int64_t next_node + float64_t distance diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx index ee8025c8027aa..657211f58f066 100644 --- a/sklearn/cluster/_hdbscan/_linkage.pyx +++ b/sklearn/cluster/_hdbscan/_linkage.pyx @@ -39,7 +39,7 @@ from ...metrics._dist_metrics cimport DistanceMetric64 from ...cluster._hierarchical_fast cimport UnionFind from ...cluster._hdbscan._tree cimport HIERARCHY_t from ...cluster._hdbscan._tree import HIERARCHY_dtype -from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t +from ...utils._typedefs cimport intp_t, uint8_t cdef extern from "numpy/arrayobject.h": intp_t * PyArray_SHAPE(cnp.PyArrayObject *) @@ -51,13 +51,6 @@ MST_edge_dtype = np.dtype([ ("distance", np.float64), ]) -# Packed shouldn't make a difference since they're all 8-byte quantities, -# but it's included just to be safe. -ctypedef packed struct MST_edge_t: - int64_t current_node - int64_t next_node - float64_t distance - cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( cnp.ndarray[float64_t, ndim=2] mutual_reachability ): diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index d4f8ec87855e4..279b6cf3020a6 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -877,7 +877,8 @@ def fit(self, X, y=None): mst_func = _hdbscan_brute kwargs["copy"] = self.copy else: - if self.mst_algorithm == "prims": + # TODO: Finalize dispatching, currently placeholder + if self.mst_algorithm in ("prims", "auto"): mst_func = _hdbscan_prims else: mst_func = _hdbscan_boruvka From be2b4e9c0d2f43d288d5b7efd561bd9eac79a05f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 29 Sep 2023 17:14:28 -0400 Subject: [PATCH 138/160] Refactored NodeData_t and formatted code Included temporary addition of homogeneity measure from original library for debugging purposes --- sklearn/cluster/_hdbscan/_boruvka.pyx | 52 +++++++++++++-------------- sklearn/cluster/tests/test_hdbscan.py | 18 ++++++++++ sklearn/neighbors/_binary_tree.pxd | 7 ++++ sklearn/neighbors/_binary_tree.pxi.tp | 7 +--- 4 files changed, 51 insertions(+), 33 deletions(-) create mode 100644 sklearn/neighbors/_binary_tree.pxd diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 4eba9812b9f04..ae84268634aa6 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -60,24 +60,15 @@ from sklearn.neighbors import KDTree from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric64 from ...utils._typedefs cimport intp_t, float64_t, uint8_t, int8_t +from ...neighbors._binary_tree cimport NodeData_t from ._linkage cimport MST_edge_t from ._linkage import MST_edge_dtype -from joblib import Parallel, delayed +from joblib import Parallel, delayed, effective_n_jobs cdef float64_t INF = np.inf - -# Define the NodeData struct used in sklearn trees for faster -# access to the node data internals in Cython. -cdef struct NodeData_t: - intp_t idx_start - intp_t idx_end - intp_t is_leaf - float64_t radius - - # Define a function giving the minimum distance between two # nodes of a ball tree cdef inline float64_t balltree_min_dist_dual( @@ -226,7 +217,12 @@ cdef class BoruvkaUnionFind(object): def _core_dist_query(tree, data, min_samples): - return tree.query(data, k=min_samples, dualtree=True, breadth_first=True) + return tree.query( + data, + k=min_samples, + dualtree=True, + breadth_first=True + ) cdef class BoruvkaAlgorithm: @@ -271,16 +267,12 @@ cdef class BoruvkaAlgorithm: cdef object tree cdef object core_dist_tree cdef DistanceMetric64 dist - cdef cnp.ndarray _data cdef readonly const float64_t[:, ::1] raw_data cdef float64_t[:, :, ::1] node_bounds cdef float64_t alpha cdef int8_t approx_min_span_tree - cdef intp_t n_jobs - cdef intp_t min_samples - cdef intp_t num_points - cdef intp_t num_nodes - cdef intp_t num_features + cdef intp_t n_jobs, min_samples + cdef intp_t num_points, num_nodes, num_features cdef bint is_KDTree cdef public float64_t[::1] core_distance @@ -299,18 +291,26 @@ cdef class BoruvkaAlgorithm: cdef cnp.ndarray components - def __init__(self, tree, min_samples=5, metric='euclidean', leaf_size=20, - alpha=1.0, approx_min_span_tree=False, n_jobs=4, **kwargs): + def __init__( + self, + tree, + min_samples=5, + metric='euclidean', + leaf_size=20, + alpha=1.0, + approx_min_span_tree=False, + n_jobs=None, + **kwargs + ): self.core_dist_tree = tree self.tree = tree self.is_KDTree = isinstance(tree, KDTree) - self._data = np.array(self.tree.data) self.raw_data = self.tree.data self.node_bounds = self.tree.node_bounds self.alpha = alpha self.approx_min_span_tree = approx_min_span_tree - self.n_jobs = n_jobs + self.n_jobs = effective_n_jobs(n_jobs) self.min_samples = min_samples self.num_points = self.tree.data.shape[0] @@ -344,9 +344,7 @@ cdef class BoruvkaAlgorithm: cdef _compute_bounds(self): """Initialize core distances""" - cdef intp_t n - cdef intp_t i - cdef intp_t m + cdef intp_t i, n, m cdef cnp.ndarray[float64_t, ndim=2] knn_dist cdef cnp.ndarray[intp_t, ndim=2] knn_indices @@ -377,7 +375,6 @@ cdef class BoruvkaAlgorithm: dualtree=True, breadth_first=True) - print(f"DEBUG *** self.min_samples={self.min_samples}") self.core_distance = knn_dist[:, self.min_samples - 1].copy() if self.is_KDTree: @@ -386,7 +383,8 @@ cdef class BoruvkaAlgorithm: # to make comparison feasible. for n in range(self.num_points): self.core_distance[n] = self.dist._dist_to_rdist( - self.core_distance[n]) + self.core_distance[n] + ) # Since we already computed NN distances for the min_samples closest # points we can use this to do the first round of boruvka -- we won't diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index da1b7a45465af..8a056c370c111 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -6,6 +6,7 @@ import pytest from scipy import stats from scipy.spatial import distance +from scipy.stats import mode from sklearn.cluster import HDBSCAN from sklearn.cluster._hdbscan._tree import ( @@ -38,6 +39,21 @@ OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} +def homogeneity(labels1, labels2): + num_missed = 0.0 + for label in set(labels1): + matches = labels2[labels1 == label] + match_mode, _ = mode(matches, keepdims=True) + num_missed += np.sum(matches != match_mode[0]) + + for label in set(labels2): + matches = labels1[labels2 == label] + match_mode, _ = mode(matches, keepdims=True) + num_missed += np.sum(matches != match_mode[0]) + + return num_missed / 2.0 + + @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) def test_hdbscan_boruvka_matches(tree): hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims").fit(X, y) @@ -46,7 +62,9 @@ def test_hdbscan_boruvka_matches(tree): labels_boruvka = hdb_boruvka.labels_ similarity = fowlkes_mallows_score(labels_prims, labels_boruvka) + error_rate = homogeneity(labels_prims, labels_boruvka) / X.shape[0] + print(f"DEBUG *** {error_rate=}") assert similarity > 0.85 diff --git a/sklearn/neighbors/_binary_tree.pxd b/sklearn/neighbors/_binary_tree.pxd new file mode 100644 index 0000000000000..fcba5f5c39919 --- /dev/null +++ b/sklearn/neighbors/_binary_tree.pxd @@ -0,0 +1,7 @@ +from ..utils._typedefs cimport float32_t, float64_t, intp_t + +cdef struct NodeData_t: + intp_t idx_start + intp_t idx_end + intp_t is_leaf + float64_t radius diff --git a/sklearn/neighbors/_binary_tree.pxi.tp b/sklearn/neighbors/_binary_tree.pxi.tp index fc59310b9acbf..26feadcc55556 100644 --- a/sklearn/neighbors/_binary_tree.pxi.tp +++ b/sklearn/neighbors/_binary_tree.pxi.tp @@ -188,6 +188,7 @@ from ..utils import check_array from ..utils._typedefs cimport float32_t, float64_t, intp_t from ..utils._heap cimport heap_push from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort +from ._binary_tree cimport NodeData_t cnp.import_array() @@ -216,12 +217,6 @@ cdef struct NodeHeapData_t: cdef NodeHeapData_t nhd_tmp NodeHeapData = np.asarray((&nhd_tmp)).dtype -cdef struct NodeData_t: - intp_t idx_start - intp_t idx_end - intp_t is_leaf - float64_t radius - # build the corresponding numpy dtype for NodeData cdef NodeData_t nd_tmp NodeData = np.asarray((&nd_tmp)).dtype From 11e91f990115a7a42853eda725098f8370e92e0d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 30 Sep 2023 11:35:45 -0400 Subject: [PATCH 139/160] Formatting and new Numpy API --- sklearn/cluster/_hdbscan/_boruvka.pyx | 26 ++++++++++++++------------ sklearn/cluster/_hdbscan/hdbscan.py | 11 +++++------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index ae84268634aa6..ab0d1fa6ee772 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -66,6 +66,8 @@ from ._linkage import MST_edge_dtype from joblib import Parallel, delayed, effective_n_jobs +cdef extern from "numpy/arrayobject.h": + intp_t * PyArray_SHAPE(cnp.PyArrayObject *) cdef float64_t INF = np.inf @@ -91,7 +93,7 @@ cdef inline float64_t kdtree_min_dist_dual( intp_t node2, float64_t[:, :, ::1] node_bounds, intp_t num_features -) except -1: +) except -1 nogil: cdef float64_t d, d1, d2, rdist = 0.0 cdef intp_t j @@ -405,7 +407,7 @@ cdef class BoruvkaAlgorithm: self.update_components() for n in range(self.num_nodes): - self.bounds[n] = DBL_MAX + self.bounds[n] = DBL_MAX cdef _initialize_components(self): """Initialize components of the min spanning tree (eventually there @@ -444,7 +446,7 @@ cdef class BoruvkaAlgorithm: # for each of these, and the union the two points # together in the union find structure - for c in range(self.components.shape[0]): + for c in range(PyArray_SHAPE( self.components)[0]): component = self.components[c] source = self.candidate_point[component] sink = self.candidate_neighbor[component] @@ -475,7 +477,7 @@ cdef class BoruvkaAlgorithm: self.candidate_distance[component] = DBL_MAX if self.num_edges == self.num_points - 1: self.components = self.component_union_find.components() - return self.components.shape[0] + return PyArray_SHAPE( self.components)[0] # After having joined everything in the union find data # structure we need to go through and determine the components @@ -494,7 +496,8 @@ cdef class BoruvkaAlgorithm: # in the node is of the same component if node_info.is_leaf: current_component = self.component_of_point[ - self.idx_array[node_info.idx_start]] + self.idx_array[node_info.idx_start] + ] for i in range(node_info.idx_start + 1, node_info.idx_end): p = self.idx_array[i] if self.component_of_point[p] != current_component: @@ -507,8 +510,7 @@ cdef class BoruvkaAlgorithm: else: child1 = 2 * n + 1 child2 = 2 * n + 2 - if (self.component_of_node[child1] == - self.component_of_node[child2]): + if self.component_of_node[child1] == self.component_of_node[child2]: self.component_of_node[n] = self.component_of_node[child1] # Since we're working with mutual reachability distance we often have @@ -519,20 +521,20 @@ cdef class BoruvkaAlgorithm: # produce a true min spanning tree, but only and approximation # Thus only do this if the caller is willing to accept such if self.approx_min_span_tree: - last_num_components = self.components.shape[0] + last_num_components = PyArray_SHAPE( self.components)[0] self.components = self.component_union_find.components() - if self.components.shape[0] == last_num_components: + if PyArray_SHAPE( self.components)[0] == last_num_components: # Reset bounds for n in range(self.num_nodes): - self.bounds[n] = DBL_MAX + self.bounds[n] = DBL_MAX else: self.components = self.component_union_find.components() for n in range(self.num_nodes): - self.bounds[n] = DBL_MAX + self.bounds[n] = DBL_MAX - return self.components.shape[0] + return PyArray_SHAPE( self.components)[0] cdef int dual_tree_traversal( self, diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 279b6cf3020a6..6bc4421c79207 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -264,7 +264,7 @@ def _hdbscan_brute( ), UserWarning, ) - return _process_mst(min_spanning_tree) + return min_spanning_tree, _process_mst(min_spanning_tree) def _hdbscan_prims( @@ -347,7 +347,7 @@ def _hdbscan_prims( # Mutual reachability distance is implicit in mst_from_data_matrix min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) - return _process_mst(min_spanning_tree) + return min_spanning_tree, _process_mst(min_spanning_tree) def _hdbscan_boruvka( @@ -371,13 +371,12 @@ def _hdbscan_boruvka( metric=metric, leaf_size=leaf_size // 3, alpha=alpha, - approx_min_span_tree=True, + approx_min_span_tree=False, n_jobs=n_jobs, **metric_params, ) min_spanning_tree = out.spanning_tree() - - return _process_mst(min_spanning_tree) + return min_spanning_tree, _process_mst(min_spanning_tree) def remap_single_linkage_tree(tree, internal_to_raw, non_finite): @@ -900,7 +899,7 @@ def fit(self, X, y=None): kwargs["algo"] = "ball_tree" kwargs["leaf_size"] = self.leaf_size - self._single_linkage_tree_ = mst_func(**kwargs) + self.mst, self._single_linkage_tree_ = mst_func(**kwargs) self.labels_, self.probabilities_ = tree_to_labels( self._single_linkage_tree_, From 014c168e5e7b962e526bc4d186be91ce7551611f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 30 Sep 2023 13:07:09 -0400 Subject: [PATCH 140/160] Corrected indexing error --- sklearn/cluster/_hdbscan/_boruvka.pyx | 8 ++++---- sklearn/cluster/tests/test_hdbscan.py | 22 +++------------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index ab0d1fa6ee772..cfb50fe119ae4 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -650,14 +650,14 @@ cdef class BoruvkaAlgorithm: if component1 != component2: if self.is_KDTree: d = self.dist.rdist( - &self.raw_data[self.num_features * p][0], - &self.raw_data[self.num_features * q][0], + &self.raw_data[p][0], + &self.raw_data[q][0], self.num_features ) else: d = self.dist.dist( - &self.raw_data[self.num_features * p][0], - &self.raw_data[self.num_features * q][0], + &self.raw_data[p][0], + &self.raw_data[q][0], self.num_features ) * self.alpha if self.alpha != 1.0: diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 8a056c370c111..23c0333e88fe0 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -6,7 +6,6 @@ import pytest from scipy import stats from scipy.spatial import distance -from scipy.stats import mode from sklearn.cluster import HDBSCAN from sklearn.cluster._hdbscan._tree import ( @@ -39,21 +38,6 @@ OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} -def homogeneity(labels1, labels2): - num_missed = 0.0 - for label in set(labels1): - matches = labels2[labels1 == label] - match_mode, _ = mode(matches, keepdims=True) - num_missed += np.sum(matches != match_mode[0]) - - for label in set(labels2): - matches = labels1[labels2 == label] - match_mode, _ = mode(matches, keepdims=True) - num_missed += np.sum(matches != match_mode[0]) - - return num_missed / 2.0 - - @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) def test_hdbscan_boruvka_matches(tree): hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims").fit(X, y) @@ -62,10 +46,10 @@ def test_hdbscan_boruvka_matches(tree): labels_boruvka = hdb_boruvka.labels_ similarity = fowlkes_mallows_score(labels_prims, labels_boruvka) - error_rate = homogeneity(labels_prims, labels_boruvka) / X.shape[0] - print(f"DEBUG *** {error_rate=}") - assert similarity > 0.85 + # Although we can have tight guarantees, there can be cases where the + # labels differ slightly, hence we leave a small margin of error. + assert similarity > 0.98 def test_hdbscan_mst_algorithm_errors(): From baa6a023e2278f2ce2e36e20bdf9166a7d269d38 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 2 Oct 2023 13:20:34 -0400 Subject: [PATCH 141/160] Added greater nogil support and started boruvka bug fix --- sklearn/cluster/_hdbscan/_boruvka.pyx | 204 +++++++++++--------------- sklearn/cluster/_hdbscan/hdbscan.py | 1 + sklearn/cluster/tests/test_hdbscan.py | 17 ++- 3 files changed, 102 insertions(+), 120 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index cfb50fe119ae4..f20e28b5faaec 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -63,17 +63,17 @@ from ...utils._typedefs cimport intp_t, float64_t, uint8_t, int8_t from ...neighbors._binary_tree cimport NodeData_t from ._linkage cimport MST_edge_t from ._linkage import MST_edge_dtype - from joblib import Parallel, delayed, effective_n_jobs cdef extern from "numpy/arrayobject.h": intp_t * PyArray_SHAPE(cnp.PyArrayObject *) + cdef float64_t INF = np.inf # Define a function giving the minimum distance between two # nodes of a ball tree -cdef inline float64_t balltree_min_dist_dual( +cdef inline float64_t ball_tree_min_dist_dual( float64_t radius1, float64_t radius2, intp_t node1, @@ -87,7 +87,7 @@ cdef inline float64_t balltree_min_dist_dual( # Define a function giving the minimum distance between two # nodes of a kd-tree -cdef inline float64_t kdtree_min_dist_dual( +cdef inline float64_t kd_tree_min_dist_dual( DistanceMetric64 metric, intp_t node1, intp_t node2, @@ -117,39 +117,6 @@ cdef inline float64_t kdtree_min_dist_dual( return metric._rdist_to_dist(rdist) -# As above, but this time we use the rdist as per the kdtree -# implementation. This allows us to release the GIL over -# larger sections of code -cdef inline float64_t kdtree_min_rdist_dual( - DistanceMetric64 metric, - intp_t node1, - intp_t node2, - float64_t[:, :, ::1] node_bounds, - intp_t num_features -) except -1 nogil: - - cdef float64_t d, d1, d2, rdist = 0.0 - cdef intp_t j - - if metric.p == INF: - for j in range(num_features): - d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] - d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] - d = (d1 + fabs(d1)) + (d2 + fabs(d2)) - - rdist = max(rdist, 0.5 * d) - else: - # here we'll use the fact that x + abs(x) = 2 * max(x, 0) - for j in range(num_features): - d1 = node_bounds[0, node1, j] - node_bounds[1, node2, j] - d2 = node_bounds[0, node2, j] - node_bounds[1, node1, j] - d = (d1 + fabs(d1)) + (d2 + fabs(d2)) - - rdist += pow(0.5 * d, metric.p) - - return rdist - - cdef class BoruvkaUnionFind(object): """Efficient union find implementation. @@ -171,14 +138,14 @@ cdef class BoruvkaUnionFind(object): cdef intp_t[::1] _parent cdef uint8_t[::1] _rank - cdef cnp.ndarray is_component + cdef uint8_t[::1] is_component def __init__(self, size): self._parent = np.arange(size, dtype=np.intp) self._rank = np.zeros(size, dtype=np.uint8) - self.is_component = np.ones(size, dtype=bool) + self.is_component = np.ones(size, dtype=np.uint8) - cdef int union_(self, intp_t x, intp_t y) except -1: + cdef int union_(self, intp_t x, intp_t y) except -1 nogil: """Union together elements x and y""" cdef intp_t x_root = self.find(x) cdef intp_t y_root = self.find(y) @@ -188,18 +155,18 @@ cdef class BoruvkaUnionFind(object): if self._rank[x_root] < self._rank[y_root]: self._parent[x_root] = y_root - self.is_component[x_root] = False + self.is_component[x_root] = 0 elif self._rank[x_root] > self._rank[y_root]: self._parent[y_root] = x_root - self.is_component[y_root] = False + self.is_component[y_root] = 0 else: self._rank[x_root] += 1 self._parent[y_root] = x_root - self.is_component[y_root] = False + self.is_component[y_root] = 0 return 0 - cdef intp_t find(self, intp_t x) except -1: + cdef intp_t find(self, intp_t x) except -1 nogil: """Find the root or identifier for the component that x is in""" cdef intp_t x_parent cdef intp_t x_grandparent @@ -215,7 +182,7 @@ cdef class BoruvkaUnionFind(object): cdef cnp.ndarray[intp_t, ndim=1] components(self): """Return an array of all component roots/identifiers""" - return self.is_component.nonzero()[0] + return np.array(self.is_component).nonzero()[0] def _core_dist_query(tree, data, min_samples): @@ -279,6 +246,7 @@ cdef class BoruvkaAlgorithm: cdef public float64_t[::1] core_distance cdef public float64_t[::1] bounds + cdef public intp_t[::1] components cdef public intp_t[::1] component_of_point cdef public intp_t[::1] component_of_node cdef public intp_t[::1] candidate_neighbor @@ -291,8 +259,6 @@ cdef class BoruvkaAlgorithm: cdef MST_edge_t[::1] edges cdef intp_t num_edges - cdef cnp.ndarray components - def __init__( self, tree, @@ -306,7 +272,9 @@ cdef class BoruvkaAlgorithm: ): self.core_dist_tree = tree - self.tree = tree + self.tree = KDTree(tree.data, metric=metric, leaf_size=leaf_size, + **kwargs) + print(np.array(self.tree.idx_array)) self.is_KDTree = isinstance(tree, KDTree) self.raw_data = self.tree.data self.node_bounds = self.tree.node_bounds @@ -321,13 +289,13 @@ cdef class BoruvkaAlgorithm: self.dist = DistanceMetric.get_metric(metric, **kwargs) - self.components = np.arange(self.num_points) - self.bounds = np.empty(self.num_nodes, np.double) + self.components = np.arange(self.num_points, dtype=np.intp) + self.bounds = np.empty(self.num_nodes, np.float64) self.component_of_point = np.empty(self.num_points, dtype=np.intp) self.component_of_node = np.empty(self.num_nodes, dtype=np.intp) self.candidate_neighbor = np.empty(self.num_points, dtype=np.intp) self.candidate_point = np.empty(self.num_points, dtype=np.intp) - self.candidate_distance = np.empty(self.num_points, dtype=np.double) + self.candidate_distance = np.empty(self.num_points, dtype=np.float64) self.component_union_find = BoruvkaUnionFind(self.num_points) self.edges = np.empty((self.num_points - 1,), dtype=MST_edge_dtype) @@ -354,7 +322,7 @@ cdef class BoruvkaAlgorithm: # A shortcut: if we have a lot of points then we can split the points # into four piles and query them in parallel. On multicore systems # (most systems) this amounts to a 2x-3x wall clock improvement. - if self.tree.data.shape[0] > 16384 and self.n_jobs > 1: + if self.num_points > 16384 and self.n_jobs > 1: split_cnt = self.num_points // self.n_jobs datasets = [] for i in range(self.n_jobs): @@ -375,19 +343,11 @@ cdef class BoruvkaAlgorithm: self.tree.data, k=self.min_samples, dualtree=True, - breadth_first=True) + breadth_first=True + ) self.core_distance = knn_dist[:, self.min_samples - 1].copy() - if self.is_KDTree: - # Since we do everything in terms of rdist to free up the GIL - # we need to convert all the core distances beforehand - # to make comparison feasible. - for n in range(self.num_points): - self.core_distance[n] = self.dist._dist_to_rdist( - self.core_distance[n] - ) - # Since we already computed NN distances for the min_samples closest # points we can use this to do the first round of boruvka -- we won't # get every point due to core_distance/mutual reachability distance @@ -424,7 +384,7 @@ cdef class BoruvkaAlgorithm: for n in range(self.num_nodes): self.component_of_node[n] = -(n+1) - cdef int update_components(self) except -1: + cdef int update_components(self) except -1 nogil: """Having found the nearest neighbor not in the same component for each current component (via tree traversal), run through adding edges to the min spanning tree and recomputing components via @@ -446,13 +406,12 @@ cdef class BoruvkaAlgorithm: # for each of these, and the union the two points # together in the union find structure - for c in range(PyArray_SHAPE( self.components)[0]): + for c in range(self.components.shape[0]): component = self.components[c] source = self.candidate_point[component] sink = self.candidate_neighbor[component] if source == -1 or sink == -1: continue - # raise ValueError('Source or sink of edge is not defined!') current_source_component = self.component_union_find.find(source) current_sink_component = self.component_union_find.find(sink) if current_source_component == current_sink_component: @@ -464,11 +423,7 @@ cdef class BoruvkaAlgorithm: self.edges[self.num_edges].current_node = source self.edges[self.num_edges].next_node = sink - if self.is_KDTree: - self.edges[self.num_edges].distance = self.dist._rdist_to_dist( - self.candidate_distance[component]) - else: - self.edges[self.num_edges].distance = self.candidate_distance[component] + self.edges[self.num_edges].distance = self.candidate_distance[component] self.num_edges += 1 self.component_union_find.union_(source, sink) @@ -476,20 +431,21 @@ cdef class BoruvkaAlgorithm: # Reset everything,and check if we're done self.candidate_distance[component] = DBL_MAX if self.num_edges == self.num_points - 1: - self.components = self.component_union_find.components() - return PyArray_SHAPE( self.components)[0] + with gil: + self.components = self.component_union_find.components() + return self.components.shape[0] # After having joined everything in the union find data - # structure we need to go through and determine the components + # structure, we need to go through and determine the components # of each point for easy lookup. # - # Have done that we then go through and set the component + # Having done that, we then go through and set the component # of each node, as this provides fast pruning in later # tree traversals. - for n in range(self.tree.data.shape[0]): + for n in range(self.num_points): self.component_of_point[n] = self.component_union_find.find(n) - for n in range(self.tree.node_data.shape[0] - 1, -1, -1): + for n in range(self.num_nodes - 1, -1, -1): node_info = self.node_data[n] # Case 1: # If the node is a leaf we need to check that every point @@ -521,20 +477,22 @@ cdef class BoruvkaAlgorithm: # produce a true min spanning tree, but only and approximation # Thus only do this if the caller is willing to accept such if self.approx_min_span_tree: - last_num_components = PyArray_SHAPE( self.components)[0] - self.components = self.component_union_find.components() + last_num_components = self.components.shape[0] + with gil: + self.components = self.component_union_find.components() - if PyArray_SHAPE( self.components)[0] == last_num_components: + if self.components.shape[0] == last_num_components: # Reset bounds for n in range(self.num_nodes): self.bounds[n] = DBL_MAX else: - self.components = self.component_union_find.components() + with gil: + self.components = self.component_union_find.components() for n in range(self.num_nodes): self.bounds[n] = DBL_MAX - return PyArray_SHAPE( self.components)[0] + return self.components.shape[0] cdef int dual_tree_traversal( self, @@ -563,21 +521,26 @@ cdef class BoruvkaAlgorithm: cdef intp_t left, right cdef float64_t left_dist, right_dist + cdef intp_t dist_cnt = 0 + # Compute the distance between the query and reference nodes if self.is_KDTree: - node_dist = kdtree_min_rdist_dual( + node_dist = kd_tree_min_dist_dual( self.dist, node1, node2, self.node_bounds, self.num_features ) else: - node_dist = balltree_min_dist_dual( + node_dist = ball_tree_min_dist_dual( node1_info.radius, node2_info.radius, node1, node2, self.centroid_distances ) + with gil: + print(f"DEBUG *** node_dist({node1}, {node2})={node_dist}") + print(f"DEBUG *** bounds ({self.bounds[node1]}, {self.bounds[node2]})") # If the distance between the nodes is less than the current bound for # the query and the nodes are not in the same component continue; # otherwise we get to prune this branch and return early. @@ -593,7 +556,7 @@ cdef class BoruvkaAlgorithm: # Case 1: Both nodes are leaves # for each pair of points in node1 x node2 we need - # to compute the distance and see if it better than + # to compute the distance and see if it's better than # the current nearest neighbor for the component of # the point in the query node. # @@ -628,14 +591,17 @@ cdef class BoruvkaAlgorithm: point_indices2 = self.idx_array[ node2_info.idx_start:node2_info.idx_end ] + # with gil: + # print(f"DEBUG *** idx_array[{node1_info.idx_start}:{node1_info.idx_end}] = {np.array(point_indices1)}") + # print(f"DEBUG *** idx_array[{node2_info.idx_start}:{node2_info.idx_end}] = {np.array(point_indices2)}") + # print(f"DEBUG *** component_of_point = {np.array(self.component_of_point)}") for i in range(point_indices1.shape[0]): p = point_indices1[i] component1 = self.component_of_point[p] - if (self.core_distance[p] > - self.candidate_distance[component1]): + if self.core_distance[p] > self.candidate_distance[component1]: continue for j in range(point_indices2.shape[0]): @@ -648,18 +614,12 @@ cdef class BoruvkaAlgorithm: continue if component1 != component2: - if self.is_KDTree: - d = self.dist.rdist( - &self.raw_data[p][0], - &self.raw_data[q][0], - self.num_features - ) - else: - d = self.dist.dist( - &self.raw_data[p][0], - &self.raw_data[q][0], - self.num_features - ) * self.alpha + d = self.dist.dist( + &self.raw_data[p][0], + &self.raw_data[q][0], + self.num_features + ) * self.alpha + dist_cnt += 1 if self.alpha != 1.0: mr_dist = max( d / self.alpha, @@ -688,7 +648,7 @@ cdef class BoruvkaAlgorithm: # Compute new bounds for the query node, and # then propagate the results of that computation # up the tree. - _radius = self.dist._dist_to_rdist(node1_info.radius) if self.is_KDTree else node1_info.radius + _radius = node1_info.radius new_bound = min( new_upper_bound, new_lower_bound + 2 * _radius @@ -743,13 +703,13 @@ cdef class BoruvkaAlgorithm: right = 2 * node2 + 2 if self.is_KDTree: - left_dist = kdtree_min_rdist_dual( + left_dist = kd_tree_min_dist_dual( self.dist, node1, left, self.node_bounds, self.num_features ) - right_dist = kdtree_min_rdist_dual( + right_dist = kd_tree_min_dist_dual( self.dist, node1, right, self.node_bounds, @@ -757,14 +717,14 @@ cdef class BoruvkaAlgorithm: ) else: node2_info = self.node_data[left] - left_dist = balltree_min_dist_dual( + left_dist = ball_tree_min_dist_dual( node1_info.radius, node2_info.radius, node1, left, self.centroid_distances ) node2_info = self.node_data[right] - right_dist = balltree_min_dist_dual( + right_dist = ball_tree_min_dist_dual( node1_info.radius, node2_info.radius, node1, right, @@ -772,11 +732,15 @@ cdef class BoruvkaAlgorithm: ) if left_dist < right_dist: - self.dual_tree_traversal(node1, left) - self.dual_tree_traversal(node1, right) + with gil: + print(f"DEBUG *** descending into ({node1}, {left}) | ({node1}, {right})") + dist_cnt += self.dual_tree_traversal(node1, left) + dist_cnt += self.dual_tree_traversal(node1, right) else: - self.dual_tree_traversal(node1, right) - self.dual_tree_traversal(node1, left) + with gil: + print(f"DEBUG *** descending into ({node1}, {right}) | ({node1}, {left})") + dist_cnt += self.dual_tree_traversal(node1, right) + dist_cnt += self.dual_tree_traversal(node1, left) # Case 2b: The reference node is a leaf, or is smaller than # the query node. @@ -789,13 +753,13 @@ cdef class BoruvkaAlgorithm: left = 2 * node1 + 1 right = 2 * node1 + 2 if self.is_KDTree: - left_dist = kdtree_min_rdist_dual( + left_dist = kd_tree_min_dist_dual( self.dist, left, node2, self.node_bounds, self.num_features ) - right_dist = kdtree_min_rdist_dual( + right_dist = kd_tree_min_dist_dual( self.dist, right, node2, self.node_bounds, @@ -803,14 +767,14 @@ cdef class BoruvkaAlgorithm: ) else: node1_info = self.node_data[left] - left_dist = balltree_min_dist_dual( + left_dist = ball_tree_min_dist_dual( node1_info.radius, node2_info.radius, left, node2, self.centroid_distances ) node1_info = self.node_data[right] - right_dist = balltree_min_dist_dual( + right_dist = ball_tree_min_dist_dual( node1_info.radius, node2_info.radius, right, node2, @@ -818,21 +782,27 @@ cdef class BoruvkaAlgorithm: ) if left_dist < right_dist: - self.dual_tree_traversal(left, node2) - self.dual_tree_traversal(right, node2) + with gil: + print(f"DEBUG *** descending into ({left}, {node2}) | ({right}, {node2})") + dist_cnt += self.dual_tree_traversal(left, node2) + dist_cnt += self.dual_tree_traversal(right, node2) else: - self.dual_tree_traversal(right, node2) - self.dual_tree_traversal(left, node2) + with gil: + print(f"DEBUG *** descending into ({right}, {node2}) | ({left}, {node2})") + dist_cnt += self.dual_tree_traversal(right, node2) + dist_cnt += self.dual_tree_traversal(left, node2) - return 0 + return dist_cnt cpdef spanning_tree(self): """Compute the minimum spanning tree of the data held by the tree passed in at construction""" - cdef intp_t num_components = self.tree.data.shape[0] + cdef intp_t num_components = self.num_points + cdef intp_t dist_cnt while num_components > 1: - self.dual_tree_traversal(0, 0) + dist_cnt = self.dual_tree_traversal(0, 0) + print(f"DEBUG *** finished {num_components} with {dist_cnt} calcs") num_components = self.update_components() return np.array(self.edges, dtype=MST_edge_dtype) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 6bc4421c79207..32f334a5bca39 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -363,6 +363,7 @@ def _hdbscan_boruvka( leaf_size = max(leaf_size, 3) Tree = KDTree if algo == "kd_tree" else BallTree tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) + print(np.array(tree.idx_array)) n_jobs = effective_n_jobs(n_jobs) out = BoruvkaAlgorithm( diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 23c0333e88fe0..7e50c9157c7e2 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -39,9 +39,20 @@ @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) -def test_hdbscan_boruvka_matches(tree): - hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims").fit(X, y) - hdb_boruvka = HDBSCAN(algorithm=tree, mst_algorithm="boruvka").fit(X, y) +@pytest.mark.parametrize("n_samples", [200, 16385]) +@pytest.mark.parametrize("n_jobs", [1, 4]) +def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs): + if n_samples > 16384: + data, _ = make_blobs(n_samples=n_samples, random_state=10) + data = shuffle(X, random_state=7) + data = StandardScaler().fit_transform(X) + else: + data = X + + hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims", n_jobs=n_jobs).fit(data) + hdb_boruvka = HDBSCAN(algorithm=tree, mst_algorithm="boruvka", n_jobs=n_jobs).fit( + data + ) labels_prims = hdb_prims.labels_ labels_boruvka = hdb_boruvka.labels_ From ffc6b778c154c9ae624f7bc5af3f4caba35895f4 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 8 Oct 2023 14:29:44 -0400 Subject: [PATCH 142/160] Removed debug statements and improved test --- sklearn/cluster/_hdbscan/_boruvka.pyx | 41 +++++++-------------------- sklearn/cluster/_hdbscan/hdbscan.py | 21 ++++++++------ sklearn/cluster/tests/test_hdbscan.py | 14 +++++---- 3 files changed, 30 insertions(+), 46 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index f20e28b5faaec..ead483606ea5e 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -274,7 +274,6 @@ cdef class BoruvkaAlgorithm: self.core_dist_tree = tree self.tree = KDTree(tree.data, metric=metric, leaf_size=leaf_size, **kwargs) - print(np.array(self.tree.idx_array)) self.is_KDTree = isinstance(tree, KDTree) self.raw_data = self.tree.data self.node_bounds = self.tree.node_bounds @@ -521,8 +520,6 @@ cdef class BoruvkaAlgorithm: cdef intp_t left, right cdef float64_t left_dist, right_dist - cdef intp_t dist_cnt = 0 - # Compute the distance between the query and reference nodes if self.is_KDTree: node_dist = kd_tree_min_dist_dual( @@ -538,9 +535,6 @@ cdef class BoruvkaAlgorithm: self.centroid_distances ) - with gil: - print(f"DEBUG *** node_dist({node1}, {node2})={node_dist}") - print(f"DEBUG *** bounds ({self.bounds[node1]}, {self.bounds[node2]})") # If the distance between the nodes is less than the current bound for # the query and the nodes are not in the same component continue; # otherwise we get to prune this branch and return early. @@ -591,10 +585,6 @@ cdef class BoruvkaAlgorithm: point_indices2 = self.idx_array[ node2_info.idx_start:node2_info.idx_end ] - # with gil: - # print(f"DEBUG *** idx_array[{node1_info.idx_start}:{node1_info.idx_end}] = {np.array(point_indices1)}") - # print(f"DEBUG *** idx_array[{node2_info.idx_start}:{node2_info.idx_end}] = {np.array(point_indices2)}") - # print(f"DEBUG *** component_of_point = {np.array(self.component_of_point)}") for i in range(point_indices1.shape[0]): @@ -619,7 +609,6 @@ cdef class BoruvkaAlgorithm: &self.raw_data[q][0], self.num_features ) * self.alpha - dist_cnt += 1 if self.alpha != 1.0: mr_dist = max( d / self.alpha, @@ -732,15 +721,11 @@ cdef class BoruvkaAlgorithm: ) if left_dist < right_dist: - with gil: - print(f"DEBUG *** descending into ({node1}, {left}) | ({node1}, {right})") - dist_cnt += self.dual_tree_traversal(node1, left) - dist_cnt += self.dual_tree_traversal(node1, right) + self.dual_tree_traversal(node1, left) + self.dual_tree_traversal(node1, right) else: - with gil: - print(f"DEBUG *** descending into ({node1}, {right}) | ({node1}, {left})") - dist_cnt += self.dual_tree_traversal(node1, right) - dist_cnt += self.dual_tree_traversal(node1, left) + self.dual_tree_traversal(node1, right) + self.dual_tree_traversal(node1, left) # Case 2b: The reference node is a leaf, or is smaller than # the query node. @@ -782,27 +767,21 @@ cdef class BoruvkaAlgorithm: ) if left_dist < right_dist: - with gil: - print(f"DEBUG *** descending into ({left}, {node2}) | ({right}, {node2})") - dist_cnt += self.dual_tree_traversal(left, node2) - dist_cnt += self.dual_tree_traversal(right, node2) + self.dual_tree_traversal(left, node2) + self.dual_tree_traversal(right, node2) else: - with gil: - print(f"DEBUG *** descending into ({right}, {node2}) | ({left}, {node2})") - dist_cnt += self.dual_tree_traversal(right, node2) - dist_cnt += self.dual_tree_traversal(left, node2) + self.dual_tree_traversal(right, node2) + self.dual_tree_traversal(left, node2) - return dist_cnt + return 0 cpdef spanning_tree(self): """Compute the minimum spanning tree of the data held by the tree passed in at construction""" cdef intp_t num_components = self.num_points - cdef intp_t dist_cnt while num_components > 1: - dist_cnt = self.dual_tree_traversal(0, 0) - print(f"DEBUG *** finished {num_components} with {dist_cnt} calcs") + self.dual_tree_traversal(0, 0) num_components = self.update_components() return np.array(self.edges, dtype=MST_edge_dtype) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 32f334a5bca39..768c624e3b898 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -358,12 +358,12 @@ def _hdbscan_boruvka( metric="euclidean", leaf_size=40, n_jobs=None, + approx_min_span_tree=False, **metric_params, ): leaf_size = max(leaf_size, 3) Tree = KDTree if algo == "kd_tree" else BallTree tree = Tree(X, metric=metric, leaf_size=leaf_size, **metric_params) - print(np.array(tree.idx_array)) n_jobs = effective_n_jobs(n_jobs) out = BoruvkaAlgorithm( @@ -372,7 +372,7 @@ def _hdbscan_boruvka( metric=metric, leaf_size=leaf_size // 3, alpha=alpha, - approx_min_span_tree=False, + approx_min_span_tree=approx_min_span_tree, n_jobs=n_jobs, **metric_params, ) @@ -680,7 +680,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): deprecated={"kdtree", "balltree"}, ), ], - "mst_algorithm": [StrOptions({"auto", "brute", "prims", "boruvka"})], + "mst_algorithm": [ + StrOptions({"auto", "brute", "prims", "boruvka_exact", "boruvka_approx"}) + ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "n_jobs": [Integral, None], "cluster_selection_method": [StrOptions({"eom", "leaf"})], @@ -808,12 +810,10 @@ def fit(self, X, y=None): f" samples in X ({X.shape[0]})" ) - algos = {self.algorithm, self.mst_algorithm} - if ( - "brute" in algos - and len({"kd_tree", "ball_tree", "prims", "boruvka"}.intersection(algos)) - > 0 - ): + algorithms = {self.algorithm, self.mst_algorithm} + acceptable_algorithms = {"auto", "brute"} + + if "brute" in algorithms and not algorithms.issubset(acceptable_algorithms): raise ValueError( "When setting either `algorithm='brute'` or `mst_algorithm='brute'`," " both keyword arguments must only be set to either 'brute' or 'auto'." @@ -882,6 +882,9 @@ def fit(self, X, y=None): mst_func = _hdbscan_prims else: mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = ( + self.mst_algorithm == "boruvka_approx" + ) kwargs["algo"] = self.algorithm kwargs["leaf_size"] = self.leaf_size else: diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 7e50c9157c7e2..91a75166dbca5 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -41,7 +41,8 @@ @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) @pytest.mark.parametrize("n_samples", [200, 16385]) @pytest.mark.parametrize("n_jobs", [1, 4]) -def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs): +@pytest.mark.parametrize("mst_algo", ["boruvka_exact", "boruvka_approx"]) +def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs, mst_algo): if n_samples > 16384: data, _ = make_blobs(n_samples=n_samples, random_state=10) data = shuffle(X, random_state=7) @@ -50,7 +51,7 @@ def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs): data = X hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims", n_jobs=n_jobs).fit(data) - hdb_boruvka = HDBSCAN(algorithm=tree, mst_algorithm="boruvka", n_jobs=n_jobs).fit( + hdb_boruvka = HDBSCAN(algorithm=tree, mst_algorithm=mst_algo, n_jobs=n_jobs).fit( data ) labels_prims = hdb_prims.labels_ @@ -58,9 +59,10 @@ def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs): similarity = fowlkes_mallows_score(labels_prims, labels_boruvka) - # Although we can have tight guarantees, there can be cases where the - # labels differ slightly, hence we leave a small margin of error. - assert similarity > 0.98 + # We should expect that the exact boruvka algorithm produces a correct mst, + # but the approximation will almost surely produce an incorrect tree, and + # hence differ from the exact labels. + assert similarity >= 0.91 if "approx" in mst_algo else 1 def test_hdbscan_mst_algorithm_errors(): @@ -70,7 +72,7 @@ def test_hdbscan_mst_algorithm_errors(): with pytest.raises(ValueError, match=msg): hdb.fit(X, y) - for mst_algo in ["prims", "boruvka"]: + for mst_algo in ["prims", "boruvka_exact", "boruvka_approx"]: hdb = HDBSCAN(algorithm="brute", mst_algorithm=mst_algo) with pytest.raises(ValueError, match=msg): hdb.fit(X, y) From 974673ed9cf98e21a0b8b1f7ee2b2fe04a320f8e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 10 Oct 2023 15:58:19 -0400 Subject: [PATCH 143/160] Improved tests and hdbscan dispatch logic --- sklearn/cluster/_hdbscan/_boruvka.pyx | 60 +++++++++++++-------------- sklearn/cluster/_hdbscan/hdbscan.py | 52 ++++++++++++++--------- sklearn/cluster/tests/test_hdbscan.py | 31 ++++++++++---- 3 files changed, 84 insertions(+), 59 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index ead483606ea5e..7e942c7544e50 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -233,31 +233,31 @@ cdef class BoruvkaAlgorithm: Keyword args passed to the metric. """ - cdef object tree - cdef object core_dist_tree - cdef DistanceMetric64 dist - cdef readonly const float64_t[:, ::1] raw_data - cdef float64_t[:, :, ::1] node_bounds - cdef float64_t alpha - cdef int8_t approx_min_span_tree - cdef intp_t n_jobs, min_samples - cdef intp_t num_points, num_nodes, num_features - cdef bint is_KDTree - - cdef public float64_t[::1] core_distance - cdef public float64_t[::1] bounds - cdef public intp_t[::1] components - cdef public intp_t[::1] component_of_point - cdef public intp_t[::1] component_of_node - cdef public intp_t[::1] candidate_neighbor - cdef public intp_t[::1] candidate_point - cdef public float64_t[::1] candidate_distance - cdef public float64_t[:, ::1] centroid_distances - cdef public intp_t[::1] idx_array - cdef public NodeData_t[::1] node_data - cdef BoruvkaUnionFind component_union_find - cdef MST_edge_t[::1] edges - cdef intp_t num_edges + cdef: + object tree + DistanceMetric64 dist + readonly const float64_t[:, ::1] raw_data + float64_t[:, :, ::1] node_bounds + float64_t alpha + int8_t approx_min_span_tree + intp_t n_jobs, min_samples + intp_t num_points, num_nodes, num_features + bint is_KDTree + + public float64_t[::1] core_distance + public float64_t[::1] bounds + public intp_t[::1] components + public intp_t[::1] component_of_point + public intp_t[::1] component_of_node + public intp_t[::1] candidate_neighbor + public intp_t[::1] candidate_point + public float64_t[::1] candidate_distance + public float64_t[:, ::1] centroid_distances + public intp_t[::1] idx_array + public NodeData_t[::1] node_data + BoruvkaUnionFind component_union_find + MST_edge_t[::1] edges + intp_t num_edges def __init__( self, @@ -271,9 +271,7 @@ cdef class BoruvkaAlgorithm: **kwargs ): - self.core_dist_tree = tree - self.tree = KDTree(tree.data, metric=metric, leaf_size=leaf_size, - **kwargs) + self.tree =tree self.is_KDTree = isinstance(tree, KDTree) self.raw_data = self.tree.data self.node_bounds = self.tree.node_bounds @@ -319,7 +317,7 @@ cdef class BoruvkaAlgorithm: cdef cnp.ndarray[intp_t, ndim=2] knn_indices # A shortcut: if we have a lot of points then we can split the points - # into four piles and query them in parallel. On multicore systems + # into multiple piles and query them in parallel. On multicore systems # (most systems) this amounts to a 2x-3x wall clock improvement. if self.num_points > 16384 and self.n_jobs > 1: split_cnt = self.num_points // self.n_jobs @@ -332,13 +330,13 @@ cdef class BoruvkaAlgorithm: knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( delayed(_core_dist_query) - (self.core_dist_tree, points, + (self.tree, points, self.min_samples) for points in datasets) knn_dist = np.vstack([x[0] for x in knn_data]) knn_indices = np.vstack([x[1] for x in knn_data]) else: - knn_dist, knn_indices = self.core_dist_tree.query( + knn_dist, knn_indices = self.tree.query( self.tree.data, k=self.min_samples, dualtree=True, diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 768c624e3b898..6bc704264f197 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -865,43 +865,55 @@ def fit(self, X, y=None): " Please select a different metric." ) - if self.algorithm != "auto": + if algorithms != {"auto"}: if ( self.metric != "precomputed" and issparse(X) - and self.algorithm != "brute" + and "brute" not in algorithms ): raise ValueError("Sparse data matrices only support algorithm `brute`.") - - if self.algorithm == "brute": + if "brute" in algorithms: mst_func = _hdbscan_brute kwargs["copy"] = self.copy else: - # TODO: Finalize dispatching, currently placeholder - if self.mst_algorithm in ("prims", "auto"): - mst_func = _hdbscan_prims + kwargs["leaf_size"] = self.leaf_size + # We prefer KDTree unless otherwise specified + if self.algorithm != "auto": + tree_algorithm = self.algorithm else: - mst_func = _hdbscan_boruvka - kwargs["approx_min_span_tree"] = ( - self.mst_algorithm == "boruvka_approx" + tree_algorithm = ( + "kd_tree" + if self.metric in KDTree.valid_metrics + else "ball_tree" ) - kwargs["algo"] = self.algorithm - kwargs["leaf_size"] = self.leaf_size + kwargs["algo"] = tree_algorithm + + if self.mst_algorithm != "auto": + if self.mst_algorithm == "prims": + mst_func = _hdbscan_prims + else: + mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = ( + self.mst_algorithm == "boruvka_approx" + ) + else: + # Approximate boruvka is always preferable + mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = True + else: if issparse(X) or self.metric not in FAST_METRICS: # We can't do much with sparse matrices ... mst_func = _hdbscan_brute kwargs["copy"] = self.copy - elif self.metric in KDTree.valid_metrics: - # TODO: Benchmark KD vs Ball Tree efficiency - mst_func = _hdbscan_prims - kwargs["algo"] = "kd_tree" - kwargs["leaf_size"] = self.leaf_size else: - # Metric is a valid BallTree metric - mst_func = _hdbscan_prims - kwargs["algo"] = "ball_tree" + # Approximate boruvka is always preferable + mst_func = _hdbscan_boruvka + kwargs["approx_min_span_tree"] = True kwargs["leaf_size"] = self.leaf_size + kwargs["algo"] = ( + "kd_tree" if self.metric in KDTree.valid_metrics else "ball_tree" + ) self.mst, self._single_linkage_tree_ = mst_func(**kwargs) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 91a75166dbca5..5f4f7deabc2bb 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -326,28 +326,29 @@ def test_hdbscan_precomputed_non_brute(tree): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) -def test_hdbscan_sparse(csr_container): +@pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "prims"]) +def test_hdbscan_sparse(csr_container, mst_algorithm): """ Tests that HDBSCAN works correctly when passing sparse feature data. Evaluates correctness by comparing against the same data passed as a dense array. """ - dense_labels = HDBSCAN().fit(X).labels_ + dense_labels = HDBSCAN(mst_algorithm=mst_algorithm).fit(X).labels_ n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 _X_sparse = csr_container(X) X_sparse = _X_sparse.copy() sparse_labels = HDBSCAN().fit(X_sparse).labels_ - assert_array_equal(dense_labels, sparse_labels) + fowlkes_mallows_score(dense_labels, sparse_labels) == 1 # Compare that the sparse and dense non-precomputed routines return the same labels # where the 0th observation contains the outlier. for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): X_dense = X.copy() X_dense[0, 0] = outlier_val - dense_labels = HDBSCAN().fit(X_dense).labels_ + dense_labels = HDBSCAN(mst_algorithm=mst_algorithm).fit(X_dense).labels_ n_clusters = len(set(dense_labels) - OUTLIER_SET) assert n_clusters == 3 assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] @@ -355,7 +356,7 @@ def test_hdbscan_sparse(csr_container): X_sparse = _X_sparse.copy() X_sparse[0, 0] = outlier_val sparse_labels = HDBSCAN().fit(X_sparse).labels_ - assert_array_equal(dense_labels, sparse_labels) + fowlkes_mallows_score(dense_labels, sparse_labels) == 1 msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): @@ -363,22 +364,36 @@ def test_hdbscan_sparse(csr_container): @pytest.mark.parametrize("algorithm", ALGORITHMS) -def test_hdbscan_centers(algorithm): +@pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "prims"]) +def test_hdbscan_centers(algorithm, mst_algorithm): """ Tests that HDBSCAN centers are calculated and stored properly, and are accurate to the data. """ + mst_algorithm = "auto" if algorithm == "brute" else mst_algorithm + print(f"\nDEBUG *** {mst_algorithm=} | {algorithm=}") + if mst_algorithm == "auto" and algorithm == "auto": + pytest.xfail("We expect approximate boruvka to fail this closeness test") centers = [(0.0, 0.0), (3.0, 3.0)] H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) - hdb = HDBSCAN(store_centers="both").fit(H) + hdb = HDBSCAN( + algorithm=algorithm, mst_algorithm=mst_algorithm, store_centers="both" + ).fit(H) + # The boruvka algorithm tends to produce the centroids/medoids in opposite + # order, so we reverse for this test. + if mst_algorithm == "boruvka_exact": + centers = centers[::-1] for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_): assert_allclose(center, centroid, rtol=1, atol=0.05) assert_allclose(center, medoid, rtol=1, atol=0.05) # Ensure that nothing is done for noise hdb = HDBSCAN( - algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0] + algorithm=algorithm, + mst_algorithm=mst_algorithm, + store_centers="both", + min_cluster_size=X.shape[0], ).fit(X) assert hdb.centroids_.shape[0] == 0 assert hdb.medoids_.shape[0] == 0 From 61aef6304c5aaf15e3894554cf3f2b30b5973d92 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 10 Oct 2023 16:25:43 -0400 Subject: [PATCH 144/160] Cleaned up cython file --- sklearn/cluster/_hdbscan/_boruvka.pyx | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 7e942c7544e50..116d72a910f5b 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -65,10 +65,6 @@ from ._linkage cimport MST_edge_t from ._linkage import MST_edge_dtype from joblib import Parallel, delayed, effective_n_jobs -cdef extern from "numpy/arrayobject.h": - intp_t * PyArray_SHAPE(cnp.PyArrayObject *) - - cdef float64_t INF = np.inf # Define a function giving the minimum distance between two @@ -79,7 +75,7 @@ cdef inline float64_t ball_tree_min_dist_dual( intp_t node1, intp_t node2, float64_t[:, ::1] centroid_dist -) except -1 nogil: +) noexcept nogil: cdef float64_t dist_pt = centroid_dist[node1, node2] return max(0, (dist_pt - radius1 - radius2)) @@ -93,7 +89,7 @@ cdef inline float64_t kd_tree_min_dist_dual( intp_t node2, float64_t[:, :, ::1] node_bounds, intp_t num_features -) except -1 nogil: +) noexcept nogil: cdef float64_t d, d1, d2, rdist = 0.0 cdef intp_t j @@ -139,13 +135,15 @@ cdef class BoruvkaUnionFind(object): cdef intp_t[::1] _parent cdef uint8_t[::1] _rank cdef uint8_t[::1] is_component + cdef intp_t num_components def __init__(self, size): self._parent = np.arange(size, dtype=np.intp) self._rank = np.zeros(size, dtype=np.uint8) self.is_component = np.ones(size, dtype=np.uint8) + self.num_components = size - cdef int union_(self, intp_t x, intp_t y) except -1 nogil: + cdef int union_(self, intp_t x, intp_t y) noexcept nogil: """Union together elements x and y""" cdef intp_t x_root = self.find(x) cdef intp_t y_root = self.find(y) @@ -166,7 +164,7 @@ cdef class BoruvkaUnionFind(object): return 0 - cdef intp_t find(self, intp_t x) except -1 nogil: + cdef intp_t find(self, intp_t x) noexcept nogil: """Find the root or identifier for the component that x is in""" cdef intp_t x_parent cdef intp_t x_grandparent @@ -316,6 +314,8 @@ cdef class BoruvkaAlgorithm: cdef cnp.ndarray[float64_t, ndim=2] knn_dist cdef cnp.ndarray[intp_t, ndim=2] knn_indices + # TODO: Revisit n_jobs semantics in a follow-up PR. Specifically, consider + # replacing with OpenMP prange # A shortcut: if we have a lot of points then we can split the points # into multiple piles and query them in parallel. On multicore systems # (most systems) this amounts to a 2x-3x wall clock improvement. @@ -381,7 +381,7 @@ cdef class BoruvkaAlgorithm: for n in range(self.num_nodes): self.component_of_node[n] = -(n+1) - cdef int update_components(self) except -1 nogil: + cdef int update_components(self) noexcept nogil: """Having found the nearest neighbor not in the same component for each current component (via tree traversal), run through adding edges to the min spanning tree and recomputing components via @@ -495,7 +495,7 @@ cdef class BoruvkaAlgorithm: self, intp_t node1, intp_t node2 - ) except -1 nogil: + ) noexcept nogil: """Perform a dual tree traversal, pruning wherever possible, to find the nearest neighbor not in the same component for each component. This is akin to a standard dual tree NN search, but we also prune @@ -778,8 +778,9 @@ cdef class BoruvkaAlgorithm: the tree passed in at construction""" cdef intp_t num_components = self.num_points - while num_components > 1: - self.dual_tree_traversal(0, 0) - num_components = self.update_components() + with nogil: + while num_components > 1: + self.dual_tree_traversal(0, 0) + num_components = self.update_components() return np.array(self.edges, dtype=MST_edge_dtype) From 2634228dc38215c26b50b36cdfa8b345fdd0aabe Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 10 Oct 2023 16:44:53 -0400 Subject: [PATCH 145/160] Removed unnecessary public attributes --- sklearn/cluster/_hdbscan/_boruvka.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 116d72a910f5b..24c15e63181bf 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -242,17 +242,17 @@ cdef class BoruvkaAlgorithm: intp_t num_points, num_nodes, num_features bint is_KDTree - public float64_t[::1] core_distance - public float64_t[::1] bounds - public intp_t[::1] components - public intp_t[::1] component_of_point - public intp_t[::1] component_of_node - public intp_t[::1] candidate_neighbor - public intp_t[::1] candidate_point - public float64_t[::1] candidate_distance - public float64_t[:, ::1] centroid_distances - public intp_t[::1] idx_array - public NodeData_t[::1] node_data + float64_t[::1] core_distance + float64_t[::1] bounds + intp_t[::1] components + intp_t[::1] component_of_point + intp_t[::1] component_of_node + intp_t[::1] candidate_neighbor + intp_t[::1] candidate_point + float64_t[::1] candidate_distance + float64_t[:, ::1] centroid_distances + intp_t[::1] idx_array + NodeData_t[::1] node_data BoruvkaUnionFind component_union_find MST_edge_t[::1] edges intp_t num_edges From 53a7ec617a2bf1e7fafaf1079d47895a85fc38f5 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 11 Oct 2023 12:51:17 -0400 Subject: [PATCH 146/160] Updated formatting and removed parallel-query schema --- sklearn/cluster/_hdbscan/_boruvka.pyx | 73 ++++++++++----------------- 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index 24c15e63181bf..d058b08cba77d 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -63,7 +63,7 @@ from ...utils._typedefs cimport intp_t, float64_t, uint8_t, int8_t from ...neighbors._binary_tree cimport NodeData_t from ._linkage cimport MST_edge_t from ._linkage import MST_edge_dtype -from joblib import Parallel, delayed, effective_n_jobs +from joblib import effective_n_jobs cdef float64_t INF = np.inf @@ -113,8 +113,10 @@ cdef inline float64_t kd_tree_min_dist_dual( return metric._rdist_to_dist(rdist) -cdef class BoruvkaUnionFind(object): - """Efficient union find implementation. +cdef class BoruvkaUnionFind: + """ + A union find implementation which avoids virtual nodes in order to keep track + of exact correspondence between initial elements and components. Parameters ---------- @@ -135,13 +137,11 @@ cdef class BoruvkaUnionFind(object): cdef intp_t[::1] _parent cdef uint8_t[::1] _rank cdef uint8_t[::1] is_component - cdef intp_t num_components def __init__(self, size): self._parent = np.arange(size, dtype=np.intp) self._rank = np.zeros(size, dtype=np.uint8) self.is_component = np.ones(size, dtype=np.uint8) - self.num_components = size cdef int union_(self, intp_t x, intp_t y) noexcept nogil: """Union together elements x and y""" @@ -240,7 +240,7 @@ cdef class BoruvkaAlgorithm: int8_t approx_min_span_tree intp_t n_jobs, min_samples intp_t num_points, num_nodes, num_features - bint is_KDTree + bint has_KDTree float64_t[::1] core_distance float64_t[::1] bounds @@ -270,7 +270,7 @@ cdef class BoruvkaAlgorithm: ): self.tree =tree - self.is_KDTree = isinstance(tree, KDTree) + self.has_KDTree = isinstance(tree, KDTree) self.raw_data = self.tree.data self.node_bounds = self.tree.node_bounds self.alpha = alpha @@ -299,7 +299,7 @@ cdef class BoruvkaAlgorithm: self.idx_array = self.tree.idx_array self.node_data = self.tree.node_data - if not self.is_KDTree: + if not self.has_KDTree: # Compute centroids for BallTree self.centroid_distances = self.dist.pairwise(self.tree.node_bounds[0]) @@ -314,34 +314,14 @@ cdef class BoruvkaAlgorithm: cdef cnp.ndarray[float64_t, ndim=2] knn_dist cdef cnp.ndarray[intp_t, ndim=2] knn_indices - # TODO: Revisit n_jobs semantics in a follow-up PR. Specifically, consider - # replacing with OpenMP prange - # A shortcut: if we have a lot of points then we can split the points - # into multiple piles and query them in parallel. On multicore systems - # (most systems) this amounts to a 2x-3x wall clock improvement. - if self.num_points > 16384 and self.n_jobs > 1: - split_cnt = self.num_points // self.n_jobs - datasets = [] - for i in range(self.n_jobs): - if i == self.n_jobs - 1: - datasets.append(np.asarray(self.tree.data[i*split_cnt:])) - else: - datasets.append(np.asarray(self.tree.data[i*split_cnt:(i+1)*split_cnt])) - - knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( - delayed(_core_dist_query) - (self.tree, points, - self.min_samples) - for points in datasets) - knn_dist = np.vstack([x[0] for x in knn_data]) - knn_indices = np.vstack([x[1] for x in knn_data]) - else: - knn_dist, knn_indices = self.tree.query( - self.tree.data, - k=self.min_samples, - dualtree=True, - breadth_first=True - ) + # TODO: Evaluate query-parallelization featured in original HDBSCAN + # implementation. Removed for now for simplicity. + knn_dist, knn_indices = self.tree.query( + self.tree.data, + k=self.min_samples, + dualtree=True, + breadth_first=True + ) self.core_distance = knn_dist[:, self.min_samples - 1].copy() @@ -519,7 +499,7 @@ cdef class BoruvkaAlgorithm: cdef float64_t left_dist, right_dist # Compute the distance between the query and reference nodes - if self.is_KDTree: + if self.has_KDTree: node_dist = kd_tree_min_dist_dual( self.dist, node1, node2, self.node_bounds, @@ -597,8 +577,7 @@ cdef class BoruvkaAlgorithm: q = point_indices2[j] component2 = self.component_of_point[q] - if (self.core_distance[q] > - self.candidate_distance[component1]): + if self.core_distance[q] > self.candidate_distance[component1]: continue if component1 != component2: @@ -658,7 +637,7 @@ cdef class BoruvkaAlgorithm: self.bounds[right] ) - if self.is_KDTree: + if self.has_KDTree: new_bound = bound_max else: bound_min = min( @@ -683,13 +662,17 @@ cdef class BoruvkaAlgorithm: # compute distances between nodes to determine # whether we should prioritise the left or # right branch in the reference tree. - elif node1_info.is_leaf or (not node2_info.is_leaf and - node2_info.radius > node1_info.radius): - + elif ( + node1_info.is_leaf or + ( + not node2_info.is_leaf and + node2_info.radius > node1_info.radius + ) + ): left = 2 * node2 + 1 right = 2 * node2 + 2 - if self.is_KDTree: + if self.has_KDTree: left_dist = kd_tree_min_dist_dual( self.dist, node1, left, @@ -735,7 +718,7 @@ cdef class BoruvkaAlgorithm: else: left = 2 * node1 + 1 right = 2 * node1 + 2 - if self.is_KDTree: + if self.has_KDTree: left_dist = kd_tree_min_dist_dual( self.dist, left, node2, From 37e5d5c566eae93b3a07eff0a147a0225c1761da Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 11 Oct 2023 12:54:29 -0400 Subject: [PATCH 147/160] Remove attribute used in debugging --- sklearn/cluster/_hdbscan/hdbscan.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 6bc704264f197..1b52b18c21027 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -264,7 +264,7 @@ def _hdbscan_brute( ), UserWarning, ) - return min_spanning_tree, _process_mst(min_spanning_tree) + return _process_mst(min_spanning_tree) def _hdbscan_prims( @@ -347,7 +347,7 @@ def _hdbscan_prims( # Mutual reachability distance is implicit in mst_from_data_matrix min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) - return min_spanning_tree, _process_mst(min_spanning_tree) + return _process_mst(min_spanning_tree) def _hdbscan_boruvka( @@ -377,7 +377,7 @@ def _hdbscan_boruvka( **metric_params, ) min_spanning_tree = out.spanning_tree() - return min_spanning_tree, _process_mst(min_spanning_tree) + return _process_mst(min_spanning_tree) def remap_single_linkage_tree(tree, internal_to_raw, non_finite): @@ -915,7 +915,7 @@ def fit(self, X, y=None): "kd_tree" if self.metric in KDTree.valid_metrics else "ball_tree" ) - self.mst, self._single_linkage_tree_ = mst_func(**kwargs) + self._single_linkage_tree_ = mst_func(**kwargs) self.labels_, self.probabilities_ = tree_to_labels( self._single_linkage_tree_, From 07882814fb262e30961a0f656f6fcfc2c392a943 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 11 Oct 2023 14:17:39 -0400 Subject: [PATCH 148/160] Improved tests --- sklearn/cluster/tests/test_hdbscan.py | 85 ++++++++++++++------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 5f4f7deabc2bb..d5b0c37e71461 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -28,32 +28,29 @@ X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) -ALGORITHMS = [ +# These are necessary options for both space-tree/MST algorithm selection +BRUTE_COMPATIBLE = {"auto", "brute"} + +ALGORITHMS = { "kd_tree", "ball_tree", - "brute", - "auto", -] +}.union(BRUTE_COMPATIBLE) + +EXACT_MST_ALGORITHMS = {"prims", "boruvka_exact"} +MST_ALGORITHMS = {"boruvka_approx"}.union(EXACT_MST_ALGORITHMS).union(BRUTE_COMPATIBLE) + OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) -@pytest.mark.parametrize("n_samples", [200, 16385]) @pytest.mark.parametrize("n_jobs", [1, 4]) -@pytest.mark.parametrize("mst_algo", ["boruvka_exact", "boruvka_approx"]) -def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs, mst_algo): - if n_samples > 16384: - data, _ = make_blobs(n_samples=n_samples, random_state=10) - data = shuffle(X, random_state=7) - data = StandardScaler().fit_transform(X) - else: - data = X - - hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims", n_jobs=n_jobs).fit(data) - hdb_boruvka = HDBSCAN(algorithm=tree, mst_algorithm=mst_algo, n_jobs=n_jobs).fit( - data - ) +@pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "boruvka_approx"]) +def test_hdbscan_boruvka_matches(tree, n_jobs, mst_algorithm): + hdb_prims = HDBSCAN(algorithm=tree, mst_algorithm="prims", n_jobs=n_jobs).fit(X) + hdb_boruvka = HDBSCAN( + algorithm=tree, mst_algorithm=mst_algorithm, n_jobs=n_jobs + ).fit(X) labels_prims = hdb_prims.labels_ labels_boruvka = hdb_boruvka.labels_ @@ -62,7 +59,7 @@ def test_hdbscan_boruvka_matches(tree, n_samples, n_jobs, mst_algo): # We should expect that the exact boruvka algorithm produces a correct mst, # but the approximation will almost surely produce an incorrect tree, and # hence differ from the exact labels. - assert similarity >= 0.91 if "approx" in mst_algo else 1 + assert similarity >= 0.91 if "approx" in mst_algorithm else 1 def test_hdbscan_mst_algorithm_errors(): @@ -72,17 +69,23 @@ def test_hdbscan_mst_algorithm_errors(): with pytest.raises(ValueError, match=msg): hdb.fit(X, y) - for mst_algo in ["prims", "boruvka_exact", "boruvka_approx"]: - hdb = HDBSCAN(algorithm="brute", mst_algorithm=mst_algo) + for mst_algorithm in MST_ALGORITHMS - BRUTE_COMPATIBLE: + hdb = HDBSCAN(algorithm="brute", mst_algorithm=mst_algorithm) with pytest.raises(ValueError, match=msg): hdb.fit(X, y) @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) -def test_outlier_data(outlier_type): +@pytest.mark.parametrize("mst_algorithm", MST_ALGORITHMS) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_outlier_data(outlier_type, mst_algorithm, algorithm): """ Tests if np.inf and np.nan data are each treated as special outliers. """ + algos = {algorithm, mst_algorithm} + if "brute" in algos and not algos.issubset(BRUTE_COMPATIBLE): + pytest.skip("Incompatible algorithm configuration") + outlier = { "infinite": np.inf, "missing": np.nan, @@ -97,7 +100,7 @@ def test_outlier_data(outlier_type): X_outlier = X.copy() X_outlier[0] = [outlier, 1] X_outlier[5] = [outlier, outlier] - model = HDBSCAN().fit(X_outlier) + model = HDBSCAN(algorithm=algorithm, mst_algorithm=mst_algorithm).fit(X_outlier) (missing_labels_idx,) = (model.labels_ == label).nonzero() assert_array_equal(missing_labels_idx, [0, 5]) @@ -106,7 +109,9 @@ def test_outlier_data(outlier_type): assert_array_equal(missing_probs_idx, [0, 5]) clean_indices = list(range(1, 5)) + list(range(6, 200)) - clean_model = HDBSCAN().fit(X_outlier[clean_indices]) + clean_model = HDBSCAN(algorithm=algorithm, mst_algorithm=mst_algorithm).fit( + X_outlier[clean_indices] + ) assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) @@ -174,19 +179,19 @@ def test_hdbscan_feature_array(): assert score >= 0.98 -@pytest.mark.parametrize("algo", ALGORITHMS) +@pytest.mark.parametrize("algorithm", ALGORITHMS) @pytest.mark.parametrize("metric", _VALID_METRICS) -def test_hdbscan_algorithms(algo, metric): +def test_hdbscan_algorithms(algorithm, metric): """ Tests that HDBSCAN works with the expected combinations of algorithms and metrics, or raises the expected errors. """ - labels = HDBSCAN(algorithm=algo).fit_predict(X) + labels = HDBSCAN(algorithm=algorithm).fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true # Validation for brute is handled by `pairwise_distances` - if algo in ("brute", "auto"): + if algorithm in ("brute", "auto"): return ALGOS_TREES = { @@ -201,12 +206,12 @@ def test_hdbscan_algorithms(algo, metric): }.get(metric, None) hdb = HDBSCAN( - algorithm=algo, + algorithm=algorithm, metric=metric, metric_params=metric_params, ) - if metric not in ALGOS_TREES[algo].valid_metrics: + if metric not in ALGOS_TREES[algorithm].valid_metrics: with pytest.raises(ValueError): hdb.fit(X) elif metric == "wminkowski": @@ -326,7 +331,7 @@ def test_hdbscan_precomputed_non_brute(tree): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) -@pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "prims"]) +@pytest.mark.parametrize("mst_algorithm", EXACT_MST_ALGORITHMS) def test_hdbscan_sparse(csr_container, mst_algorithm): """ Tests that HDBSCAN works correctly when passing sparse feature data. @@ -364,27 +369,25 @@ def test_hdbscan_sparse(csr_container, mst_algorithm): @pytest.mark.parametrize("algorithm", ALGORITHMS) -@pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "prims"]) +@pytest.mark.parametrize("mst_algorithm", MST_ALGORITHMS) def test_hdbscan_centers(algorithm, mst_algorithm): """ Tests that HDBSCAN centers are calculated and stored properly, and are accurate to the data. """ - mst_algorithm = "auto" if algorithm == "brute" else mst_algorithm - print(f"\nDEBUG *** {mst_algorithm=} | {algorithm=}") - if mst_algorithm == "auto" and algorithm == "auto": - pytest.xfail("We expect approximate boruvka to fail this closeness test") + algos = {mst_algorithm, algorithm} + if "brute" in algos and not algos.issubset(BRUTE_COMPATIBLE): + pytest.skip("Incompatible algorithm configuration") + centers = [(0.0, 0.0), (3.0, 3.0)] H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) hdb = HDBSCAN( algorithm=algorithm, mst_algorithm=mst_algorithm, store_centers="both" ).fit(H) - # The boruvka algorithm tends to produce the centroids/medoids in opposite - # order, so we reverse for this test. - if mst_algorithm == "boruvka_exact": - centers = centers[::-1] - for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_): + centroids = np.sort(hdb.centroids_, axis=0) + medoids = np.sort(hdb.medoids_, axis=0) + for center, centroid, medoid in zip(centers, centroids, medoids): assert_allclose(center, centroid, rtol=1, atol=0.05) assert_allclose(center, medoid, rtol=1, atol=0.05) From 3dd7c5af0f64f51d0a79f90f35de7f56e1478a77 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 11 Oct 2023 14:30:37 -0400 Subject: [PATCH 149/160] Updated changelog --- doc/whats_new/v1.4.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 3cdf299280ed7..9d1a214285bb1 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -217,6 +217,15 @@ Changelog `kdtree` and `balltree` values will be removed in 1.6. :pr:`26744` by :user:`Shreesha Kumar Bhat `. +- |Enhancement| : The `mst_algorithm` argument is introduced, allowing for the user to + select between `{"auto", "brute", "prims", "boruvka_exact", "boruvka_approx"}`. + Note that setting `mst_algorithm="prims"` recovers the same functionality as + before this change, except when setting `algorithm="brute"` in which case + both `"auto", "brute"` options for `mst_algorithm` recover current behavior. + This instead introduces `"boruvka_exact", "boruvka_approx"` which are both faster + MST building algorithms than the current `"prims"`. + :pr:`27572` by :user:`Meekail Zain `. + :mod:`sklearn.compose` ...................... From 5a9aebe1c12b339ef3561dfff0d08b6a72e341c3 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 11 Oct 2023 14:36:00 -0400 Subject: [PATCH 150/160] Changed default to preserve backwards compatability --- sklearn/cluster/_hdbscan/hdbscan.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index db8cf91b5572a..6e455c1a0d628 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -516,10 +516,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): The `'balltree'` option was deprecated in version 1.4, and will be renamed to `'ball_tree'` in 1.6. - mst_algorithm : {"auto", "brute", "prims", "boruvka"}, default="auto" - Exactly which algorithm to use for building the minimum spanning tree; - by default this is set to `"auto"` which switches between `"prims"` and - `"boruvka"` based on a heuristic. + mst_algorithm : {"auto", "brute", "prims", "boruvka"}, default="prims" + Exactly which algorithm to use for building the minimum spanning tree. + The `"auto"` option switches between `"brute"` and `"boruvka"` based on + the data and use of precomputed distances. .. versionadded:: 1.4 @@ -701,7 +701,7 @@ def __init__( metric_params=None, alpha=1.0, algorithm="auto", - mst_algorithm="auto", + mst_algorithm="prims", leaf_size=40, n_jobs=None, cluster_selection_method="eom", From 3b40f8a31052d915d8bcdede68c71627d8318699 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 09:31:30 -0400 Subject: [PATCH 151/160] Improved tests, and adjusted auto option for backwards compatability --- sklearn/cluster/_hdbscan/hdbscan.py | 27 ++++++++++++------- sklearn/cluster/tests/test_hdbscan.py | 38 ++++++++++++++++++--------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 6e455c1a0d628..4912b4ce0ff02 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -516,10 +516,13 @@ class HDBSCAN(ClusterMixin, BaseEstimator): The `'balltree'` option was deprecated in version 1.4, and will be renamed to `'ball_tree'` in 1.6. - mst_algorithm : {"auto", "brute", "prims", "boruvka"}, default="prims" + mst_algorithm : {"auto", "brute", "prims", "boruvka"}, default="auto" Exactly which algorithm to use for building the minimum spanning tree. - The `"auto"` option switches between `"brute"` and `"boruvka"` based on - the data and use of precomputed distances. + The `"auto"` option switches between `"brute"` and `"boruvka_exact"` based + on the data and use of precomputed distances. If you can tolerate some + inexactness and would prefer a speedup, consider using `"boruvka_approx"`. + The speedup is especially dramatic when dealing with many features + (n_features > ~45) .. versionadded:: 1.4 @@ -701,7 +704,7 @@ def __init__( metric_params=None, alpha=1.0, algorithm="auto", - mst_algorithm="prims", + mst_algorithm="auto", leaf_size=40, n_jobs=None, cluster_selection_method="eom", @@ -812,12 +815,18 @@ def fit(self, X, y=None): algorithms = {self.algorithm, self.mst_algorithm} acceptable_algorithms = {"auto", "brute"} + using_brute_compat_algos = algorithms.issubset(acceptable_algorithms) - if "brute" in algorithms and not algorithms.issubset(acceptable_algorithms): + if "brute" in algorithms and not using_brute_compat_algos: raise ValueError( "When setting either `algorithm='brute'` or `mst_algorithm='brute'`," " both keyword arguments must only be set to either 'brute' or 'auto'." ) + if self.metric == "precomputed" and not using_brute_compat_algos: + raise ValueError( + "When setting `metric='precomputed'`, both `mst_algorithm` and" + " `algorithm` must be set to either 'brute' or 'auto'." + ) # TODO(1.6): Remove if self.algorithm == "kdtree": @@ -897,9 +906,9 @@ def fit(self, X, y=None): self.mst_algorithm == "boruvka_approx" ) else: - # Approximate boruvka is always preferable + # Boruvka is always preferable mst_func = _hdbscan_boruvka - kwargs["approx_min_span_tree"] = True + kwargs["approx_min_span_tree"] = False else: if issparse(X) or self.metric not in FAST_METRICS: @@ -907,9 +916,9 @@ def fit(self, X, y=None): mst_func = _hdbscan_brute kwargs["copy"] = self.copy else: - # Approximate boruvka is always preferable + # Boruvka is always preferable mst_func = _hdbscan_boruvka - kwargs["approx_min_span_tree"] = True + kwargs["approx_min_span_tree"] = False kwargs["leaf_size"] = self.leaf_size kwargs["algo"] = ( "kd_tree" if self.metric in KDTree.valid_metrics else "ball_tree" diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index d5b0c37e71461..2d9824c47af32 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -34,15 +34,21 @@ ALGORITHMS = { "kd_tree", "ball_tree", -}.union(BRUTE_COMPATIBLE) +} | BRUTE_COMPATIBLE EXACT_MST_ALGORITHMS = {"prims", "boruvka_exact"} -MST_ALGORITHMS = {"boruvka_approx"}.union(EXACT_MST_ALGORITHMS).union(BRUTE_COMPATIBLE) +MST_ALGORITHMS = {"boruvka_approx"} | EXACT_MST_ALGORITHMS | BRUTE_COMPATIBLE OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} +def _validate_algorithms(algorithm, mst_algorithm): + algos = {algorithm, mst_algorithm} + if "brute" in algos and not algos.issubset(BRUTE_COMPATIBLE): + pytest.xfail("Incompatible algorithm configuration") + + @pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) @pytest.mark.parametrize("n_jobs", [1, 4]) @pytest.mark.parametrize("mst_algorithm", ["boruvka_exact", "boruvka_approx"]) @@ -82,10 +88,7 @@ def test_outlier_data(outlier_type, mst_algorithm, algorithm): """ Tests if np.inf and np.nan data are each treated as special outliers. """ - algos = {algorithm, mst_algorithm} - if "brute" in algos and not algos.issubset(BRUTE_COMPATIBLE): - pytest.skip("Incompatible algorithm configuration") - + _validate_algorithms(algorithm, mst_algorithm) outlier = { "infinite": np.inf, "missing": np.nan, @@ -319,14 +322,25 @@ def test_hdbscan_callable_metric(): assert n_clusters == n_clusters_true -@pytest.mark.parametrize("tree", ["kd", "ball"]) -def test_hdbscan_precomputed_non_brute(tree): +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS)) +def test_hdbscan_precomputed_non_brute(algorithm, mst_algorithm): """ Tests that HDBSCAN correctly raises an error when passing precomputed data while requesting a tree-based algorithm. """ - hdb = HDBSCAN(metric="precomputed", algorithm=f"prims_{tree}tree") - with pytest.raises(ValueError): + algos = {algorithm, mst_algorithm} + if algos.issubset(BRUTE_COMPATIBLE): + return + hdb = HDBSCAN( + metric="precomputed", algorithm=algorithm, mst_algorithm=mst_algorithm + ) + + if "brute" in algos: + msg = "When setting either `algorithm='brute'` or `mst_algorithm='brute'`" + else: + msg = "When setting `metric='precomputed'`, both `mst_algorithm` and" + with pytest.raises(ValueError, match=msg): hdb.fit(X) @@ -375,9 +389,7 @@ def test_hdbscan_centers(algorithm, mst_algorithm): Tests that HDBSCAN centers are calculated and stored properly, and are accurate to the data. """ - algos = {mst_algorithm, algorithm} - if "brute" in algos and not algos.issubset(BRUTE_COMPATIBLE): - pytest.skip("Incompatible algorithm configuration") + _validate_algorithms(algorithm, mst_algorithm) centers = [(0.0, 0.0), (3.0, 3.0)] H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) From 04e4007b4637828705154865fa9188082a55c261 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 09:37:03 -0400 Subject: [PATCH 152/160] Corrected changelog entry --- doc/whats_new/v1.4.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 9d1a214285bb1..5856f29ebc839 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -226,6 +226,10 @@ Changelog MST building algorithms than the current `"prims"`. :pr:`27572` by :user:`Meekail Zain `. + This implementation is an adaptation from the original implementation of HDBSCAN in + `scikit-learn-contrib/hdbscan `_, + by :user:`Leland McInnes ` et al. + :mod:`sklearn.compose` ...................... From e06188fd71b03e8ab94a926387d1c207cde9576b Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 09:40:02 -0400 Subject: [PATCH 153/160] Removed extraneous function --- sklearn/cluster/_hdbscan/_boruvka.pyx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_boruvka.pyx b/sklearn/cluster/_hdbscan/_boruvka.pyx index d058b08cba77d..056077715f7c9 100644 --- a/sklearn/cluster/_hdbscan/_boruvka.pyx +++ b/sklearn/cluster/_hdbscan/_boruvka.pyx @@ -183,15 +183,6 @@ cdef class BoruvkaUnionFind: return np.array(self.is_component).nonzero()[0] -def _core_dist_query(tree, data, min_samples): - return tree.query( - data, - k=min_samples, - dualtree=True, - breadth_first=True - ) - - cdef class BoruvkaAlgorithm: """A Dual Tree Boruvka Algorithm implemented for the sklearn KDTree space tree implementation. From 6ef166849f70859942f803193d175f9a71e5df50 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 10:00:27 -0400 Subject: [PATCH 154/160] Stabalized tests by using sorted lists --- sklearn/cluster/tests/test_hdbscan.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 2d9824c47af32..cab0073bfe39f 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -82,8 +82,8 @@ def test_hdbscan_mst_algorithm_errors(): @pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) -@pytest.mark.parametrize("mst_algorithm", MST_ALGORITHMS) -@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS)) +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) def test_outlier_data(outlier_type, mst_algorithm, algorithm): """ Tests if np.inf and np.nan data are each treated as special outliers. @@ -182,7 +182,7 @@ def test_hdbscan_feature_array(): assert score >= 0.98 -@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) @pytest.mark.parametrize("metric", _VALID_METRICS) def test_hdbscan_algorithms(algorithm, metric): """ @@ -345,7 +345,7 @@ def test_hdbscan_precomputed_non_brute(algorithm, mst_algorithm): @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) -@pytest.mark.parametrize("mst_algorithm", EXACT_MST_ALGORITHMS) +@pytest.mark.parametrize("mst_algorithm", sorted(EXACT_MST_ALGORITHMS)) def test_hdbscan_sparse(csr_container, mst_algorithm): """ Tests that HDBSCAN works correctly when passing sparse feature data. @@ -382,8 +382,8 @@ def test_hdbscan_sparse(csr_container, mst_algorithm): HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse) -@pytest.mark.parametrize("algorithm", ALGORITHMS) -@pytest.mark.parametrize("mst_algorithm", MST_ALGORITHMS) +@pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS)) def test_hdbscan_centers(algorithm, mst_algorithm): """ Tests that HDBSCAN centers are calculated and stored properly, and are From 76713ff19dc590008a03e4db5ab5c4e9f353fcab Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 11:48:47 -0400 Subject: [PATCH 155/160] Updated to include deprecation for auto heuristic --- sklearn/cluster/_hdbscan/hdbscan.py | 49 +++++++++++++---- sklearn/cluster/tests/test_hdbscan.py | 76 +++++++++++++++++---------- 2 files changed, 89 insertions(+), 36 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 4912b4ce0ff02..1670e18c32d59 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -35,6 +35,7 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import warnings from numbers import Integral, Real from warnings import warn @@ -656,7 +657,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator): >>> from sklearn.cluster import HDBSCAN >>> from sklearn.datasets import load_digits >>> X, _ = load_digits(return_X_y=True) - >>> hdb = HDBSCAN(min_cluster_size=20) + >>> hdb = HDBSCAN(min_cluster_size=20, mst_algorithm="prims") >>> hdb.fit(X) HDBSCAN(min_cluster_size=20) >>> hdb.labels_ @@ -684,7 +685,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator): ), ], "mst_algorithm": [ - StrOptions({"auto", "brute", "prims", "boruvka_exact", "boruvka_approx"}) + StrOptions( + {"auto", "brute", "prims", "boruvka_exact", "boruvka_approx", "warn"}, + deprecated={"warn"}, + ), ], "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], "n_jobs": [Integral, None], @@ -704,7 +708,8 @@ def __init__( metric_params=None, alpha=1.0, algorithm="auto", - mst_algorithm="auto", + # TODO(1.6): Change default to "auto" + mst_algorithm="warn", leaf_size=40, n_jobs=None, cluster_selection_method="eom", @@ -813,9 +818,35 @@ def fit(self, X, y=None): f" samples in X ({X.shape[0]})" ) - algorithms = {self.algorithm, self.mst_algorithm} - acceptable_algorithms = {"auto", "brute"} - using_brute_compat_algos = algorithms.issubset(acceptable_algorithms) + # TODO(1.6): Remove and set `mst_algorithm` default to "auto" + if self.mst_algorithm == "warn": + if self.algorithm == "brute" or ( + self.algorithm == "auto" and self.metric == "precomputed" + ): + mst_algorithm = "brute" + else: + mst_algorithm = "prims" + warnings.warn( + ( + "In version 1.6 the default MST algorithm dispatch behavior will" + " change to include the new `boruvka_exact` and `boruvka_approx`" + " algorithms, resulting in some models potentially changing. To" + " suppress this warning, and to avoid unintended changes in" + " behavior, please manually set `mst_algorithm`. You can opt in to" + " the new behavior by manually setting `mst_algorithm='auto'`. You" + " can preserve old behavior by setting `mst_algorithm` to `'brute'`" + " or `'auto'` when `algorithm='brute'`, or `algorithm='auto'` and" + " `metric='precomputed'`; otherwise set" + " `mst_algorithm='prims'` to keep old behavior." + ), + FutureWarning, + ) + else: + mst_algorithm = self.mst_algorithm + + algorithms = {self.algorithm, mst_algorithm} + brute_compat_algorithms = {"auto", "brute"} + using_brute_compat_algos = algorithms.issubset(brute_compat_algorithms) if "brute" in algorithms and not using_brute_compat_algos: raise ValueError( @@ -897,13 +928,13 @@ def fit(self, X, y=None): ) kwargs["algo"] = tree_algorithm - if self.mst_algorithm != "auto": - if self.mst_algorithm == "prims": + if mst_algorithm != "auto": + if mst_algorithm == "prims": mst_func = _hdbscan_prims else: mst_func = _hdbscan_boruvka kwargs["approx_min_span_tree"] = ( - self.mst_algorithm == "boruvka_approx" + mst_algorithm == "boruvka_approx" ) else: # Boruvka is always preferable diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index cab0073bfe39f..3160293312853 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -125,7 +125,9 @@ def test_hdbscan_distance_matrix(): """ D = euclidean_distances(X) D_original = D.copy() - labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D) + labels = HDBSCAN(metric="precomputed", copy=True, mst_algorithm="auto").fit_predict( + D + ) assert_allclose(D, D_original) n_clusters = len(set(labels) - OUTLIER_SET) @@ -138,14 +140,14 @@ def test_hdbscan_distance_matrix(): msg = r"The precomputed distance matrix.*has shape" with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed", copy=True).fit_predict(X) + HDBSCAN(metric="precomputed", copy=True, mst_algorithm="auto").fit_predict(X) msg = r"The precomputed distance matrix.*values" # Ensure the matrix is not symmetric D[0, 1] = 10 D[1, 0] = 1 with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed").fit_predict(D) + HDBSCAN(metric="precomputed", mst_algorithm="auto").fit_predict(D) @pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS]) @@ -162,7 +164,7 @@ def test_hdbscan_sparse_distance_matrix(sparse_constructor): D = sparse_constructor(D) D.eliminate_zeros() - labels = HDBSCAN(metric="precomputed").fit_predict(D) + labels = HDBSCAN(metric="precomputed", mst_algorithm="auto").fit_predict(D) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -172,7 +174,7 @@ def test_hdbscan_feature_array(): Tests that HDBSCAN works with feature array, including an arbitrary goodness of fit check. Note that the check is a simple heuristic. """ - labels = HDBSCAN().fit_predict(X) + labels = HDBSCAN(mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -189,7 +191,7 @@ def test_hdbscan_algorithms(algorithm, metric): Tests that HDBSCAN works with the expected combinations of algorithms and metrics, or raises the expected errors. """ - labels = HDBSCAN(algorithm=algorithm).fit_predict(X) + labels = HDBSCAN(algorithm=algorithm, mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -231,7 +233,7 @@ def test_dbscan_clustering(): TODO: Improve and strengthen this test if at all possible. """ - clusterer = HDBSCAN().fit(X) + clusterer = HDBSCAN(mst_algorithm="auto").fit(X) labels = clusterer.dbscan_clustering(0.3) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -249,7 +251,7 @@ def test_dbscan_clustering_outlier_data(cut_distance): X_outlier[0] = [np.inf, 1] X_outlier[2] = [1, np.nan] X_outlier[5] = [np.inf, np.nan] - model = HDBSCAN().fit(X_outlier) + model = HDBSCAN(mst_algorithm="auto").fit(X_outlier) labels = model.dbscan_clustering(cut_distance=cut_distance) missing_labels_idx = np.flatnonzero(labels == missing_label) @@ -259,7 +261,7 @@ def test_dbscan_clustering_outlier_data(cut_distance): assert_array_equal(infinite_labels_idx, [0]) clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx)) - clean_model = HDBSCAN().fit(X_outlier[clean_idx]) + clean_model = HDBSCAN(mst_algorithm="auto").fit(X_outlier[clean_idx]) clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) assert_array_equal(clean_labels, labels[clean_idx]) @@ -295,7 +297,7 @@ def test_hdbscan_no_clusters(): Tests that HDBSCAN correctly does not generate a valid cluster when the `min_cluster_size` is too large for the data. """ - labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) + labels = HDBSCAN(min_cluster_size=len(X) - 1, mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == 0 @@ -306,7 +308,9 @@ def test_hdbscan_min_cluster_size(): many points """ for min_cluster_size in range(2, len(X), 1): - labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X) + labels = HDBSCAN( + min_cluster_size=min_cluster_size, mst_algorithm="auto" + ).fit_predict(X) true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size @@ -317,7 +321,7 @@ def test_hdbscan_callable_metric(): Tests that HDBSCAN works when passed a callable metric. """ metric = distance.euclidean - labels = HDBSCAN(metric=metric).fit_predict(X) + labels = HDBSCAN(metric=metric, mst_algorithm="auto").fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -359,7 +363,7 @@ def test_hdbscan_sparse(csr_container, mst_algorithm): _X_sparse = csr_container(X) X_sparse = _X_sparse.copy() - sparse_labels = HDBSCAN().fit(X_sparse).labels_ + sparse_labels = HDBSCAN(mst_algorithm="auto").fit(X_sparse).labels_ fowlkes_mallows_score(dense_labels, sparse_labels) == 1 # Compare that the sparse and dense non-precomputed routines return the same labels @@ -374,12 +378,14 @@ def test_hdbscan_sparse(csr_container, mst_algorithm): X_sparse = _X_sparse.copy() X_sparse[0, 0] = outlier_val - sparse_labels = HDBSCAN().fit(X_sparse).labels_ + sparse_labels = HDBSCAN(mst_algorithm="auto").fit(X_sparse).labels_ fowlkes_mallows_score(dense_labels, sparse_labels) == 1 msg = "Sparse data matrices only support algorithm `brute`." with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse) + HDBSCAN(metric="euclidean", algorithm="ball_tree", mst_algorithm="auto").fit( + X_sparse + ) @pytest.mark.parametrize("algorithm", sorted(ALGORITHMS)) @@ -460,7 +466,7 @@ def test_hdbscan_better_than_dbscan(): cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0, ) - hdb = HDBSCAN().fit(X) + hdb = HDBSCAN(mst_algorithm="auto").fit(X) n_clusters = len(set(hdb.labels_)) - int(-1 in hdb.labels_) assert n_clusters == 4 @@ -478,7 +484,7 @@ def test_hdbscan_usable_inputs(X, kwargs): Tests that HDBSCAN works correctly for array-likes and precomputed inputs with non-finite points. """ - HDBSCAN(min_samples=1, **kwargs).fit(X) + HDBSCAN(min_samples=1, mst_algorithm="auto", **kwargs).fit(X) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) @@ -491,10 +497,11 @@ def test_hdbscan_sparse_distances_too_few_nonzero(csr_container): msg = "There exists points with fewer than" with pytest.raises(ValueError, match=msg): - HDBSCAN(metric="precomputed").fit(X) + HDBSCAN(metric="precomputed", mst_algorithm="auto").fit(X) -def test_hdbscan_tree_invalid_metric(): +@pytest.mark.parametrize("mst_algorithm", sorted(MST_ALGORITHMS - {"brute"})) +def test_hdbscan_tree_invalid_metric(mst_algorithm): """ Tests that HDBSCAN correctly raises an error for invalid metric choices. """ @@ -506,16 +513,24 @@ def test_hdbscan_tree_invalid_metric(): # Callables are not supported for either with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X) + HDBSCAN( + algorithm="kd_tree", metric=metric_callable, mst_algorithm=mst_algorithm + ).fit(X) with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X) + HDBSCAN( + algorithm="ball_tree", metric=metric_callable, mst_algorithm=mst_algorithm + ).fit(X) # The set of valid metrics for KDTree at the time of writing this test is a # strict subset of those supported in BallTree metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) if len(metrics_not_kd) > 0: with pytest.raises(ValueError, match=msg): - HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X) + HDBSCAN( + algorithm="kd_tree", + metric=metrics_not_kd[0], + mst_algorithm=mst_algorithm, + ).fit(X) def test_hdbscan_too_many_min_samples(): @@ -523,7 +538,7 @@ def test_hdbscan_too_many_min_samples(): Tests that HDBSCAN correctly raises an error when setting `min_samples` larger than the number of samples. """ - hdb = HDBSCAN(min_samples=len(X) + 1) + hdb = HDBSCAN(min_samples=len(X) + 1, mst_algorithm="auto") msg = r"min_samples (.*) must be at most" with pytest.raises(ValueError, match=msg): hdb.fit(X) @@ -537,7 +552,7 @@ def test_hdbscan_precomputed_dense_nan(): X_nan = X.copy() X_nan[0, 0] = np.nan msg = "np.nan values found in precomputed-dense" - hdb = HDBSCAN(metric="precomputed") + hdb = HDBSCAN(metric="precomputed", mst_algorithm="auto") with pytest.raises(ValueError, match=msg): hdb.fit(X_nan) @@ -560,7 +575,7 @@ def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon): ], ) - est = HDBSCAN().fit(X) + est = HDBSCAN(mst_algorithm="auto").fit(X) condensed_tree = _condense_tree( est._single_linkage_tree_, min_cluster_size=est.min_cluster_size ) @@ -629,7 +644,7 @@ def test_hdbscan_warning_on_deprecated_algorithm_name(): " to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`." ) with pytest.warns(FutureWarning, match=msg): - HDBSCAN(algorithm="kdtree").fit(X) + HDBSCAN(algorithm="kdtree", mst_algorithm="auto").fit(X) # Test that warning message is shown when algorithm='balltree' msg = ( @@ -638,4 +653,11 @@ def test_hdbscan_warning_on_deprecated_algorithm_name(): " `algorithm='ball_tree'`." ) with pytest.warns(FutureWarning, match=msg): - HDBSCAN(algorithm="balltree").fit(X) + HDBSCAN(algorithm="balltree", mst_algorithm="auto").fit(X) + + +# TODO(1.6): Remove +def test_hdbscan_warning_on_mst_default(): + msg = "In version 1.6 the default MST algorithm dispatch behavior will" + with pytest.warns(FutureWarning, match=msg): + HDBSCAN().fit_predict(X) From de5d041e0cd855afcd88b4d4e8e6bbba63d0c28d Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 11:50:39 -0400 Subject: [PATCH 156/160] Updated example in docstring --- sklearn/cluster/_hdbscan/hdbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 1670e18c32d59..3e07c16212927 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -657,9 +657,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): >>> from sklearn.cluster import HDBSCAN >>> from sklearn.datasets import load_digits >>> X, _ = load_digits(return_X_y=True) - >>> hdb = HDBSCAN(min_cluster_size=20, mst_algorithm="prims") + >>> hdb = HDBSCAN(min_cluster_size=20, mst_algorithm='prims') >>> hdb.fit(X) - HDBSCAN(min_cluster_size=20) + HDBSCAN(min_cluster_size=20, mst_algorithm='prims') >>> hdb.labels_ array([ 2, 6, -1, ..., -1, -1, -1]) """ From 8ba71e8898e9f7e40208f49a79234b87c784dab1 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Thu, 12 Oct 2023 14:36:40 -0400 Subject: [PATCH 157/160] Updated centers test to use less adversarial data --- sklearn/cluster/tests/test_hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 3160293312853..f62440450ec28 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -398,7 +398,7 @@ def test_hdbscan_centers(algorithm, mst_algorithm): _validate_algorithms(algorithm, mst_algorithm) centers = [(0.0, 0.0), (3.0, 3.0)] - H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) + H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.25) hdb = HDBSCAN( algorithm=algorithm, mst_algorithm=mst_algorithm, store_centers="both" ).fit(H) From 6a592f4383da9eb2894f715b14d12b258a083ae2 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 13 Oct 2023 13:27:11 -0400 Subject: [PATCH 158/160] Corrected test by making hdb model more noise-tolerant --- sklearn/cluster/tests/test_hdbscan.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index f62440450ec28..4188f1828bc28 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -398,9 +398,12 @@ def test_hdbscan_centers(algorithm, mst_algorithm): _validate_algorithms(algorithm, mst_algorithm) centers = [(0.0, 0.0), (3.0, 3.0)] - H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.25) + H, _ = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) hdb = HDBSCAN( - algorithm=algorithm, mst_algorithm=mst_algorithm, store_centers="both" + algorithm=algorithm, + mst_algorithm=mst_algorithm, + store_centers="both", + min_samples=10, ).fit(H) centroids = np.sort(hdb.centroids_, axis=0) From 68d4fd14d743b6bfe86cb79645de23a94c569254 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 13 Oct 2023 14:58:38 -0400 Subject: [PATCH 159/160] Avoid FutureWarning in tests --- sklearn/cluster/tests/test_hdbscan.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 4188f1828bc28..22818ae702f7e 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -214,6 +214,7 @@ def test_hdbscan_algorithms(algorithm, metric): algorithm=algorithm, metric=metric, metric_params=metric_params, + mst_algorithm="auto", ) if metric not in ALGOS_TREES[algorithm].valid_metrics: @@ -275,6 +276,7 @@ def test_hdbscan_high_dimensional(): labels = HDBSCAN( algorithm="auto", metric="seuclidean", + mst_algorithm="auto", metric_params={"V": np.ones(H.shape[1])}, ).fit_predict(H) n_clusters = len(set(labels) - OUTLIER_SET) @@ -286,7 +288,9 @@ def test_hdbscan_best_balltree_metric(): Tests that HDBSCAN using `BallTree` works. """ labels = HDBSCAN( - metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} + metric="seuclidean", + mst_algorithm="auto", + metric_params={"V": np.ones(X.shape[1])}, ).fit_predict(X) n_clusters = len(set(labels) - OUTLIER_SET) assert n_clusters == n_clusters_true @@ -435,6 +439,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): cluster_selection_epsilon=0.0, cluster_selection_method="eom", allow_single_cluster=True, + mst_algorithm="auto", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 @@ -450,6 +455,7 @@ def test_hdbscan_allow_single_cluster_with_epsilon(): cluster_selection_method="eom", allow_single_cluster=True, algorithm="kd_tree", + mst_algorithm="auto", ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 From 39aa99291d173f056f3f88878c509bd1b6897d19 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 14 Oct 2023 10:06:56 -0400 Subject: [PATCH 160/160] Fixed remaining FutureWarning --- sklearn/tests/test_docstring_parameters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index e6d2ade736f4a..6adc1419b0ea0 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -253,6 +253,10 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == "MDS": est.set_params(normalized_stress="auto") + # TODO(1.6): TO BE REMOVED for 1.6 (avoid FutureWarning) + if Estimator.__name__ == "HDBSCAN": + est.set_params(mst_algorithm="auto") + # Low max iter to speed up tests: we are only interested in checking the existence # of fitted attributes. This should be invariant to whether it has converged or not. if "max_iter" in est.get_params():