From 0ab847cdb8a792e812ee95a5adcc034d6523c031 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Oct 2022 15:40:55 +0200 Subject: [PATCH 01/10] MAINT further style improvement --- sklearn/cluster/_hdbscan/_reachability.pyx | 179 +++++++++++++-------- sklearn/cluster/_hdbscan/hdbscan.py | 22 +-- 2 files changed, 127 insertions(+), 74 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 64aa9573e103a..0cf3f99fdc2a7 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -1,44 +1,54 @@ -# mutual reachability distance compiutations +# mutual reachability distance computations # Authors: Leland McInnes # Meekail Zain # License: 3-clause BSD import numpy as np -from cython.parallel cimport prange +from scipy.sparse import issparse + +from ...neighbors import BallTree, KDTree + cimport numpy as cnp -from libc.math cimport isfinite +from cython.parallel cimport prange +from libc.math cimport isfinite, INFINITY -import gc -from scipy.sparse import issparse -from scipy.spatial.distance import pdist, squareform +def mutual_reachability_graph( + distance_matrix, n_neighbors=5, max_distance=0.0, copy=False +): + """Compute the weighted adjacency matrix of the mutual reachability graph. -from ...neighbors import BallTree, KDTree + The mutual reachability distance used to build the graph is defined as:: -def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): - """Compute the weighted adjacency matrix of the mutual reachability - graph of a distance matrix. Note that computation is performed in-place for - `distance_matrix`. If out-of-place computation is required, pass a copy to - this function. + max(d_core(x_p), d_core(x_q), d(x_p, x_q)) + + and the core distance `d_core` is defined as the distance between a point + `x_p` and its k-th nearest neighbor. Parameters ---------- - distance_matrix : ndarray or sparse matrix of shape (n_samples, n_samples) + distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples) Array of distances between samples. If sparse, the array must be in `LIL` format. - min_points : int, default=5 + n_neighbors : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. - max_dist : float, default=0.0 + max_distance : float, default=0.0 The distance which `np.inf` is replaced with. When the true mutual- reachability distance is measured to be infinite, it is instead - truncated to `max_dist`. + truncated to `max_dist`. Only used when `distance_matrix` is a sparse + matrix. + + copy : bool, default=False + Whether or not to compute the mutual reachinbility graph in-place, i.e. + modifying directly `distance_matrix`. Returns ------- - mututal_reachability: ndarray of shape (n_samples, n_samples) + mututal_reachability_graph: {ndarray, sparse matrix} of shape \ + (n_samples, n_samples) Weighted adjacency matrix of the mutual reachability graph. References @@ -48,78 +58,121 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. """ - # Account for index offset - min_points -= 1 + if copy: + distance_matrix = distance_matrix.copy() - # Note that in both routines `distance_matrix` is operated on in-place. At - # this point, if out-of-place operation is desired then this function - # should have been passed a copy. if issparse(distance_matrix): - return _sparse_mutual_reachability( - distance_matrix, - min_points=min_points, - max_dist=max_dist + # FIXME: since we convert to a CSR matrix then we do not make the operation + # in-place. + return _sparse_mutual_reachability_graph( + distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance ).tocsr() - return _dense_mutual_reachability(distance_matrix, min_points=min_points) + return _dense_mutual_reachability_graph(distance_matrix, n_neighbors=n_neighbors) + -cdef _dense_mutual_reachability( +cdef _dense_mutual_reachability_graph( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t min_points=5 + cnp.intp_t n_neighbors=5 ): - cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cdef cnp.float64_t mr_dist - cdef cnp.float64_t[:] core_distances + """Dense implementation of mutual reachability graph. + + The computation is done in-place, i.e. the distance matrix is modified + directly. + + Parameters + ---------- + distance_matrix : ndarray of shape (n_samples, n_samples) + Array of distances between samples. + + n_neighbors : int, default=5 + The number of points in a neighbourhood for a point to be considered + a core point. + + Returns + ------- + mututal_reachability_graph : ndarray of shape (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. This object + is the same as `distance_matrix` since the operation is done in-place. + """ + cdef: + cnp.intp_t i, j, n_samples = distance_matrix.shape[0] + cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.float64_t mutual_reachibility_distance + cnp.float64_t[:] core_distances - # Compute the core distances for all samples `x_p` corresponding - # to the distance of the k-th farthest neighbours (including - # `x_p`). core_distances = np.partition( - distance_matrix, - min_points, - axis=0, - )[min_points] + distance_matrix, farther_neighbor_idx, axis=0 + )[farther_neighbor_idx] with nogil: for i in range(n_samples): for j in prange(n_samples): - mr_dist = max( + mutual_reachibility_distance = max( core_distances[i], core_distances[j], - distance_matrix[i, j] + distance_matrix[i, j], ) - distance_matrix[i, j] = mr_dist + distance_matrix[i, j] = mutual_reachibility_distance return distance_matrix -# Assumes LIL format. + # TODO: Rewrite for CSR. -cdef _sparse_mutual_reachability( +cdef _sparse_mutual_reachability_graph( object distance_matrix, - cnp.intp_t min_points=5, - cnp.float64_t max_dist=0. + cnp.intp_t n_neighbors=5, + cnp.float64_t max_distance=0.0, ): - cdef cnp.intp_t i, j, n, n_samples = distance_matrix.shape[0] - cdef cnp.float64_t mr_dist - cdef cnp.float64_t[:] core_distances - cdef cnp.int32_t[:] nz_row_data, nz_col_data + """Sparse implementation of mutual reachability graph. + + The computation is done in-place, i.e. the distance matrix is modified + directly. This implementation only accepts `LIL` format sparse matrices. + + Parameters + ---------- + distance_matrix : sparse matrix of shape (n_samples, n_samples) + Sparse matrix of distances between samples. The sparse format should + be `LIL`. + + n_neighbors : int, default=5 + The number of points in a neighbourhood for a point to be considered + a core point. + + Returns + ------- + mututal_reachability_graph : sparse matrix of shape (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. This object + is the same as `distance_matrix` since the operation is done in-place. + """ + cdef: + cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0] + list row_distances + cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.float64_t mutual_reachibility_distance + cnp.float64_t[:] core_distances + cnp.int32_t[:] nz_row_data, nz_col_data + core_distances = np.empty(n_samples, dtype=np.float64) for i in range(n_samples): - if min_points < len(distance_matrix.data[i]): + row_distances = distance_matrix.data[i] + if farther_neighbor_idx < len(row_distances): core_distances[i] = np.partition( - distance_matrix.data[i], - min_points - )[min_points] + row_distances, farther_neighbor_idx + )[farther_neighbor_idx] else: - core_distances[i] = np.infty + core_distances[i] = INFINITY nz_row_data, nz_col_data = distance_matrix.nonzero() - for n in range(nz_row_data.shape[0]): - i = nz_row_data[n] - j = nz_col_data[n] - mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) - if isfinite(mr_dist): - distance_matrix[i, j] = mr_dist - elif max_dist > 0: - distance_matrix[i, j] = max_dist + for sample_idx in range(nz_row_data.shape[0]): + i, j = nz_row_data[sample_idx], nz_col_data[sample_idx] + mutual_reachibility_distance = max( + core_distances[i], core_distances[j], distance_matrix[i, j] + ) + if isfinite(mutual_reachibility_distance): + distance_matrix[i, j] = mutual_reachibility_distance + elif max_distance > 0: + # TODO: it seems that we assume that distance_matrix is initialized + # with zeros. + distance_matrix[i, j] = max_distance return distance_matrix diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 79beead943898..fe2a641bb07f5 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -22,7 +22,7 @@ from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix -from ._reachability import mutual_reachability +from ._reachability import mutual_reachability_graph from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics @@ -63,7 +63,7 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): f"There exists points with fewer than {min_samples} neighbors. Ensure" " your distance matrix has non-zero values for at least" f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" - " graph), or specify a `max_dist` in `metric_params` to use when" + " graph), or specify a `max_distance` in `metric_params` to use when" " distances are missing." ) @@ -108,7 +108,7 @@ def _process_mst(min_spanning_tree): def _hdbscan_brute( X, - min_samples=5, + n_neighbors=5, alpha=None, metric="euclidean", n_jobs=None, @@ -128,17 +128,17 @@ def _hdbscan_brute( distance_matrix /= alpha # max_dist is only relevant for sparse and is ignored for dense - max_dist = metric_params.get("max_dist", 0.0) + max_distance = metric_params.get("max_distance", 0.0) sparse = issparse(distance_matrix) distance_matrix = distance_matrix.tolil() if sparse else distance_matrix # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. - mutual_reachability_ = mutual_reachability( - distance_matrix, min_points=min_samples, max_dist=max_dist + mutual_reachability_ = mutual_reachability_graph( + distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance ) min_spanning_tree = _brute_mst( - mutual_reachability_, min_samples=min_samples, sparse=sparse + mutual_reachability_, min_samples=n_neighbors, sparse=sparse ) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): @@ -156,7 +156,7 @@ def _hdbscan_brute( def _hdbscan_prims( X, algo, - min_samples=5, + n_neighbors=5, alpha=1.0, metric="euclidean", leaf_size=40, @@ -168,7 +168,7 @@ def _hdbscan_prims( # Get distance to kth nearest neighbour nbrs = NearestNeighbors( - n_neighbors=min_samples, + n_neighbors=n_neighbors, algorithm=algo, leaf_size=leaf_size, metric=metric, @@ -177,7 +177,7 @@ def _hdbscan_prims( p=None, ).fit(X) - neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) + neighbors_distances, _ = nbrs.kneighbors(X, n_neighbors, return_distance=True) core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) dist_metric = DistanceMetric.get_metric(metric, **metric_params) @@ -590,7 +590,7 @@ def fit(self, X, y=None): mst_func = None kwargs = dict( X=X, - min_samples=self._min_samples, + n_neighbors=self._min_samples, alpha=self.alpha, metric=self.metric, n_jobs=self.n_jobs, From d6a59a53be9dd0ea60e8e4b2a17d65c8d9398e40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Oct 2022 15:54:33 +0200 Subject: [PATCH 02/10] FIX let's be consistent and call min_samples --- sklearn/cluster/_hdbscan/_reachability.pyx | 20 ++++++++++---------- sklearn/cluster/_hdbscan/hdbscan.py | 14 +++++++------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 3c5a8b86a9f5a..fb9c288c039b1 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -12,7 +12,7 @@ from libc.math cimport isfinite, INFINITY def mutual_reachability_graph( - distance_matrix, n_neighbors=5, max_distance=0.0, copy=False + distance_matrix, min_samples=5, max_distance=0.0, copy=False ): """Compute the weighted adjacency matrix of the mutual reachability graph. @@ -29,7 +29,7 @@ def mutual_reachability_graph( Array of distances between samples. If sparse, the array must be in `LIL` format. - n_neighbors : int, default=5 + min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. @@ -63,15 +63,15 @@ def mutual_reachability_graph( # FIXME: since we convert to a CSR matrix then we do not make the operation # in-place. return _sparse_mutual_reachability_graph( - distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance + distance_matrix, min_samples=min_samples, max_distance=max_distance ).tocsr() - return _dense_mutual_reachability_graph(distance_matrix, n_neighbors=n_neighbors) + return _dense_mutual_reachability_graph(distance_matrix, min_samples=min_samples) cdef _dense_mutual_reachability_graph( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t n_neighbors=5 + cnp.intp_t min_samples=5 ): """Dense implementation of mutual reachability graph. @@ -83,7 +83,7 @@ cdef _dense_mutual_reachability_graph( distance_matrix : ndarray of shape (n_samples, n_samples) Array of distances between samples. - n_neighbors : int, default=5 + min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. @@ -95,7 +95,7 @@ cdef _dense_mutual_reachability_graph( """ cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.intp_t farther_neighbor_idx = min_samples - 1 cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances @@ -118,7 +118,7 @@ cdef _dense_mutual_reachability_graph( # TODO: Rewrite for CSR. cdef _sparse_mutual_reachability_graph( object distance_matrix, - cnp.intp_t n_neighbors=5, + cnp.intp_t min_samples=5, cnp.float64_t max_distance=0.0, ): """Sparse implementation of mutual reachability graph. @@ -132,7 +132,7 @@ cdef _sparse_mutual_reachability_graph( Sparse matrix of distances between samples. The sparse format should be `LIL`. - n_neighbors : int, default=5 + min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. @@ -145,7 +145,7 @@ cdef _sparse_mutual_reachability_graph( cdef: cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0] list row_distances - cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.intp_t farther_neighbor_idx = min_samples - 1 cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances cnp.int32_t[:] nz_row_data, nz_col_data diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index d5da676409423..5ff89f68dcf8d 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -108,7 +108,7 @@ def _process_mst(min_spanning_tree): def _hdbscan_brute( X, - n_neighbors=5, + min_samples=5, alpha=None, metric="euclidean", n_jobs=None, @@ -144,10 +144,10 @@ def _hdbscan_brute( # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. mutual_reachability_ = mutual_reachability_graph( - distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance + distance_matrix, min_samples=min_samples, max_distance=max_distance ) min_spanning_tree = _brute_mst( - mutual_reachability_, min_samples=n_neighbors, sparse=sparse + mutual_reachability_, min_samples=min_samples, sparse=sparse ) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): @@ -165,7 +165,7 @@ def _hdbscan_brute( def _hdbscan_prims( X, algo, - n_neighbors=5, + min_samples=5, alpha=1.0, metric="euclidean", leaf_size=40, @@ -177,7 +177,7 @@ def _hdbscan_prims( # Get distance to kth nearest neighbour nbrs = NearestNeighbors( - n_neighbors=n_neighbors, + n_neighbors=min_samples, algorithm=algo, leaf_size=leaf_size, metric=metric, @@ -186,7 +186,7 @@ def _hdbscan_prims( p=None, ).fit(X) - neighbors_distances, _ = nbrs.kneighbors(X, n_neighbors, return_distance=True) + neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) dist_metric = DistanceMetric.get_metric(metric, **metric_params) @@ -599,7 +599,7 @@ def fit(self, X, y=None): mst_func = None kwargs = dict( X=X, - n_neighbors=self._min_samples, + min_samples=self._min_samples, alpha=self.alpha, metric=self.metric, n_jobs=self.n_jobs, From e09ece767de7523f1eb0f911dcef1e80cf3d95c9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Oct 2022 18:07:14 +0200 Subject: [PATCH 03/10] TMP POC for CSC processing --- sklearn/cluster/_hdbscan/_reachability.pyx | 78 +++++++++++++--------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index fb9c288c039b1..be10d28ab8555 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -6,11 +6,17 @@ import numpy as np from scipy.sparse import issparse +cimport cython cimport numpy as cnp from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY +ctypedef fused integral: + int + long long + + def mutual_reachability_graph( distance_matrix, min_samples=5, max_distance=0.0, copy=False ): @@ -59,19 +65,30 @@ def mutual_reachability_graph( if copy: distance_matrix = distance_matrix.copy() + further_neighbor_idx = min_samples - 1 if issparse(distance_matrix): # FIXME: since we convert to a CSR matrix then we do not make the operation # in-place. - return _sparse_mutual_reachability_graph( - distance_matrix, min_samples=min_samples, max_distance=max_distance - ).tocsr() + distance_matrix = distance_matrix.tocsc() + _sparse_mutual_reachability_graph( + distance_matrix.data, + distance_matrix.indices, + distance_matrix.indptr, + distance_matrix.shape, + further_neighbor_idx=further_neighbor_idx, + max_distance=max_distance, + ) + else: + _dense_mutual_reachability_graph( + distance_matrix, further_neighbor_idx=further_neighbor_idx + ) + return distance_matrix - return _dense_mutual_reachability_graph(distance_matrix, min_samples=min_samples) cdef _dense_mutual_reachability_graph( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t min_samples=5 + cnp.intp_t further_neighbor_idx=5 ): """Dense implementation of mutual reachability graph. @@ -95,13 +112,12 @@ cdef _dense_mutual_reachability_graph( """ cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cnp.intp_t farther_neighbor_idx = min_samples - 1 cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances core_distances = np.partition( - distance_matrix, farther_neighbor_idx, axis=0 - )[farther_neighbor_idx] + distance_matrix, further_neighbor_idx, axis=0 + )[further_neighbor_idx] with nogil: for i in range(n_samples): @@ -112,13 +128,15 @@ cdef _dense_mutual_reachability_graph( distance_matrix[i, j], ) distance_matrix[i, j] = mutual_reachibility_distance - return distance_matrix # TODO: Rewrite for CSR. cdef _sparse_mutual_reachability_graph( - object distance_matrix, - cnp.intp_t min_samples=5, + cnp.ndarray[cnp.float64_t, ndim=1, mode="c"] data, + cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indices, + cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indptr, + cnp.intp_t n_samples, + cnp.intp_t further_neighbor_idx=5, cnp.float64_t max_distance=0.0, ): """Sparse implementation of mutual reachability graph. @@ -143,34 +161,30 @@ cdef _sparse_mutual_reachability_graph( is the same as `distance_matrix` since the operation is done in-place. """ cdef: - cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0] - list row_distances - cnp.intp_t farther_neighbor_idx = min_samples - 1 + cnp.intp_t i, col_ind, row_ind cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances - cnp.int32_t[:] nz_row_data, nz_col_data + cnp.float64_t[:] col_data + cnp.int32_t[:] row_indices core_distances = np.empty(n_samples, dtype=np.float64) for i in range(n_samples): - row_distances = distance_matrix.data[i] - if farther_neighbor_idx < len(row_distances): + col_data = data[indptr[i]:indptr[i + 1]] + if further_neighbor_idx < col_data.size: core_distances[i] = np.partition( - row_distances, farther_neighbor_idx - )[farther_neighbor_idx] + col_data, further_neighbor_idx + )[further_neighbor_idx] else: core_distances[i] = INFINITY - nz_row_data, nz_col_data = distance_matrix.nonzero() - for sample_idx in range(nz_row_data.shape[0]): - i, j = nz_row_data[sample_idx], nz_col_data[sample_idx] - mutual_reachibility_distance = max( - core_distances[i], core_distances[j], distance_matrix[i, j] - ) - if isfinite(mutual_reachibility_distance): - distance_matrix[i, j] = mutual_reachibility_distance - elif max_distance > 0: - # TODO: it seems that we assume that distance_matrix is initialized - # with zeros. - distance_matrix[i, j] = max_distance - return distance_matrix + for col_ind in range(n_samples): + for i in range(indptr[col_ind], indptr[col_ind + 1]): + row_ind = indices[i] + mutual_reachibility_distance = max( + core_distances[col_ind], core_distances[row_ind], data[i] + ) + if isfinite(mutual_reachibility_distance): + data[i] = mutual_reachibility_distance + elif max_distance > 0: + data[i] = max_distance From 1cb0db82f91e5bfee8cf445786aacf381e8911d8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 11:34:38 +0200 Subject: [PATCH 04/10] ENH CSR, fused type, no-copy --- sklearn/cluster/_hdbscan/_reachability.pyx | 101 ++++++++++----------- sklearn/cluster/_hdbscan/hdbscan.py | 35 ++----- 2 files changed, 59 insertions(+), 77 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index be10d28ab8555..d347882d5bc82 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -7,10 +7,12 @@ import numpy as np from scipy.sparse import issparse cimport cython +from cython cimport floating cimport numpy as cnp from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY +cnp.import_array() ctypedef fused integral: int @@ -18,7 +20,7 @@ ctypedef fused integral: def mutual_reachability_graph( - distance_matrix, min_samples=5, max_distance=0.0, copy=False + distance_matrix, min_samples=5, max_distance=0.0 ): """Compute the weighted adjacency matrix of the mutual reachability graph. @@ -29,11 +31,13 @@ def mutual_reachability_graph( and the core distance `d_core` is defined as the distance between a point `x_p` and its k-th nearest neighbor. + Note that all computations are done in-place. + Parameters ---------- distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples) Array of distances between samples. If sparse, the array must be in - `LIL` format. + `CSR` format. min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered @@ -45,10 +49,6 @@ def mutual_reachability_graph( truncated to `max_dist`. Only used when `distance_matrix` is a sparse matrix. - copy : bool, default=False - Whether or not to compute the mutual reachinbility graph in-place, i.e. - modifying directly `distance_matrix`. - Returns ------- mututal_reachability_graph: {ndarray, sparse matrix} of shape \ @@ -62,19 +62,17 @@ def mutual_reachability_graph( In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. """ - if copy: - distance_matrix = distance_matrix.copy() - further_neighbor_idx = min_samples - 1 if issparse(distance_matrix): - # FIXME: since we convert to a CSR matrix then we do not make the operation - # in-place. - distance_matrix = distance_matrix.tocsc() + if distance_matrix.format != "csr": + raise ValueError( + "Only sparse CSR matrices are supported for `distance_matrix`." + ) _sparse_mutual_reachability_graph( distance_matrix.data, distance_matrix.indices, distance_matrix.indptr, - distance_matrix.shape, + distance_matrix.shape[0], further_neighbor_idx=further_neighbor_idx, max_distance=max_distance, ) @@ -86,9 +84,9 @@ def mutual_reachability_graph( -cdef _dense_mutual_reachability_graph( - cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t further_neighbor_idx=5 +def _dense_mutual_reachability_graph( + cnp.ndarray[dtype=floating, ndim=2] distance_matrix, + cnp.intp_t further_neighbor_idx, ): """Dense implementation of mutual reachability graph. @@ -100,24 +98,20 @@ cdef _dense_mutual_reachability_graph( distance_matrix : ndarray of shape (n_samples, n_samples) Array of distances between samples. - min_samples : int, default=5 - The number of points in a neighbourhood for a point to be considered - a core point. - - Returns - ------- - mututal_reachability_graph : ndarray of shape (n_samples, n_samples) - Weighted adjacency matrix of the mutual reachability graph. This object - is the same as `distance_matrix` since the operation is done in-place. + further_neighbor_idx : int + The index of the furthest neighbor to use to define the core distances. """ cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cnp.float64_t mutual_reachibility_distance - cnp.float64_t[:] core_distances + floating mutual_reachibility_distance + floating[:] core_distances + # We assume that the distance matrix is symmetric. We choose to sort every + # row to have the same implementation than the sparse case that requires + # CSR matrix. core_distances = np.partition( - distance_matrix, further_neighbor_idx, axis=0 - )[further_neighbor_idx] + distance_matrix, further_neighbor_idx, axis=1 + )[:, further_neighbor_idx] with nogil: for i in range(n_samples): @@ -130,44 +124,47 @@ cdef _dense_mutual_reachability_graph( distance_matrix[i, j] = mutual_reachibility_distance -# TODO: Rewrite for CSR. -cdef _sparse_mutual_reachability_graph( - cnp.ndarray[cnp.float64_t, ndim=1, mode="c"] data, - cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indices, - cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indptr, +def _sparse_mutual_reachability_graph( + cnp.ndarray[floating, ndim=1, mode="c"] data, + cnp.ndarray[integral, ndim=1, mode="c"] indices, + cnp.ndarray[integral, ndim=1, mode="c"] indptr, cnp.intp_t n_samples, - cnp.intp_t further_neighbor_idx=5, - cnp.float64_t max_distance=0.0, + cnp.intp_t further_neighbor_idx, + cnp.float64_t max_distance, ): """Sparse implementation of mutual reachability graph. The computation is done in-place, i.e. the distance matrix is modified - directly. This implementation only accepts `LIL` format sparse matrices. + directly. This implementation only accepts `CSR` format sparse matrices. Parameters ---------- distance_matrix : sparse matrix of shape (n_samples, n_samples) Sparse matrix of distances between samples. The sparse format should - be `LIL`. + be `CSR`. - min_samples : int, default=5 - The number of points in a neighbourhood for a point to be considered - a core point. + further_neighbor_idx : int + The index of the furthest neighbor to use to define the core distances. - Returns - ------- - mututal_reachability_graph : sparse matrix of shape (n_samples, n_samples) - Weighted adjacency matrix of the mutual reachability graph. This object - is the same as `distance_matrix` since the operation is done in-place. + max_distance : float + The distance which `np.inf` is replaced with. When the true mutual- + reachability distance is measured to be infinite, it is instead + truncated to `max_dist`. Only used when `distance_matrix` is a sparse + matrix. """ cdef: - cnp.intp_t i, col_ind, row_ind - cnp.float64_t mutual_reachibility_distance - cnp.float64_t[:] core_distances - cnp.float64_t[:] col_data - cnp.int32_t[:] row_indices + cnp.intp_t i, col_ind + integral row_ind + floating mutual_reachibility_distance + floating[:] core_distances + floating[:] col_data + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 - core_distances = np.empty(n_samples, dtype=np.float64) + core_distances = np.empty(n_samples, dtype=dtype) for i in range(n_samples): col_data = data[indptr[i]:indptr[i + 1]] diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 5ff89f68dcf8d..753c4145ccc70 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -46,8 +46,8 @@ } -def _brute_mst(mutual_reachability, min_samples, sparse=False): - if not sparse: +def _brute_mst(mutual_reachability, min_samples): + if not issparse(mutual_reachability): return mst_from_distance_matrix(mutual_reachability) # Check connected component on mutual reachability @@ -116,10 +116,6 @@ def _hdbscan_brute( **metric_params, ): if metric == "precomputed": - # Treating this case explicitly, instead of letting - # sklearn.metrics.pairwise_distances handle it, - # enables the usage of numpy.inf in the distance - # matrix to indicate missing distance information. distance_matrix = X.copy() if copy else X else: distance_matrix = pairwise_distances( @@ -127,28 +123,18 @@ def _hdbscan_brute( ) distance_matrix /= alpha - # max_dist is only relevant for sparse and is ignored for dense max_distance = metric_params.get("max_distance", 0.0) - sparse = issparse(distance_matrix) - - # TODO: Investigate whether it is worth implementing a PWD backend for the - # combined operations of: - # - The pairwise distance calculation - # - The element-wise mutual-reachability calculation - # I suspect this would be better handled as one composite Cython routine to - # minimize memory-movement, however I (@micky774) am unsure whether it is - # narrow enough of a scope for the current PWD backend, or if it is better - # as a separate utility. - distance_matrix = distance_matrix.tolil() if sparse else distance_matrix + if issparse(distance_matrix) and distance_matrix.format != "csr": + # we need CSR format to avoid a conversion in `_brute_mst` when calling + # `csgraph.connected_components` + distance_matrix = distance_matrix.tocsr() # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. mutual_reachability_ = mutual_reachability_graph( distance_matrix, min_samples=min_samples, max_distance=max_distance ) - min_spanning_tree = _brute_mst( - mutual_reachability_, min_samples=min_samples, sparse=sparse - ) + min_spanning_tree = _brute_mst(mutual_reachability_, min_samples=min_samples) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): warn( @@ -358,10 +344,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): copy : bool, default=False If `copy=True` then any time an in-place modifications would be made that would overwrite data passed to :term:`fit`, a copy will first be - made, guaranteeing that the original data will be unchanged. Currently - this only makes a difference when passing in a dense precomputed - distance array (i.e. when `metric="precomputed"`) and using the - `"brute"` algorithm (see `algorithm` for details). + made, guaranteeing that the original data will be unchanged. + Currently, it only applies with `metric="precomputed"`, passing a dense + array or a sparse matrix of format CSR and algorithm used is `"brute"`. Attributes ---------- From 8a38591a81308263d1943a807cff79621fc89167 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 11:36:00 +0200 Subject: [PATCH 05/10] iter --- sklearn/cluster/_hdbscan/_reachability.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index d347882d5bc82..12fe7d5309152 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -1,6 +1,7 @@ # mutual reachability distance computations # Authors: Leland McInnes # Meekail Zain +# Guillaume Lemaitre # License: 3-clause BSD import numpy as np From 41cb21ed37a4debbf5207d0b4e207582a5e17589 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 11:38:59 +0200 Subject: [PATCH 06/10] homogeneous dtype for max_distance --- sklearn/cluster/_hdbscan/_reachability.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 12fe7d5309152..e451f124d270f 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -84,7 +84,6 @@ def mutual_reachability_graph( return distance_matrix - def _dense_mutual_reachability_graph( cnp.ndarray[dtype=floating, ndim=2] distance_matrix, cnp.intp_t further_neighbor_idx, @@ -131,7 +130,7 @@ def _sparse_mutual_reachability_graph( cnp.ndarray[integral, ndim=1, mode="c"] indptr, cnp.intp_t n_samples, cnp.intp_t further_neighbor_idx, - cnp.float64_t max_distance, + floating max_distance, ): """Sparse implementation of mutual reachability graph. From c510bf85ece258568f41ce38c45d414cd710e388 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:33:05 +0200 Subject: [PATCH 07/10] TST add a couple of tests (wip) --- sklearn/cluster/_hdbscan/tests/__init__.py | 0 .../_hdbscan/tests/test_reachibility.py | 50 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 sklearn/cluster/_hdbscan/tests/__init__.py create mode 100644 sklearn/cluster/_hdbscan/tests/test_reachibility.py diff --git a/sklearn/cluster/_hdbscan/tests/__init__.py b/sklearn/cluster/_hdbscan/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py new file mode 100644 index 0000000000000..9d610c9a10f2c --- /dev/null +++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest + +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, +) + +from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph + + +def test_mutual_reachability_graph_error_sparse_format(): + """Check that we raise an error if the sparse format is not CSR.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = X.T @ X + np.fill_diagonal(X, 0.0) + X = _convert_container(X, "sparse_csc") + + err_msg = "Only sparse CSR matrices are supported" + with pytest.raises(ValueError, match=err_msg): + mutual_reachability_graph(X) + + +@pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) +def test_mutual_reachability_graph_inplace(array_type): + """Check that the operation is happening inplace.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = X.T @ X + np.fill_diagonal(X, 0.0) + X = _convert_container(X, array_type) + + mr_graph = mutual_reachability_graph(X) + + assert id(mr_graph) == id(X) + + +def test_mutual_reachability_graph_equivalence_dense_sparse(): + """Check that we get the same results for dense and sparse implementation.""" + rng = np.random.RandomState(0) + X = rng.randn(5, 5) + X_dense = X.T @ X + np.fill_diagonal(X_dense, 0.0) + X_sparse = _convert_container(X_dense, "sparse_csr") + + mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3) + mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3) + + assert_allclose(mr_graph_dense, mr_graph_sparse.A) From 85c1914afa9b6f9e6a48678fe4f07db51d82b20a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:50:13 +0200 Subject: [PATCH 08/10] TST some more tests --- sklearn/cluster/_hdbscan/_reachability.pyx | 29 ++++++++++--------- .../_hdbscan/tests/test_reachibility.py | 16 +++++++++- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index e451f124d270f..d1716dd79e7fd 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -157,7 +157,7 @@ def _sparse_mutual_reachability_graph( integral row_ind floating mutual_reachibility_distance floating[:] core_distances - floating[:] col_data + floating[:] row_data if floating is float: dtype = np.float32 @@ -167,21 +167,22 @@ def _sparse_mutual_reachability_graph( core_distances = np.empty(n_samples, dtype=dtype) for i in range(n_samples): - col_data = data[indptr[i]:indptr[i + 1]] - if further_neighbor_idx < col_data.size: + row_data = data[indptr[i]:indptr[i + 1]] + if further_neighbor_idx < row_data.size: core_distances[i] = np.partition( - col_data, further_neighbor_idx + row_data, further_neighbor_idx )[further_neighbor_idx] else: core_distances[i] = INFINITY - for col_ind in range(n_samples): - for i in range(indptr[col_ind], indptr[col_ind + 1]): - row_ind = indices[i] - mutual_reachibility_distance = max( - core_distances[col_ind], core_distances[row_ind], data[i] - ) - if isfinite(mutual_reachibility_distance): - data[i] = mutual_reachibility_distance - elif max_distance > 0: - data[i] = max_distance + with nogil: + for col_ind in range(n_samples): + for i in range(indptr[col_ind], indptr[col_ind + 1]): + row_ind = indices[i] + mutual_reachibility_distance = max( + core_distances[col_ind], core_distances[row_ind], data[i] + ) + if isfinite(mutual_reachibility_distance): + data[i] = mutual_reachibility_distance + elif max_distance > 0: + data[i] = max_distance diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py index 9d610c9a10f2c..c8ba28d0af25b 100644 --- a/sklearn/cluster/_hdbscan/tests/test_reachibility.py +++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -41,10 +41,24 @@ def test_mutual_reachability_graph_equivalence_dense_sparse(): rng = np.random.RandomState(0) X = rng.randn(5, 5) X_dense = X.T @ X - np.fill_diagonal(X_dense, 0.0) X_sparse = _convert_container(X_dense, "sparse_csr") mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3) mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3) assert_allclose(mr_graph_dense, mr_graph_sparse.A) + + +@pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_mutual_reachability_graph_preserve_dtype(array_type, dtype): + """Check that the computation preserve dtype thanks to fused types.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = (X.T @ X).astype(dtype) + np.fill_diagonal(X, 0.0) + X = _convert_container(X, array_type) + + assert X.dtype == dtype + mr_graph = mutual_reachability_graph(X) + assert mr_graph.dtype == dtype From 9ba964d14803852f095909d428adf94ada2374b2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:53:01 +0200 Subject: [PATCH 09/10] fused type --- sklearn/cluster/_hdbscan/_reachability.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index d1716dd79e7fd..d925d1d22e62b 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -153,7 +153,7 @@ def _sparse_mutual_reachability_graph( matrix. """ cdef: - cnp.intp_t i, col_ind + integral i, col_ind integral row_ind floating mutual_reachibility_distance floating[:] core_distances From 0c65f8cc42217612654daf22805f28331e38fa95 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:58:39 +0200 Subject: [PATCH 10/10] FIX put correct name on indices --- sklearn/cluster/_hdbscan/_reachability.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index d925d1d22e62b..c83fef742e82e 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -153,8 +153,7 @@ def _sparse_mutual_reachability_graph( matrix. """ cdef: - integral i, col_ind - integral row_ind + integral i, col_ind, row_ind floating mutual_reachibility_distance floating[:] core_distances floating[:] row_data @@ -176,11 +175,11 @@ def _sparse_mutual_reachability_graph( core_distances[i] = INFINITY with nogil: - for col_ind in range(n_samples): - for i in range(indptr[col_ind], indptr[col_ind + 1]): - row_ind = indices[i] + for row_ind in range(n_samples): + for i in range(indptr[row_ind], indptr[row_ind + 1]): + col_ind = indices[i] mutual_reachibility_distance = max( - core_distances[col_ind], core_distances[row_ind], data[i] + core_distances[row_ind], core_distances[col_ind], data[i] ) if isfinite(mutual_reachibility_distance): data[i] = mutual_reachibility_distance