From a068e51f65d8b8981224295ccb76812f462cda02 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 18 Oct 2022 19:35:46 -0400 Subject: [PATCH 01/25] Improved documentation --- sklearn/cluster/_hdbscan/_reachability.pyx | 11 ++++------- sklearn/cluster/_hdbscan/hdbscan.py | 8 ++++++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 64aa9573e103a..4118732d6e623 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -4,16 +4,13 @@ # License: 3-clause BSD import numpy as np -from cython.parallel cimport prange +from scipy.sparse import issparse +from ...neighbors import BallTree, KDTree + cimport numpy as cnp +from cython.parallel cimport prange from libc.math cimport isfinite -import gc - -from scipy.sparse import issparse -from scipy.spatial.distance import pdist, squareform - -from ...neighbors import BallTree, KDTree def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): """Compute the weighted adjacency matrix of the mutual reachability diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 79beead943898..1658f61b52aed 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -130,6 +130,14 @@ def _hdbscan_brute( # max_dist is only relevant for sparse and is ignored for dense max_dist = metric_params.get("max_dist", 0.0) sparse = issparse(distance_matrix) + + # TODO: Investigate whether it is worth implementing a PWD backend for the + # combined operations of: + # - The pairwise distance calculation + # - The element-wise mutual-reachability calculation + # I suspect this would be better handled as one composite Cython routine to + # minimize memory-movement, however I (@micky774) am unsure whether it is + # narrow enough of a scope for the current PWD backend. distance_matrix = distance_matrix.tolil() if sparse else distance_matrix # Note that `distance_matrix` is manipulated in-place, however we do not From 06152a302916cb1459f9a1589aa6cfad74ccdff4 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 18 Oct 2022 20:20:25 -0400 Subject: [PATCH 02/25] Updated comment --- sklearn/cluster/_hdbscan/hdbscan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 1658f61b52aed..4a8760503ae40 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -137,7 +137,8 @@ def _hdbscan_brute( # - The element-wise mutual-reachability calculation # I suspect this would be better handled as one composite Cython routine to # minimize memory-movement, however I (@micky774) am unsure whether it is - # narrow enough of a scope for the current PWD backend. + # narrow enough of a scope for the current PWD backend, or if it is better + # as a separate utility. distance_matrix = distance_matrix.tolil() if sparse else distance_matrix # Note that `distance_matrix` is manipulated in-place, however we do not From 0ab847cdb8a792e812ee95a5adcc034d6523c031 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Oct 2022 15:40:55 +0200 Subject: [PATCH 03/25] MAINT further style improvement --- sklearn/cluster/_hdbscan/_reachability.pyx | 179 +++++++++++++-------- sklearn/cluster/_hdbscan/hdbscan.py | 22 +-- 2 files changed, 127 insertions(+), 74 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 64aa9573e103a..0cf3f99fdc2a7 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -1,44 +1,54 @@ -# mutual reachability distance compiutations +# mutual reachability distance computations # Authors: Leland McInnes # Meekail Zain # License: 3-clause BSD import numpy as np -from cython.parallel cimport prange +from scipy.sparse import issparse + +from ...neighbors import BallTree, KDTree + cimport numpy as cnp -from libc.math cimport isfinite +from cython.parallel cimport prange +from libc.math cimport isfinite, INFINITY -import gc -from scipy.sparse import issparse -from scipy.spatial.distance import pdist, squareform +def mutual_reachability_graph( + distance_matrix, n_neighbors=5, max_distance=0.0, copy=False +): + """Compute the weighted adjacency matrix of the mutual reachability graph. -from ...neighbors import BallTree, KDTree + The mutual reachability distance used to build the graph is defined as:: -def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): - """Compute the weighted adjacency matrix of the mutual reachability - graph of a distance matrix. Note that computation is performed in-place for - `distance_matrix`. If out-of-place computation is required, pass a copy to - this function. + max(d_core(x_p), d_core(x_q), d(x_p, x_q)) + + and the core distance `d_core` is defined as the distance between a point + `x_p` and its k-th nearest neighbor. Parameters ---------- - distance_matrix : ndarray or sparse matrix of shape (n_samples, n_samples) + distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples) Array of distances between samples. If sparse, the array must be in `LIL` format. - min_points : int, default=5 + n_neighbors : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. - max_dist : float, default=0.0 + max_distance : float, default=0.0 The distance which `np.inf` is replaced with. When the true mutual- reachability distance is measured to be infinite, it is instead - truncated to `max_dist`. + truncated to `max_dist`. Only used when `distance_matrix` is a sparse + matrix. + + copy : bool, default=False + Whether or not to compute the mutual reachinbility graph in-place, i.e. + modifying directly `distance_matrix`. Returns ------- - mututal_reachability: ndarray of shape (n_samples, n_samples) + mututal_reachability_graph: {ndarray, sparse matrix} of shape \ + (n_samples, n_samples) Weighted adjacency matrix of the mutual reachability graph. References @@ -48,78 +58,121 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. """ - # Account for index offset - min_points -= 1 + if copy: + distance_matrix = distance_matrix.copy() - # Note that in both routines `distance_matrix` is operated on in-place. At - # this point, if out-of-place operation is desired then this function - # should have been passed a copy. if issparse(distance_matrix): - return _sparse_mutual_reachability( - distance_matrix, - min_points=min_points, - max_dist=max_dist + # FIXME: since we convert to a CSR matrix then we do not make the operation + # in-place. + return _sparse_mutual_reachability_graph( + distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance ).tocsr() - return _dense_mutual_reachability(distance_matrix, min_points=min_points) + return _dense_mutual_reachability_graph(distance_matrix, n_neighbors=n_neighbors) + -cdef _dense_mutual_reachability( +cdef _dense_mutual_reachability_graph( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t min_points=5 + cnp.intp_t n_neighbors=5 ): - cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cdef cnp.float64_t mr_dist - cdef cnp.float64_t[:] core_distances + """Dense implementation of mutual reachability graph. + + The computation is done in-place, i.e. the distance matrix is modified + directly. + + Parameters + ---------- + distance_matrix : ndarray of shape (n_samples, n_samples) + Array of distances between samples. + + n_neighbors : int, default=5 + The number of points in a neighbourhood for a point to be considered + a core point. + + Returns + ------- + mututal_reachability_graph : ndarray of shape (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. This object + is the same as `distance_matrix` since the operation is done in-place. + """ + cdef: + cnp.intp_t i, j, n_samples = distance_matrix.shape[0] + cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.float64_t mutual_reachibility_distance + cnp.float64_t[:] core_distances - # Compute the core distances for all samples `x_p` corresponding - # to the distance of the k-th farthest neighbours (including - # `x_p`). core_distances = np.partition( - distance_matrix, - min_points, - axis=0, - )[min_points] + distance_matrix, farther_neighbor_idx, axis=0 + )[farther_neighbor_idx] with nogil: for i in range(n_samples): for j in prange(n_samples): - mr_dist = max( + mutual_reachibility_distance = max( core_distances[i], core_distances[j], - distance_matrix[i, j] + distance_matrix[i, j], ) - distance_matrix[i, j] = mr_dist + distance_matrix[i, j] = mutual_reachibility_distance return distance_matrix -# Assumes LIL format. + # TODO: Rewrite for CSR. -cdef _sparse_mutual_reachability( +cdef _sparse_mutual_reachability_graph( object distance_matrix, - cnp.intp_t min_points=5, - cnp.float64_t max_dist=0. + cnp.intp_t n_neighbors=5, + cnp.float64_t max_distance=0.0, ): - cdef cnp.intp_t i, j, n, n_samples = distance_matrix.shape[0] - cdef cnp.float64_t mr_dist - cdef cnp.float64_t[:] core_distances - cdef cnp.int32_t[:] nz_row_data, nz_col_data + """Sparse implementation of mutual reachability graph. + + The computation is done in-place, i.e. the distance matrix is modified + directly. This implementation only accepts `LIL` format sparse matrices. + + Parameters + ---------- + distance_matrix : sparse matrix of shape (n_samples, n_samples) + Sparse matrix of distances between samples. The sparse format should + be `LIL`. + + n_neighbors : int, default=5 + The number of points in a neighbourhood for a point to be considered + a core point. + + Returns + ------- + mututal_reachability_graph : sparse matrix of shape (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. This object + is the same as `distance_matrix` since the operation is done in-place. + """ + cdef: + cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0] + list row_distances + cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.float64_t mutual_reachibility_distance + cnp.float64_t[:] core_distances + cnp.int32_t[:] nz_row_data, nz_col_data + core_distances = np.empty(n_samples, dtype=np.float64) for i in range(n_samples): - if min_points < len(distance_matrix.data[i]): + row_distances = distance_matrix.data[i] + if farther_neighbor_idx < len(row_distances): core_distances[i] = np.partition( - distance_matrix.data[i], - min_points - )[min_points] + row_distances, farther_neighbor_idx + )[farther_neighbor_idx] else: - core_distances[i] = np.infty + core_distances[i] = INFINITY nz_row_data, nz_col_data = distance_matrix.nonzero() - for n in range(nz_row_data.shape[0]): - i = nz_row_data[n] - j = nz_col_data[n] - mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) - if isfinite(mr_dist): - distance_matrix[i, j] = mr_dist - elif max_dist > 0: - distance_matrix[i, j] = max_dist + for sample_idx in range(nz_row_data.shape[0]): + i, j = nz_row_data[sample_idx], nz_col_data[sample_idx] + mutual_reachibility_distance = max( + core_distances[i], core_distances[j], distance_matrix[i, j] + ) + if isfinite(mutual_reachibility_distance): + distance_matrix[i, j] = mutual_reachibility_distance + elif max_distance > 0: + # TODO: it seems that we assume that distance_matrix is initialized + # with zeros. + distance_matrix[i, j] = max_distance return distance_matrix diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 79beead943898..fe2a641bb07f5 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -22,7 +22,7 @@ from ...utils._param_validation import Interval, StrOptions from ...utils.validation import _assert_all_finite from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix -from ._reachability import mutual_reachability +from ._reachability import mutual_reachability_graph from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics @@ -63,7 +63,7 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False): f"There exists points with fewer than {min_samples} neighbors. Ensure" " your distance matrix has non-zero values for at least" f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" - " graph), or specify a `max_dist` in `metric_params` to use when" + " graph), or specify a `max_distance` in `metric_params` to use when" " distances are missing." ) @@ -108,7 +108,7 @@ def _process_mst(min_spanning_tree): def _hdbscan_brute( X, - min_samples=5, + n_neighbors=5, alpha=None, metric="euclidean", n_jobs=None, @@ -128,17 +128,17 @@ def _hdbscan_brute( distance_matrix /= alpha # max_dist is only relevant for sparse and is ignored for dense - max_dist = metric_params.get("max_dist", 0.0) + max_distance = metric_params.get("max_distance", 0.0) sparse = issparse(distance_matrix) distance_matrix = distance_matrix.tolil() if sparse else distance_matrix # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. - mutual_reachability_ = mutual_reachability( - distance_matrix, min_points=min_samples, max_dist=max_dist + mutual_reachability_ = mutual_reachability_graph( + distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance ) min_spanning_tree = _brute_mst( - mutual_reachability_, min_samples=min_samples, sparse=sparse + mutual_reachability_, min_samples=n_neighbors, sparse=sparse ) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): @@ -156,7 +156,7 @@ def _hdbscan_brute( def _hdbscan_prims( X, algo, - min_samples=5, + n_neighbors=5, alpha=1.0, metric="euclidean", leaf_size=40, @@ -168,7 +168,7 @@ def _hdbscan_prims( # Get distance to kth nearest neighbour nbrs = NearestNeighbors( - n_neighbors=min_samples, + n_neighbors=n_neighbors, algorithm=algo, leaf_size=leaf_size, metric=metric, @@ -177,7 +177,7 @@ def _hdbscan_prims( p=None, ).fit(X) - neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) + neighbors_distances, _ = nbrs.kneighbors(X, n_neighbors, return_distance=True) core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) dist_metric = DistanceMetric.get_metric(metric, **metric_params) @@ -590,7 +590,7 @@ def fit(self, X, y=None): mst_func = None kwargs = dict( X=X, - min_samples=self._min_samples, + n_neighbors=self._min_samples, alpha=self.alpha, metric=self.metric, n_jobs=self.n_jobs, From d6a59a53be9dd0ea60e8e4b2a17d65c8d9398e40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Oct 2022 15:54:33 +0200 Subject: [PATCH 04/25] FIX let's be consistent and call min_samples --- sklearn/cluster/_hdbscan/_reachability.pyx | 20 ++++++++++---------- sklearn/cluster/_hdbscan/hdbscan.py | 14 +++++++------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 3c5a8b86a9f5a..fb9c288c039b1 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -12,7 +12,7 @@ from libc.math cimport isfinite, INFINITY def mutual_reachability_graph( - distance_matrix, n_neighbors=5, max_distance=0.0, copy=False + distance_matrix, min_samples=5, max_distance=0.0, copy=False ): """Compute the weighted adjacency matrix of the mutual reachability graph. @@ -29,7 +29,7 @@ def mutual_reachability_graph( Array of distances between samples. If sparse, the array must be in `LIL` format. - n_neighbors : int, default=5 + min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. @@ -63,15 +63,15 @@ def mutual_reachability_graph( # FIXME: since we convert to a CSR matrix then we do not make the operation # in-place. return _sparse_mutual_reachability_graph( - distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance + distance_matrix, min_samples=min_samples, max_distance=max_distance ).tocsr() - return _dense_mutual_reachability_graph(distance_matrix, n_neighbors=n_neighbors) + return _dense_mutual_reachability_graph(distance_matrix, min_samples=min_samples) cdef _dense_mutual_reachability_graph( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t n_neighbors=5 + cnp.intp_t min_samples=5 ): """Dense implementation of mutual reachability graph. @@ -83,7 +83,7 @@ cdef _dense_mutual_reachability_graph( distance_matrix : ndarray of shape (n_samples, n_samples) Array of distances between samples. - n_neighbors : int, default=5 + min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. @@ -95,7 +95,7 @@ cdef _dense_mutual_reachability_graph( """ cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.intp_t farther_neighbor_idx = min_samples - 1 cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances @@ -118,7 +118,7 @@ cdef _dense_mutual_reachability_graph( # TODO: Rewrite for CSR. cdef _sparse_mutual_reachability_graph( object distance_matrix, - cnp.intp_t n_neighbors=5, + cnp.intp_t min_samples=5, cnp.float64_t max_distance=0.0, ): """Sparse implementation of mutual reachability graph. @@ -132,7 +132,7 @@ cdef _sparse_mutual_reachability_graph( Sparse matrix of distances between samples. The sparse format should be `LIL`. - n_neighbors : int, default=5 + min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered a core point. @@ -145,7 +145,7 @@ cdef _sparse_mutual_reachability_graph( cdef: cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0] list row_distances - cnp.intp_t farther_neighbor_idx = n_neighbors - 1 + cnp.intp_t farther_neighbor_idx = min_samples - 1 cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances cnp.int32_t[:] nz_row_data, nz_col_data diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index d5da676409423..5ff89f68dcf8d 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -108,7 +108,7 @@ def _process_mst(min_spanning_tree): def _hdbscan_brute( X, - n_neighbors=5, + min_samples=5, alpha=None, metric="euclidean", n_jobs=None, @@ -144,10 +144,10 @@ def _hdbscan_brute( # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. mutual_reachability_ = mutual_reachability_graph( - distance_matrix, n_neighbors=n_neighbors, max_distance=max_distance + distance_matrix, min_samples=min_samples, max_distance=max_distance ) min_spanning_tree = _brute_mst( - mutual_reachability_, min_samples=n_neighbors, sparse=sparse + mutual_reachability_, min_samples=min_samples, sparse=sparse ) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): @@ -165,7 +165,7 @@ def _hdbscan_brute( def _hdbscan_prims( X, algo, - n_neighbors=5, + min_samples=5, alpha=1.0, metric="euclidean", leaf_size=40, @@ -177,7 +177,7 @@ def _hdbscan_prims( # Get distance to kth nearest neighbour nbrs = NearestNeighbors( - n_neighbors=n_neighbors, + n_neighbors=min_samples, algorithm=algo, leaf_size=leaf_size, metric=metric, @@ -186,7 +186,7 @@ def _hdbscan_prims( p=None, ).fit(X) - neighbors_distances, _ = nbrs.kneighbors(X, n_neighbors, return_distance=True) + neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) dist_metric = DistanceMetric.get_metric(metric, **metric_params) @@ -599,7 +599,7 @@ def fit(self, X, y=None): mst_func = None kwargs = dict( X=X, - n_neighbors=self._min_samples, + min_samples=self._min_samples, alpha=self.alpha, metric=self.metric, n_jobs=self.n_jobs, From e09ece767de7523f1eb0f911dcef1e80cf3d95c9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 19 Oct 2022 18:07:14 +0200 Subject: [PATCH 05/25] TMP POC for CSC processing --- sklearn/cluster/_hdbscan/_reachability.pyx | 78 +++++++++++++--------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index fb9c288c039b1..be10d28ab8555 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -6,11 +6,17 @@ import numpy as np from scipy.sparse import issparse +cimport cython cimport numpy as cnp from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY +ctypedef fused integral: + int + long long + + def mutual_reachability_graph( distance_matrix, min_samples=5, max_distance=0.0, copy=False ): @@ -59,19 +65,30 @@ def mutual_reachability_graph( if copy: distance_matrix = distance_matrix.copy() + further_neighbor_idx = min_samples - 1 if issparse(distance_matrix): # FIXME: since we convert to a CSR matrix then we do not make the operation # in-place. - return _sparse_mutual_reachability_graph( - distance_matrix, min_samples=min_samples, max_distance=max_distance - ).tocsr() + distance_matrix = distance_matrix.tocsc() + _sparse_mutual_reachability_graph( + distance_matrix.data, + distance_matrix.indices, + distance_matrix.indptr, + distance_matrix.shape, + further_neighbor_idx=further_neighbor_idx, + max_distance=max_distance, + ) + else: + _dense_mutual_reachability_graph( + distance_matrix, further_neighbor_idx=further_neighbor_idx + ) + return distance_matrix - return _dense_mutual_reachability_graph(distance_matrix, min_samples=min_samples) cdef _dense_mutual_reachability_graph( cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t min_samples=5 + cnp.intp_t further_neighbor_idx=5 ): """Dense implementation of mutual reachability graph. @@ -95,13 +112,12 @@ cdef _dense_mutual_reachability_graph( """ cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cnp.intp_t farther_neighbor_idx = min_samples - 1 cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances core_distances = np.partition( - distance_matrix, farther_neighbor_idx, axis=0 - )[farther_neighbor_idx] + distance_matrix, further_neighbor_idx, axis=0 + )[further_neighbor_idx] with nogil: for i in range(n_samples): @@ -112,13 +128,15 @@ cdef _dense_mutual_reachability_graph( distance_matrix[i, j], ) distance_matrix[i, j] = mutual_reachibility_distance - return distance_matrix # TODO: Rewrite for CSR. cdef _sparse_mutual_reachability_graph( - object distance_matrix, - cnp.intp_t min_samples=5, + cnp.ndarray[cnp.float64_t, ndim=1, mode="c"] data, + cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indices, + cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indptr, + cnp.intp_t n_samples, + cnp.intp_t further_neighbor_idx=5, cnp.float64_t max_distance=0.0, ): """Sparse implementation of mutual reachability graph. @@ -143,34 +161,30 @@ cdef _sparse_mutual_reachability_graph( is the same as `distance_matrix` since the operation is done in-place. """ cdef: - cnp.intp_t i, j, sample_idx, n_samples = distance_matrix.shape[0] - list row_distances - cnp.intp_t farther_neighbor_idx = min_samples - 1 + cnp.intp_t i, col_ind, row_ind cnp.float64_t mutual_reachibility_distance cnp.float64_t[:] core_distances - cnp.int32_t[:] nz_row_data, nz_col_data + cnp.float64_t[:] col_data + cnp.int32_t[:] row_indices core_distances = np.empty(n_samples, dtype=np.float64) for i in range(n_samples): - row_distances = distance_matrix.data[i] - if farther_neighbor_idx < len(row_distances): + col_data = data[indptr[i]:indptr[i + 1]] + if further_neighbor_idx < col_data.size: core_distances[i] = np.partition( - row_distances, farther_neighbor_idx - )[farther_neighbor_idx] + col_data, further_neighbor_idx + )[further_neighbor_idx] else: core_distances[i] = INFINITY - nz_row_data, nz_col_data = distance_matrix.nonzero() - for sample_idx in range(nz_row_data.shape[0]): - i, j = nz_row_data[sample_idx], nz_col_data[sample_idx] - mutual_reachibility_distance = max( - core_distances[i], core_distances[j], distance_matrix[i, j] - ) - if isfinite(mutual_reachibility_distance): - distance_matrix[i, j] = mutual_reachibility_distance - elif max_distance > 0: - # TODO: it seems that we assume that distance_matrix is initialized - # with zeros. - distance_matrix[i, j] = max_distance - return distance_matrix + for col_ind in range(n_samples): + for i in range(indptr[col_ind], indptr[col_ind + 1]): + row_ind = indices[i] + mutual_reachibility_distance = max( + core_distances[col_ind], core_distances[row_ind], data[i] + ) + if isfinite(mutual_reachibility_distance): + data[i] = mutual_reachibility_distance + elif max_distance > 0: + data[i] = max_distance From 1cb0db82f91e5bfee8cf445786aacf381e8911d8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 11:34:38 +0200 Subject: [PATCH 06/25] ENH CSR, fused type, no-copy --- sklearn/cluster/_hdbscan/_reachability.pyx | 101 ++++++++++----------- sklearn/cluster/_hdbscan/hdbscan.py | 35 ++----- 2 files changed, 59 insertions(+), 77 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index be10d28ab8555..d347882d5bc82 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -7,10 +7,12 @@ import numpy as np from scipy.sparse import issparse cimport cython +from cython cimport floating cimport numpy as cnp from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY +cnp.import_array() ctypedef fused integral: int @@ -18,7 +20,7 @@ ctypedef fused integral: def mutual_reachability_graph( - distance_matrix, min_samples=5, max_distance=0.0, copy=False + distance_matrix, min_samples=5, max_distance=0.0 ): """Compute the weighted adjacency matrix of the mutual reachability graph. @@ -29,11 +31,13 @@ def mutual_reachability_graph( and the core distance `d_core` is defined as the distance between a point `x_p` and its k-th nearest neighbor. + Note that all computations are done in-place. + Parameters ---------- distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples) Array of distances between samples. If sparse, the array must be in - `LIL` format. + `CSR` format. min_samples : int, default=5 The number of points in a neighbourhood for a point to be considered @@ -45,10 +49,6 @@ def mutual_reachability_graph( truncated to `max_dist`. Only used when `distance_matrix` is a sparse matrix. - copy : bool, default=False - Whether or not to compute the mutual reachinbility graph in-place, i.e. - modifying directly `distance_matrix`. - Returns ------- mututal_reachability_graph: {ndarray, sparse matrix} of shape \ @@ -62,19 +62,17 @@ def mutual_reachability_graph( In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. """ - if copy: - distance_matrix = distance_matrix.copy() - further_neighbor_idx = min_samples - 1 if issparse(distance_matrix): - # FIXME: since we convert to a CSR matrix then we do not make the operation - # in-place. - distance_matrix = distance_matrix.tocsc() + if distance_matrix.format != "csr": + raise ValueError( + "Only sparse CSR matrices are supported for `distance_matrix`." + ) _sparse_mutual_reachability_graph( distance_matrix.data, distance_matrix.indices, distance_matrix.indptr, - distance_matrix.shape, + distance_matrix.shape[0], further_neighbor_idx=further_neighbor_idx, max_distance=max_distance, ) @@ -86,9 +84,9 @@ def mutual_reachability_graph( -cdef _dense_mutual_reachability_graph( - cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, - cnp.intp_t further_neighbor_idx=5 +def _dense_mutual_reachability_graph( + cnp.ndarray[dtype=floating, ndim=2] distance_matrix, + cnp.intp_t further_neighbor_idx, ): """Dense implementation of mutual reachability graph. @@ -100,24 +98,20 @@ cdef _dense_mutual_reachability_graph( distance_matrix : ndarray of shape (n_samples, n_samples) Array of distances between samples. - min_samples : int, default=5 - The number of points in a neighbourhood for a point to be considered - a core point. - - Returns - ------- - mututal_reachability_graph : ndarray of shape (n_samples, n_samples) - Weighted adjacency matrix of the mutual reachability graph. This object - is the same as `distance_matrix` since the operation is done in-place. + further_neighbor_idx : int + The index of the furthest neighbor to use to define the core distances. """ cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] - cnp.float64_t mutual_reachibility_distance - cnp.float64_t[:] core_distances + floating mutual_reachibility_distance + floating[:] core_distances + # We assume that the distance matrix is symmetric. We choose to sort every + # row to have the same implementation than the sparse case that requires + # CSR matrix. core_distances = np.partition( - distance_matrix, further_neighbor_idx, axis=0 - )[further_neighbor_idx] + distance_matrix, further_neighbor_idx, axis=1 + )[:, further_neighbor_idx] with nogil: for i in range(n_samples): @@ -130,44 +124,47 @@ cdef _dense_mutual_reachability_graph( distance_matrix[i, j] = mutual_reachibility_distance -# TODO: Rewrite for CSR. -cdef _sparse_mutual_reachability_graph( - cnp.ndarray[cnp.float64_t, ndim=1, mode="c"] data, - cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indices, - cnp.ndarray[cnp.int32_t, ndim=1, mode="c"] indptr, +def _sparse_mutual_reachability_graph( + cnp.ndarray[floating, ndim=1, mode="c"] data, + cnp.ndarray[integral, ndim=1, mode="c"] indices, + cnp.ndarray[integral, ndim=1, mode="c"] indptr, cnp.intp_t n_samples, - cnp.intp_t further_neighbor_idx=5, - cnp.float64_t max_distance=0.0, + cnp.intp_t further_neighbor_idx, + cnp.float64_t max_distance, ): """Sparse implementation of mutual reachability graph. The computation is done in-place, i.e. the distance matrix is modified - directly. This implementation only accepts `LIL` format sparse matrices. + directly. This implementation only accepts `CSR` format sparse matrices. Parameters ---------- distance_matrix : sparse matrix of shape (n_samples, n_samples) Sparse matrix of distances between samples. The sparse format should - be `LIL`. + be `CSR`. - min_samples : int, default=5 - The number of points in a neighbourhood for a point to be considered - a core point. + further_neighbor_idx : int + The index of the furthest neighbor to use to define the core distances. - Returns - ------- - mututal_reachability_graph : sparse matrix of shape (n_samples, n_samples) - Weighted adjacency matrix of the mutual reachability graph. This object - is the same as `distance_matrix` since the operation is done in-place. + max_distance : float + The distance which `np.inf` is replaced with. When the true mutual- + reachability distance is measured to be infinite, it is instead + truncated to `max_dist`. Only used when `distance_matrix` is a sparse + matrix. """ cdef: - cnp.intp_t i, col_ind, row_ind - cnp.float64_t mutual_reachibility_distance - cnp.float64_t[:] core_distances - cnp.float64_t[:] col_data - cnp.int32_t[:] row_indices + cnp.intp_t i, col_ind + integral row_ind + floating mutual_reachibility_distance + floating[:] core_distances + floating[:] col_data + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 - core_distances = np.empty(n_samples, dtype=np.float64) + core_distances = np.empty(n_samples, dtype=dtype) for i in range(n_samples): col_data = data[indptr[i]:indptr[i + 1]] diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 5ff89f68dcf8d..753c4145ccc70 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -46,8 +46,8 @@ } -def _brute_mst(mutual_reachability, min_samples, sparse=False): - if not sparse: +def _brute_mst(mutual_reachability, min_samples): + if not issparse(mutual_reachability): return mst_from_distance_matrix(mutual_reachability) # Check connected component on mutual reachability @@ -116,10 +116,6 @@ def _hdbscan_brute( **metric_params, ): if metric == "precomputed": - # Treating this case explicitly, instead of letting - # sklearn.metrics.pairwise_distances handle it, - # enables the usage of numpy.inf in the distance - # matrix to indicate missing distance information. distance_matrix = X.copy() if copy else X else: distance_matrix = pairwise_distances( @@ -127,28 +123,18 @@ def _hdbscan_brute( ) distance_matrix /= alpha - # max_dist is only relevant for sparse and is ignored for dense max_distance = metric_params.get("max_distance", 0.0) - sparse = issparse(distance_matrix) - - # TODO: Investigate whether it is worth implementing a PWD backend for the - # combined operations of: - # - The pairwise distance calculation - # - The element-wise mutual-reachability calculation - # I suspect this would be better handled as one composite Cython routine to - # minimize memory-movement, however I (@micky774) am unsure whether it is - # narrow enough of a scope for the current PWD backend, or if it is better - # as a separate utility. - distance_matrix = distance_matrix.tolil() if sparse else distance_matrix + if issparse(distance_matrix) and distance_matrix.format != "csr": + # we need CSR format to avoid a conversion in `_brute_mst` when calling + # `csgraph.connected_components` + distance_matrix = distance_matrix.tocsr() # Note that `distance_matrix` is manipulated in-place, however we do not # need it for anything else past this point, hence the operation is safe. mutual_reachability_ = mutual_reachability_graph( distance_matrix, min_samples=min_samples, max_distance=max_distance ) - min_spanning_tree = _brute_mst( - mutual_reachability_, min_samples=min_samples, sparse=sparse - ) + min_spanning_tree = _brute_mst(mutual_reachability_, min_samples=min_samples) # Warn if the MST couldn't be constructed around the missing distances if np.isinf(min_spanning_tree.T[2]).any(): warn( @@ -358,10 +344,9 @@ class HDBSCAN(ClusterMixin, BaseEstimator): copy : bool, default=False If `copy=True` then any time an in-place modifications would be made that would overwrite data passed to :term:`fit`, a copy will first be - made, guaranteeing that the original data will be unchanged. Currently - this only makes a difference when passing in a dense precomputed - distance array (i.e. when `metric="precomputed"`) and using the - `"brute"` algorithm (see `algorithm` for details). + made, guaranteeing that the original data will be unchanged. + Currently, it only applies with `metric="precomputed"`, passing a dense + array or a sparse matrix of format CSR and algorithm used is `"brute"`. Attributes ---------- From 8a38591a81308263d1943a807cff79621fc89167 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 11:36:00 +0200 Subject: [PATCH 07/25] iter --- sklearn/cluster/_hdbscan/_reachability.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index d347882d5bc82..12fe7d5309152 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -1,6 +1,7 @@ # mutual reachability distance computations # Authors: Leland McInnes # Meekail Zain +# Guillaume Lemaitre # License: 3-clause BSD import numpy as np From 41cb21ed37a4debbf5207d0b4e207582a5e17589 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 11:38:59 +0200 Subject: [PATCH 08/25] homogeneous dtype for max_distance --- sklearn/cluster/_hdbscan/_reachability.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 12fe7d5309152..e451f124d270f 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -84,7 +84,6 @@ def mutual_reachability_graph( return distance_matrix - def _dense_mutual_reachability_graph( cnp.ndarray[dtype=floating, ndim=2] distance_matrix, cnp.intp_t further_neighbor_idx, @@ -131,7 +130,7 @@ def _sparse_mutual_reachability_graph( cnp.ndarray[integral, ndim=1, mode="c"] indptr, cnp.intp_t n_samples, cnp.intp_t further_neighbor_idx, - cnp.float64_t max_distance, + floating max_distance, ): """Sparse implementation of mutual reachability graph. From c510bf85ece258568f41ce38c45d414cd710e388 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:33:05 +0200 Subject: [PATCH 09/25] TST add a couple of tests (wip) --- sklearn/cluster/_hdbscan/tests/__init__.py | 0 .../_hdbscan/tests/test_reachibility.py | 50 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 sklearn/cluster/_hdbscan/tests/__init__.py create mode 100644 sklearn/cluster/_hdbscan/tests/test_reachibility.py diff --git a/sklearn/cluster/_hdbscan/tests/__init__.py b/sklearn/cluster/_hdbscan/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py new file mode 100644 index 0000000000000..9d610c9a10f2c --- /dev/null +++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest + +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, +) + +from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph + + +def test_mutual_reachability_graph_error_sparse_format(): + """Check that we raise an error if the sparse format is not CSR.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = X.T @ X + np.fill_diagonal(X, 0.0) + X = _convert_container(X, "sparse_csc") + + err_msg = "Only sparse CSR matrices are supported" + with pytest.raises(ValueError, match=err_msg): + mutual_reachability_graph(X) + + +@pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) +def test_mutual_reachability_graph_inplace(array_type): + """Check that the operation is happening inplace.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = X.T @ X + np.fill_diagonal(X, 0.0) + X = _convert_container(X, array_type) + + mr_graph = mutual_reachability_graph(X) + + assert id(mr_graph) == id(X) + + +def test_mutual_reachability_graph_equivalence_dense_sparse(): + """Check that we get the same results for dense and sparse implementation.""" + rng = np.random.RandomState(0) + X = rng.randn(5, 5) + X_dense = X.T @ X + np.fill_diagonal(X_dense, 0.0) + X_sparse = _convert_container(X_dense, "sparse_csr") + + mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3) + mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3) + + assert_allclose(mr_graph_dense, mr_graph_sparse.A) From 85c1914afa9b6f9e6a48678fe4f07db51d82b20a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:50:13 +0200 Subject: [PATCH 10/25] TST some more tests --- sklearn/cluster/_hdbscan/_reachability.pyx | 29 ++++++++++--------- .../_hdbscan/tests/test_reachibility.py | 16 +++++++++- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index e451f124d270f..d1716dd79e7fd 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -157,7 +157,7 @@ def _sparse_mutual_reachability_graph( integral row_ind floating mutual_reachibility_distance floating[:] core_distances - floating[:] col_data + floating[:] row_data if floating is float: dtype = np.float32 @@ -167,21 +167,22 @@ def _sparse_mutual_reachability_graph( core_distances = np.empty(n_samples, dtype=dtype) for i in range(n_samples): - col_data = data[indptr[i]:indptr[i + 1]] - if further_neighbor_idx < col_data.size: + row_data = data[indptr[i]:indptr[i + 1]] + if further_neighbor_idx < row_data.size: core_distances[i] = np.partition( - col_data, further_neighbor_idx + row_data, further_neighbor_idx )[further_neighbor_idx] else: core_distances[i] = INFINITY - for col_ind in range(n_samples): - for i in range(indptr[col_ind], indptr[col_ind + 1]): - row_ind = indices[i] - mutual_reachibility_distance = max( - core_distances[col_ind], core_distances[row_ind], data[i] - ) - if isfinite(mutual_reachibility_distance): - data[i] = mutual_reachibility_distance - elif max_distance > 0: - data[i] = max_distance + with nogil: + for col_ind in range(n_samples): + for i in range(indptr[col_ind], indptr[col_ind + 1]): + row_ind = indices[i] + mutual_reachibility_distance = max( + core_distances[col_ind], core_distances[row_ind], data[i] + ) + if isfinite(mutual_reachibility_distance): + data[i] = mutual_reachibility_distance + elif max_distance > 0: + data[i] = max_distance diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py index 9d610c9a10f2c..c8ba28d0af25b 100644 --- a/sklearn/cluster/_hdbscan/tests/test_reachibility.py +++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -41,10 +41,24 @@ def test_mutual_reachability_graph_equivalence_dense_sparse(): rng = np.random.RandomState(0) X = rng.randn(5, 5) X_dense = X.T @ X - np.fill_diagonal(X_dense, 0.0) X_sparse = _convert_container(X_dense, "sparse_csr") mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3) mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3) assert_allclose(mr_graph_dense, mr_graph_sparse.A) + + +@pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_mutual_reachability_graph_preserve_dtype(array_type, dtype): + """Check that the computation preserve dtype thanks to fused types.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = (X.T @ X).astype(dtype) + np.fill_diagonal(X, 0.0) + X = _convert_container(X, array_type) + + assert X.dtype == dtype + mr_graph = mutual_reachability_graph(X) + assert mr_graph.dtype == dtype From 9ba964d14803852f095909d428adf94ada2374b2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:53:01 +0200 Subject: [PATCH 11/25] fused type --- sklearn/cluster/_hdbscan/_reachability.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index d1716dd79e7fd..d925d1d22e62b 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -153,7 +153,7 @@ def _sparse_mutual_reachability_graph( matrix. """ cdef: - cnp.intp_t i, col_ind + integral i, col_ind integral row_ind floating mutual_reachibility_distance floating[:] core_distances From 0c65f8cc42217612654daf22805f28331e38fa95 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 20 Oct 2022 16:58:39 +0200 Subject: [PATCH 12/25] FIX put correct name on indices --- sklearn/cluster/_hdbscan/_reachability.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index d925d1d22e62b..c83fef742e82e 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -153,8 +153,7 @@ def _sparse_mutual_reachability_graph( matrix. """ cdef: - integral i, col_ind - integral row_ind + integral i, col_ind, row_ind floating mutual_reachibility_distance floating[:] core_distances floating[:] row_data @@ -176,11 +175,11 @@ def _sparse_mutual_reachability_graph( core_distances[i] = INFINITY with nogil: - for col_ind in range(n_samples): - for i in range(indptr[col_ind], indptr[col_ind + 1]): - row_ind = indices[i] + for row_ind in range(n_samples): + for i in range(indptr[row_ind], indptr[row_ind + 1]): + col_ind = indices[i] mutual_reachibility_distance = max( - core_distances[col_ind], core_distances[row_ind], data[i] + core_distances[row_ind], core_distances[col_ind], data[i] ) if isfinite(mutual_reachibility_distance): data[i] = mutual_reachibility_distance From b83f614cbce03728a0bbd164c985fdcdf8f6ce64 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 5 Nov 2022 14:08:21 -0400 Subject: [PATCH 13/25] Added validation for precomputed distance matrix --- sklearn/cluster/_hdbscan/hdbscan.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 753c4145ccc70..dda9ae8507236 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -116,6 +116,19 @@ def _hdbscan_brute( **metric_params, ): if metric == "precomputed": + if X.shape[0] != X.shape[1]: + raise ValueError( + "The precomputed distance matrix is expected to be symmetric, however" + f" it has shape {X.shape}. Please verify that the" + " distance matrix was constructed correctly." + ) + if np.allclose(X, X.T): + raise ValueError( + "The precomputed distance matrix is expected to be symmetric, however" + " its values appear to be asymmetric. Please verify that the distance" + " matrix was constructed correctly." + ) + distance_matrix = X.copy() if copy else X else: distance_matrix = pairwise_distances( From 2310e1e5bbad472def197f1da926da5136f8a0f1 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sat, 5 Nov 2022 14:34:40 -0400 Subject: [PATCH 14/25] Fixed typo --- sklearn/cluster/_hdbscan/hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index dda9ae8507236..bf58b10e3ccfd 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -122,7 +122,7 @@ def _hdbscan_brute( f" it has shape {X.shape}. Please verify that the" " distance matrix was constructed correctly." ) - if np.allclose(X, X.T): + if not np.allclose(X, X.T): raise ValueError( "The precomputed distance matrix is expected to be symmetric, however" " its values appear to be asymmetric. Please verify that the distance" From b99ba60368b0021376082be8368758ad13300404 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 9 Nov 2022 18:36:04 -0500 Subject: [PATCH 15/25] Updated symmetry check to account for sparse --- sklearn/cluster/_hdbscan/hdbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index bf58b10e3ccfd..937eac39db926 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -20,7 +20,7 @@ from ...metrics._dist_metrics import DistanceMetric from ...neighbors import BallTree, KDTree, NearestNeighbors from ...utils._param_validation import Interval, StrOptions -from ...utils.validation import _assert_all_finite +from ...utils.validation import _assert_all_finite, _allclose_dense_sparse from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix from ._reachability import mutual_reachability_graph from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut @@ -122,7 +122,7 @@ def _hdbscan_brute( f" it has shape {X.shape}. Please verify that the" " distance matrix was constructed correctly." ) - if not np.allclose(X, X.T): + if not _allclose_dense_sparse(X, X.T): raise ValueError( "The precomputed distance matrix is expected to be symmetric, however" " its values appear to be asymmetric. Please verify that the distance" From 2874340f467c1b8cccb65669f5252cb714161896 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 29 Nov 2022 18:03:59 -0500 Subject: [PATCH 16/25] Updated import order and used cython integral fused type --- sklearn/cluster/_hdbscan/_reachability.pyx | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index c83fef742e82e..0131c9ed1aca5 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -4,22 +4,17 @@ # Guillaume Lemaitre # License: 3-clause BSD -import numpy as np -from scipy.sparse import issparse - cimport cython -from cython cimport floating cimport numpy as cnp + +import numpy as np +from scipy.sparse import issparse +from cython cimport floating, integral from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY cnp.import_array() -ctypedef fused integral: - int - long long - - def mutual_reachability_graph( distance_matrix, min_samples=5, max_distance=0.0 ): From c47a8ba61cd751c012fdaf7db07615abd87b3890 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Wed, 30 Nov 2022 18:25:04 -0500 Subject: [PATCH 17/25] Update sklearn/cluster/_hdbscan/hdbscan.py Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/hdbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 937eac39db926..74af44c9e88f5 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -358,8 +358,8 @@ class HDBSCAN(ClusterMixin, BaseEstimator): If `copy=True` then any time an in-place modifications would be made that would overwrite data passed to :term:`fit`, a copy will first be made, guaranteeing that the original data will be unchanged. - Currently, it only applies with `metric="precomputed"`, passing a dense - array or a sparse matrix of format CSR and algorithm used is `"brute"`. + Currently, it only applies when `metric="precomputed"`, when passing + a dense array or a CSR sparse matrix and when `algorithm="brute"`. Attributes ---------- From 557975af8375922516f09c0a4dd01f30c7310598 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 30 Nov 2022 18:25:17 -0500 Subject: [PATCH 18/25] Optimized loops and formatted imports --- sklearn/cluster/_hdbscan/_reachability.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index 0131c9ed1aca5..efc641df29e19 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -99,18 +99,20 @@ def _dense_mutual_reachability_graph( cdef: cnp.intp_t i, j, n_samples = distance_matrix.shape[0] floating mutual_reachibility_distance - floating[:] core_distances + floating[::1] core_distances # We assume that the distance matrix is symmetric. We choose to sort every # row to have the same implementation than the sparse case that requires # CSR matrix. - core_distances = np.partition( - distance_matrix, further_neighbor_idx, axis=1 - )[:, further_neighbor_idx] + core_distances = np.ascontiguousarray( + np.partition( + distance_matrix, further_neighbor_idx, axis=1 + )[:, further_neighbor_idx] + ) with nogil: - for i in range(n_samples): - for j in prange(n_samples): + for i in prange(n_samples): + for j in range(n_samples): mutual_reachibility_distance = max( core_distances[i], core_distances[j], @@ -118,7 +120,6 @@ def _dense_mutual_reachability_graph( ) distance_matrix[i, j] = mutual_reachibility_distance - def _sparse_mutual_reachability_graph( cnp.ndarray[floating, ndim=1, mode="c"] data, cnp.ndarray[integral, ndim=1, mode="c"] indices, From f34c121385f8b610fc7ed9a7bba7de5ededdac5e Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 30 Nov 2022 18:56:31 -0500 Subject: [PATCH 19/25] Updated sparse check --- sklearn/cluster/_hdbscan/hdbscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 74af44c9e88f5..7ec7ad56cd177 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -13,7 +13,7 @@ from warnings import warn import numpy as np -from scipy.sparse import csgraph, issparse +from scipy.sparse import csgraph, issparse, isspmatrix_csr from ...base import BaseEstimator, ClusterMixin from ...metrics import pairwise_distances @@ -137,7 +137,7 @@ def _hdbscan_brute( distance_matrix /= alpha max_distance = metric_params.get("max_distance", 0.0) - if issparse(distance_matrix) and distance_matrix.format != "csr": + if isspmatrix_csr(distance_matrix): # we need CSR format to avoid a conversion in `_brute_mst` when calling # `csgraph.connected_components` distance_matrix = distance_matrix.tocsr() From cc526317dc29cd7c92ff4fecb9c97e2a0b747b75 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 19 Dec 2022 13:00:24 -0500 Subject: [PATCH 20/25] Removed prange and added note --- sklearn/cluster/_hdbscan/_reachability.pyx | 5 +- .../_argkminlabels.pyx | 679 +++++++++++++ .../_engines.pxd | 347 +++++++ .../_engines.pyx | 940 ++++++++++++++++++ 4 files changed, 1969 insertions(+), 2 deletions(-) create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pxd create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pyx diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index efc641df29e19..dc4263694f89a 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -10,7 +10,6 @@ cimport numpy as cnp import numpy as np from scipy.sparse import issparse from cython cimport floating, integral -from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY cnp.import_array() @@ -111,7 +110,9 @@ def _dense_mutual_reachability_graph( ) with nogil: - for i in prange(n_samples): + # TODO: Update w/ prange with thread count based on + # _openmp_effective_n_threads + for i in range(n_samples): for j in range(n_samples): mutual_reachibility_distance = max( core_distances[i], diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx new file mode 100644 index 0000000000000..32e0a4d6d0546 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx @@ -0,0 +1,679 @@ + +from cython cimport floating, integral +from cython.parallel cimport parallel, prange +from libcpp.map cimport map as cmap, pair +from libc.stdlib cimport free + +cimport numpy as cnp + +cnp.import_array() + +from ...utils._typedefs cimport ITYPE_t, DTYPE_t +from ...utils._typedefs import ITYPE, DTYPE +from ...utils._sorting cimport simultaneous_sort +import numpy as np +from scipy.sparse import issparse +from sklearn.utils.fixes import threadpool_limits + +cpdef enum WeightingStrategy: + uniform = 0 + distance = 1 + other = 2 +from ._argkmin cimport ArgKmin64, EuclideanArgKmin64 +from ._datasets_pair cimport DatasetsPair64 + +cdef class ArgKminLabels64(ArgKmin64): + """ + 64bit implementation of ArgKminLabel. + """ + cdef: + const ITYPE_t[:] labels, + DTYPE_t[:, :] label_weights + cmap[ITYPE_t, ITYPE_t] labels_to_index + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + ITYPE_t k, + weights, + labels, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + """Compute the argkmin reduction. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`ArgKmin64`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance should directly be created outside of this class method. + """ + if ( + ( + metric in ("euclidean", "sqeuclidean") + or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2) + ) + and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator + ): + # Specialized implementation of ArgKminLabels for the Euclidean distance + # for the dense-dense and sparse-sparse cases. + # This implementation computes the distances by chunk using + # a decomposition of the Squared Euclidean distance. + # This specialisation has an improved arithmetic intensity for both + # the dense and sparse settings, allowing in most case speed-ups of + # several orders of magnitude compared to the generic ArgKmin + # implementation. + # For more information see MiddleTermComputer. + use_squared_distances = metric == "sqeuclidean" + pda = EuclideanArgKminLabels64( + X=X, Y=Y, k=k, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + strategy=strategy, + metric_kwargs=metric_kwargs, + weights=weights, + labels=labels, + ) + else: + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = ArgKminLabels64( + datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + labels=labels, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair64 datasets_pair, + const ITYPE_t[:] labels, + chunk_size=None, + strategy=None, + ITYPE_t k=1, + weights=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + k=k, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.other + self.labels = labels + + cdef ITYPE_t[:] unique_labels = np.unique(labels) + + cdef ITYPE_t idx, label + # Map from set of unique labels to their indices in `label_weights` + for idx, label in enumerate(unique_labels): + self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) + + # Buffer used in building a histogram for one-pass weighted mode + self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) + + def _finalize_results(self): + probabilities = np.asarray(self.label_weights) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + ITYPE_t sample_index, + ITYPE_t* indices, + DTYPE_t* distances, + ) nogil: + cdef: + ITYPE_t y_idx, label, label_index, multi_output_index + DTYPE_t label_weight = 1 + + # Iterate through the sample k-nearest neighbours + for jdx in range(self.k): + # Absolute indice of the jdx-th Nearest Neighbors + # in range [0, n_samples_Y) + if self.weight_type == WeightingStrategy.distance: + label_weight = 1 / distances[jdx] + y_idx = indices[jdx] + label = self.labels[y_idx] + label_index = self.labels_to_index[label] + self.label_weights[sample_index][label_index] += label_weight + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, sample_index + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + max_label_weight = -1 + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][0], + &self.heaps_r_distances_chunks[thread_num][0], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for sample_index in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[sample_index, 0], + &self.argkmin_indices[sample_index, 0], + self.k, + ) + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return + +cdef class EuclideanArgKminLabels64(EuclideanArgKmin64): + """ + 64bit implementation of ArgKminLabel. + """ + cdef: + const ITYPE_t[:] labels, + DTYPE_t[:, :] label_weights + cmap[ITYPE_t, ITYPE_t] labels_to_index + WeightingStrategy weight_type + + def __init__( + self, + X, + Y, + ITYPE_t k, + bint use_squared_distances=False, + chunk_size=None, + strategy=None, + metric_kwargs=None, + weights=None, + labels=None, + ): + super().__init__( + X=X, Y=Y, k=k, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + strategy=strategy, + metric_kwargs=metric_kwargs, + ) + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.other + self.labels = labels + + cdef ITYPE_t[:] unique_labels = np.unique(labels) + + cdef ITYPE_t idx, label + # Map from set of unique labels to their indices in `label_weights` + for idx, label in enumerate(unique_labels): + self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) + + # Buffer used in building a histogram for one-pass weighted mode + self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) + + def _finalize_results(self): + probabilities = np.asarray(self.label_weights) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + ITYPE_t sample_index, + ITYPE_t* indices, + DTYPE_t* distances, + ) nogil: + cdef: + ITYPE_t y_idx, label, label_index, multi_output_index + DTYPE_t label_weight = 1 + + # Iterate through the sample k-nearest neighbours + for jdx in range(self.k): + # Absolute indice of the jdx-th Nearest Neighbors + # in range [0, n_samples_Y) + if self.weight_type == WeightingStrategy.distance: + label_weight = 1 / distances[jdx] + y_idx = indices[jdx] + label = self.labels[y_idx] + label_index = self.labels_to_index[label] + self.label_weights[sample_index][label_index] += label_weight + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, sample_index + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + max_label_weight = -1 + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][0], + &self.heaps_r_distances_chunks[thread_num][0], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for sample_index in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[sample_index, 0], + &self.argkmin_indices[sample_index, 0], + self.k, + ) + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return +from ._argkmin cimport ArgKmin32, EuclideanArgKmin32 +from ._datasets_pair cimport DatasetsPair32 + +cdef class ArgKminLabels32(ArgKmin32): + """ + 32bit implementation of ArgKminLabel. + """ + cdef: + const ITYPE_t[:] labels, + DTYPE_t[:, :] label_weights + cmap[ITYPE_t, ITYPE_t] labels_to_index + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + ITYPE_t k, + weights, + labels, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + """Compute the argkmin reduction. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`ArgKmin32`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance should directly be created outside of this class method. + """ + if ( + ( + metric in ("euclidean", "sqeuclidean") + or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2) + ) + and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator + ): + # Specialized implementation of ArgKminLabels for the Euclidean distance + # for the dense-dense and sparse-sparse cases. + # This implementation computes the distances by chunk using + # a decomposition of the Squared Euclidean distance. + # This specialisation has an improved arithmetic intensity for both + # the dense and sparse settings, allowing in most case speed-ups of + # several orders of magnitude compared to the generic ArgKmin + # implementation. + # For more information see MiddleTermComputer. + use_squared_distances = metric == "sqeuclidean" + pda = EuclideanArgKminLabels32( + X=X, Y=Y, k=k, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + strategy=strategy, + metric_kwargs=metric_kwargs, + weights=weights, + labels=labels, + ) + else: + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = ArgKminLabels32( + datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + labels=labels, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair32 datasets_pair, + const ITYPE_t[:] labels, + chunk_size=None, + strategy=None, + ITYPE_t k=1, + weights=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + k=k, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.other + self.labels = labels + + cdef ITYPE_t[:] unique_labels = np.unique(labels) + + cdef ITYPE_t idx, label + # Map from set of unique labels to their indices in `label_weights` + for idx, label in enumerate(unique_labels): + self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) + + # Buffer used in building a histogram for one-pass weighted mode + self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) + + def _finalize_results(self): + probabilities = np.asarray(self.label_weights) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + ITYPE_t sample_index, + ITYPE_t* indices, + DTYPE_t* distances, + ) nogil: + cdef: + ITYPE_t y_idx, label, label_index, multi_output_index + DTYPE_t label_weight = 1 + + # Iterate through the sample k-nearest neighbours + for jdx in range(self.k): + # Absolute indice of the jdx-th Nearest Neighbors + # in range [0, n_samples_Y) + if self.weight_type == WeightingStrategy.distance: + label_weight = 1 / distances[jdx] + y_idx = indices[jdx] + label = self.labels[y_idx] + label_index = self.labels_to_index[label] + self.label_weights[sample_index][label_index] += label_weight + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, sample_index + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + max_label_weight = -1 + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][0], + &self.heaps_r_distances_chunks[thread_num][0], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for sample_index in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[sample_index, 0], + &self.argkmin_indices[sample_index, 0], + self.k, + ) + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return + +cdef class EuclideanArgKminLabels32(EuclideanArgKmin32): + """ + 32bit implementation of ArgKminLabel. + """ + cdef: + const ITYPE_t[:] labels, + DTYPE_t[:, :] label_weights + cmap[ITYPE_t, ITYPE_t] labels_to_index + WeightingStrategy weight_type + + def __init__( + self, + X, + Y, + ITYPE_t k, + bint use_squared_distances=False, + chunk_size=None, + strategy=None, + metric_kwargs=None, + weights=None, + labels=None, + ): + super().__init__( + X=X, Y=Y, k=k, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + strategy=strategy, + metric_kwargs=metric_kwargs, + ) + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.other + self.labels = labels + + cdef ITYPE_t[:] unique_labels = np.unique(labels) + + cdef ITYPE_t idx, label + # Map from set of unique labels to their indices in `label_weights` + for idx, label in enumerate(unique_labels): + self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) + + # Buffer used in building a histogram for one-pass weighted mode + self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) + + def _finalize_results(self): + probabilities = np.asarray(self.label_weights) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + ITYPE_t sample_index, + ITYPE_t* indices, + DTYPE_t* distances, + ) nogil: + cdef: + ITYPE_t y_idx, label, label_index, multi_output_index + DTYPE_t label_weight = 1 + + # Iterate through the sample k-nearest neighbours + for jdx in range(self.k): + # Absolute indice of the jdx-th Nearest Neighbors + # in range [0, n_samples_Y) + if self.weight_type == WeightingStrategy.distance: + label_weight = 1 / distances[jdx] + y_idx = indices[jdx] + label = self.labels[y_idx] + label_index = self.labels_to_index[label] + self.label_weights[sample_index][label_index] += label_weight + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, sample_index + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + max_label_weight = -1 + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][0], + &self.heaps_r_distances_chunks[thread_num][0], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) nogil: + cdef: + ITYPE_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for sample_index in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[sample_index, 0], + &self.argkmin_indices[sample_index, 0], + self.k, + ) + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd b/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd new file mode 100644 index 0000000000000..33023cdb2a400 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd @@ -0,0 +1,347 @@ +cimport numpy as cnp + +from libcpp.vector cimport vector + +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t + + +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const DTYPE_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil + +cdef class BaseEngine: + + cdef void _parallel_on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_init( + self, + ) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_Y_synchronize( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + +cdef class EuclideanEngine64(BaseEngine): + cdef: + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + ITYPE_t dist_middle_terms_chunks_size + ITYPE_t n_features + ITYPE_t chunk_size + DTYPE_t[::1] X_norm_squared, Y_norm_squared + + # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM + vector[vector[DTYPE_t]] dist_middle_terms_chunks + + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_init(self) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef DTYPE_t _compute_pair_distance( + self, + ITYPE_t i, # Index of X sample + ITYPE_t j, # Index of Y sample + ITYPE_t X_start, # Index offset + ITYPE_t Y_start, # Index offset + DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms + ) nogil + +cdef class DenseDenseEuclideanEngine64(EuclideanEngine64): + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef class SparseSparseEuclideanEngine64(EuclideanEngine64): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const DTYPE_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + +cdef class EuclideanEngine32(BaseEngine): + cdef: + ITYPE_t effective_n_threads + ITYPE_t chunks_n_threads + ITYPE_t dist_middle_terms_chunks_size + ITYPE_t n_features + ITYPE_t chunk_size + DTYPE_t[::1] X_norm_squared, Y_norm_squared + + # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM + vector[vector[DTYPE_t]] dist_middle_terms_chunks + + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_init(self) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef DTYPE_t _compute_pair_distance( + self, + ITYPE_t i, # Index of X sample + ITYPE_t j, # Index of Y sample + ITYPE_t X_start, # Index offset + ITYPE_t Y_start, # Index offset + DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms + ) nogil + +cdef class DenseDenseEuclideanEngine32(EuclideanEngine32): + cdef: + const cnp.float32_t[:, ::1] X + const cnp.float32_t[:, ::1] Y + + # Buffers for upcasting chunks of X and Y from 32bit to 64bit + vector[vector[DTYPE_t]] X_c_upcast + vector[vector[DTYPE_t]] Y_c_upcast + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + +cdef class SparseSparseEuclideanEngine32(EuclideanEngine32): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const DTYPE_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx b/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx new file mode 100644 index 0000000000000..5e1fe8cb457b3 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx @@ -0,0 +1,940 @@ +cimport numpy as cnp + +from libcpp.vector cimport vector + +from ...utils._cython_blas cimport ( + BLAS_Order, + BLAS_Trans, + NoTrans, + RowMajor, + Trans, + _gemm, +) +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t + +# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used +# Introduction in Cython: +# +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa +cdef extern from "" namespace "std" nogil: + void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa + +import numpy as np +from scipy.sparse import issparse, csr_matrix +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE +from ...utils import check_array + +cdef class BaseEngine: + def __init__(self): + return + + cdef void _parallel_on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil: + return + + cdef void _parallel_on_Y_init( + self, + ) nogil: + return + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_Y_synchronize( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + +# TODO: If possible optimize this routine to efficiently treat cases where +# `n_samples_X << n_samples_Y` met in practise when X_test consists of a +# few samples, and thus when there's a single chunk of X whose number of +# samples is less that the default chunk size. + +# TODO: compare this routine with the similar ones in SciPy, especially +# `csr_matmat` which might implement a better algorithm. +# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669 # noqa +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const DTYPE_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil: + # This routine assumes that D points to the first element of a + # zeroed buffer of length at least equal to n_X × n_Y, conceptually + # representing a 2-d C-ordered array. + cdef: + ITYPE_t i, j, k + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr + + for i in range(n_X): + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] + for j in range(n_Y): + k = i * n_Y + j + for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + Y_j_col_idx = Y_indices[Y_j_ptr] + if X_i_col_idx == Y_j_col_idx: + D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] + + +from ._base cimport _sqeuclidean_row_norms64 + +cdef class EuclideanEngine64(BaseEngine): + """Helper class to compute a Euclidean distance matrix in chunks. + + This is an abstract base class that is further specialized depending + on the type of data (dense or sparse). + + `EuclideanDistance` subclasses relies on the squared Euclidean + distances between chunks of vectors X_c and Y_c using the + following decomposition for the (i,j) pair : + + + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + + + This helper class is in charge of wrapping the common logic to compute + the middle term, i.e. `- 2 X_c_i.Y_c_j^T`. + """ + + @classmethod + def get_for( + cls, + X, + Y, + pda, + ) -> EuclideanEngine64: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + + Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + + Returns + ------- + engine: EuclideanEngine64 + The suited EuclideanEngine64 implementation. + """ + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk + if not X_is_sparse and not Y_is_sparse: + return DenseDenseEuclideanEngine64( + X, + Y, + effective_n_threads=pda.effective_n_threads, + chunks_n_threads=pda.chunks_n_threads, + dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, + chunk_size=pda.chunk_size, + metric_kwargs=pda.metric_kwargs, + ) + if X_is_sparse and Y_is_sparse: + return SparseSparseEuclideanEngine64( + X, + Y, + effective_n_threads=pda.effective_n_threads, + chunks_n_threads=pda.chunks_n_threads, + dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, + chunk_size=pda.chunk_size, + metric_kwargs=pda.metric_kwargs, + ) + + raise NotImplementedError( + "X and Y must be both CSR sparse matrices or both numpy arrays." + ) + + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t chunk_size, + dict metric_kwargs=None, + ): + self.effective_n_threads = effective_n_threads + self.chunks_n_threads = chunks_n_threads + self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = X.shape[1] + self.chunk_size = chunk_size + + self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) + + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = check_array( + metric_kwargs.pop("Y_norm_squared"), + ensure_2d=False, + input_name="Y_norm_squared", + dtype=np.float64, + ) + else: + self.Y_norm_squared = _sqeuclidean_row_norms64( + Y, + self.effective_n_threads, + ) + + if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: + self.X_norm_squared = check_array( + metric_kwargs.pop("X_norm_squared"), + ensure_2d=False, + input_name="X_norm_squared", + dtype=np.float64, + ) + else: + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms64( + X, + self.effective_n_threads, + ) + ) + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_init(self) nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return NULL + + cdef DTYPE_t _compute_pair_distance( + self, + ITYPE_t i, # Index of X sample + ITYPE_t j, # Index of Y sample + ITYPE_t X_start, # Index offset + ITYPE_t Y_start, # Index offset + DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms + ) nogil: + + cdef ITYPE_t n_Y = len(self.Y_norm_squared) + # Index of middle term + cdef ITYPE_t k = n_Y * i + j + cdef DTYPE_t val = ( + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * n_Y + j] + + self.Y_norm_squared[j + Y_start] + ) + # Catastrophic cancellation might cause -0. to be present, + # e.g. when computing d(x_i, y_i) when X is Y. + return max(0., val) + + +cdef class DenseDenseEuclideanEngine64(EuclideanEngine64): + """Computes the middle term of the Euclidean distance between two chunked dense matrices + X_c and Y_c. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + This class use the BLAS gemm routine to perform the dot product of each chunks + of the distance matrix with improved arithmetic intensity and vector instruction (SIMD). + """ + + def __init__( + self, + const DTYPE_t[:, ::1] X, + const DTYPE_t[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + dict metric_kwargs=None, + ): + super().__init__( + X, Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + metric_kwargs=None, + ) + self.X = X + self.Y = Y + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + return + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_end - X_start + ITYPE_t n = Y_end - Y_start + ITYPE_t K = self.n_features + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + DTYPE_t * A = &self.X[X_start, 0] + DTYPE_t * B = &self.Y[Y_start, 0] + ITYPE_t lda = self.n_features + ITYPE_t ldb = self.n_features + DTYPE_t beta = 0. + ITYPE_t ldc = Y_end - Y_start + + # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + return dist_middle_terms + + +cdef class SparseSparseEuclideanEngine64(EuclideanEngine64): + """Middle term of the Euclidean distance between two chunked CSR matrices. + + The result is return as a contiguous array. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. + This routine iterates over the data, indices and indptr arrays of the sparse matrices without + densifying them. + """ + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + X, Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + metric_kwargs=None, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + _middle_term_sparse_sparse_64( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms + +from ._base cimport _sqeuclidean_row_norms32 + +cdef class EuclideanEngine32(BaseEngine): + """Helper class to compute a Euclidean distance matrix in chunks. + + This is an abstract base class that is further specialized depending + on the type of data (dense or sparse). + + `EuclideanDistance` subclasses relies on the squared Euclidean + distances between chunks of vectors X_c and Y_c using the + following decomposition for the (i,j) pair : + + + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + + + This helper class is in charge of wrapping the common logic to compute + the middle term, i.e. `- 2 X_c_i.Y_c_j^T`. + """ + + @classmethod + def get_for( + cls, + X, + Y, + pda, + ) -> EuclideanEngine32: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + + Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + + Returns + ------- + engine: EuclideanEngine32 + The suited EuclideanEngine32 implementation. + """ + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk + if not X_is_sparse and not Y_is_sparse: + return DenseDenseEuclideanEngine32( + X, + Y, + effective_n_threads=pda.effective_n_threads, + chunks_n_threads=pda.chunks_n_threads, + dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, + chunk_size=pda.chunk_size, + metric_kwargs=pda.metric_kwargs, + ) + if X_is_sparse and Y_is_sparse: + return SparseSparseEuclideanEngine32( + X, + Y, + effective_n_threads=pda.effective_n_threads, + chunks_n_threads=pda.chunks_n_threads, + dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, + chunk_size=pda.chunk_size, + metric_kwargs=pda.metric_kwargs, + ) + + raise NotImplementedError( + "X and Y must be both CSR sparse matrices or both numpy arrays." + ) + + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t chunk_size, + dict metric_kwargs=None, + ): + self.effective_n_threads = effective_n_threads + self.chunks_n_threads = chunks_n_threads + self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = X.shape[1] + self.chunk_size = chunk_size + + self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) + + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = check_array( + metric_kwargs.pop("Y_norm_squared"), + ensure_2d=False, + input_name="Y_norm_squared", + dtype=np.float64, + ) + else: + self.Y_norm_squared = _sqeuclidean_row_norms32( + Y, + self.effective_n_threads, + ) + + if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: + self.X_norm_squared = check_array( + metric_kwargs.pop("X_norm_squared"), + ensure_2d=False, + input_name="X_norm_squared", + dtype=np.float64, + ) + else: + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms32( + X, + self.effective_n_threads, + ) + ) + + cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + return + + cdef void _parallel_on_Y_init(self) nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + return NULL + + cdef DTYPE_t _compute_pair_distance( + self, + ITYPE_t i, # Index of X sample + ITYPE_t j, # Index of Y sample + ITYPE_t X_start, # Index offset + ITYPE_t Y_start, # Index offset + DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms + ) nogil: + + cdef ITYPE_t n_Y = len(self.Y_norm_squared) + # Index of middle term + cdef ITYPE_t k = n_Y * i + j + cdef DTYPE_t val = ( + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * n_Y + j] + + self.Y_norm_squared[j + Y_start] + ) + # Catastrophic cancellation might cause -0. to be present, + # e.g. when computing d(x_i, y_i) when X is Y. + return max(0., val) + + +cdef class DenseDenseEuclideanEngine32(EuclideanEngine32): + """Computes the middle term of the Euclidean distance between two chunked dense matrices + X_c and Y_c. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + This class use the BLAS gemm routine to perform the dot product of each chunks + of the distance matrix with improved arithmetic intensity and vector instruction (SIMD). + """ + + def __init__( + self, + const cnp.float32_t[:, ::1] X, + const cnp.float32_t[:, ::1] Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + dict metric_kwargs=None, + ): + super().__init__( + X, Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + metric_kwargs=None, + ) + self.X = X + self.Y = Y + # We populate the buffer for upcasting chunks of X and Y from float32 to float64. + self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) + self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) + + upcast_buffer_n_elements = self.chunk_size * n_features + + for thread_num in range(self.effective_n_threads): + self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements) + self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] + + cdef void _parallel_on_X_init_chunk( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] + + cdef void _parallel_on_Y_parallel_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_end - X_start + ITYPE_t n = Y_end - Y_start + ITYPE_t K = self.n_features + DTYPE_t alpha = - 2. + DTYPE_t * A = self.X_c_upcast[thread_num].data() + DTYPE_t * B = self.Y_c_upcast[thread_num].data() + ITYPE_t lda = self.n_features + ITYPE_t ldb = self.n_features + DTYPE_t beta = 0. + ITYPE_t ldc = Y_end - Y_start + + # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + return dist_middle_terms + + +cdef class SparseSparseEuclideanEngine32(EuclideanEngine32): + """Middle term of the Euclidean distance between two chunked CSR matrices. + + The result is return as a contiguous array. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. + This routine iterates over the data, indices and indptr arrays of the sparse matrices without + densifying them. + """ + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + X, Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + metric_kwargs=None, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + _middle_term_sparse_sparse_64( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms From 7eff5ba01e72b209cd3bfb0b2c14b28c9be778cb Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 19 Dec 2022 13:06:13 -0500 Subject: [PATCH 21/25] Revert "Removed prange and added note" This reverts commit cc526317dc29cd7c92ff4fecb9c97e2a0b747b75. --- sklearn/cluster/_hdbscan/_reachability.pyx | 5 +- .../_argkminlabels.pyx | 679 ------------- .../_engines.pxd | 347 ------- .../_engines.pyx | 940 ------------------ 4 files changed, 2 insertions(+), 1969 deletions(-) delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pxd delete mode 100644 sklearn/metrics/_pairwise_distances_reduction/_engines.pyx diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index dc4263694f89a..efc641df29e19 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -10,6 +10,7 @@ cimport numpy as cnp import numpy as np from scipy.sparse import issparse from cython cimport floating, integral +from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY cnp.import_array() @@ -110,9 +111,7 @@ def _dense_mutual_reachability_graph( ) with nogil: - # TODO: Update w/ prange with thread count based on - # _openmp_effective_n_threads - for i in range(n_samples): + for i in prange(n_samples): for j in range(n_samples): mutual_reachibility_distance = max( core_distances[i], diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx b/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx deleted file mode 100644 index 32e0a4d6d0546..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkminlabels.pyx +++ /dev/null @@ -1,679 +0,0 @@ - -from cython cimport floating, integral -from cython.parallel cimport parallel, prange -from libcpp.map cimport map as cmap, pair -from libc.stdlib cimport free - -cimport numpy as cnp - -cnp.import_array() - -from ...utils._typedefs cimport ITYPE_t, DTYPE_t -from ...utils._typedefs import ITYPE, DTYPE -from ...utils._sorting cimport simultaneous_sort -import numpy as np -from scipy.sparse import issparse -from sklearn.utils.fixes import threadpool_limits - -cpdef enum WeightingStrategy: - uniform = 0 - distance = 1 - other = 2 -from ._argkmin cimport ArgKmin64, EuclideanArgKmin64 -from ._datasets_pair cimport DatasetsPair64 - -cdef class ArgKminLabels64(ArgKmin64): - """ - 64bit implementation of ArgKminLabel. - """ - cdef: - const ITYPE_t[:] labels, - DTYPE_t[:, :] label_weights - cmap[ITYPE_t, ITYPE_t] labels_to_index - WeightingStrategy weight_type - - @classmethod - def compute( - cls, - X, - Y, - ITYPE_t k, - weights, - labels, - str metric="euclidean", - chunk_size=None, - dict metric_kwargs=None, - str strategy=None, - ): - """Compute the argkmin reduction. - - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKmin64`. - - This allows decoupling the API entirely from the implementation details - whilst maintaining RAII: all temporarily allocated datastructures necessary - for the concrete implementation are therefore freed when this classmethod - returns. - - No instance should directly be created outside of this class method. - """ - if ( - ( - metric in ("euclidean", "sqeuclidean") - or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2) - ) - and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator - ): - # Specialized implementation of ArgKminLabels for the Euclidean distance - # for the dense-dense and sparse-sparse cases. - # This implementation computes the distances by chunk using - # a decomposition of the Squared Euclidean distance. - # This specialisation has an improved arithmetic intensity for both - # the dense and sparse settings, allowing in most case speed-ups of - # several orders of magnitude compared to the generic ArgKmin - # implementation. - # For more information see MiddleTermComputer. - use_squared_distances = metric == "sqeuclidean" - pda = EuclideanArgKminLabels64( - X=X, Y=Y, k=k, - use_squared_distances=use_squared_distances, - chunk_size=chunk_size, - strategy=strategy, - metric_kwargs=metric_kwargs, - weights=weights, - labels=labels, - ) - else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. - pda = ArgKminLabels64( - datasets_pair=DatasetsPair64.get_for(X, Y, metric, metric_kwargs), - k=k, - chunk_size=chunk_size, - strategy=strategy, - weights=weights, - labels=labels, - ) - - # Limit the number of threads in second level of nested parallelism for BLAS - # to avoid threads over-subscription (in GEMM for instance). - with threadpool_limits(limits=1, user_api="blas"): - if pda.execute_in_parallel_on_Y: - pda._parallel_on_Y() - else: - pda._parallel_on_X() - - return pda._finalize_results() - - def __init__( - self, - DatasetsPair64 datasets_pair, - const ITYPE_t[:] labels, - chunk_size=None, - strategy=None, - ITYPE_t k=1, - weights=None, - ): - super().__init__( - datasets_pair=datasets_pair, - chunk_size=chunk_size, - strategy=strategy, - k=k, - ) - - if weights == "uniform": - self.weight_type = WeightingStrategy.uniform - elif weights == "distance": - self.weight_type = WeightingStrategy.distance - else: - self.weight_type = WeightingStrategy.other - self.labels = labels - - cdef ITYPE_t[:] unique_labels = np.unique(labels) - - cdef ITYPE_t idx, label - # Map from set of unique labels to their indices in `label_weights` - for idx, label in enumerate(unique_labels): - self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) - - # Buffer used in building a histogram for one-pass weighted mode - self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) - - def _finalize_results(self): - probabilities = np.asarray(self.label_weights) - probabilities /= probabilities.sum(axis=1, keepdims=True) - return probabilities - - cdef inline void weighted_histogram_mode( - self, - ITYPE_t sample_index, - ITYPE_t* indices, - DTYPE_t* distances, - ) nogil: - cdef: - ITYPE_t y_idx, label, label_index, multi_output_index - DTYPE_t label_weight = 1 - - # Iterate through the sample k-nearest neighbours - for jdx in range(self.k): - # Absolute indice of the jdx-th Nearest Neighbors - # in range [0, n_samples_Y) - if self.weight_type == WeightingStrategy.distance: - label_weight = 1 / distances[jdx] - y_idx = indices[jdx] - label = self.labels[y_idx] - label_index = self.labels_to_index[label] - self.label_weights[sample_index][label_index] += label_weight - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, sample_index - # Sorting the main heaps portion associated to `X[X_start:X_end]` - # in ascending order w.r.t the distances. - for idx in range(X_end - X_start): - simultaneous_sort( - self.heaps_r_distances_chunks[thread_num] + idx * self.k, - self.heaps_indices_chunks[thread_num] + idx * self.k, - self.k - ) - # One-pass top-one weighted mode - # Compute the absolute index in [0, n_samples_X) - sample_index = X_start + idx - max_label_weight = -1 - self.weighted_histogram_mode( - sample_index, - &self.heaps_indices_chunks[thread_num][0], - &self.heaps_r_distances_chunks[thread_num][0], - ) - return - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t sample_index, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for sample_index in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[sample_index, 0], - &self.argkmin_indices[sample_index, 0], - self.k, - ) - self.weighted_histogram_mode( - sample_index, - &self.argkmin_indices[sample_index][0], - &self.argkmin_distances[sample_index][0], - ) - return - -cdef class EuclideanArgKminLabels64(EuclideanArgKmin64): - """ - 64bit implementation of ArgKminLabel. - """ - cdef: - const ITYPE_t[:] labels, - DTYPE_t[:, :] label_weights - cmap[ITYPE_t, ITYPE_t] labels_to_index - WeightingStrategy weight_type - - def __init__( - self, - X, - Y, - ITYPE_t k, - bint use_squared_distances=False, - chunk_size=None, - strategy=None, - metric_kwargs=None, - weights=None, - labels=None, - ): - super().__init__( - X=X, Y=Y, k=k, - use_squared_distances=use_squared_distances, - chunk_size=chunk_size, - strategy=strategy, - metric_kwargs=metric_kwargs, - ) - if weights == "uniform": - self.weight_type = WeightingStrategy.uniform - elif weights == "distance": - self.weight_type = WeightingStrategy.distance - else: - self.weight_type = WeightingStrategy.other - self.labels = labels - - cdef ITYPE_t[:] unique_labels = np.unique(labels) - - cdef ITYPE_t idx, label - # Map from set of unique labels to their indices in `label_weights` - for idx, label in enumerate(unique_labels): - self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) - - # Buffer used in building a histogram for one-pass weighted mode - self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) - - def _finalize_results(self): - probabilities = np.asarray(self.label_weights) - probabilities /= probabilities.sum(axis=1, keepdims=True) - return probabilities - - cdef inline void weighted_histogram_mode( - self, - ITYPE_t sample_index, - ITYPE_t* indices, - DTYPE_t* distances, - ) nogil: - cdef: - ITYPE_t y_idx, label, label_index, multi_output_index - DTYPE_t label_weight = 1 - - # Iterate through the sample k-nearest neighbours - for jdx in range(self.k): - # Absolute indice of the jdx-th Nearest Neighbors - # in range [0, n_samples_Y) - if self.weight_type == WeightingStrategy.distance: - label_weight = 1 / distances[jdx] - y_idx = indices[jdx] - label = self.labels[y_idx] - label_index = self.labels_to_index[label] - self.label_weights[sample_index][label_index] += label_weight - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, sample_index - # Sorting the main heaps portion associated to `X[X_start:X_end]` - # in ascending order w.r.t the distances. - for idx in range(X_end - X_start): - simultaneous_sort( - self.heaps_r_distances_chunks[thread_num] + idx * self.k, - self.heaps_indices_chunks[thread_num] + idx * self.k, - self.k - ) - # One-pass top-one weighted mode - # Compute the absolute index in [0, n_samples_X) - sample_index = X_start + idx - max_label_weight = -1 - self.weighted_histogram_mode( - sample_index, - &self.heaps_indices_chunks[thread_num][0], - &self.heaps_r_distances_chunks[thread_num][0], - ) - return - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t sample_index, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for sample_index in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[sample_index, 0], - &self.argkmin_indices[sample_index, 0], - self.k, - ) - self.weighted_histogram_mode( - sample_index, - &self.argkmin_indices[sample_index][0], - &self.argkmin_distances[sample_index][0], - ) - return -from ._argkmin cimport ArgKmin32, EuclideanArgKmin32 -from ._datasets_pair cimport DatasetsPair32 - -cdef class ArgKminLabels32(ArgKmin32): - """ - 32bit implementation of ArgKminLabel. - """ - cdef: - const ITYPE_t[:] labels, - DTYPE_t[:, :] label_weights - cmap[ITYPE_t, ITYPE_t] labels_to_index - WeightingStrategy weight_type - - @classmethod - def compute( - cls, - X, - Y, - ITYPE_t k, - weights, - labels, - str metric="euclidean", - chunk_size=None, - dict metric_kwargs=None, - str strategy=None, - ): - """Compute the argkmin reduction. - - This classmethod is responsible for introspecting the arguments - values to dispatch to the most appropriate implementation of - :class:`ArgKmin32`. - - This allows decoupling the API entirely from the implementation details - whilst maintaining RAII: all temporarily allocated datastructures necessary - for the concrete implementation are therefore freed when this classmethod - returns. - - No instance should directly be created outside of this class method. - """ - if ( - ( - metric in ("euclidean", "sqeuclidean") - or metric=="minkowski" and (metric_kwargs is None or metric_kwargs.get("p", 2)==2) - ) - and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator - ): - # Specialized implementation of ArgKminLabels for the Euclidean distance - # for the dense-dense and sparse-sparse cases. - # This implementation computes the distances by chunk using - # a decomposition of the Squared Euclidean distance. - # This specialisation has an improved arithmetic intensity for both - # the dense and sparse settings, allowing in most case speed-ups of - # several orders of magnitude compared to the generic ArgKmin - # implementation. - # For more information see MiddleTermComputer. - use_squared_distances = metric == "sqeuclidean" - pda = EuclideanArgKminLabels32( - X=X, Y=Y, k=k, - use_squared_distances=use_squared_distances, - chunk_size=chunk_size, - strategy=strategy, - metric_kwargs=metric_kwargs, - weights=weights, - labels=labels, - ) - else: - # Fall back on a generic implementation that handles most scipy - # metrics by computing the distances between 2 vectors at a time. - pda = ArgKminLabels32( - datasets_pair=DatasetsPair32.get_for(X, Y, metric, metric_kwargs), - k=k, - chunk_size=chunk_size, - strategy=strategy, - weights=weights, - labels=labels, - ) - - # Limit the number of threads in second level of nested parallelism for BLAS - # to avoid threads over-subscription (in GEMM for instance). - with threadpool_limits(limits=1, user_api="blas"): - if pda.execute_in_parallel_on_Y: - pda._parallel_on_Y() - else: - pda._parallel_on_X() - - return pda._finalize_results() - - def __init__( - self, - DatasetsPair32 datasets_pair, - const ITYPE_t[:] labels, - chunk_size=None, - strategy=None, - ITYPE_t k=1, - weights=None, - ): - super().__init__( - datasets_pair=datasets_pair, - chunk_size=chunk_size, - strategy=strategy, - k=k, - ) - - if weights == "uniform": - self.weight_type = WeightingStrategy.uniform - elif weights == "distance": - self.weight_type = WeightingStrategy.distance - else: - self.weight_type = WeightingStrategy.other - self.labels = labels - - cdef ITYPE_t[:] unique_labels = np.unique(labels) - - cdef ITYPE_t idx, label - # Map from set of unique labels to their indices in `label_weights` - for idx, label in enumerate(unique_labels): - self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) - - # Buffer used in building a histogram for one-pass weighted mode - self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) - - def _finalize_results(self): - probabilities = np.asarray(self.label_weights) - probabilities /= probabilities.sum(axis=1, keepdims=True) - return probabilities - - cdef inline void weighted_histogram_mode( - self, - ITYPE_t sample_index, - ITYPE_t* indices, - DTYPE_t* distances, - ) nogil: - cdef: - ITYPE_t y_idx, label, label_index, multi_output_index - DTYPE_t label_weight = 1 - - # Iterate through the sample k-nearest neighbours - for jdx in range(self.k): - # Absolute indice of the jdx-th Nearest Neighbors - # in range [0, n_samples_Y) - if self.weight_type == WeightingStrategy.distance: - label_weight = 1 / distances[jdx] - y_idx = indices[jdx] - label = self.labels[y_idx] - label_index = self.labels_to_index[label] - self.label_weights[sample_index][label_index] += label_weight - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, sample_index - # Sorting the main heaps portion associated to `X[X_start:X_end]` - # in ascending order w.r.t the distances. - for idx in range(X_end - X_start): - simultaneous_sort( - self.heaps_r_distances_chunks[thread_num] + idx * self.k, - self.heaps_indices_chunks[thread_num] + idx * self.k, - self.k - ) - # One-pass top-one weighted mode - # Compute the absolute index in [0, n_samples_X) - sample_index = X_start + idx - max_label_weight = -1 - self.weighted_histogram_mode( - sample_index, - &self.heaps_indices_chunks[thread_num][0], - &self.heaps_r_distances_chunks[thread_num][0], - ) - return - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t sample_index, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for sample_index in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[sample_index, 0], - &self.argkmin_indices[sample_index, 0], - self.k, - ) - self.weighted_histogram_mode( - sample_index, - &self.argkmin_indices[sample_index][0], - &self.argkmin_distances[sample_index][0], - ) - return - -cdef class EuclideanArgKminLabels32(EuclideanArgKmin32): - """ - 32bit implementation of ArgKminLabel. - """ - cdef: - const ITYPE_t[:] labels, - DTYPE_t[:, :] label_weights - cmap[ITYPE_t, ITYPE_t] labels_to_index - WeightingStrategy weight_type - - def __init__( - self, - X, - Y, - ITYPE_t k, - bint use_squared_distances=False, - chunk_size=None, - strategy=None, - metric_kwargs=None, - weights=None, - labels=None, - ): - super().__init__( - X=X, Y=Y, k=k, - use_squared_distances=use_squared_distances, - chunk_size=chunk_size, - strategy=strategy, - metric_kwargs=metric_kwargs, - ) - if weights == "uniform": - self.weight_type = WeightingStrategy.uniform - elif weights == "distance": - self.weight_type = WeightingStrategy.distance - else: - self.weight_type = WeightingStrategy.other - self.labels = labels - - cdef ITYPE_t[:] unique_labels = np.unique(labels) - - cdef ITYPE_t idx, label - # Map from set of unique labels to their indices in `label_weights` - for idx, label in enumerate(unique_labels): - self.labels_to_index.insert(pair[ITYPE_t, ITYPE_t](label, idx)) - - # Buffer used in building a histogram for one-pass weighted mode - self.label_weights = np.zeros((self.n_samples_X, len(unique_labels)), dtype=DTYPE) - - def _finalize_results(self): - probabilities = np.asarray(self.label_weights) - probabilities /= probabilities.sum(axis=1, keepdims=True) - return probabilities - - cdef inline void weighted_histogram_mode( - self, - ITYPE_t sample_index, - ITYPE_t* indices, - DTYPE_t* distances, - ) nogil: - cdef: - ITYPE_t y_idx, label, label_index, multi_output_index - DTYPE_t label_weight = 1 - - # Iterate through the sample k-nearest neighbours - for jdx in range(self.k): - # Absolute indice of the jdx-th Nearest Neighbors - # in range [0, n_samples_Y) - if self.weight_type == WeightingStrategy.distance: - label_weight = 1 / distances[jdx] - y_idx = indices[jdx] - label = self.labels[y_idx] - label_index = self.labels_to_index[label] - self.label_weights[sample_index][label_index] += label_weight - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t idx, sample_index - # Sorting the main heaps portion associated to `X[X_start:X_end]` - # in ascending order w.r.t the distances. - for idx in range(X_end - X_start): - simultaneous_sort( - self.heaps_r_distances_chunks[thread_num] + idx * self.k, - self.heaps_indices_chunks[thread_num] + idx * self.k, - self.k - ) - # One-pass top-one weighted mode - # Compute the absolute index in [0, n_samples_X) - sample_index = X_start + idx - max_label_weight = -1 - self.weighted_histogram_mode( - sample_index, - &self.heaps_indices_chunks[thread_num][0], - &self.heaps_r_distances_chunks[thread_num][0], - ) - return - - cdef void _parallel_on_Y_finalize( - self, - ) nogil: - cdef: - ITYPE_t sample_index, thread_num - - with nogil, parallel(num_threads=self.chunks_n_threads): - # Deallocating temporary datastructures - for thread_num in prange(self.chunks_n_threads, schedule='static'): - free(self.heaps_r_distances_chunks[thread_num]) - free(self.heaps_indices_chunks[thread_num]) - - # Sorting the main in ascending order w.r.t the distances. - # This is done in parallel sample-wise (no need for locks). - for sample_index in prange(self.n_samples_X, schedule='static'): - simultaneous_sort( - &self.argkmin_distances[sample_index, 0], - &self.argkmin_indices[sample_index, 0], - self.k, - ) - self.weighted_histogram_mode( - sample_index, - &self.argkmin_indices[sample_index][0], - &self.argkmin_distances[sample_index][0], - ) - return diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd b/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd deleted file mode 100644 index 33023cdb2a400..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_engines.pxd +++ /dev/null @@ -1,347 +0,0 @@ -cimport numpy as cnp - -from libcpp.vector cimport vector - -from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t - - -cdef void _middle_term_sparse_sparse_64( - const DTYPE_t[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indices, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const DTYPE_t[:] Y_data, - const SPARSE_INDEX_TYPE_t[:] Y_indices, - const SPARSE_INDEX_TYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t * D, -) nogil - -cdef class BaseEngine: - - cdef void _parallel_on_X_parallel_init( - self, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_X_parallel_finalize( - self, - ITYPE_t thread_num - ) nogil - - cdef void _parallel_on_Y_init( - self, - ) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_Y_synchronize( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - -cdef class EuclideanEngine64(BaseEngine): - cdef: - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads - ITYPE_t dist_middle_terms_chunks_size - ITYPE_t n_features - ITYPE_t chunk_size - DTYPE_t[::1] X_norm_squared, Y_norm_squared - - # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM - vector[vector[DTYPE_t]] dist_middle_terms_chunks - - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_init(self) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef DTYPE_t _compute_pair_distance( - self, - ITYPE_t i, # Index of X sample - ITYPE_t j, # Index of Y sample - ITYPE_t X_start, # Index offset - ITYPE_t Y_start, # Index offset - DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms - ) nogil - -cdef class DenseDenseEuclideanEngine64(EuclideanEngine64): - cdef: - const DTYPE_t[:, ::1] X - const DTYPE_t[:, ::1] Y - - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef class SparseSparseEuclideanEngine64(EuclideanEngine64): - cdef: - const DTYPE_t[:] X_data - const SPARSE_INDEX_TYPE_t[:] X_indices - const SPARSE_INDEX_TYPE_t[:] X_indptr - - const DTYPE_t[:] Y_data - const SPARSE_INDEX_TYPE_t[:] Y_indices - const SPARSE_INDEX_TYPE_t[:] Y_indptr - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - -cdef class EuclideanEngine32(BaseEngine): - cdef: - ITYPE_t effective_n_threads - ITYPE_t chunks_n_threads - ITYPE_t dist_middle_terms_chunks_size - ITYPE_t n_features - ITYPE_t chunk_size - DTYPE_t[::1] X_norm_squared, Y_norm_squared - - # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM - vector[vector[DTYPE_t]] dist_middle_terms_chunks - - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_init(self) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef DTYPE_t _compute_pair_distance( - self, - ITYPE_t i, # Index of X sample - ITYPE_t j, # Index of Y sample - ITYPE_t X_start, # Index offset - ITYPE_t Y_start, # Index offset - DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms - ) nogil - -cdef class DenseDenseEuclideanEngine32(EuclideanEngine32): - cdef: - const cnp.float32_t[:, ::1] X - const cnp.float32_t[:, ::1] Y - - # Buffers for upcasting chunks of X and Y from 32bit to 64bit - vector[vector[DTYPE_t]] X_c_upcast - vector[vector[DTYPE_t]] Y_c_upcast - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil - - -cdef class SparseSparseEuclideanEngine32(EuclideanEngine32): - cdef: - const DTYPE_t[:] X_data - const SPARSE_INDEX_TYPE_t[:] X_indices - const SPARSE_INDEX_TYPE_t[:] X_indptr - - const DTYPE_t[:] Y_data - const SPARSE_INDEX_TYPE_t[:] Y_indices - const SPARSE_INDEX_TYPE_t[:] Y_indptr - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil diff --git a/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx b/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx deleted file mode 100644 index 5e1fe8cb457b3..0000000000000 --- a/sklearn/metrics/_pairwise_distances_reduction/_engines.pyx +++ /dev/null @@ -1,940 +0,0 @@ -cimport numpy as cnp - -from libcpp.vector cimport vector - -from ...utils._cython_blas cimport ( - BLAS_Order, - BLAS_Trans, - NoTrans, - RowMajor, - Trans, - _gemm, -) -from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t - -# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used -# Introduction in Cython: -# -# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa -cdef extern from "" namespace "std" nogil: - void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa - -import numpy as np -from scipy.sparse import issparse, csr_matrix -from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE -from ...utils import check_array - -cdef class BaseEngine: - def __init__(self): - return - - cdef void _parallel_on_X_parallel_init( - self, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_X_prange_iter_finalize( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_X_parallel_finalize( - self, - ITYPE_t thread_num - ) nogil: - return - - cdef void _parallel_on_Y_init( - self, - ) nogil: - return - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_Y_synchronize( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - -# TODO: If possible optimize this routine to efficiently treat cases where -# `n_samples_X << n_samples_Y` met in practise when X_test consists of a -# few samples, and thus when there's a single chunk of X whose number of -# samples is less that the default chunk size. - -# TODO: compare this routine with the similar ones in SciPy, especially -# `csr_matmat` which might implement a better algorithm. -# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669 # noqa -cdef void _middle_term_sparse_sparse_64( - const DTYPE_t[:] X_data, - const SPARSE_INDEX_TYPE_t[:] X_indices, - const SPARSE_INDEX_TYPE_t[:] X_indptr, - ITYPE_t X_start, - ITYPE_t X_end, - const DTYPE_t[:] Y_data, - const SPARSE_INDEX_TYPE_t[:] Y_indices, - const SPARSE_INDEX_TYPE_t[:] Y_indptr, - ITYPE_t Y_start, - ITYPE_t Y_end, - DTYPE_t * D, -) nogil: - # This routine assumes that D points to the first element of a - # zeroed buffer of length at least equal to n_X × n_Y, conceptually - # representing a 2-d C-ordered array. - cdef: - ITYPE_t i, j, k - ITYPE_t n_X = X_end - X_start - ITYPE_t n_Y = Y_end - Y_start - ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr - - for i in range(n_X): - for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): - X_i_col_idx = X_indices[X_i_ptr] - for j in range(n_Y): - k = i * n_Y + j - for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): - Y_j_col_idx = Y_indices[Y_j_ptr] - if X_i_col_idx == Y_j_col_idx: - D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] - - -from ._base cimport _sqeuclidean_row_norms64 - -cdef class EuclideanEngine64(BaseEngine): - """Helper class to compute a Euclidean distance matrix in chunks. - - This is an abstract base class that is further specialized depending - on the type of data (dense or sparse). - - `EuclideanDistance` subclasses relies on the squared Euclidean - distances between chunks of vectors X_c and Y_c using the - following decomposition for the (i,j) pair : - - - ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - - - This helper class is in charge of wrapping the common logic to compute - the middle term, i.e. `- 2 X_c_i.Y_c_j^T`. - """ - - @classmethod - def get_for( - cls, - X, - Y, - pda, - ) -> EuclideanEngine64: - """Return the DatasetsPair implementation for the given arguments. - - Parameters - ---------- - X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - - Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - - Returns - ------- - engine: EuclideanEngine64 - The suited EuclideanEngine64 implementation. - """ - X_is_sparse = issparse(X) - Y_is_sparse = issparse(Y) - dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk - if not X_is_sparse and not Y_is_sparse: - return DenseDenseEuclideanEngine64( - X, - Y, - effective_n_threads=pda.effective_n_threads, - chunks_n_threads=pda.chunks_n_threads, - dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, - chunk_size=pda.chunk_size, - metric_kwargs=pda.metric_kwargs, - ) - if X_is_sparse and Y_is_sparse: - return SparseSparseEuclideanEngine64( - X, - Y, - effective_n_threads=pda.effective_n_threads, - chunks_n_threads=pda.chunks_n_threads, - dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, - chunk_size=pda.chunk_size, - metric_kwargs=pda.metric_kwargs, - ) - - raise NotImplementedError( - "X and Y must be both CSR sparse matrices or both numpy arrays." - ) - - - @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - X_data = np.asarray(X.data, dtype=DTYPE) - X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - return X_data, X_indices, X_indptr - - def __init__( - self, - X, - Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t chunk_size, - dict metric_kwargs=None, - ): - self.effective_n_threads = effective_n_threads - self.chunks_n_threads = chunks_n_threads - self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size - self.n_features = X.shape[1] - self.chunk_size = chunk_size - - self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) - - if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: - self.Y_norm_squared = check_array( - metric_kwargs.pop("Y_norm_squared"), - ensure_2d=False, - input_name="Y_norm_squared", - dtype=np.float64, - ) - else: - self.Y_norm_squared = _sqeuclidean_row_norms64( - Y, - self.effective_n_threads, - ) - - if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: - self.X_norm_squared = check_array( - metric_kwargs.pop("X_norm_squared"), - ensure_2d=False, - input_name="X_norm_squared", - dtype=np.float64, - ) - else: - # Do not recompute norms if datasets are identical. - self.X_norm_squared = ( - self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms64( - X, - self.effective_n_threads, - ) - ) - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: - self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_init(self) nogil: - for thread_num in range(self.chunks_n_threads): - self.dist_middle_terms_chunks[thread_num].resize( - self.dist_middle_terms_chunks_size - ) - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return NULL - - cdef DTYPE_t _compute_pair_distance( - self, - ITYPE_t i, # Index of X sample - ITYPE_t j, # Index of Y sample - ITYPE_t X_start, # Index offset - ITYPE_t Y_start, # Index offset - DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms - ) nogil: - - cdef ITYPE_t n_Y = len(self.Y_norm_squared) - # Index of middle term - cdef ITYPE_t k = n_Y * i + j - cdef DTYPE_t val = ( - self.X_norm_squared[i + X_start] + - dist_middle_terms[i * n_Y + j] + - self.Y_norm_squared[j + Y_start] - ) - # Catastrophic cancellation might cause -0. to be present, - # e.g. when computing d(x_i, y_i) when X is Y. - return max(0., val) - - -cdef class DenseDenseEuclideanEngine64(EuclideanEngine64): - """Computes the middle term of the Euclidean distance between two chunked dense matrices - X_c and Y_c. - - dist_middle_terms = - 2 X_c_i.Y_c_j^T - - This class use the BLAS gemm routine to perform the dot product of each chunks - of the distance matrix with improved arithmetic intensity and vector instruction (SIMD). - """ - - def __init__( - self, - const DTYPE_t[:, ::1] X, - const DTYPE_t[:, ::1] Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - dict metric_kwargs=None, - ): - super().__init__( - X, Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - metric_kwargs=None, - ) - self.X = X - self.Y = Y - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil: - return - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() - - # Careful: LDA, LDB and LDC are given for F-ordered arrays - # in BLAS documentations, for instance: - # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa - # - # Here, we use their counterpart values to work with C-ordered arrays. - BLAS_Order order = RowMajor - BLAS_Trans ta = NoTrans - BLAS_Trans tb = Trans - ITYPE_t m = X_end - X_start - ITYPE_t n = Y_end - Y_start - ITYPE_t K = self.n_features - DTYPE_t alpha = - 2. - # Casting for A and B to remove the const is needed because APIs exposed via - # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. - # See: https://github.com/scipy/scipy/issues/14262 - DTYPE_t * A = &self.X[X_start, 0] - DTYPE_t * B = &self.Y[Y_start, 0] - ITYPE_t lda = self.n_features - ITYPE_t ldb = self.n_features - DTYPE_t beta = 0. - ITYPE_t ldc = Y_end - Y_start - - # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` - _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) - - return dist_middle_terms - - -cdef class SparseSparseEuclideanEngine64(EuclideanEngine64): - """Middle term of the Euclidean distance between two chunked CSR matrices. - - The result is return as a contiguous array. - - dist_middle_terms = - 2 X_c_i.Y_c_j^T - - The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. - This routine iterates over the data, indices and indptr arrays of the sparse matrices without - densifying them. - """ - - def __init__( - self, - X, - Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - X, Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - metric_kwargs=None, - ) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - DTYPE_t *dist_middle_terms = ( - self.dist_middle_terms_chunks[thread_num].data() - ) - - _middle_term_sparse_sparse_64( - self.X_data, - self.X_indices, - self.X_indptr, - X_start, - X_end, - self.Y_data, - self.Y_indices, - self.Y_indptr, - Y_start, - Y_end, - dist_middle_terms, - ) - - return dist_middle_terms - -from ._base cimport _sqeuclidean_row_norms32 - -cdef class EuclideanEngine32(BaseEngine): - """Helper class to compute a Euclidean distance matrix in chunks. - - This is an abstract base class that is further specialized depending - on the type of data (dense or sparse). - - `EuclideanDistance` subclasses relies on the squared Euclidean - distances between chunks of vectors X_c and Y_c using the - following decomposition for the (i,j) pair : - - - ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² - - - This helper class is in charge of wrapping the common logic to compute - the middle term, i.e. `- 2 X_c_i.Y_c_j^T`. - """ - - @classmethod - def get_for( - cls, - X, - Y, - pda, - ) -> EuclideanEngine32: - """Return the DatasetsPair implementation for the given arguments. - - Parameters - ---------- - X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - - Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features) - Input data. - If provided as a ndarray, it must be C-contiguous. - - Returns - ------- - engine: EuclideanEngine32 - The suited EuclideanEngine32 implementation. - """ - X_is_sparse = issparse(X) - Y_is_sparse = issparse(Y) - dist_middle_terms_chunks_size = pda.Y_n_samples_chunk * pda.X_n_samples_chunk - if not X_is_sparse and not Y_is_sparse: - return DenseDenseEuclideanEngine32( - X, - Y, - effective_n_threads=pda.effective_n_threads, - chunks_n_threads=pda.chunks_n_threads, - dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, - chunk_size=pda.chunk_size, - metric_kwargs=pda.metric_kwargs, - ) - if X_is_sparse and Y_is_sparse: - return SparseSparseEuclideanEngine32( - X, - Y, - effective_n_threads=pda.effective_n_threads, - chunks_n_threads=pda.chunks_n_threads, - dist_middle_terms_chunks_size=dist_middle_terms_chunks_size, - chunk_size=pda.chunk_size, - metric_kwargs=pda.metric_kwargs, - ) - - raise NotImplementedError( - "X and Y must be both CSR sparse matrices or both numpy arrays." - ) - - - @classmethod - def unpack_csr_matrix(cls, X: csr_matrix): - """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" - X_data = np.asarray(X.data, dtype=DTYPE) - X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) - X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) - return X_data, X_indices, X_indptr - - def __init__( - self, - X, - Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t chunk_size, - dict metric_kwargs=None, - ): - self.effective_n_threads = effective_n_threads - self.chunks_n_threads = chunks_n_threads - self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size - self.n_features = X.shape[1] - self.chunk_size = chunk_size - - self.dist_middle_terms_chunks = vector[vector[DTYPE_t]](self.effective_n_threads) - - if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: - self.Y_norm_squared = check_array( - metric_kwargs.pop("Y_norm_squared"), - ensure_2d=False, - input_name="Y_norm_squared", - dtype=np.float64, - ) - else: - self.Y_norm_squared = _sqeuclidean_row_norms32( - Y, - self.effective_n_threads, - ) - - if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: - self.X_norm_squared = check_array( - metric_kwargs.pop("X_norm_squared"), - ensure_2d=False, - input_name="X_norm_squared", - dtype=np.float64, - ) - else: - # Do not recompute norms if datasets are identical. - self.X_norm_squared = ( - self.Y_norm_squared if X is Y else - _sqeuclidean_row_norms32( - X, - self.effective_n_threads, - ) - ) - - cdef void _parallel_on_X_parallel_init(self, ITYPE_t thread_num) nogil: - self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - return - - cdef void _parallel_on_Y_init(self) nogil: - for thread_num in range(self.chunks_n_threads): - self.dist_middle_terms_chunks[thread_num].resize( - self.dist_middle_terms_chunks_size - ) - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - return NULL - - cdef DTYPE_t _compute_pair_distance( - self, - ITYPE_t i, # Index of X sample - ITYPE_t j, # Index of Y sample - ITYPE_t X_start, # Index offset - ITYPE_t Y_start, # Index offset - DTYPE_t * dist_middle_terms, # Array of pre-computeted middle terms - ) nogil: - - cdef ITYPE_t n_Y = len(self.Y_norm_squared) - # Index of middle term - cdef ITYPE_t k = n_Y * i + j - cdef DTYPE_t val = ( - self.X_norm_squared[i + X_start] + - dist_middle_terms[i * n_Y + j] + - self.Y_norm_squared[j + Y_start] - ) - # Catastrophic cancellation might cause -0. to be present, - # e.g. when computing d(x_i, y_i) when X is Y. - return max(0., val) - - -cdef class DenseDenseEuclideanEngine32(EuclideanEngine32): - """Computes the middle term of the Euclidean distance between two chunked dense matrices - X_c and Y_c. - - dist_middle_terms = - 2 X_c_i.Y_c_j^T - - This class use the BLAS gemm routine to perform the dot product of each chunks - of the distance matrix with improved arithmetic intensity and vector instruction (SIMD). - """ - - def __init__( - self, - const cnp.float32_t[:, ::1] X, - const cnp.float32_t[:, ::1] Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - dict metric_kwargs=None, - ): - super().__init__( - X, Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - metric_kwargs=None, - ) - self.X = X - self.Y = Y - # We populate the buffer for upcasting chunks of X and Y from float32 to float64. - self.X_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) - self.Y_c_upcast = vector[vector[DTYPE_t]](self.effective_n_threads) - - upcast_buffer_n_elements = self.chunk_size * n_features - - for thread_num in range(self.effective_n_threads): - self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements) - self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = Y_end - Y_start - - # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] - - cdef void _parallel_on_X_init_chunk( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = X_end - X_start - - # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] - - cdef void _parallel_on_Y_parallel_init( - self, - ITYPE_t thread_num, - ITYPE_t X_start, - ITYPE_t X_end, - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = X_end - X_start - - # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num - ) nogil: - cdef: - ITYPE_t i, j - ITYPE_t n_chunk_samples = Y_end - Y_start - - # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 - for i in range(n_chunk_samples): - for j in range(self.n_features): - self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() - - # Careful: LDA, LDB and LDC are given for F-ordered arrays - # in BLAS documentations, for instance: - # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa - # - # Here, we use their counterpart values to work with C-ordered arrays. - BLAS_Order order = RowMajor - BLAS_Trans ta = NoTrans - BLAS_Trans tb = Trans - ITYPE_t m = X_end - X_start - ITYPE_t n = Y_end - Y_start - ITYPE_t K = self.n_features - DTYPE_t alpha = - 2. - DTYPE_t * A = self.X_c_upcast[thread_num].data() - DTYPE_t * B = self.Y_c_upcast[thread_num].data() - ITYPE_t lda = self.n_features - ITYPE_t ldb = self.n_features - DTYPE_t beta = 0. - ITYPE_t ldc = Y_end - Y_start - - # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` - _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) - - return dist_middle_terms - - -cdef class SparseSparseEuclideanEngine32(EuclideanEngine32): - """Middle term of the Euclidean distance between two chunked CSR matrices. - - The result is return as a contiguous array. - - dist_middle_terms = - 2 X_c_i.Y_c_j^T - - The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. - This routine iterates over the data, indices and indptr arrays of the sparse matrices without - densifying them. - """ - - def __init__( - self, - X, - Y, - ITYPE_t effective_n_threads, - ITYPE_t chunks_n_threads, - ITYPE_t dist_middle_terms_chunks_size, - ITYPE_t n_features, - ITYPE_t chunk_size, - ): - super().__init__( - X, Y, - effective_n_threads, - chunks_n_threads, - dist_middle_terms_chunks_size, - n_features, - chunk_size, - metric_kwargs=None, - ) - self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) - self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) - - cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - # Flush the thread dist_middle_terms_chunks to 0.0 - fill( - self.dist_middle_terms_chunks[thread_num].begin(), - self.dist_middle_terms_chunks[thread_num].end(), - 0.0, - ) - - cdef DTYPE_t * _compute_dist_middle_terms( - self, - ITYPE_t X_start, - ITYPE_t X_end, - ITYPE_t Y_start, - ITYPE_t Y_end, - ITYPE_t thread_num, - ) nogil: - cdef: - DTYPE_t *dist_middle_terms = ( - self.dist_middle_terms_chunks[thread_num].data() - ) - - _middle_term_sparse_sparse_64( - self.X_data, - self.X_indices, - self.X_indptr, - X_start, - X_end, - self.Y_data, - self.Y_indices, - self.Y_indptr, - Y_start, - Y_end, - dist_middle_terms, - ) - - return dist_middle_terms From 6bcc83d249d957cb781844b3f350530247107b61 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 19 Dec 2022 13:06:52 -0500 Subject: [PATCH 22/25] Removed excess files and removed prange --- sklearn/cluster/_hdbscan/_reachability.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_reachability.pyx b/sklearn/cluster/_hdbscan/_reachability.pyx index efc641df29e19..dc4263694f89a 100644 --- a/sklearn/cluster/_hdbscan/_reachability.pyx +++ b/sklearn/cluster/_hdbscan/_reachability.pyx @@ -10,7 +10,6 @@ cimport numpy as cnp import numpy as np from scipy.sparse import issparse from cython cimport floating, integral -from cython.parallel cimport prange from libc.math cimport isfinite, INFINITY cnp.import_array() @@ -111,7 +110,9 @@ def _dense_mutual_reachability_graph( ) with nogil: - for i in prange(n_samples): + # TODO: Update w/ prange with thread count based on + # _openmp_effective_n_threads + for i in range(n_samples): for j in range(n_samples): mutual_reachibility_distance = max( core_distances[i], From 9715b94242e738eeb4f60546ee38834fe0dccbc0 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Fri, 3 Feb 2023 19:50:01 -0500 Subject: [PATCH 23/25] Update sklearn/cluster/_hdbscan/hdbscan.py Co-authored-by: Thomas J. Fan --- sklearn/cluster/_hdbscan/hdbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 7ec7ad56cd177..e7b22aa7aca97 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -137,7 +137,7 @@ def _hdbscan_brute( distance_matrix /= alpha max_distance = metric_params.get("max_distance", 0.0) - if isspmatrix_csr(distance_matrix): + if issparse(distance_matrix) and distance_matrix.format != "csr": # we need CSR format to avoid a conversion in `_brute_mst` when calling # `csgraph.connected_components` distance_matrix = distance_matrix.tocsr() From 4667952548fea4f8777f77ed4dc147885ee22af7 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 3 Feb 2023 19:55:45 -0500 Subject: [PATCH 24/25] Updated test to include value errors on assymetric distance matrices --- sklearn/cluster/tests/test_hdbscan.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 65f9829ffef5c..0a7704bcd85cb 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -78,6 +78,17 @@ def test_hdbscan_distance_matrix(): score = fowlkes_mallows_score(y, labels) assert score >= 0.98 + msg = r"The precomputed distance matrix.*has shape" + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed", copy=True).fit_predict(X) + + msg = r"The precomputed distance matrix.*values" + # Ensure the matrix is not symmetric + D[0, 1] = 10 + D[1, 0] = 1 + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed").fit_predict(D) + def test_hdbscan_sparse_distance_matrix(): D = distance.squareform(distance.pdist(X)) From ad83829c62957e2409a5efde7a3868c61b82d4f6 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Mon, 13 Feb 2023 18:13:09 -0500 Subject: [PATCH 25/25] Updated sparse distance matrix test --- sklearn/cluster/tests/test_hdbscan.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 0a7704bcd85cb..36fe8e5a6158c 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -90,14 +90,15 @@ def test_hdbscan_distance_matrix(): HDBSCAN(metric="precomputed").fit_predict(D) -def test_hdbscan_sparse_distance_matrix(): +@pytest.mark.parametrize("sparse_constructor", [sparse.csr_matrix, sparse.csc_matrix]) +def test_hdbscan_sparse_distance_matrix(sparse_constructor): D = distance.squareform(distance.pdist(X)) D /= np.max(D) threshold = stats.scoreatpercentile(D.flatten(), 50) D[D >= threshold] = 0.0 - D = sparse.csr_matrix(D) + D = sparse_constructor(D) D.eliminate_zeros() labels = HDBSCAN(metric="precomputed").fit_predict(D)