-
Notifications
You must be signed in to change notification settings - Fork 0
MAINT additional cleaning in reachibility.pyx #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0ab847c
b4c1660
d6a59a5
e09ece7
1cb0db8
8a38591
41cb21e
c510bf8
85c1914
9ba964d
0c65f8c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,41 +1,59 @@ | ||
| # mutual reachability distance compiutations | ||
| # mutual reachability distance computations | ||
| # Authors: Leland McInnes <[email protected]> | ||
| # Meekail Zain <[email protected]> | ||
| # Guillaume Lemaitre <[email protected]> | ||
| # License: 3-clause BSD | ||
|
|
||
| import numpy as np | ||
| from scipy.sparse import issparse | ||
| from ...neighbors import BallTree, KDTree | ||
|
|
||
| cimport cython | ||
| from cython cimport floating | ||
| cimport numpy as cnp | ||
| from cython.parallel cimport prange | ||
| from libc.math cimport isfinite | ||
| from libc.math cimport isfinite, INFINITY | ||
|
|
||
| cnp.import_array() | ||
|
|
||
| def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): | ||
| """Compute the weighted adjacency matrix of the mutual reachability | ||
| graph of a distance matrix. Note that computation is performed in-place for | ||
| `distance_matrix`. If out-of-place computation is required, pass a copy to | ||
| this function. | ||
| ctypedef fused integral: | ||
| int | ||
| long long | ||
|
|
||
|
|
||
| def mutual_reachability_graph( | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we build the graph, I prefer to make it explicit. |
||
| distance_matrix, min_samples=5, max_distance=0.0 | ||
| ): | ||
| """Compute the weighted adjacency matrix of the mutual reachability graph. | ||
|
|
||
| The mutual reachability distance used to build the graph is defined as:: | ||
|
|
||
| max(d_core(x_p), d_core(x_q), d(x_p, x_q)) | ||
|
|
||
| and the core distance `d_core` is defined as the distance between a point | ||
| `x_p` and its k-th nearest neighbor. | ||
|
|
||
| Note that all computations are done in-place. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| distance_matrix : ndarray or sparse matrix of shape (n_samples, n_samples) | ||
| distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples) | ||
| Array of distances between samples. If sparse, the array must be in | ||
| `LIL` format. | ||
| `CSR` format. | ||
|
|
||
| min_points : int, default=5 | ||
| min_samples : int, default=5 | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change the name to be consistent with the high-level function and DBSCAN as well |
||
| The number of points in a neighbourhood for a point to be considered | ||
| a core point. | ||
|
|
||
| max_dist : float, default=0.0 | ||
| max_distance : float, default=0.0 | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that we can be explicit regarding the naming. |
||
| The distance which `np.inf` is replaced with. When the true mutual- | ||
| reachability distance is measured to be infinite, it is instead | ||
| truncated to `max_dist`. | ||
| truncated to `max_dist`. Only used when `distance_matrix` is a sparse | ||
| matrix. | ||
|
|
||
| Returns | ||
| ------- | ||
| mututal_reachability: ndarray of shape (n_samples, n_samples) | ||
| mututal_reachability_graph: {ndarray, sparse matrix} of shape \ | ||
| (n_samples, n_samples) | ||
| Weighted adjacency matrix of the mutual reachability graph. | ||
|
|
||
| References | ||
|
|
@@ -45,78 +63,125 @@ def mutual_reachability(distance_matrix, min_points=5, max_dist=0.0): | |
| In Pacific-Asia Conference on Knowledge Discovery and Data Mining | ||
| (pp. 160-172). Springer Berlin Heidelberg. | ||
| """ | ||
| # Account for index offset | ||
| min_points -= 1 | ||
|
|
||
| # Note that in both routines `distance_matrix` is operated on in-place. At | ||
| # this point, if out-of-place operation is desired then this function | ||
| # should have been passed a copy. | ||
| further_neighbor_idx = min_samples - 1 | ||
| if issparse(distance_matrix): | ||
| return _sparse_mutual_reachability( | ||
| distance_matrix, | ||
| min_points=min_points, | ||
| max_dist=max_dist | ||
| ).tocsr() | ||
| if distance_matrix.format != "csr": | ||
| raise ValueError( | ||
| "Only sparse CSR matrices are supported for `distance_matrix`." | ||
| ) | ||
| _sparse_mutual_reachability_graph( | ||
| distance_matrix.data, | ||
| distance_matrix.indices, | ||
| distance_matrix.indptr, | ||
| distance_matrix.shape[0], | ||
| further_neighbor_idx=further_neighbor_idx, | ||
| max_distance=max_distance, | ||
| ) | ||
| else: | ||
| _dense_mutual_reachability_graph( | ||
| distance_matrix, further_neighbor_idx=further_neighbor_idx | ||
| ) | ||
| return distance_matrix | ||
|
|
||
| return _dense_mutual_reachability(distance_matrix, min_points=min_points) | ||
|
|
||
| cdef _dense_mutual_reachability( | ||
| cnp.ndarray[dtype=cnp.float64_t, ndim=2] distance_matrix, | ||
| cnp.intp_t min_points=5 | ||
| def _dense_mutual_reachability_graph( | ||
| cnp.ndarray[dtype=floating, ndim=2] distance_matrix, | ||
| cnp.intp_t further_neighbor_idx, | ||
| ): | ||
| cdef cnp.intp_t i, j, n_samples = distance_matrix.shape[0] | ||
| cdef cnp.float64_t mr_dist | ||
| cdef cnp.float64_t[:] core_distances | ||
| """Dense implementation of mutual reachability graph. | ||
|
|
||
| The computation is done in-place, i.e. the distance matrix is modified | ||
| directly. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| distance_matrix : ndarray of shape (n_samples, n_samples) | ||
| Array of distances between samples. | ||
|
|
||
| # Compute the core distances for all samples `x_p` corresponding | ||
| # to the distance of the k-th farthest neighbours (including | ||
| # `x_p`). | ||
| further_neighbor_idx : int | ||
| The index of the furthest neighbor to use to define the core distances. | ||
| """ | ||
| cdef: | ||
| cnp.intp_t i, j, n_samples = distance_matrix.shape[0] | ||
| floating mutual_reachibility_distance | ||
| floating[:] core_distances | ||
|
|
||
| # We assume that the distance matrix is symmetric. We choose to sort every | ||
| # row to have the same implementation than the sparse case that requires | ||
| # CSR matrix. | ||
| core_distances = np.partition( | ||
| distance_matrix, | ||
| min_points, | ||
| axis=0, | ||
| )[min_points] | ||
| distance_matrix, further_neighbor_idx, axis=1 | ||
| )[:, further_neighbor_idx] | ||
|
|
||
| with nogil: | ||
| for i in range(n_samples): | ||
| for j in prange(n_samples): | ||
| mr_dist = max( | ||
| mutual_reachibility_distance = max( | ||
| core_distances[i], | ||
| core_distances[j], | ||
| distance_matrix[i, j] | ||
| distance_matrix[i, j], | ||
| ) | ||
| distance_matrix[i, j] = mr_dist | ||
| return distance_matrix | ||
| distance_matrix[i, j] = mutual_reachibility_distance | ||
|
|
||
|
|
||
| # Assumes LIL format. | ||
| # TODO: Rewrite for CSR. | ||
| cdef _sparse_mutual_reachability( | ||
| object distance_matrix, | ||
| cnp.intp_t min_points=5, | ||
| cnp.float64_t max_dist=0. | ||
| def _sparse_mutual_reachability_graph( | ||
| cnp.ndarray[floating, ndim=1, mode="c"] data, | ||
| cnp.ndarray[integral, ndim=1, mode="c"] indices, | ||
| cnp.ndarray[integral, ndim=1, mode="c"] indptr, | ||
| cnp.intp_t n_samples, | ||
| cnp.intp_t further_neighbor_idx, | ||
| floating max_distance, | ||
| ): | ||
| cdef cnp.intp_t i, j, n, n_samples = distance_matrix.shape[0] | ||
| cdef cnp.float64_t mr_dist | ||
| cdef cnp.float64_t[:] core_distances | ||
| cdef cnp.int32_t[:] nz_row_data, nz_col_data | ||
| core_distances = np.empty(n_samples, dtype=np.float64) | ||
| """Sparse implementation of mutual reachability graph. | ||
|
|
||
| The computation is done in-place, i.e. the distance matrix is modified | ||
| directly. This implementation only accepts `CSR` format sparse matrices. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| distance_matrix : sparse matrix of shape (n_samples, n_samples) | ||
| Sparse matrix of distances between samples. The sparse format should | ||
| be `CSR`. | ||
|
|
||
| further_neighbor_idx : int | ||
| The index of the furthest neighbor to use to define the core distances. | ||
|
|
||
| max_distance : float | ||
| The distance which `np.inf` is replaced with. When the true mutual- | ||
| reachability distance is measured to be infinite, it is instead | ||
| truncated to `max_dist`. Only used when `distance_matrix` is a sparse | ||
| matrix. | ||
| """ | ||
| cdef: | ||
| integral i, col_ind, row_ind | ||
| floating mutual_reachibility_distance | ||
| floating[:] core_distances | ||
| floating[:] row_data | ||
|
|
||
| if floating is float: | ||
| dtype = np.float32 | ||
| else: | ||
| dtype = np.float64 | ||
|
|
||
| core_distances = np.empty(n_samples, dtype=dtype) | ||
|
|
||
| for i in range(n_samples): | ||
| if min_points < len(distance_matrix.data[i]): | ||
| row_data = data[indptr[i]:indptr[i + 1]] | ||
| if further_neighbor_idx < row_data.size: | ||
| core_distances[i] = np.partition( | ||
| distance_matrix.data[i], | ||
| min_points | ||
| )[min_points] | ||
| row_data, further_neighbor_idx | ||
| )[further_neighbor_idx] | ||
| else: | ||
| core_distances[i] = np.infty | ||
|
|
||
| nz_row_data, nz_col_data = distance_matrix.nonzero() | ||
| for n in range(nz_row_data.shape[0]): | ||
| i = nz_row_data[n] | ||
| j = nz_col_data[n] | ||
| mr_dist = max(core_distances[i], core_distances[j], distance_matrix[i, j]) | ||
| if isfinite(mr_dist): | ||
| distance_matrix[i, j] = mr_dist | ||
| elif max_dist > 0: | ||
| distance_matrix[i, j] = max_dist | ||
| return distance_matrix | ||
| core_distances[i] = INFINITY | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would avoid a Python interaction |
||
|
|
||
| with nogil: | ||
| for row_ind in range(n_samples): | ||
| for i in range(indptr[row_ind], indptr[row_ind + 1]): | ||
| col_ind = indices[i] | ||
| mutual_reachibility_distance = max( | ||
| core_distances[row_ind], core_distances[col_ind], data[i] | ||
| ) | ||
| if isfinite(mutual_reachibility_distance): | ||
| data[i] = mutual_reachibility_distance | ||
| elif max_distance > 0: | ||
| data[i] = max_distance | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added fused type directly.